diff --git a/Makefile.arm b/Makefile.arm new file mode 100644 index 000000000..8502d5286 --- /dev/null +++ b/Makefile.arm @@ -0,0 +1,12 @@ + +ifeq ($(CORE), ARMV7) +CCOMMON_OPT += -marm -mfpu=vfpv3 -mfloat-abi=hard -march=armv7-a +FCOMMON_OPT += -marm -mfpu=vfpv3 -mfloat-abi=hard -march=armv7-a +endif + +ifeq ($(CORE), ARMV6) +CCOMMON_OPT += -marm -mfpu=vfp -mfloat-abi=hard -march=armv6 +FCOMMON_OPT += -marm -mfpu=vfp -mfloat-abi=hard -march=armv6 +endif + + diff --git a/Makefile.arm64 b/Makefile.arm64 new file mode 100644 index 000000000..a4f8bab6b --- /dev/null +++ b/Makefile.arm64 @@ -0,0 +1,7 @@ + +ifeq ($(CORE), ARMV8) +CCOMMON_OPT += -march=armv8-a +FCOMMON_OPT += -march=armv8-a +endif + + diff --git a/Makefile.system b/Makefile.system index 7da074a65..b63c7799d 100644 --- a/Makefile.system +++ b/Makefile.system @@ -336,14 +336,14 @@ ifeq ($(ARCH), x86) DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \ CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO ifneq ($(NO_AVX), 1) -DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER +DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER HASWELL endif endif ifeq ($(ARCH), x86_64) DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO ifneq ($(NO_AVX), 1) -DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER +DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER HASWELL endif endif @@ -373,6 +373,19 @@ NO_BINARY_MODE = 1 BINARY_DEFINED = 1 endif +ifeq ($(ARCH), arm) +NO_BINARY_MODE = 1 +BINARY_DEFINED = 1 +endif + +ifeq ($(ARCH), arm64) +NO_BINARY_MODE = 1 +BINARY_DEFINED = 1 +endif + + + + # # C Compiler dependent settings # @@ -833,6 +846,19 @@ ifeq ($(DEBUG), 1) COMMON_OPT += -g endif +ifndef COMMON_OPT +ifeq ($(ARCH), arm) +COMMON_OPT = -O3 +endif +endif + +ifndef COMMON_OPT +ifeq ($(ARCH), arm64) +COMMON_OPT = -O3 +endif +endif + + ifndef COMMON_OPT COMMON_OPT = -O2 endif @@ -958,6 +984,10 @@ export HAVE_SSE4_2 export HAVE_SSE4A export HAVE_SSE5 export HAVE_AVX +export HAVE_VFP +export HAVE_VFPV3 +export HAVE_VFPV4 +export HAVE_NEON export KERNELDIR export FUNCTION_PROFILE export TARGET_CORE diff --git a/c_check b/c_check index d5fe59f75..0828a5bba 100644 --- a/c_check +++ b/c_check @@ -63,6 +63,8 @@ $architecture = mips64 if ($data =~ /ARCH_MIPS64/); $architecture = alpha if ($data =~ /ARCH_ALPHA/); $architecture = sparc if ($data =~ /ARCH_SPARC/); $architecture = ia64 if ($data =~ /ARCH_IA64/); +$architecture = arm if ($data =~ /ARCH_ARM/); +$architecture = arm64 if ($data =~ /ARCH_ARM64/); $defined = 0; @@ -149,6 +151,8 @@ $architecture = mips64 if ($data =~ /ARCH_MIPS64/); $architecture = alpha if ($data =~ /ARCH_ALPHA/); $architecture = sparc if ($data =~ /ARCH_SPARC/); $architecture = ia64 if ($data =~ /ARCH_IA64/); +$architecture = arm if ($data =~ /ARCH_ARM/); +$architecture = arm64 if ($data =~ /ARCH_ARM64/); $binformat = bin32; $binformat = bin64 if ($data =~ /BINARY_64/); diff --git a/cblas_noconst.h b/cblas_noconst.h new file mode 100644 index 000000000..fd2e940c0 --- /dev/null +++ b/cblas_noconst.h @@ -0,0 +1,303 @@ +#ifndef CBLAS_H +#define CBLAS_H + +#include +#include "common.h" + +#ifdef __cplusplus +extern "C" { + /* Assume C declarations for C++ */ +#endif /* __cplusplus */ + +/*Set the number of threads on runtime.*/ +void openblas_set_num_threads(int num_threads); +void goto_set_num_threads(int num_threads); + +/*Get the build configure on runtime.*/ +char* openblas_get_config(void); + +/* Get the parallelization type which is used by OpenBLAS */ +int openblas_get_parallel(void); +/* OpenBLAS is compiled for sequential use */ +#define OPENBLAS_SEQUENTIAL 0 +/* OpenBLAS is compiled using normal threading model */ +#define OPENBLAS_THREAD 1 +/* OpenBLAS is compiled using OpenMP threading model */ +#define OPENBLAS_OPENMP 2 + + +#define CBLAS_INDEX size_t + +typedef enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102} CBLAS_ORDER; +typedef enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113, CblasConjNoTrans=114} CBLAS_TRANSPOSE; +typedef enum CBLAS_UPLO {CblasUpper=121, CblasLower=122} CBLAS_UPLO; +typedef enum CBLAS_DIAG {CblasNonUnit=131, CblasUnit=132} CBLAS_DIAG; +typedef enum CBLAS_SIDE {CblasLeft=141, CblasRight=142} CBLAS_SIDE; + +float cblas_sdsdot(blasint n, float alpha, float *x, blasint incx, float *y, blasint incy); +double cblas_dsdot (blasint n, float *x, blasint incx, float *y, blasint incy); +float cblas_sdot(blasint n, float *x, blasint incx, float *y, blasint incy); +double cblas_ddot(blasint n, double *x, blasint incx, double *y, blasint incy); + +openblas_complex_float cblas_cdotu(blasint n, float *x, blasint incx, float *y, blasint incy); +openblas_complex_float cblas_cdotc(blasint n, float *x, blasint incx, float *y, blasint incy); +openblas_complex_double cblas_zdotu(blasint n, double *x, blasint incx, double *y, blasint incy); +openblas_complex_double cblas_zdotc(blasint n, double *x, blasint incx, double *y, blasint incy); + +void cblas_cdotu_sub(blasint n, float *x, blasint incx, float *y, blasint incy, openblas_complex_float *ret); +void cblas_cdotc_sub(blasint n, float *x, blasint incx, float *y, blasint incy, openblas_complex_float *ret); +void cblas_zdotu_sub(blasint n, double *x, blasint incx, double *y, blasint incy, openblas_complex_double *ret); +void cblas_zdotc_sub(blasint n, double *x, blasint incx, double *y, blasint incy, openblas_complex_double *ret); + +float cblas_sasum (blasint n, float *x, blasint incx); +double cblas_dasum (blasint n, double *x, blasint incx); +float cblas_scasum(blasint n, float *x, blasint incx); +double cblas_dzasum(blasint n, double *x, blasint incx); + +float cblas_snrm2 (blasint N, float *X, blasint incX); +double cblas_dnrm2 (blasint N, double *X, blasint incX); +float cblas_scnrm2(blasint N, float *X, blasint incX); +double cblas_dznrm2(blasint N, double *X, blasint incX); + +CBLAS_INDEX cblas_isamax(blasint n, float *x, blasint incx); +CBLAS_INDEX cblas_idamax(blasint n, double *x, blasint incx); +CBLAS_INDEX cblas_icamax(blasint n, float *x, blasint incx); +CBLAS_INDEX cblas_izamax(blasint n, double *x, blasint incx); + +void cblas_saxpy(blasint n, float alpha, float *x, blasint incx, float *y, blasint incy); +void cblas_daxpy(blasint n, double alpha, double *x, blasint incx, double *y, blasint incy); +void cblas_caxpy(blasint n, float *alpha, float *x, blasint incx, float *y, blasint incy); +void cblas_zaxpy(blasint n, double *alpha, double *x, blasint incx, double *y, blasint incy); + +void cblas_scopy(blasint n, float *x, blasint incx, float *y, blasint incy); +void cblas_dcopy(blasint n, double *x, blasint incx, double *y, blasint incy); +void cblas_ccopy(blasint n, float *x, blasint incx, float *y, blasint incy); +void cblas_zcopy(blasint n, double *x, blasint incx, double *y, blasint incy); + +void cblas_sswap(blasint n, float *x, blasint incx, float *y, blasint incy); +void cblas_dswap(blasint n, double *x, blasint incx, double *y, blasint incy); +void cblas_cswap(blasint n, float *x, blasint incx, float *y, blasint incy); +void cblas_zswap(blasint n, double *x, blasint incx, double *y, blasint incy); + +void cblas_srot(blasint N, float *X, blasint incX, float *Y, blasint incY, float c, float s); +void cblas_drot(blasint N, double *X, blasint incX, double *Y, blasint incY, double c, double s); + +void cblas_srotg(float *a, float *b, float *c, float *s); +void cblas_drotg(double *a, double *b, double *c, double *s); + +void cblas_srotm(blasint N, float *X, blasint incX, float *Y, blasint incY, float *P); +void cblas_drotm(blasint N, double *X, blasint incX, double *Y, blasint incY, double *P); + +void cblas_srotmg(float *d1, float *d2, float *b1, float b2, float *P); +void cblas_drotmg(double *d1, double *d2, double *b1, double b2, double *P); + +void cblas_sscal(blasint N, float alpha, float *X, blasint incX); +void cblas_dscal(blasint N, double alpha, double *X, blasint incX); +void cblas_cscal(blasint N, float *alpha, float *X, blasint incX); +void cblas_zscal(blasint N, double *alpha, double *X, blasint incX); +void cblas_csscal(blasint N, float alpha, float *X, blasint incX); +void cblas_zdscal(blasint N, double alpha, double *X, blasint incX); + +void cblas_sgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n, + float alpha, float *a, blasint lda, float *x, blasint incx, float beta, float *y, blasint incy); +void cblas_dgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n, + double alpha, double *a, blasint lda, double *x, blasint incx, double beta, double *y, blasint incy); +void cblas_cgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n, + float *alpha, float *a, blasint lda, float *x, blasint incx, float *beta, float *y, blasint incy); +void cblas_zgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n, + double *alpha, double *a, blasint lda, double *x, blasint incx, double *beta, double *y, blasint incy); + +void cblas_sger (enum CBLAS_ORDER order, blasint M, blasint N, float alpha, float *X, blasint incX, float *Y, blasint incY, float *A, blasint lda); +void cblas_dger (enum CBLAS_ORDER order, blasint M, blasint N, double alpha, double *X, blasint incX, double *Y, blasint incY, double *A, blasint lda); +void cblas_cgeru(enum CBLAS_ORDER order, blasint M, blasint N, float *alpha, float *X, blasint incX, float *Y, blasint incY, float *A, blasint lda); +void cblas_cgerc(enum CBLAS_ORDER order, blasint M, blasint N, float *alpha, float *X, blasint incX, float *Y, blasint incY, float *A, blasint lda); +void cblas_zgeru(enum CBLAS_ORDER order, blasint M, blasint N, double *alpha, double *X, blasint incX, double *Y, blasint incY, double *A, blasint lda); +void cblas_zgerc(enum CBLAS_ORDER order, blasint M, blasint N, double *alpha, double *X, blasint incX, double *Y, blasint incY, double *A, blasint lda); + +void cblas_strsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX); +void cblas_dtrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX); +void cblas_ctrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX); +void cblas_ztrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX); + +void cblas_strmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX); +void cblas_dtrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX); +void cblas_ctrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX); +void cblas_ztrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX); + +void cblas_ssyr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *A, blasint lda); +void cblas_dsyr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *A, blasint lda); +void cblas_cher(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *A, blasint lda); +void cblas_zher(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *A, blasint lda); + +void cblas_ssyr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,blasint N, float alpha, float *X, + blasint incX, float *Y, blasint incY, float *A, blasint lda); +void cblas_dsyr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, + blasint incX, double *Y, blasint incY, double *A, blasint lda); +void cblas_cher2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float *alpha, float *X, blasint incX, + float *Y, blasint incY, float *A, blasint lda); +void cblas_zher2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double *alpha, double *X, blasint incX, + double *Y, blasint incY, double *A, blasint lda); + +void cblas_sgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N, + blasint KL, blasint KU, float alpha, float *A, blasint lda, float *X, blasint incX, float beta, float *Y, blasint incY); +void cblas_dgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N, + blasint KL, blasint KU, double alpha, double *A, blasint lda, double *X, blasint incX, double beta, double *Y, blasint incY); +void cblas_cgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N, + blasint KL, blasint KU, float *alpha, float *A, blasint lda, float *X, blasint incX, float *beta, float *Y, blasint incY); +void cblas_zgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N, + blasint KL, blasint KU, double *alpha, double *A, blasint lda, double *X, blasint incX, double *beta, double *Y, blasint incY); + +void cblas_ssbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K, float alpha, float *A, + blasint lda, float *X, blasint incX, float beta, float *Y, blasint incY); +void cblas_dsbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K, double alpha, double *A, + blasint lda, double *X, blasint incX, double beta, double *Y, blasint incY); + + +void cblas_stbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint N, blasint K, float *A, blasint lda, float *X, blasint incX); +void cblas_dtbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint N, blasint K, double *A, blasint lda, double *X, blasint incX); +void cblas_ctbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint N, blasint K, float *A, blasint lda, float *X, blasint incX); +void cblas_ztbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint N, blasint K, double *A, blasint lda, double *X, blasint incX); + +void cblas_stbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint N, blasint K, float *A, blasint lda, float *X, blasint incX); +void cblas_dtbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint N, blasint K, double *A, blasint lda, double *X, blasint incX); +void cblas_ctbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint N, blasint K, float *A, blasint lda, float *X, blasint incX); +void cblas_ztbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint N, blasint K, double *A, blasint lda, double *X, blasint incX); + +void cblas_stpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint N, float *Ap, float *X, blasint incX); +void cblas_dtpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint N, double *Ap, double *X, blasint incX); +void cblas_ctpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint N, float *Ap, float *X, blasint incX); +void cblas_ztpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint N, double *Ap, double *X, blasint incX); + +void cblas_stpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint N, float *Ap, float *X, blasint incX); +void cblas_dtpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint N, double *Ap, double *X, blasint incX); +void cblas_ctpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint N, float *Ap, float *X, blasint incX); +void cblas_ztpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint N, double *Ap, double *X, blasint incX); + +void cblas_ssymv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *A, + blasint lda, float *X, blasint incX, float beta, float *Y, blasint incY); +void cblas_dsymv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *A, + blasint lda, double *X, blasint incX, double beta, double *Y, blasint incY); +void cblas_chemv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float *alpha, float *A, + blasint lda, float *X, blasint incX, float *beta, float *Y, blasint incY); +void cblas_zhemv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double *alpha, double *A, + blasint lda, double *X, blasint incX, double *beta, double *Y, blasint incY); + + +void cblas_sspmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *Ap, + float *X, blasint incX, float beta, float *Y, blasint incY); +void cblas_dspmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *Ap, + double *X, blasint incX, double beta, double *Y, blasint incY); + +void cblas_sspr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *Ap); +void cblas_dspr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *Ap); + +void cblas_chpr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *A); +void cblas_zhpr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X,blasint incX, double *A); + +void cblas_sspr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *Y, blasint incY, float *A); +void cblas_dspr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *Y, blasint incY, double *A); +void cblas_chpr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float *alpha, float *X, blasint incX, float *Y, blasint incY, float *Ap); +void cblas_zhpr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double *alpha, double *X, blasint incX, double *Y, blasint incY, double *Ap); + +void cblas_chbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K, + float *alpha, float *A, blasint lda, float *X, blasint incX, float *beta, float *Y, blasint incY); +void cblas_zhbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K, + double *alpha, double *A, blasint lda, double *X, blasint incX, double *beta, double *Y, blasint incY); + +void cblas_chpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, + float *alpha, float *Ap, float *X, blasint incX, float *beta, float *Y, blasint incY); +void cblas_zhpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, + double *alpha, double *Ap, double *X, blasint incX, double *beta, double *Y, blasint incY); + +void cblas_sgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K, + float alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc); +void cblas_dgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K, + double alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc); +void cblas_cgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K, + float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc); +void cblas_zgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K, + double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc); + +void cblas_ssymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, + float alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc); +void cblas_dsymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, + double alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc); +void cblas_csymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, + float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc); +void cblas_zsymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, + double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc); + +void cblas_ssyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, + blasint N, blasint K, float alpha, float *A, blasint lda, float beta, float *C, blasint ldc); +void cblas_dsyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, + blasint N, blasint K, double alpha, double *A, blasint lda, double beta, double *C, blasint ldc); +void cblas_csyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, + blasint N, blasint K, float *alpha, float *A, blasint lda, float *beta, float *C, blasint ldc); +void cblas_zsyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, + blasint N, blasint K, double *alpha, double *A, blasint lda, double *beta, double *C, blasint ldc); + +void cblas_ssyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, + blasint N, blasint K, float alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc); +void cblas_dsyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, + blasint N, blasint K, double alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc); +void cblas_csyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, + blasint N, blasint K, float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc); +void cblas_zsyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, + blasint N, blasint K, double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc); + +void cblas_strmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, + enum CBLAS_DIAG Diag, blasint M, blasint N, float alpha, float *A, blasint lda, float *B, blasint ldb); +void cblas_dtrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, + enum CBLAS_DIAG Diag, blasint M, blasint N, double alpha, double *A, blasint lda, double *B, blasint ldb); +void cblas_ctrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, + enum CBLAS_DIAG Diag, blasint M, blasint N, float *alpha, float *A, blasint lda, float *B, blasint ldb); +void cblas_ztrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, + enum CBLAS_DIAG Diag, blasint M, blasint N, double *alpha, double *A, blasint lda, double *B, blasint ldb); + +void cblas_strsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, + enum CBLAS_DIAG Diag, blasint M, blasint N, float alpha, float *A, blasint lda, float *B, blasint ldb); +void cblas_dtrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, + enum CBLAS_DIAG Diag, blasint M, blasint N, double alpha, double *A, blasint lda, double *B, blasint ldb); +void cblas_ctrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, + enum CBLAS_DIAG Diag, blasint M, blasint N, float *alpha, float *A, blasint lda, float *B, blasint ldb); +void cblas_ztrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, + enum CBLAS_DIAG Diag, blasint M, blasint N, double *alpha, double *A, blasint lda, double *B, blasint ldb); + +void cblas_chemm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, + float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc); +void cblas_zhemm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, + double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc); + +void cblas_cherk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K, + float alpha, float *A, blasint lda, float beta, float *C, blasint ldc); +void cblas_zherk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K, + double alpha, double *A, blasint lda, double beta, double *C, blasint ldc); + +void cblas_cher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K, + float *alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc); +void cblas_zher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K, + double *alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc); + +void cblas_xerbla(blasint p, char *rout, char *form, ...); + +#ifdef __cplusplus +} +#endif /* __cplusplus */ + +#endif diff --git a/common.h b/common.h index 309f246e2..4e3230d7e 100644 --- a/common.h +++ b/common.h @@ -310,6 +310,15 @@ typedef int blasint; #define YIELDING SwitchToThread() #endif +#if defined(ARMV7) || defined(ARMV6) || defined(ARMV8) +#define YIELDING asm volatile ("nop;nop;nop;nop;nop;nop;nop;nop; \n"); +#endif + +#ifdef PILEDRIVER +#define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n"); +#endif + + #ifndef YIELDING #define YIELDING sched_yield() #endif @@ -363,6 +372,15 @@ please https://github.com/xianyi/OpenBLAS/issues/246 #include "common_mips64.h" #endif +#ifdef ARCH_ARM +#include "common_arm.h" +#endif + +#ifdef ARCH_ARM64 +#include "common_arm64.h" +#endif + + #ifdef OS_LINUX #include "common_linux.h" #endif diff --git a/common_arm.h b/common_arm.h new file mode 100644 index 000000000..8c9752d9f --- /dev/null +++ b/common_arm.h @@ -0,0 +1,169 @@ +/***************************************************************************** +Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the ISCAS nor the names of its contributors may + be used to endorse or promote products derived from this software + without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#ifndef COMMON_ARM +#define COMMON_ARM + +#define MB +#define WMB + +#define INLINE inline + +#define RETURN_BY_COMPLEX + +#ifndef ASSEMBLER + +static void __inline blas_lock(volatile BLASULONG *address){ + + int register ret; + + do { + while (*address) {YIELDING;}; + + __asm__ __volatile__( + "ldrex r2, [%1] \n\t" + "mov r2, #0 \n\t" + "strex r3, r2, [%1] \n\t" + "mov %0 , r3 \n\t" + : "=r"(ret), "=r"(address) + : "1"(address) + : "memory", "r2" , "r3" + + + ); + + } while (ret); + +} + + +static inline unsigned long long rpcc(void){ + unsigned long long ret=0; + double v; + struct timeval tv; + gettimeofday(&tv,NULL); + v=(double) tv.tv_sec + (double) tv.tv_usec * 1e-6; + ret = (unsigned long long) ( v * 1000.0d ); + return ret; +} + +static inline int blas_quickdivide(blasint x, blasint y){ + return x / y; +} + +#if defined(DOUBLE) +#define GET_IMAGE(res) __asm__ __volatile__("vstr.f64 d1, %0" : "=m"(res) : : "memory") +#else +#define GET_IMAGE(res) __asm__ __volatile__("vstr.f32 s1, %0" : "=m"(res) : : "memory") +#endif + +#define GET_IMAGE_CANCEL + +#endif + + +#ifndef F_INTERFACE +#define REALNAME ASMNAME +#else +#define REALNAME ASMFNAME +#endif + +#if defined(ASSEMBLER) && !defined(NEEDPARAM) + +#define PROLOGUE \ + .arm ;\ + .global REALNAME ;\ + .func REALNAME ;\ +REALNAME: + +#define EPILOGUE + +#define PROFCODE + +#endif + + +#define SEEK_ADDRESS + +#ifndef PAGESIZE +#define PAGESIZE ( 4 << 10) +#endif +#define HUGE_PAGESIZE ( 4 << 20) + +#define BUFFER_SIZE (16 << 20) + + +#define BASE_ADDRESS (START_ADDRESS - BUFFER_SIZE * MAX_CPU_NUMBER) + +#ifndef MAP_ANONYMOUS +#define MAP_ANONYMOUS MAP_ANON +#endif + +#endif diff --git a/common_arm64.h b/common_arm64.h new file mode 100644 index 000000000..2da0d894c --- /dev/null +++ b/common_arm64.h @@ -0,0 +1,169 @@ +/***************************************************************************** +Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the ISCAS nor the names of its contributors may + be used to endorse or promote products derived from this software + without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#ifndef COMMON_ARM64 +#define COMMON_ARM64 + +#define MB +#define WMB + +#define INLINE inline + +#define RETURN_BY_COMPLEX + +#ifndef ASSEMBLER + +static void __inline blas_lock(volatile BLASULONG *address){ +/* + int register ret; + + do { + while (*address) {YIELDING;}; + + __asm__ __volatile__( + "ldrex r2, [%1] \n\t" + "mov r2, #0 \n\t" + "strex r3, r2, [%1] \n\t" + "mov %0 , r3 \n\t" + : "=r"(ret), "=r"(address) + : "1"(address) + : "memory", "r2" , "r3" + + + ); + + } while (ret); +*/ +} + + +static inline unsigned long long rpcc(void){ + unsigned long long ret=0; + double v; + struct timeval tv; + gettimeofday(&tv,NULL); + v=(double) tv.tv_sec + (double) tv.tv_usec * 1e-6; + ret = (unsigned long long) ( v * 1000.0d ); + return ret; +} + +static inline int blas_quickdivide(blasint x, blasint y){ + return x / y; +} + +#if defined(DOUBLE) +#define GET_IMAGE(res) __asm__ __volatile__("vstr.f64 d1, %0" : "=m"(res) : : "memory") +#else +#define GET_IMAGE(res) __asm__ __volatile__("vstr.f32 s1, %0" : "=m"(res) : : "memory") +#endif + +#define GET_IMAGE_CANCEL + +#endif + + +#ifndef F_INTERFACE +#define REALNAME ASMNAME +#else +#define REALNAME ASMFNAME +#endif + +#if defined(ASSEMBLER) && !defined(NEEDPARAM) + +#define PROLOGUE \ + .arm ;\ + .global REALNAME ;\ + .func REALNAME ;\ +REALNAME: + +#define EPILOGUE + +#define PROFCODE + +#endif + + +#define SEEK_ADDRESS + +#ifndef PAGESIZE +#define PAGESIZE ( 4 << 10) +#endif +#define HUGE_PAGESIZE ( 4 << 20) + +#define BUFFER_SIZE (16 << 20) + + +#define BASE_ADDRESS (START_ADDRESS - BUFFER_SIZE * MAX_CPU_NUMBER) + +#ifndef MAP_ANONYMOUS +#define MAP_ANONYMOUS MAP_ANON +#endif + +#endif diff --git a/cpuid.h b/cpuid.h index 2cbbd4539..cb4404cb0 100644 --- a/cpuid.h +++ b/cpuid.h @@ -107,7 +107,7 @@ #define CORE_BOBCAT 21 #define CORE_BULLDOZER 22 #define CORE_PILEDRIVER 23 -#define CORE_HASWELL CORE_SANDYBRIDGE +#define CORE_HASWELL 24 #define HAVE_SSE (1 << 0) #define HAVE_SSE2 (1 << 1) @@ -200,7 +200,6 @@ typedef struct { #define CPUTYPE_BOBCAT 45 #define CPUTYPE_BULLDOZER 46 #define CPUTYPE_PILEDRIVER 47 -// this define is because BLAS doesn't have haswell specific optimizations yet -#define CPUTYPE_HASWELL CPUTYPE_SANDYBRIDGE +#define CPUTYPE_HASWELL 48 #endif diff --git a/cpuid_arm.c b/cpuid_arm.c new file mode 100644 index 000000000..efd1369b4 --- /dev/null +++ b/cpuid_arm.c @@ -0,0 +1,262 @@ +/************************************************************************** + Copyright (c) 2013, The OpenBLAS Project + All rights reserved. + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE + USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ + +#include + +#define CPU_UNKNOWN 0 +#define CPU_ARMV6 1 +#define CPU_ARMV7 2 +#define CPU_CORTEXA15 3 + +static char *cpuname[] = { + "UNKOWN", + "ARMV6", + "ARMV7", + "CORTEXA15" +}; + + +int get_feature(char *search) +{ + +#ifdef linux + FILE *infile; + char buffer[2048], *p,*t; + p = (char *) NULL ; + + infile = fopen("/proc/cpuinfo", "r"); + + while (fgets(buffer, sizeof(buffer), infile)) + { + + if (!strncmp("Features", buffer, 8)) + { + p = strchr(buffer, ':') + 2; + break; + } + } + + fclose(infile); + + + if( p == NULL ) return; + + t = strtok(p," "); + while( t = strtok(NULL," ")) + { + if (!strcmp(t, search)) { return(1); } + } + +#endif + return(0); +} + + +int detect(void) +{ + +#ifdef linux + + FILE *infile; + char buffer[512], *p; + p = (char *) NULL ; + + infile = fopen("/proc/cpuinfo", "r"); + + while (fgets(buffer, sizeof(buffer), infile)) + { + + if (!strncmp("model name", buffer, 10)) + { + p = strchr(buffer, ':') + 2; + break; + } + } + + fclose(infile); + + if(p != NULL) + { + + if (strstr(p, "ARMv7")) + { + if ( get_feature("vfpv4")) + return CPU_ARMV7; + + if ( get_feature("vfpv3")) + return CPU_ARMV7; + + if ( get_feature("vfp")) + return CPU_ARMV6; + + + } + + if (strstr(p, "ARMv6")) + { + if ( get_feature("vfp")) + return CPU_ARMV6; + } + + + } +#endif + + return CPU_UNKNOWN; +} + +char *get_corename(void) +{ + return cpuname[detect()]; +} + +void get_architecture(void) +{ + printf("ARM"); +} + +void get_subarchitecture(void) +{ + int d = detect(); + switch (d) + { + + case CPU_ARMV7: + printf("ARMV7"); + break; + + case CPU_ARMV6: + printf("ARMV6"); + break; + + default: + printf("UNKNOWN"); + break; + } +} + +void get_subdirname(void) +{ + printf("arm"); +} + +void get_cpuconfig(void) +{ + + int d = detect(); + switch (d) + { + + case CPU_ARMV7: + printf("#define ARMV7\n"); + printf("#define HAVE_VFP\n"); + printf("#define HAVE_VFPV3\n"); + if ( get_feature("neon")) printf("#define HAVE_NEON\n"); + if ( get_feature("vfpv4")) printf("#define HAVE_VFPV4\n"); + printf("#define L1_DATA_SIZE 65536\n"); + printf("#define L1_DATA_LINESIZE 32\n"); + printf("#define L2_SIZE 512488\n"); + printf("#define L2_LINESIZE 32\n"); + printf("#define DTB_DEFAULT_ENTRIES 64\n"); + printf("#define DTB_SIZE 4096\n"); + printf("#define L2_ASSOCIATIVE 4\n"); + break; + + case CPU_ARMV6: + printf("#define ARMV6\n"); + printf("#define HAVE_VFP\n"); + printf("#define L1_DATA_SIZE 65536\n"); + printf("#define L1_DATA_LINESIZE 32\n"); + printf("#define L2_SIZE 512488\n"); + printf("#define L2_LINESIZE 32\n"); + printf("#define DTB_DEFAULT_ENTRIES 64\n"); + printf("#define DTB_SIZE 4096\n"); + printf("#define L2_ASSOCIATIVE 4\n"); + break; + + } +} + + +void get_libname(void) +{ + + int d = detect(); + switch (d) + { + + case CPU_ARMV7: + printf("armv7\n"); + break; + + case CPU_ARMV6: + printf("armv6\n"); + break; + + } +} + + +void get_features(void) +{ + +#ifdef linux + FILE *infile; + char buffer[2048], *p,*t; + p = (char *) NULL ; + + infile = fopen("/proc/cpuinfo", "r"); + + while (fgets(buffer, sizeof(buffer), infile)) + { + + if (!strncmp("Features", buffer, 8)) + { + p = strchr(buffer, ':') + 2; + break; + } + } + + fclose(infile); + + + if( p == NULL ) return; + + t = strtok(p," "); + while( t = strtok(NULL," ")) + { + if (!strcmp(t, "vfp")) { printf("HAVE_VFP=1\n"); continue; } + if (!strcmp(t, "vfpv3")) { printf("HAVE_VFPV3=1\n"); continue; } + if (!strcmp(t, "vfpv4")) { printf("HAVE_VFPV4=1\n"); continue; } + if (!strcmp(t, "neon")) { printf("HAVE_NEON=1\n"); continue; } + } + +#endif + return; +} + + diff --git a/cpuid_x86.c b/cpuid_x86.c index 9e850a2aa..4ed01b891 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1243,6 +1243,7 @@ static char *cpuname[] = { "BOBCAT", "BULLDOZER", "PILEDRIVER", + "HASWELL", }; static char *lowercpuname[] = { @@ -1293,6 +1294,7 @@ static char *lowercpuname[] = { "bobcat", "bulldozer", "piledriver", + "haswell", }; static char *corename[] = { @@ -1320,6 +1322,7 @@ static char *corename[] = { "BOBCAT", "BULLDOZER", "PILEDRIVER", + "HASWELL", }; static char *corename_lower[] = { @@ -1347,6 +1350,7 @@ static char *corename_lower[] = { "bobcat", "bulldozer", "piledriver", + "haswell", }; diff --git a/ctest.c b/ctest.c index 413519274..86dc226d4 100644 --- a/ctest.c +++ b/ctest.c @@ -124,3 +124,12 @@ ARCH_IA64 #if defined(__LP64) || defined(__LP64__) || defined(__ptr64) || defined(__x86_64__) || defined(__amd64__) || defined(__64BIT__) BINARY_64 #endif + +#if defined(__ARM_ARCH) || defined(__ARM_ARCH_7A__) +ARCH_ARM +#endif + +#if defined(__aarch64__) +ARCH_ARM64 +#endif + diff --git a/driver/level3/level3.c b/driver/level3/level3.c index 2fe889527..5f746642c 100644 --- a/driver/level3/level3.c +++ b/driver/level3/level3.c @@ -333,9 +333,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; -#if defined(BULLDOZER) && defined(ARCH_X86_64) && !defined(XDOUBLE) && !defined(COMPLEX) - if (min_jj >= 12*GEMM_UNROLL_N) min_jj = 12*GEMM_UNROLL_N; - else +#if ( defined(BULLDOZER) || defined(PILEDRIVER) || defined(HASWELL) ) && defined(ARCH_X86_64) && !defined(XDOUBLE) && !defined(COMPLEX) if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; else if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N; diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c index 3242790fa..ee1a8db7c 100644 --- a/driver/level3/level3_thread.c +++ b/driver/level3/level3_thread.c @@ -367,9 +367,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){ min_jj = MIN(n_to, xxx + div_n) - jjs; -#if defined(BULLDOZER) && defined(ARCH_X86_64) && !defined(XDOUBLE) && !defined(COMPLEX) - if (min_jj >= 12*GEMM_UNROLL_N) min_jj = 12*GEMM_UNROLL_N; - else +#if ( defined(BULLDOZER) || defined(PILEDRIVER) || defined(HASWELL) ) && defined(ARCH_X86_64) && !defined(XDOUBLE) && !defined(COMPLEX) if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; else if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N; diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 02f006d39..d6a08eb9f 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -65,14 +65,15 @@ extern gotoblas_t gotoblas_BOBCAT; extern gotoblas_t gotoblas_SANDYBRIDGE; extern gotoblas_t gotoblas_BULLDOZER; extern gotoblas_t gotoblas_PILEDRIVER; +extern gotoblas_t gotoblas_HASWELL; #else //Use NEHALEM kernels for sandy bridge #define gotoblas_SANDYBRIDGE gotoblas_NEHALEM +#define gotoblas_HASWELL gotoblas_NEHALEM #define gotoblas_BULLDOZER gotoblas_BARCELONA #define gotoblas_PILEDRIVER gotoblas_BARCELONA #endif -//Use sandy bridge kernels for haswell. -#define gotoblas_HASWELL gotoblas_SANDYBRIDGE + #define VENDOR_INTEL 1 #define VENDOR_AMD 2 @@ -297,6 +298,7 @@ static char *corename[] = { "Bobcat", "Bulldozer", "Piledriver", + "Haswell", }; char *gotoblas_corename(void) { @@ -319,7 +321,8 @@ char *gotoblas_corename(void) { if (gotoblas == &gotoblas_SANDYBRIDGE) return corename[16]; if (gotoblas == &gotoblas_BOBCAT) return corename[17]; if (gotoblas == &gotoblas_BULLDOZER) return corename[18]; - if (gotoblas == &gotoblas_PILEDRIVER) return corename[19]; + if (gotoblas == &gotoblas_PILEDRIVER) return corename[19]; + if (gotoblas == &gotoblas_HASWELL) return corename[20]; return corename[0]; } diff --git a/getarch.c b/getarch.c index 3ffda6244..6e1938434 100644 --- a/getarch.c +++ b/getarch.c @@ -298,6 +298,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CORENAME "SANDYBRIDGE" #endif +#ifdef FORCE_HASWELL +#define FORCE +#define FORCE_INTEL +#define ARCHITECTURE "X86" +#define SUBARCHITECTURE "HASWELL" +#define ARCHCONFIG "-DHASWELL " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \ + "-DFMA3" +#define LIBNAME "haswell" +#define CORENAME "HASWELL" +#endif + #ifdef FORCE_ATOM #define FORCE #define FORCE_INTEL @@ -679,6 +694,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CORENAME "generic" #endif +#ifdef FORCE_ARMV7 +#define FORCE +#define ARCHITECTURE "ARM" +#define SUBARCHITECTURE "ARMV7" +#define SUBDIRNAME "arm" +#define ARCHCONFIG "-DARMV7 " \ + "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \ + "-DL2_SIZE=512488 -DL2_LINESIZE=32 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " \ + "-DHAVE_VFPV3 -DHAVE_VFP" +#define LIBNAME "armv7" +#define CORENAME "ARMV7" +#else +#endif + +#ifdef FORCE_ARMV6 +#define FORCE +#define ARCHITECTURE "ARM" +#define SUBARCHITECTURE "ARMV6" +#define SUBDIRNAME "arm" +#define ARCHCONFIG "-DARMV6 " \ + "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \ + "-DL2_SIZE=512488 -DL2_LINESIZE=32 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " \ + "-DHAVE_VFP" +#define LIBNAME "armv6" +#define CORENAME "ARMV6" +#else +#endif + +#ifdef FORCE_ARMV8 +#define FORCE +#define ARCHITECTURE "ARM64" +#define SUBARCHITECTURE "ARMV8" +#define SUBDIRNAME "arm64" +#define ARCHCONFIG "-DARMV8 " \ + "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \ + "-DL2_SIZE=512488 -DL2_LINESIZE=32 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " \ + "-DHAVE_VFP -DHAVE_VFPV3 -DHAVE_VFPV4" +#define LIBNAME "armv8" +#define CORENAME "ARMV8" +#else +#endif + + #ifndef FORCE #if defined(__powerpc__) || defined(__powerpc) || defined(powerpc) || \ @@ -719,6 +780,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define OPENBLAS_SUPPORTED #endif +#ifdef __arm__ +#include "cpuid_arm.c" +#define OPENBLAS_SUPPORTED +#endif + + #ifndef OPENBLAS_SUPPORTED #error "This arch/CPU is not supported by OpenBLAS." #endif @@ -773,7 +840,7 @@ int main(int argc, char *argv[]){ #ifdef FORCE printf("CORE=%s\n", CORENAME); #else -#if defined(__i386__) || defined(__x86_64__) || defined(POWER) || defined(__mips__) +#if defined(__i386__) || defined(__x86_64__) || defined(POWER) || defined(__mips__) || defined(__arm__) printf("CORE=%s\n", get_corename()); #endif #endif @@ -788,6 +855,11 @@ int main(int argc, char *argv[]){ printf("NUM_CORES=%d\n", get_num_cores()); +#if defined(__arm__) && !defined(FORCE) + get_features(); +#endif + + #if defined(__i386__) || defined(__x86_64__) #ifndef FORCE get_sse(); diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index f8152ac50..b9b4bef1e 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -14,6 +14,20 @@ ifeq ($(ARCH), MIPS) USE_GEMM3M = 1 endif +ifeq ($(ARCH), arm) +USE_TRMM = 1 +endif + +ifeq ($(ARCH), arm64) +USE_TRMM = 1 +endif + +ifeq ($(TARGET), LOONGSON3B) +USE_TRMM = 1 +endif + + + SKERNELOBJS += \ sgemm_kernel$(TSUFFIX).$(SUFFIX) \ $(SGEMMINCOPYOBJ) $(SGEMMITCOPYOBJ) \ @@ -498,7 +512,8 @@ $(KDIR)xgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMD $(KDIR)xgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND) $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $@ -ifeq ($(TARGET), LOONGSON3B) + +ifdef USE_TRMM $(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ @@ -582,24 +597,6 @@ $(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) $(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ - -else - -ifdef STRMMKERNEL - -$(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ - -$(KDIR)strmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ - -$(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ - -$(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ - - else $(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ @@ -613,93 +610,17 @@ $(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ -endif - -ifdef DTRMMKERNEL - -ifdef DTRMMKERNEL_LN -$(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL_LN) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ -else -$(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ -endif - -ifdef DTRMMKERNEL_LT -$(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL_LT) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ -else -$(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ -endif - -ifdef DTRMMKERNEL_RN -$(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL_RN) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ -else -$(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ -endif - -ifdef DTRMMKERNEL_RT -$(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL_RT) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ -else -$(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ -endif - -else - -ifdef DTRMMKERNEL_LN -$(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL_LN) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ -else $(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ -endif -ifdef DTRMMKERNEL_LT -$(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL_LT) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ -else $(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ -endif -ifdef DTRMMKERNEL_RN -$(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL_RN) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ -else $(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ -endif -ifdef DTRMMKERNEL_RT -$(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL_RT) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ -else $(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ -endif - -endif - -ifdef QTRMMKERNEL - -$(KDIR)qtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ - -$(KDIR)qtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ - -$(KDIR)qtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ - -$(KDIR)qtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ - -else $(KDIR)qtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ @@ -713,36 +634,6 @@ $(KDIR)qtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(KDIR)qtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ -endif - -ifdef CTRMMKERNEL - -$(KDIR)ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ - -$(KDIR)ctrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ - -$(KDIR)ctrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ - -$(KDIR)ctrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ - -$(KDIR)ctrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ - -$(KDIR)ctrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ - -$(KDIR)ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ - -$(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ - -else - $(KDIR)ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ @@ -767,37 +658,6 @@ $(KDIR)ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ -endif - -ifdef ZTRMMKERNEL - -$(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ - -$(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ - -$(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ - -$(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ - -$(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ - -$(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ - -$(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ - -$(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ - - -else - $(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ @@ -821,37 +681,10 @@ $(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ - endif -endif - -ifdef XTRMMKERNEL - -$(KDIR)xtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ - -$(KDIR)xtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ - -$(KDIR)xtrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ - -$(KDIR)xtrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ -$(KDIR)xtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ - -$(KDIR)xtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ - -$(KDIR)xtrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ -$(KDIR)xtrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ -else $(KDIR)xtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ @@ -877,9 +710,6 @@ $(KDIR)xtrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(KDIR)xtrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ -endif - - $(KDIR)cgemm3m_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM3MKERNEL) $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $@ diff --git a/kernel/arm/KERNEL b/kernel/arm/KERNEL new file mode 100644 index 000000000..aeccfbf4c --- /dev/null +++ b/kernel/arm/KERNEL @@ -0,0 +1,46 @@ +ifndef SNRM2KERNEL +SNRM2KERNEL = nrm2.c +endif + +ifndef DNRM2KERNEL +DNRM2KERNEL = nrm2.c +endif + +ifndef CNRM2KERNEL +CNRM2KERNEL = znrm2.c +endif + +ifndef ZNRM2KERNEL +ZNRM2KERNEL = znrm2.c +endif + +ifndef SCABS_KERNEL +SCABS_KERNEL = ../generic/cabs.c +endif + +ifndef DCABS_KERNEL +DCABS_KERNEL = ../generic/cabs.c +endif + +ifndef QCABS_KERNEL +QCABS_KERNEL = ../generic/cabs.c +endif + +ifndef LSAME_KERNEL +LSAME_KERNEL = ../generic/lsame.c +endif + +ifndef SGEMM_BETA +SGEMM_BETA = ../generic/gemm_beta.c +endif +ifndef DGEMM_BETA +DGEMM_BETA = ../generic/gemm_beta.c +endif +ifndef CGEMM_BETA +CGEMM_BETA = ../generic/zgemm_beta.c +endif +ifndef ZGEMM_BETA +ZGEMM_BETA = ../generic/zgemm_beta.c +endif + + diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6 new file mode 100644 index 000000000..f47a843f3 --- /dev/null +++ b/kernel/arm/KERNEL.ARMV6 @@ -0,0 +1,142 @@ +SAMAXKERNEL = iamax_vfp.S +DAMAXKERNEL = iamax_vfp.S +CAMAXKERNEL = iamax_vfp.S +ZAMAXKERNEL = iamax_vfp.S + +SAMINKERNEL = iamax_vfp.S +DAMINKERNEL = iamax_vfp.S +CAMINKERNEL = iamax_vfp.S +ZAMINKERNEL = iamax_vfp.S + +SMAXKERNEL = iamax_vfp.S +DMAXKERNEL = iamax_vfp.S + +SMINKERNEL = iamax_vfp.S +DMINKERNEL = iamax_vfp.S + +ISAMAXKERNEL = iamax_vfp.S +IDAMAXKERNEL = iamax_vfp.S +ICAMAXKERNEL = iamax_vfp.S +IZAMAXKERNEL = iamax_vfp.S + +ISAMINKERNEL = iamax_vfp.S +IDAMINKERNEL = iamax_vfp.S +ICAMINKERNEL = iamax_vfp.S +IZAMINKERNEL = iamax_vfp.S + +ISMAXKERNEL = iamax_vfp.S +IDMAXKERNEL = iamax_vfp.S + +ISMINKERNEL = iamax_vfp.S +IDMINKERNEL = iamax_vfp.S + +SASUMKERNEL = asum_vfp.S +DASUMKERNEL = asum_vfp.S +CASUMKERNEL = asum_vfp.S +ZASUMKERNEL = asum_vfp.S + +SAXPYKERNEL = axpy_vfp.S +DAXPYKERNEL = axpy_vfp.S +CAXPYKERNEL = axpy_vfp.S +ZAXPYKERNEL = axpy_vfp.S + +SCOPYKERNEL = scopy_vfp.S +DCOPYKERNEL = dcopy_vfp.S +CCOPYKERNEL = ccopy_vfp.S +ZCOPYKERNEL = zcopy_vfp.S + +SDOTKERNEL = sdot_vfp.S +DDOTKERNEL = ddot_vfp.S +CDOTKERNEL = cdot_vfp.S +ZDOTKERNEL = zdot_vfp.S + +SNRM2KERNEL = nrm2_vfp.S +DNRM2KERNEL = nrm2_vfp.S +CNRM2KERNEL = nrm2_vfp.S +ZNRM2KERNEL = nrm2_vfp.S + +SROTKERNEL = rot_vfp.S +DROTKERNEL = rot_vfp.S +CROTKERNEL = rot_vfp.S +ZROTKERNEL = rot_vfp.S + +SSCALKERNEL = scal_vfp.S +DSCALKERNEL = scal_vfp.S +CSCALKERNEL = scal_vfp.S +ZSCALKERNEL = scal_vfp.S + +SSWAPKERNEL = swap_vfp.S +DSWAPKERNEL = swap_vfp.S +CSWAPKERNEL = swap_vfp.S +ZSWAPKERNEL = swap_vfp.S + +SGEMVNKERNEL = gemv_n_vfp.S +DGEMVNKERNEL = gemv_n_vfp.S +CGEMVNKERNEL = cgemv_n_vfp.S +ZGEMVNKERNEL = zgemv_n_vfp.S + +SGEMVTKERNEL = gemv_t_vfp.S +DGEMVTKERNEL = gemv_t_vfp.S +CGEMVTKERNEL = cgemv_t_vfp.S +ZGEMVTKERNEL = zgemv_t_vfp.S + +STRMMKERNEL = strmm_kernel_4x2_vfp.S +DTRMMKERNEL = dtrmm_kernel_4x2_vfp.S +CTRMMKERNEL = ctrmm_kernel_2x2_vfp.S +ZTRMMKERNEL = ztrmm_kernel_2x2_vfp.S + +SGEMMKERNEL = sgemm_kernel_4x2_vfp.S +SGEMMINCOPY = sgemm_ncopy_4_vfp.S +SGEMMITCOPY = sgemm_tcopy_4_vfp.S +SGEMMINCOPYOBJ = sgemm_incopy.o +SGEMMITCOPYOBJ = sgemm_itcopy.o +SGEMMONCOPY = sgemm_ncopy_2_vfp.S +SGEMMOTCOPY = ../generic/gemm_tcopy_2.c +SGEMMONCOPYOBJ = sgemm_oncopy.o +SGEMMOTCOPYOBJ = sgemm_otcopy.o + +DGEMMKERNEL = dgemm_kernel_4x2_vfp.S +DGEMMINCOPY = dgemm_ncopy_4_vfp.S +DGEMMITCOPY = dgemm_tcopy_4_vfp.S +DGEMMINCOPYOBJ = dgemm_incopy.o +DGEMMITCOPYOBJ = dgemm_itcopy.o +DGEMMONCOPY = dgemm_ncopy_2_vfp.S +DGEMMOTCOPY = ../generic/gemm_tcopy_2.c +DGEMMONCOPYOBJ = dgemm_oncopy.o +DGEMMOTCOPYOBJ = dgemm_otcopy.o + +CGEMMKERNEL = cgemm_kernel_2x2_vfp.S +CGEMMONCOPY = cgemm_ncopy_2_vfp.S +CGEMMOTCOPY = cgemm_tcopy_2_vfp.S +CGEMMONCOPYOBJ = cgemm_oncopy.o +CGEMMOTCOPYOBJ = cgemm_otcopy.o + +ZGEMMKERNEL = zgemm_kernel_2x2_vfp.S +ZGEMMONCOPY = zgemm_ncopy_2_vfp.S +ZGEMMOTCOPY = zgemm_tcopy_2_vfp.S +ZGEMMONCOPYOBJ = zgemm_oncopy.o +ZGEMMOTCOPYOBJ = zgemm_otcopy.o + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + + + + diff --git a/kernel/arm/KERNEL.ARMV7 b/kernel/arm/KERNEL.ARMV7 new file mode 100644 index 000000000..507f9813c --- /dev/null +++ b/kernel/arm/KERNEL.ARMV7 @@ -0,0 +1,141 @@ +SAMAXKERNEL = iamax_vfp.S +DAMAXKERNEL = iamax_vfp.S +CAMAXKERNEL = iamax_vfp.S +ZAMAXKERNEL = iamax_vfp.S + +SAMINKERNEL = iamax_vfp.S +DAMINKERNEL = iamax_vfp.S +CAMINKERNEL = iamax_vfp.S +ZAMINKERNEL = iamax_vfp.S + +SMAXKERNEL = iamax_vfp.S +DMAXKERNEL = iamax_vfp.S + +SMINKERNEL = iamax_vfp.S +DMINKERNEL = iamax_vfp.S + +ISAMAXKERNEL = iamax_vfp.S +IDAMAXKERNEL = iamax_vfp.S +ICAMAXKERNEL = iamax_vfp.S +IZAMAXKERNEL = iamax_vfp.S + +ISAMINKERNEL = iamax_vfp.S +IDAMINKERNEL = iamax_vfp.S +ICAMINKERNEL = iamax_vfp.S +IZAMINKERNEL = iamax_vfp.S + +ISMAXKERNEL = iamax_vfp.S +IDMAXKERNEL = iamax_vfp.S + +ISMINKERNEL = iamax_vfp.S +IDMINKERNEL = iamax_vfp.S + +SSWAPKERNEL = swap_vfp.S +DSWAPKERNEL = swap_vfp.S +CSWAPKERNEL = swap_vfp.S +ZSWAPKERNEL = swap_vfp.S + +SASUMKERNEL = asum_vfp.S +DASUMKERNEL = asum_vfp.S +CASUMKERNEL = asum_vfp.S +ZASUMKERNEL = asum_vfp.S + +SAXPYKERNEL = axpy_vfp.S +DAXPYKERNEL = axpy_vfp.S +CAXPYKERNEL = axpy_vfp.S +ZAXPYKERNEL = axpy_vfp.S + +SCOPYKERNEL = scopy_vfp.S +DCOPYKERNEL = dcopy_vfp.S +CCOPYKERNEL = ccopy_vfp.S +ZCOPYKERNEL = zcopy_vfp.S + +SDOTKERNEL = sdot_vfp.S +DDOTKERNEL = ddot_vfp.S +CDOTKERNEL = cdot_vfp.S +ZDOTKERNEL = zdot_vfp.S + +SNRM2KERNEL = nrm2_vfpv3.S +DNRM2KERNEL = nrm2_vfpv3.S +CNRM2KERNEL = nrm2_vfpv3.S +ZNRM2KERNEL = nrm2_vfpv3.S + +SROTKERNEL = rot_vfp.S +DROTKERNEL = rot_vfp.S +CROTKERNEL = rot_vfp.S +ZROTKERNEL = rot_vfp.S + +SSCALKERNEL = scal_vfp.S +DSCALKERNEL = scal_vfp.S +CSCALKERNEL = scal_vfp.S +ZSCALKERNEL = scal_vfp.S + +SGEMVNKERNEL = gemv_n_vfp.S +DGEMVNKERNEL = gemv_n_vfp.S +CGEMVNKERNEL = cgemv_n_vfp.S +ZGEMVNKERNEL = zgemv_n_vfp.S + +SGEMVTKERNEL = gemv_t_vfp.S +DGEMVTKERNEL = gemv_t_vfp.S +CGEMVTKERNEL = cgemv_t_vfp.S +ZGEMVTKERNEL = zgemv_t_vfp.S + +STRMMKERNEL = strmm_kernel_4x4_vfpv3.S +DTRMMKERNEL = dtrmm_kernel_4x4_vfpv3.S +CTRMMKERNEL = ctrmm_kernel_2x2_vfpv3.S +ZTRMMKERNEL = ztrmm_kernel_2x2_vfpv3.S + +#SGEMMKERNEL = ../generic/gemmkernel_2x2.c +SGEMMKERNEL = sgemm_kernel_4x4_vfpv3.S +SGEMMINCOPY = +SGEMMITCOPY = +SGEMMONCOPY = sgemm_ncopy_4_vfp.S +SGEMMOTCOPY = sgemm_tcopy_4_vfp.S +SGEMMINCOPYOBJ = +SGEMMITCOPYOBJ = +SGEMMONCOPYOBJ = sgemm_oncopy.o +SGEMMOTCOPYOBJ = sgemm_otcopy.o + +DGEMMKERNEL = dgemm_kernel_4x4_vfpv3.S +DGEMMINCOPY = +DGEMMITCOPY = +DGEMMONCOPY = dgemm_ncopy_4_vfp.S +DGEMMOTCOPY = dgemm_tcopy_4_vfp.S +DGEMMINCOPYOBJ = +DGEMMITCOPYOBJ = +DGEMMONCOPYOBJ = dgemm_oncopy.o +DGEMMOTCOPYOBJ = dgemm_otcopy.o + +CGEMMKERNEL = cgemm_kernel_2x2_vfpv3.S +CGEMMONCOPY = cgemm_ncopy_2_vfp.S +CGEMMOTCOPY = cgemm_tcopy_2_vfp.S +CGEMMONCOPYOBJ = cgemm_oncopy.o +CGEMMOTCOPYOBJ = cgemm_otcopy.o + +ZGEMMKERNEL = zgemm_kernel_2x2_vfpv3.S +ZGEMMONCOPY = zgemm_ncopy_2_vfp.S +ZGEMMOTCOPY = zgemm_tcopy_2_vfp.S +ZGEMMONCOPYOBJ = zgemm_oncopy.o +ZGEMMOTCOPYOBJ = zgemm_otcopy.o + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + + diff --git a/kernel/arm/Makefile b/kernel/arm/Makefile new file mode 100644 index 000000000..efae70d7b --- /dev/null +++ b/kernel/arm/Makefile @@ -0,0 +1,2 @@ +clean :: + diff --git a/kernel/arm/amax.c b/kernel/arm/amax.c new file mode 100644 index 000000000..55107ca4f --- /dev/null +++ b/kernel/arm/amax.c @@ -0,0 +1,73 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : OK +* BLASTEST double : OK +* CTEST : NoTest +* TEST : NoTest +* +**************************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT maxf=0.0; + + if (n < 0 || inc_x < 1 ) return(maxf); + + maxf=ABS(x[0]); + + while(i < n) + { + if( ABS(x[ix]) > ABS(maxf) ) + { + maxf = ABS(x[ix]); + } + ix += inc_x; + i++; + } + return(maxf); +} + + diff --git a/kernel/arm/amin.c b/kernel/arm/amin.c new file mode 100644 index 000000000..3f7e97be6 --- /dev/null +++ b/kernel/arm/amin.c @@ -0,0 +1,73 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : OK +* BLASTEST double : OK +* CTEST : NoTest +* TEST : NoTest +* +**************************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT minf=0.0; + + if (n < 0 || inc_x < 1 ) return(minf); + + minf=ABS(x[0]); + + while(i < n) + { + if( ABS(x[ix]) < ABS(minf) ) + { + minf = ABS(x[ix]); + } + ix += inc_x; + i++; + } + return(minf); +} + + diff --git a/kernel/arm/asum.c b/kernel/arm/asum.c new file mode 100644 index 000000000..5ac6936a0 --- /dev/null +++ b/kernel/arm/asum.c @@ -0,0 +1,67 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : OK +* BLASTEST double : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + FLOAT sumf = 0.0; + if (n < 0 || inc_x < 1 ) return(sumf); + + n *= inc_x; + while(i < n) + { + sumf += ABS(x[i]); + i += inc_x; + } + return(sumf); +} + + diff --git a/kernel/arm/asum_vfp.S b/kernel/arm/asum_vfp.S new file mode 100644 index 000000000..2b6ceb191 --- /dev/null +++ b/kernel/arm/asum_vfp.S @@ -0,0 +1,481 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/11 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define N r0 +#define X r1 +#define INC_X r2 + + +#define I r12 + +#define X_PRE 512 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +#if !defined(COMPLEX) + +#if defined(DOUBLE) + +.macro KERNEL_F4 + + pld [ X, #X_PRE ] + fldmiad X!, { d4 - d5 } + vabs.f64 d4, d4 + vadd.f64 d0 , d0, d4 + vabs.f64 d5, d5 + fldmiad X!, { d6 - d7 } + vabs.f64 d6, d6 + vadd.f64 d1 , d1, d5 + vabs.f64 d7, d7 + vadd.f64 d0 , d0, d6 + vadd.f64 d1 , d1, d7 + +.endm + +.macro KERNEL_F1 + + fldmiad X!, { d4 } + vabs.f64 d4, d4 + vadd.f64 d0 , d0, d4 + +.endm + + +.macro KERNEL_S4 + + fldmiad X, { d4 } + vabs.f64 d4, d4 + vadd.f64 d0 , d0, d4 + add X, X, INC_X + + fldmiad X, { d4 } + vabs.f64 d4, d4 + vadd.f64 d0 , d0, d4 + add X, X, INC_X + + fldmiad X, { d4 } + vabs.f64 d4, d4 + vadd.f64 d0 , d0, d4 + add X, X, INC_X + + fldmiad X, { d4 } + vabs.f64 d4, d4 + vadd.f64 d0 , d0, d4 + add X, X, INC_X + +.endm + + +.macro KERNEL_S1 + + fldmiad X, { d4 } + vabs.f64 d4, d4 + vadd.f64 d0 , d0, d4 + add X, X, INC_X + +.endm + +#else + +.macro KERNEL_F4 + + fldmias X!, { s4 - s5 } + vabs.f32 s4, s4 + vadd.f32 s0 , s0, s4 + vabs.f32 s5, s5 + fldmias X!, { s6 - s7 } + vabs.f32 s6, s6 + vadd.f32 s1 , s1, s5 + vabs.f32 s7, s7 + vadd.f32 s0 , s0, s6 + vadd.f32 s1 , s1, s7 + +.endm + +.macro KERNEL_F1 + + fldmias X!, { s4 } + vabs.f32 s4, s4 + vadd.f32 s0 , s0, s4 + +.endm + + +.macro KERNEL_S4 + + fldmias X, { s4 } + vabs.f32 s4, s4 + vadd.f32 s0 , s0, s4 + add X, X, INC_X + + fldmias X, { s4 } + vabs.f32 s4, s4 + vadd.f32 s0 , s0, s4 + add X, X, INC_X + + fldmias X, { s4 } + vabs.f32 s4, s4 + vadd.f32 s0 , s0, s4 + add X, X, INC_X + + fldmias X, { s4 } + vabs.f32 s4, s4 + vadd.f32 s0 , s0, s4 + add X, X, INC_X + +.endm + + +.macro KERNEL_S1 + + fldmias X, { s4 } + vabs.f32 s4, s4 + vadd.f32 s0 , s0, s4 + add X, X, INC_X + +.endm + + +#endif + +#else + +#if defined(DOUBLE) + +.macro KERNEL_F4 + + pld [ X, #X_PRE ] + fldmiad X!, { d4 - d5 } + vabs.f64 d4, d4 + vadd.f64 d0 , d0, d4 + vabs.f64 d5, d5 + fldmiad X!, { d6 - d7 } + vabs.f64 d6, d6 + vadd.f64 d1 , d1, d5 + vabs.f64 d7, d7 + vadd.f64 d0 , d0, d6 + vadd.f64 d1 , d1, d7 + + pld [ X, #X_PRE ] + fldmiad X!, { d4 - d5 } + vabs.f64 d4, d4 + vadd.f64 d0 , d0, d4 + vabs.f64 d5, d5 + fldmiad X!, { d6 - d7 } + vabs.f64 d6, d6 + vadd.f64 d1 , d1, d5 + vabs.f64 d7, d7 + vadd.f64 d0 , d0, d6 + vadd.f64 d1 , d1, d7 + + +.endm + +.macro KERNEL_F1 + + fldmiad X!, { d4 } + vabs.f64 d4, d4 + vadd.f64 d0 , d0, d4 + + fldmiad X!, { d4 } + vabs.f64 d4, d4 + vadd.f64 d0 , d0, d4 + + +.endm + + +.macro KERNEL_S4 + + fldmiad X, { d4 -d5 } + vabs.f64 d4, d4 + vadd.f64 d0 , d0, d4 + vabs.f64 d5, d5 + vadd.f64 d0 , d0, d5 + add X, X, INC_X + + fldmiad X, { d4 -d5 } + vabs.f64 d4, d4 + vadd.f64 d0 , d0, d4 + vabs.f64 d5, d5 + vadd.f64 d0 , d0, d5 + add X, X, INC_X + + fldmiad X, { d4 -d5 } + vabs.f64 d4, d4 + vadd.f64 d0 , d0, d4 + vabs.f64 d5, d5 + vadd.f64 d0 , d0, d5 + add X, X, INC_X + + fldmiad X, { d4 -d5 } + vabs.f64 d4, d4 + vadd.f64 d0 , d0, d4 + vabs.f64 d5, d5 + vadd.f64 d0 , d0, d5 + add X, X, INC_X + +.endm + + +.macro KERNEL_S1 + + fldmiad X, { d4 -d5 } + vabs.f64 d4, d4 + vadd.f64 d0 , d0, d4 + vabs.f64 d5, d5 + vadd.f64 d0 , d0, d5 + add X, X, INC_X + +.endm + +#else + +.macro KERNEL_F4 + + pld [ X, #X_PRE ] + fldmias X!, { s4 - s5 } + vabs.f32 s4, s4 + vadd.f32 s0 , s0, s4 + vabs.f32 s5, s5 + fldmias X!, { s6 - s7 } + vabs.f32 s6, s6 + vadd.f32 s1 , s1, s5 + vabs.f32 s7, s7 + vadd.f32 s0 , s0, s6 + vadd.f32 s1 , s1, s7 + + fldmias X!, { s4 - s5 } + vabs.f32 s4, s4 + vadd.f32 s0 , s0, s4 + vabs.f32 s5, s5 + fldmias X!, { s6 - s7 } + vabs.f32 s6, s6 + vadd.f32 s1 , s1, s5 + vabs.f32 s7, s7 + vadd.f32 s0 , s0, s6 + vadd.f32 s1 , s1, s7 + + +.endm + +.macro KERNEL_F1 + + fldmias X!, { s4 } + vabs.f32 s4, s4 + vadd.f32 s0 , s0, s4 + + fldmias X!, { s4 } + vabs.f32 s4, s4 + vadd.f32 s0 , s0, s4 + +.endm + + +.macro KERNEL_S4 + + fldmias X, { s4 -s5 } + vabs.f32 s4, s4 + vadd.f32 s0 , s0, s4 + vabs.f32 s5, s5 + vadd.f32 s0 , s0, s5 + add X, X, INC_X + + fldmias X, { s4 -s5 } + vabs.f32 s4, s4 + vadd.f32 s0 , s0, s4 + vabs.f32 s5, s5 + vadd.f32 s0 , s0, s5 + add X, X, INC_X + + fldmias X, { s4 -s5 } + vabs.f32 s4, s4 + vadd.f32 s0 , s0, s4 + vabs.f32 s5, s5 + vadd.f32 s0 , s0, s5 + add X, X, INC_X + + fldmias X, { s4 -s5 } + vabs.f32 s4, s4 + vadd.f32 s0 , s0, s4 + vabs.f32 s5, s5 + vadd.f32 s0 , s0, s5 + add X, X, INC_X + +.endm + + +.macro KERNEL_S1 + + fldmias X, { s4 -s5 } + vabs.f32 s4, s4 + vadd.f32 s0 , s0, s4 + vabs.f32 s5, s5 + vadd.f32 s0 , s0, s5 + add X, X, INC_X + +.endm + +#endif + +#endif + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + +#if defined(DOUBLE) + vsub.f64 d0 , d0 , d0 + vsub.f64 d1 , d1 , d1 +#else + vsub.f32 s0 , s0 , s0 + vsub.f32 s1 , s1 , s1 +#endif + + cmp N, #0 + ble asum_kernel_L999 + + cmp INC_X, #0 + beq asum_kernel_L999 + + cmp INC_X, #1 + bne asum_kernel_S_BEGIN + + +asum_kernel_F_BEGIN: + + asrs I, N, #2 // I = N / 4 + ble asum_kernel_F1 + + .align 5 + +asum_kernel_F4: + +#if !defined(DOUBLE) && !defined(COMPLEX) + pld [ X, #X_PRE ] +#endif + KERNEL_F4 + + subs I, I, #1 + ble asum_kernel_F1 + + KERNEL_F4 + + subs I, I, #1 + bne asum_kernel_F4 + +asum_kernel_F1: + + ands I, N, #3 + ble asum_kernel_L999 + +asum_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne asum_kernel_F10 + + b asum_kernel_L999 + +asum_kernel_S_BEGIN: + +#if defined(COMPLEX) + +#if defined(DOUBLE) + lsl INC_X, INC_X, #4 // INC_X * SIZE * 2 +#else + lsl INC_X, INC_X, #3 // INC_X * SIZE * 2 +#endif + +#else + +#if defined(DOUBLE) + lsl INC_X, INC_X, #3 // INC_X * SIZE +#else + lsl INC_X, INC_X, #2 // INC_X * SIZE +#endif + +#endif + + asrs I, N, #2 // I = N / 4 + ble asum_kernel_S1 + + .align 5 + +asum_kernel_S4: + + KERNEL_S4 + + subs I, I, #1 + bne asum_kernel_S4 + +asum_kernel_S1: + + ands I, N, #3 + ble asum_kernel_L999 + +asum_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne asum_kernel_S10 + + +asum_kernel_L999: + + +#if defined(DOUBLE) + vadd.f64 d0 , d0, d1 // set return value +#else + vadd.f32 s0 , s0, s1 // set return value +#endif + + bx lr + + EPILOGUE + diff --git a/kernel/arm/axpy.c b/kernel/arm/axpy.c new file mode 100644 index 000000000..dceddf78a --- /dev/null +++ b/kernel/arm/axpy.c @@ -0,0 +1,64 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : OK +* BLASTEST double : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + + +#include "common.h" + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG ix,iy; + + if ( n < 0 ) return(0); + if ( da == 0.0 ) return(0); + + ix = 0; + iy = 0; + + while(i < n) + { + + y[iy] += da * x[ix] ; + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(0); + +} + + diff --git a/kernel/arm/axpy_vfp.S b/kernel/arm/axpy_vfp.S new file mode 100644 index 000000000..acc575707 --- /dev/null +++ b/kernel/arm/axpy_vfp.S @@ -0,0 +1,503 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/14 Saar +* BLASTEST : xOK +* CTEST : xOK +* TEST : xOK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_INC_X [fp, #0 ] +#define OLD_Y [fp, #4 ] +#define OLD_INC_Y [fp, #8 ] + + +#define N r0 +#define Y r1 +#define INC_X r2 +#define X r3 +#define INC_Y r4 + +#define I r12 + +#define X_PRE 512 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +/*****************************************************************************************/ + +#if !defined(CONJ) + +#if defined(DOUBLE) + +#define FMAC_R1 fmacd +#define FMAC_R2 fnmacd +#define FMAC_I1 fmacd +#define FMAC_I2 fmacd + +#else + +#define FMAC_R1 fmacs +#define FMAC_R2 fnmacs +#define FMAC_I1 fmacs +#define FMAC_I2 fmacs + +#endif + +#else // CONJ + +#if defined(DOUBLE) + +#define FMAC_R1 fmacd +#define FMAC_R2 fmacd +#define FMAC_I1 fnmacd +#define FMAC_I2 fmacd + +#else + +#define FMAC_R1 fmacs +#define FMAC_R2 fmacs +#define FMAC_I1 fnmacs +#define FMAC_I2 fmacs + +#endif + +#endif + + +#if !defined(COMPLEX) + +#if defined(DOUBLE) + +.macro KERNEL_F4 + + pld [ X, #X_PRE ] + fldmiad X!, { d4 - d7 } + pld [ Y, #X_PRE ] + fldmiad Y , { d8 - d11 } + fmacd d8 , d0, d4 + fstmiad Y!, { d8 } + fmacd d9 , d0, d5 + fstmiad Y!, { d9 } + fmacd d10, d0, d6 + fstmiad Y!, { d10 } + fmacd d11, d0, d7 + fstmiad Y!, { d11 } + + +.endm + + +.macro KERNEL_F1 + + fldmiad X!, { d4 } + fldmiad Y , { d8 } + fmacd d8 , d0, d4 + fstmiad Y!, { d8 } + +.endm + +.macro KERNEL_S1 + + fldmiad X , { d4 } + fldmiad Y , { d8 } + fmacd d8 , d0, d4 + fstmiad Y , { d8 } + add X, X, INC_X + add Y, Y, INC_Y + +.endm + +#else + +.macro KERNEL_F4 + + fldmias X!, { s4 - s7 } + fldmias Y , { s8 - s11 } + fmacs s8 , s0, s4 + fstmias Y!, { s8 } + fmacs s9 , s0, s5 + fstmias Y!, { s9 } + fmacs s10, s0, s6 + fstmias Y!, { s10 } + fmacs s11, s0, s7 + fstmias Y!, { s11 } + + +.endm + + +.macro KERNEL_F1 + + fldmias X!, { s4 } + fldmias Y , { s8 } + fmacs s8 , s0, s4 + fstmias Y!, { s8 } + +.endm + +.macro KERNEL_S1 + + fldmias X , { s4 } + fldmias Y , { s8 } + fmacs s8 , s0, s4 + fstmias Y , { s8 } + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + +#endif + +#else + +#if defined(DOUBLE) + +.macro KERNEL_F4 + + pld [ X, #X_PRE ] + fldmiad X!, { d4 - d7 } + pld [ Y, #X_PRE ] + fldmiad Y , { d8 - d11 } + + FMAC_R1 d8 , d0, d4 + FMAC_R2 d8 , d1, d5 + FMAC_I1 d9 , d0, d5 + FMAC_I2 d9 , d1, d4 + fstmiad Y!, { d8 } + fstmiad Y!, { d9 } + + FMAC_R1 d10, d0, d6 + FMAC_R2 d10, d1, d7 + FMAC_I1 d11, d0, d7 + FMAC_I2 d11, d1, d6 + fstmiad Y!, { d10 } + fstmiad Y!, { d11 } + + pld [ X, #X_PRE ] + fldmiad X!, { d4 - d7 } + pld [ Y, #X_PRE ] + fldmiad Y , { d8 - d11 } + + FMAC_R1 d8 , d0, d4 + FMAC_R2 d8 , d1, d5 + FMAC_I1 d9 , d0, d5 + FMAC_I2 d9 , d1, d4 + fstmiad Y!, { d8 } + fstmiad Y!, { d9 } + + FMAC_R1 d10, d0, d6 + FMAC_R2 d10, d1, d7 + FMAC_I1 d11, d0, d7 + FMAC_I2 d11, d1, d6 + fstmiad Y!, { d10 } + fstmiad Y!, { d11 } + + + + + +.endm + + +.macro KERNEL_F1 + + fldmiad X!, { d4 - d5 } + fldmiad Y , { d8 - d9 } + + FMAC_R1 d8 , d0, d4 + FMAC_R2 d8 , d1, d5 + FMAC_I1 d9 , d0, d5 + FMAC_I2 d9 , d1, d4 + fstmiad Y!, { d8 } + fstmiad Y!, { d9 } + + + +.endm + +.macro KERNEL_S1 + + fldmiad X , { d4 - d5 } + fldmiad Y , { d8 - d9 } + + FMAC_R1 d8 , d0, d4 + FMAC_R2 d8 , d1, d5 + FMAC_I1 d9 , d0, d5 + FMAC_I2 d9 , d1, d4 + fstmiad Y , { d8 - d9 } + + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + + +#else + +.macro KERNEL_F4 + + pld [ X, #X_PRE ] + fldmias X!, { s4 - s7 } + pld [ Y, #X_PRE ] + fldmias Y , { s8 - s11 } + + FMAC_R1 s8 , s0, s4 + FMAC_R2 s8 , s1, s5 + FMAC_I1 s9 , s0, s5 + FMAC_I2 s9 , s1, s4 + fstmias Y!, { s8 } + fstmias Y!, { s9 } + + FMAC_R1 s10, s0, s6 + FMAC_R2 s10, s1, s7 + FMAC_I1 s11, s0, s7 + FMAC_I2 s11, s1, s6 + fstmias Y!, { s10 } + fstmias Y!, { s11 } + + fldmias X!, { s4 - s7 } + fldmias Y , { s8 - s11 } + + FMAC_R1 s8 , s0, s4 + FMAC_R2 s8 , s1, s5 + FMAC_I1 s9 , s0, s5 + FMAC_I2 s9 , s1, s4 + fstmias Y!, { s8 } + fstmias Y!, { s9 } + + FMAC_R1 s10, s0, s6 + FMAC_R2 s10, s1, s7 + FMAC_I1 s11, s0, s7 + FMAC_I2 s11, s1, s6 + fstmias Y!, { s10 } + fstmias Y!, { s11 } + + + + + +.endm + + +.macro KERNEL_F1 + + fldmias X!, { s4 - s5 } + fldmias Y , { s8 - s9 } + + FMAC_R1 s8 , s0, s4 + FMAC_R2 s8 , s1, s5 + FMAC_I1 s9 , s0, s5 + FMAC_I2 s9 , s1, s4 + fstmias Y!, { s8 } + fstmias Y!, { s9 } + + + +.endm + +.macro KERNEL_S1 + + fldmias X , { s4 - s5 } + fldmias Y , { s8 - s9 } + + FMAC_R1 s8 , s0, s4 + FMAC_R2 s8 , s1, s5 + FMAC_I1 s9 , s0, s5 + FMAC_I2 s9 , s1, s4 + fstmias Y , { s8 - s9 } + + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + +#endif + +#endif + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + push {r4 , fp} + add fp, sp, #8 + sub sp, sp, #STACKSIZE // reserve stack + + ldr INC_X , OLD_INC_X + ldr Y, OLD_Y + ldr INC_Y , OLD_INC_Y + + sub r12, fp, #128 + +#if defined(DOUBLE) + vstm r12, { d8 - d15} // store floating point registers +#else + vstm r12, { s8 - s15} // store floating point registers +#endif + + cmp N, #0 + ble axpy_kernel_L999 + + cmp INC_X, #0 + beq axpy_kernel_L999 + + cmp INC_Y, #0 + beq axpy_kernel_L999 + + cmp INC_X, #1 + bne axpy_kernel_S_BEGIN + + cmp INC_Y, #1 + bne axpy_kernel_S_BEGIN + + +axpy_kernel_F_BEGIN: + + + asrs I, N, #2 // I = N / 4 + ble axpy_kernel_F1 + + .align 5 + +axpy_kernel_F4: + +#if !defined(COMPLEX) && !defined(DOUBLE) + pld [ X, #X_PRE ] + pld [ Y, #X_PRE ] +#endif + + KERNEL_F4 + + subs I, I, #1 + ble axpy_kernel_F1 + + KERNEL_F4 + + subs I, I, #1 + bne axpy_kernel_F4 + +axpy_kernel_F1: + + ands I, N, #3 + ble axpy_kernel_L999 + +axpy_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne axpy_kernel_F10 + + b axpy_kernel_L999 + +axpy_kernel_S_BEGIN: + +#if defined(COMPLEX) + +#if defined(DOUBLE) + lsl INC_X, INC_X, #4 // INC_X * SIZE * 2 + lsl INC_Y, INC_Y, #4 // INC_Y * SIZE * 2 +#else + lsl INC_X, INC_X, #3 // INC_X * SIZE * 2 + lsl INC_Y, INC_Y, #3 // INC_Y * SIZE * 2 +#endif + +#else + +#if defined(DOUBLE) + lsl INC_X, INC_X, #3 // INC_X * SIZE + lsl INC_Y, INC_Y, #3 // INC_Y * SIZE +#else + lsl INC_X, INC_X, #2 // INC_X * SIZE + lsl INC_Y, INC_Y, #2 // INC_Y * SIZE +#endif + +#endif + + + asrs I, N, #2 // I = N / 4 + ble axpy_kernel_S1 + + .align 5 + +axpy_kernel_S4: + + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + + subs I, I, #1 + bne axpy_kernel_S4 + +axpy_kernel_S1: + + ands I, N, #3 + ble axpy_kernel_L999 + +axpy_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne axpy_kernel_S10 + + +axpy_kernel_L999: + + sub r3, fp, #128 + +#if defined(DOUBLE) + vldm r3, { d8 - d15 } // restore floating point registers +#else + vldm r3, { s8 - s15 } // restore floating point registers +#endif + + mov r0, #0 // set return value + + sub sp, fp, #8 + pop {r4,fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/ccopy_vfp.S b/kernel/arm/ccopy_vfp.S new file mode 100644 index 000000000..aaba7825e --- /dev/null +++ b/kernel/arm/ccopy_vfp.S @@ -0,0 +1,222 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/07 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define N r0 +#define X r1 +#define INC_X r2 +#define OLD_Y r3 + + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define OLD_INC_Y [fp, #4 ] + +#define I r5 +#define Y r6 +#define INC_Y r7 + +#define X_PRE 256 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro COPY_F4 + + pld [ X, #X_PRE ] + fldmias X!, { s0 - s7 } + fstmias Y!, { s0 - s7 } + +.endm + +.macro COPY_F1 + + fldmias X!, { s0 - s1 } + fstmias Y!, { s0 - s1 } + +.endm + + +/*************************************************************************************************************************/ + +.macro COPY_S4 + + nop + fldmias X, { s0 - s1 } + fstmias Y, { s0 - s1 } + add X, X, INC_X + add Y, Y, INC_Y + + fldmias X, { s2 - s3 } + fstmias Y, { s2 - s3 } + add X, X, INC_X + add Y, Y, INC_Y + + fldmias X, { s0 - s1 } + fstmias Y, { s0 - s1 } + add X, X, INC_X + add Y, Y, INC_Y + + fldmias X, { s2 - s3 } + fstmias Y, { s2 - s3 } + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + +.macro COPY_S1 + + fldmias X, { s0 - s1 } + fstmias Y, { s0 - s1 } + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + sub r4, fp, #128 + vstm r4, { s8 - s15} // store floating point registers + + mov Y, OLD_Y + ldr INC_Y, OLD_INC_Y + + cmp N, #0 + ble ccopy_kernel_L999 + + cmp INC_X, #0 + beq ccopy_kernel_L999 + + cmp INC_Y, #0 + beq ccopy_kernel_L999 + + cmp INC_X, #1 + bne ccopy_kernel_S_BEGIN + + cmp INC_Y, #1 + bne ccopy_kernel_S_BEGIN + +ccopy_kernel_F_BEGIN: + + asrs I, N, #2 // I = N / 4 + ble ccopy_kernel_F1 + +ccopy_kernel_F4: + + COPY_F4 + + subs I, I, #1 + bne ccopy_kernel_F4 + +ccopy_kernel_F1: + + ands I, N, #3 + ble ccopy_kernel_L999 + +ccopy_kernel_F10: + + COPY_F1 + + subs I, I, #1 + bne ccopy_kernel_F10 + + b ccopy_kernel_L999 + +ccopy_kernel_S_BEGIN: + + lsl INC_X, INC_X, #3 // INC_X * SIZE * 2 + lsl INC_Y, INC_Y, #3 // INC_Y * SIZE * 2 + + asrs I, N, #2 // I = N / 4 + ble ccopy_kernel_S1 + +ccopy_kernel_S4: + + COPY_S4 + + subs I, I, #1 + bne ccopy_kernel_S4 + +ccopy_kernel_S1: + + ands I, N, #3 + ble ccopy_kernel_L999 + +ccopy_kernel_S10: + + COPY_S1 + + subs I, I, #1 + bne ccopy_kernel_S10 + + + + + + +ccopy_kernel_L999: + + sub r3, fp, #128 + vldm r3, { s8 - s15} // restore floating point registers + + mov r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/cdot_vfp.S b/kernel/arm/cdot_vfp.S new file mode 100644 index 000000000..b653888df --- /dev/null +++ b/kernel/arm/cdot_vfp.S @@ -0,0 +1,284 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/11 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define N r0 +#define X r1 +#define INC_X r2 +#define OLD_Y r3 + + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define OLD_INC_Y [fp, #4 ] + +#define I r5 +#define Y r6 +#define INC_Y r7 + +#define X_PRE 512 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro KERNEL_F4 + + pld [ X, #X_PRE ] + pld [ Y, #X_PRE ] + + fldmias X!, { s4 - s5 } + fldmias Y!, { s8 - s9 } + fmacs s0 , s4, s8 + fmacs s1 , s4, s9 + fldmias X!, { s6 - s7 } + fmacs s2 , s5, s9 + fmacs s3 , s5, s8 + + fldmias Y!, { s10 - s11 } + fmacs s0 , s6, s10 + fmacs s1 , s6, s11 + fmacs s2 , s7, s11 + fmacs s3 , s7, s10 + + + fldmias X!, { s4 - s5 } + fldmias Y!, { s8 - s9 } + fmacs s0 , s4, s8 + fmacs s1 , s4, s9 + fldmias X!, { s6 - s7 } + fmacs s2 , s5, s9 + fmacs s3 , s5, s8 + + fldmias Y!, { s10 - s11 } + fmacs s0 , s6, s10 + fmacs s1 , s6, s11 + fmacs s2 , s7, s11 + fmacs s3 , s7, s10 + +.endm + +.macro KERNEL_F1 + + fldmias X!, { s4 - s5 } + fldmias Y!, { s8 - s9 } + fmacs s0 , s4, s8 + fmacs s1 , s4, s9 + fmacs s2 , s5, s9 + fmacs s3 , s5, s8 + +.endm + + +/*************************************************************************************************************************/ + +.macro KERNEL_S4 + + nop + + fldmias X, { s4 - s5 } + fldmias Y, { s8 - s9 } + fmacs s0 , s4, s8 + fmacs s1 , s4, s9 + fmacs s2 , s5, s9 + fmacs s3 , s5, s8 + add X, X, INC_X + add Y, Y, INC_Y + + fldmias X, { s4 - s5 } + fldmias Y, { s8 - s9 } + fmacs s0 , s4, s8 + fmacs s1 , s4, s9 + fmacs s2 , s5, s9 + fmacs s3 , s5, s8 + add X, X, INC_X + add Y, Y, INC_Y + + fldmias X, { s4 - s5 } + fldmias Y, { s8 - s9 } + fmacs s0 , s4, s8 + fmacs s1 , s4, s9 + fmacs s2 , s5, s9 + fmacs s3 , s5, s8 + add X, X, INC_X + add Y, Y, INC_Y + + fldmias X, { s4 - s5 } + fldmias Y, { s8 - s9 } + fmacs s0 , s4, s8 + fmacs s1 , s4, s9 + fmacs s2 , s5, s9 + fmacs s3 , s5, s8 + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + +.macro KERNEL_S1 + + fldmias X, { s4 - s5 } + fldmias Y, { s8 - s9 } + fmacs s0 , s4, s8 + fmacs s1 , s4, s9 + fmacs s2 , s5, s9 + fmacs s3 , s5, s8 + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + sub r4, fp, #128 + vstm r4, { s8 - s15} // store floating point registers + + mov Y, OLD_Y + ldr INC_Y, OLD_INC_Y + + vsub.f32 s0 , s0 , s0 + vsub.f32 s1 , s1 , s1 + vsub.f32 s2 , s2 , s2 + vsub.f32 s3 , s3 , s3 + + cmp N, #0 + ble cdot_kernel_L999 + + cmp INC_X, #0 + beq cdot_kernel_L999 + + cmp INC_Y, #0 + beq cdot_kernel_L999 + + cmp INC_X, #1 + bne cdot_kernel_S_BEGIN + + cmp INC_Y, #1 + bne cdot_kernel_S_BEGIN + +cdot_kernel_F_BEGIN: + + asrs I, N, #2 // I = N / 4 + ble cdot_kernel_F1 + +cdot_kernel_F4: + + KERNEL_F4 + + subs I, I, #1 + bne cdot_kernel_F4 + +cdot_kernel_F1: + + ands I, N, #3 + ble cdot_kernel_L999 + +cdot_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne cdot_kernel_F10 + + b cdot_kernel_L999 + +cdot_kernel_S_BEGIN: + + lsl INC_X, INC_X, #3 // INC_X * SIZE * 2 + lsl INC_Y, INC_Y, #3 // INC_Y * SIZE * 2 + + asrs I, N, #2 // I = N / 4 + ble cdot_kernel_S1 + +cdot_kernel_S4: + + KERNEL_S4 + + subs I, I, #1 + bne cdot_kernel_S4 + +cdot_kernel_S1: + + ands I, N, #3 + ble cdot_kernel_L999 + +cdot_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne cdot_kernel_S10 + + + +cdot_kernel_L999: + + sub r3, fp, #128 + vldm r3, { s8 - s15} // restore floating point registers + +#if !defined(CONJ) + vsub.f32 s0 , s0, s2 + vadd.f32 s1 , s1, s3 +#else + vadd.f32 s0 , s0, s2 + vsub.f32 s1 , s1, s3 +#endif + + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/cgemm_kernel_2x2_vfp.S b/kernel/arm/cgemm_kernel_2x2_vfp.S new file mode 100644 index 000000000..75fbf097b --- /dev/null +++ b/kernel/arm/cgemm_kernel_2x2_vfp.S @@ -0,0 +1,1252 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/28 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_K r2 +#define OLD_A r3 +#define OLD_ALPHA_R s0 +#define OLD_ALPHA_I s1 + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define A [fp, #-248 ] +#define LDC [fp, #-252 ] +#define M [fp, #-256 ] +#define N [fp, #-260 ] +#define K [fp, #-264 ] + +#define ALPHA_I [fp, #-272] +#define ALPHA_R [fp, #-280] + +#define B [fp, #4 ] +#define C [fp, #8 ] +#define OLD_LDC [fp, #12 ] + +#define I r0 +#define J r1 +#define L r2 + +#define AO r5 +#define BO r6 + +#define CO1 r8 +#define CO2 r9 + +#define K1 r7 +#define BC r12 + +#define A_PRE 96 +#define B_PRE 96 +#define C_PRE 64 + + + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + + #define KMAC_R fnmacs + #define KMAC_I fmacs + + #define FMAC_R1 fmacs + #define FMAC_R2 fnmacs + #define FMAC_I1 fmacs + #define FMAC_I2 fmacs + +#elif defined(CN) || defined(CT) + + #define KMAC_R fmacs + #define KMAC_I fnmacs + + #define FMAC_R1 fmacs + #define FMAC_R2 fnmacs + #define FMAC_I1 fmacs + #define FMAC_I2 fmacs + +#elif defined(NC) || defined(TC) + + #define KMAC_R fmacs + #define KMAC_I fnmacs + + #define FMAC_R1 fmacs + #define FMAC_R2 fmacs + #define FMAC_I1 fnmacs + #define FMAC_I2 fmacs + +#else + + #define KMAC_R fnmacs + #define KMAC_I fmacs + + #define FMAC_R1 fmacs + #define FMAC_R2 fmacs + #define FMAC_I1 fnmacs + #define FMAC_I2 fmacs + +#endif + + + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro INIT2x2 + + vsub.f32 s8 , s8 , s8 + vmov.f32 s9 , s8 + vmov.f32 s10, s8 + vmov.f32 s11, s8 + vmov.f32 s12, s8 + vmov.f32 s13, s8 + vmov.f32 s14, s8 + vmov.f32 s15, s8 + +.endm + +.macro KERNEL2x2_I + + pld [ AO, #A_PRE ] + fldmias AO!, { s0 - s3 } + pld [ BO, #B_PRE ] + fldmias BO!, { s4 - s7 } + + + fmuls s8 , s0, s4 + fmuls s9 , s0, s5 + fmuls s10 , s2, s4 + fmuls s11 , s2, s5 + + KMAC_R s8 , s1, s5 + KMAC_I s9 , s1, s4 + KMAC_R s10 , s3, s5 + KMAC_I s11 , s3, s4 + + fmuls s12 , s0, s6 + fmuls s13 , s0, s7 + fmuls s14 , s2, s6 + fmuls s15 , s2, s7 + + KMAC_R s12 , s1, s7 + KMAC_I s13 , s1, s6 + KMAC_R s14 , s3, s7 + KMAC_I s15 , s3, s6 + +.endm + + + +.macro KERNEL2x2_M1 + + pld [ AO, #A_PRE ] + fldmias AO!, { s0 - s3 } + pld [ BO, #B_PRE ] + fldmias BO!, { s4 - s7 } + + fmacs s8 , s0, s4 + fmacs s9 , s0, s5 + fmacs s10 , s2, s4 + fmacs s11 , s2, s5 + + KMAC_R s8 , s1, s5 + KMAC_I s9 , s1, s4 + KMAC_R s10 , s3, s5 + KMAC_I s11 , s3, s4 + + fmacs s12 , s0, s6 + fmacs s13 , s0, s7 + fmacs s14 , s2, s6 + fmacs s15 , s2, s7 + + KMAC_R s12 , s1, s7 + KMAC_I s13 , s1, s6 + KMAC_R s14 , s3, s7 + KMAC_I s15 , s3, s6 + +.endm + +.macro KERNEL2x2_M2 + + fldmias AO!, { s0 - s3 } + fldmias BO!, { s4 - s7 } + + fmacs s8 , s0, s4 + fmacs s9 , s0, s5 + fmacs s10 , s2, s4 + fmacs s11 , s2, s5 + + KMAC_R s8 , s1, s5 + KMAC_I s9 , s1, s4 + KMAC_R s10 , s3, s5 + KMAC_I s11 , s3, s4 + + fmacs s12 , s0, s6 + fmacs s13 , s0, s7 + fmacs s14 , s2, s6 + fmacs s15 , s2, s7 + + KMAC_R s12 , s1, s7 + KMAC_I s13 , s1, s6 + KMAC_R s14 , s3, s7 + KMAC_I s15 , s3, s6 + + +.endm + + +.macro KERNEL2x2_E + + fldmias AO!, { s0 - s3 } + fldmias BO!, { s4 - s7 } + + fmacs s8 , s0, s4 + fmacs s9 , s0, s5 + fmacs s10 , s2, s4 + fmacs s11 , s2, s5 + + KMAC_R s8 , s1, s5 + KMAC_I s9 , s1, s4 + KMAC_R s10 , s3, s5 + KMAC_I s11 , s3, s4 + + fmacs s12 , s0, s6 + fmacs s13 , s0, s7 + fmacs s14 , s2, s6 + fmacs s15 , s2, s7 + + KMAC_R s12 , s1, s7 + KMAC_I s13 , s1, s6 + KMAC_R s14 , s3, s7 + KMAC_I s15 , s3, s6 + + +.endm + +.macro KERNEL2x2_SUB + + fldmias AO!, { s0 - s3 } + fldmias BO!, { s4 - s7 } + + fmacs s8 , s0, s4 + fmacs s9 , s0, s5 + fmacs s10 , s2, s4 + fmacs s11 , s2, s5 + + KMAC_R s8 , s1, s5 + KMAC_I s9 , s1, s4 + KMAC_R s10 , s3, s5 + KMAC_I s11 , s3, s4 + + fmacs s12 , s0, s6 + fmacs s13 , s0, s7 + fmacs s14 , s2, s6 + fmacs s15 , s2, s7 + + KMAC_R s12 , s1, s7 + KMAC_I s13 , s1, s6 + KMAC_R s14 , s3, s7 + KMAC_I s15 , s3, s6 + + +.endm + + +.macro SAVE2x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + flds s0, ALPHA_R + flds s1, ALPHA_I + + fldmias CO1, { s4 - s7 } + + FMAC_R1 s4 , s0 , s8 + FMAC_I1 s5 , s0 , s9 + FMAC_R2 s4 , s1 , s9 + FMAC_I2 s5 , s1 , s8 + + FMAC_R1 s6 , s0 , s10 + FMAC_I1 s7 , s0 , s11 + FMAC_R2 s6 , s1 , s11 + FMAC_I2 s7 , s1 , s10 + + fstmias CO1, { s4 - s7 } + + fldmias CO2, { s4 - s7 } + + FMAC_R1 s4 , s0 , s12 + FMAC_I1 s5 , s0 , s13 + FMAC_R2 s4 , s1 , s13 + FMAC_I2 s5 , s1 , s12 + + FMAC_R1 s6 , s0 , s14 + FMAC_I1 s7 , s0 , s15 + FMAC_R2 s6 , s1 , s15 + FMAC_I2 s7 , s1 , s14 + + fstmias CO2, { s4 - s7 } + + add CO1, CO1, #16 + +.endm + +/******************************************************************************/ + +.macro INIT1x2 + + vsub.f32 s8 , s8 , s8 + vmov.f32 s9 , s8 + vmov.f32 s12, s8 + vmov.f32 s13, s8 + +.endm + +.macro KERNEL1x2_I + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + flds s6 , [ BO, #8 ] + flds s7 , [ BO, #12 ] + + fmuls s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmuls s9 , s0, s5 + KMAC_I s9 , s1, s4 + + fmuls s12 , s0, s6 + KMAC_R s12 , s1, s7 + fmuls s13 , s0, s7 + KMAC_I s13 , s1, s6 + + add BO , BO, #16 + add AO , AO, #8 + +.endm + + + +.macro KERNEL1x2_M1 + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + flds s6 , [ BO, #8 ] + flds s7 , [ BO, #12 ] + + fmacs s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmacs s9 , s0, s5 + KMAC_I s9 , s1, s4 + + fmacs s12 , s0, s6 + KMAC_R s12 , s1, s7 + fmacs s13 , s0, s7 + KMAC_I s13 , s1, s6 + + add BO , BO, #16 + add AO , AO, #8 + +.endm + +.macro KERNEL1x2_M2 + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + flds s6 , [ BO, #8 ] + flds s7 , [ BO, #12 ] + + fmacs s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmacs s9 , s0, s5 + KMAC_I s9 , s1, s4 + + fmacs s12 , s0, s6 + KMAC_R s12 , s1, s7 + fmacs s13 , s0, s7 + KMAC_I s13 , s1, s6 + + add BO , BO, #16 + add AO , AO, #8 + + +.endm + + +.macro KERNEL1x2_E + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + flds s6 , [ BO, #8 ] + flds s7 , [ BO, #12 ] + + fmacs s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmacs s9 , s0, s5 + KMAC_I s9 , s1, s4 + + fmacs s12 , s0, s6 + KMAC_R s12 , s1, s7 + fmacs s13 , s0, s7 + KMAC_I s13 , s1, s6 + + add BO , BO, #16 + add AO , AO, #8 + +.endm + +.macro KERNEL1x2_SUB + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + flds s6 , [ BO, #8 ] + flds s7 , [ BO, #12 ] + + fmacs s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmacs s9 , s0, s5 + KMAC_I s9 , s1, s4 + + fmacs s12 , s0, s6 + KMAC_R s12 , s1, s7 + fmacs s13 , s0, s7 + KMAC_I s13 , s1, s6 + + add BO , BO, #16 + add AO , AO, #8 + + +.endm + + +.macro SAVE1x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + flds s0, ALPHA_R + flds s1, ALPHA_I + + fldmias CO1, { s4 - s5 } + + FMAC_R1 s4 , s0 , s8 + FMAC_I1 s5 , s0 , s9 + FMAC_R2 s4 , s1 , s9 + FMAC_I2 s5 , s1 , s8 + + fstmias CO1, { s4 - s5 } + + fldmias CO2, { s4 - s5 } + + FMAC_R1 s4 , s0 , s12 + FMAC_I1 s5 , s0 , s13 + FMAC_R2 s4 , s1 , s13 + FMAC_I2 s5 , s1 , s12 + + fstmias CO2, { s4 - s5 } + + add CO1, CO1, #8 + +.endm + + +/******************************************************************************/ + +.macro INIT2x1 + + vsub.f32 s8 , s8 , s8 + vmov.f32 s9 , s8 + vmov.f32 s10, s8 + vmov.f32 s11, s8 + +.endm + +.macro KERNEL2x1_I + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + flds s2 , [ AO, #8 ] + flds s3 , [ AO, #12 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + fmuls s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmuls s9 , s0, s5 + KMAC_I s9 , s1, s4 + + fmuls s10 , s2, s4 + KMAC_R s10 , s3, s5 + fmuls s11 , s2, s5 + KMAC_I s11 , s3, s4 + + add BO , BO, #8 + add AO , AO, #16 + +.endm + + + +.macro KERNEL2x1_M1 + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s2 , [ AO, #8 ] + flds s3 , [ AO, #12 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + fmacs s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmacs s9 , s0, s5 + KMAC_I s9 , s1, s4 + + fmacs s10 , s2, s4 + KMAC_R s10 , s3, s5 + fmacs s11 , s2, s5 + KMAC_I s11 , s3, s4 + + add BO , BO, #8 + add AO , AO, #16 + +.endm + +.macro KERNEL2x1_M2 + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s2 , [ AO, #8 ] + flds s3 , [ AO, #12 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + fmacs s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmacs s9 , s0, s5 + KMAC_I s9 , s1, s4 + + fmacs s10 , s2, s4 + KMAC_R s10 , s3, s5 + fmacs s11 , s2, s5 + KMAC_I s11 , s3, s4 + + add BO , BO, #8 + add AO , AO, #16 + + +.endm + + +.macro KERNEL2x1_E + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s2 , [ AO, #8 ] + flds s3 , [ AO, #12 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + fmacs s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmacs s9 , s0, s5 + KMAC_I s9 , s1, s4 + + fmacs s10 , s2, s4 + KMAC_R s10 , s3, s5 + fmacs s11 , s2, s5 + KMAC_I s11 , s3, s4 + + add BO , BO, #8 + add AO , AO, #16 + +.endm + +.macro KERNEL2x1_SUB + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s2 , [ AO, #8 ] + flds s3 , [ AO, #12 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + fmacs s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmacs s9 , s0, s5 + KMAC_I s9 , s1, s4 + + fmacs s10 , s2, s4 + KMAC_R s10 , s3, s5 + fmacs s11 , s2, s5 + KMAC_I s11 , s3, s4 + + add BO , BO, #8 + add AO , AO, #16 + + +.endm + + +.macro SAVE2x1 + + flds s0, ALPHA_R + flds s1, ALPHA_I + + fldmias CO1, { s4 - s7 } + + FMAC_R1 s4 , s0 , s8 + FMAC_I1 s5 , s0 , s9 + FMAC_R2 s4 , s1 , s9 + FMAC_I2 s5 , s1 , s8 + + FMAC_R1 s6 , s0 , s10 + FMAC_I1 s7 , s0 , s11 + FMAC_R2 s6 , s1 , s11 + FMAC_I2 s7 , s1 , s10 + + fstmias CO1, { s4 - s7 } + + add CO1, CO1, #16 + +.endm + + +/******************************************************************************/ + +.macro INIT1x1 + + vsub.f32 s8 , s8 , s8 + vmov.f32 s9 , s8 + +.endm + +.macro KERNEL1x1_I + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + fmuls s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmuls s9 , s0, s5 + KMAC_I s9 , s1, s4 + + add BO , BO, #8 + add AO , AO, #8 + +.endm + + + +.macro KERNEL1x1_M1 + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + fmacs s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmacs s9 , s0, s5 + KMAC_I s9 , s1, s4 + + add BO , BO, #8 + add AO , AO, #8 + +.endm + +.macro KERNEL1x1_M2 + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + fmacs s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmacs s9 , s0, s5 + KMAC_I s9 , s1, s4 + + add BO , BO, #8 + add AO , AO, #8 + + +.endm + + +.macro KERNEL1x1_E + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + fmacs s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmacs s9 , s0, s5 + KMAC_I s9 , s1, s4 + + add BO , BO, #8 + add AO , AO, #8 + +.endm + +.macro KERNEL1x1_SUB + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + fmacs s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmacs s9 , s0, s5 + KMAC_I s9 , s1, s4 + + add BO , BO, #8 + add AO , AO, #8 + + +.endm + + +.macro SAVE1x1 + + flds s0, ALPHA_R + flds s1, ALPHA_I + + fldmias CO1, { s4 - s5 } + + FMAC_R1 s4 , s0 , s8 + FMAC_I1 s5 , s0 , s9 + FMAC_R2 s4 , s1 , s9 + FMAC_I2 s5 , s1 , s8 + + fstmias CO1, { s4 - s5 } + + add CO1, CO1, #8 + +.endm + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + str OLD_M, M + str OLD_N, N + str OLD_K, K + str OLD_A, A + vstr OLD_ALPHA_R, ALPHA_R + vstr OLD_ALPHA_I, ALPHA_I + + sub r3, fp, #128 + vstm r3, { s8 - s15} // store floating point registers + + ldr r3, OLD_LDC + lsl r3, r3, #3 // ldc = ldc * 4 * 2 + str r3, LDC + + ldr K1, K + ldr BC, B + + ldr J, N + asrs J, J, #1 // J = J / 2 + ble cgemm_kernel_L1_BEGIN + +cgemm_kernel_L2_BEGIN: + + ldr CO1, C // CO1 = C + ldr r4 , LDC + lsl r4 , r4 , #1 // LDC * 2 + add r3 , r4, CO1 + str r3 , C // store C + + ldr AO, A // AO = A + pld [AO , #A_PRE-64] + pld [AO , #A_PRE-32] + + + +cgemm_kernel_L2_M2_BEGIN: + + ldr I, M + asrs I, I, #1 // I = I / 2 + ble cgemm_kernel_L2_M1_BEGIN + +cgemm_kernel_L2_M2_20: + + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + cmp L , #3 + blt cgemm_kernel_L2_M2_30 + .align 5 + + + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + sub L, L, #2 + +cgemm_kernel_L2_M2_22: + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + subs L, L, #1 + bgt cgemm_kernel_L2_M2_22 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b cgemm_kernel_L2_M2_44 + + +cgemm_kernel_L2_M2_30: + tst L, #3 + ble cgemm_kernel_L2_M2_40 + + tst L, #2 + ble cgemm_kernel_L2_M2_32 + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b cgemm_kernel_L2_M2_44 + +cgemm_kernel_L2_M2_32: + + tst L, #1 + ble cgemm_kernel_L2_M2_40 + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b cgemm_kernel_L2_M2_44 + + +cgemm_kernel_L2_M2_40: + + INIT2x2 + + +cgemm_kernel_L2_M2_44: + + ands L , K1, #7 // L = L % 8 + ble cgemm_kernel_L2_M2_100 + +cgemm_kernel_L2_M2_46: + + KERNEL2x2_SUB + + subs L, L, #1 + bne cgemm_kernel_L2_M2_46 + +cgemm_kernel_L2_M2_100: + + SAVE2x2 + +cgemm_kernel_L2_M2_END: + + subs I, I, #1 + bne cgemm_kernel_L2_M2_20 + + +cgemm_kernel_L2_M1_BEGIN: + + ldr I, M + tst I, #1 // I = I % 2 + ble cgemm_kernel_L2_END + +cgemm_kernel_L2_M1_20: + + INIT1x2 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble cgemm_kernel_L2_M1_40 + +cgemm_kernel_L2_M1_22: + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs L, L, #1 + bgt cgemm_kernel_L2_M1_22 + + +cgemm_kernel_L2_M1_40: + + ands L , K1, #7 // L = L % 8 + ble cgemm_kernel_L2_M1_100 + +cgemm_kernel_L2_M1_42: + + KERNEL1x2_SUB + + subs L, L, #1 + bgt cgemm_kernel_L2_M1_42 + +cgemm_kernel_L2_M1_100: + + SAVE1x2 + + +cgemm_kernel_L2_END: + + mov r3, BC + mov r4, K1 + lsl r4, r4, #4 // k * 2 * 4 * 2 + add r3, r3, r4 // B = B + K * 2 * 8 + mov BC, r3 + + subs J , #1 // j-- + bgt cgemm_kernel_L2_BEGIN + + + +/*********************************************************************************************/ + +cgemm_kernel_L1_BEGIN: + + ldr J , N + tst J , #1 + ble cgemm_kernel_L999 + + + ldr CO1, C // CO1 = C + ldr r4 , LDC + add r3 , r4, CO1 + str r3 , C // store C + + ldr AO, A // AO = A + +cgemm_kernel_L1_M2_BEGIN: + + ldr I, M + asrs I, I, #1 // I = I / 2 + ble cgemm_kernel_L1_M1_BEGIN + +cgemm_kernel_L1_M2_20: + + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + cmp L , #3 + blt cgemm_kernel_L1_M2_30 + .align 5 + + + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + sub L, L, #2 + +cgemm_kernel_L1_M2_22: + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + subs L, L, #1 + bgt cgemm_kernel_L1_M2_22 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b cgemm_kernel_L1_M2_44 + + +cgemm_kernel_L1_M2_30: + tst L, #3 + ble cgemm_kernel_L1_M2_40 + + tst L, #2 + ble cgemm_kernel_L1_M2_32 + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b cgemm_kernel_L1_M2_44 + +cgemm_kernel_L1_M2_32: + + tst L, #1 + ble cgemm_kernel_L1_M2_40 + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b cgemm_kernel_L1_M2_44 + + +cgemm_kernel_L1_M2_40: + + INIT2x1 + + +cgemm_kernel_L1_M2_44: + + ands L , K1, #7 // L = L % 8 + ble cgemm_kernel_L1_M2_100 + +cgemm_kernel_L1_M2_46: + + KERNEL2x1_SUB + + subs L, L, #1 + bne cgemm_kernel_L1_M2_46 + +cgemm_kernel_L1_M2_100: + + SAVE2x1 + +cgemm_kernel_L1_M2_END: + + subs I, I, #1 + bne cgemm_kernel_L1_M2_20 + + +cgemm_kernel_L1_M1_BEGIN: + + ldr I, M + tst I, #1 // I = I % 2 + ble cgemm_kernel_L1_END + +cgemm_kernel_L1_M1_20: + + INIT1x1 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble cgemm_kernel_L1_M1_40 + +cgemm_kernel_L1_M1_22: + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs L, L, #1 + bgt cgemm_kernel_L1_M1_22 + + +cgemm_kernel_L1_M1_40: + + ands L , K1, #7 // L = L % 8 + ble cgemm_kernel_L1_M1_100 + +cgemm_kernel_L1_M1_42: + + KERNEL1x1_SUB + + subs L, L, #1 + bgt cgemm_kernel_L1_M1_42 + +cgemm_kernel_L1_M1_100: + + SAVE1x1 + + +cgemm_kernel_L1_END: + + + +cgemm_kernel_L999: + + sub r3, fp, #128 + vldm r3, { s8 - s15} // restore floating point registers + + movs r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/cgemm_kernel_2x2_vfpv3.S b/kernel/arm/cgemm_kernel_2x2_vfpv3.S new file mode 100644 index 000000000..3aba68de8 --- /dev/null +++ b/kernel/arm/cgemm_kernel_2x2_vfpv3.S @@ -0,0 +1,1309 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/05 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +* 2013/11/01 Saar +* UNROLL_N 2 +* UNROLL_M 2 +* CGEMM_P 96 +* CGEMM_Q 120 +* CGEMM_R 4096 +* A_PRE 96 +* B_PRE 96 +* C_PRE 64 +* +* Performance on Odroid U2: +* +* 1 Core: 2.59 GFLOPS ATLAS: 2.37 GFLOPS +* 2 Cores: 5.17 GFLOPS ATLAS: 4.46 GFLOPS +* 3 Cores: 7.69 GFLOPS ATLAS: 6.50 GFLOPS +* 4 Cores: 10.22 GFLOPS ATLAS: 8.18 GFLOPS +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_K r2 +#define OLD_A r3 +#define OLD_ALPHA_R s0 +#define OLD_ALPHA_I s1 + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define A [fp, #-248 ] +#define LDC [fp, #-252 ] +#define M [fp, #-256 ] +#define N [fp, #-260 ] +#define K [fp, #-264 ] + +#define ALPHA_I [fp, #-272] +#define ALPHA_R [fp, #-280] + +#define B [fp, #4 ] +#define C [fp, #8 ] +#define OLD_LDC [fp, #12 ] + +#define I r0 +#define J r1 +#define L r2 + +#define AO r5 +#define BO r6 + +#define CO1 r8 +#define CO2 r9 + +#define K1 r7 +#define BC r12 + +#define A_PRE 96 +#define B_PRE 96 +#define C_PRE 64 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + + #define FADD_R fsubs + #define FADD_I fadds + + #define FMAC_R1 fnmacs + #define FMAC_R2 fnmacs + #define FMAC_I1 fmacs + #define FMAC_I2 fnmacs + +#elif defined(CN) || defined(CT) + + #define FADD_R fadds + #define FADD_I fsubs + + #define FMAC_R1 fmacs + #define FMAC_R2 fmacs + #define FMAC_I1 fnmacs + #define FMAC_I2 fmacs + +#elif defined(NC) || defined(TC) + + #define FADD_R fadds + #define FADD_I fsubs + + #define FMAC_R1 fmacs + #define FMAC_R2 fnmacs + #define FMAC_I1 fmacs + #define FMAC_I2 fmacs + +#else + + #define FADD_R fsubs + #define FADD_I fadds + + #define FMAC_R1 fnmacs + #define FMAC_R2 fmacs + #define FMAC_I1 fnmacs + #define FMAC_I2 fnmacs + +#endif + + + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro INIT2x2 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s17, s16 + vmov.f32 s18, s16 + vmov.f32 s19, s16 + vmov.f32 s20, s16 + vmov.f32 s21, s16 + vmov.f32 s22, s16 + vmov.f32 s23, s16 + vmov.f32 s24, s16 + vmov.f32 s25, s16 + vmov.f32 s26, s16 + vmov.f32 s27, s16 + vmov.f32 s28, s16 + vmov.f32 s29, s16 + vmov.f32 s30, s16 + vmov.f32 s31, s16 + +.endm + +.macro KERNEL2x2_I + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + fldmias AO!, { s0 - s1 } + fldmias BO!, { s8 - s9 } + + fmuls s16 , s0, s8 + fmuls s24 , s1, s9 + fldmias AO!, { s2 - s3 } + fmuls s17 , s0, s9 + fmuls s25 , s1, s8 + + fldmias BO!, { s10 - s11 } + fmuls s18 , s2, s8 + fmuls s26 , s3, s9 + fldmias AO!, { s4 - s5 } + fmuls s19 , s2, s9 + fmuls s27 , s3, s8 + + fldmias BO!, { s12 - s13 } + fmuls s20 , s0, s10 + fmuls s28 , s1, s11 + fldmias AO!, { s6 - s7 } + fmuls s21 , s0, s11 + fmuls s29 , s1, s10 + + fldmias BO!, { s14 - s15 } + fmuls s22 , s2, s10 + fmuls s30 , s3, s11 + fmuls s23 , s2, s11 + fmuls s31 , s3, s10 + +.endm + + + +.macro KERNEL2x2_M1 + + fmacs s16 , s0, s8 + fldmias AO!, { s4 - s5 } + fmacs s24 , s1, s9 + fmacs s17 , s0, s9 + fldmias BO!, { s12 - s13 } + fmacs s25 , s1, s8 + + fmacs s18 , s2, s8 + fldmias AO!, { s6 - s7 } + fmacs s26 , s3, s9 + fmacs s19 , s2, s9 + fldmias BO!, { s14 - s15 } + fmacs s27 , s3, s8 + + fmacs s20 , s0, s10 + fmacs s28 , s1, s11 + fmacs s21 , s0, s11 + fmacs s29 , s1, s10 + + fmacs s22 , s2, s10 + fmacs s30 , s3, s11 + fmacs s23 , s2, s11 + fmacs s31 , s3, s10 + +.endm + +.macro KERNEL2x2_M2 + pld [ AO , #A_PRE ] + + fmacs s16 , s4, s12 + pld [ BO , #B_PRE ] + fmacs s24 , s5, s13 + fmacs s17 , s4, s13 + fldmias AO!, { s0 - s1 } + fmacs s25 , s5, s12 + + fmacs s18 , s6, s12 + fmacs s26 , s7, s13 + fldmias BO!, { s8 - s9 } + fmacs s19 , s6, s13 + fmacs s27 , s7, s12 + + fldmias AO!, { s2 - s3 } + fmacs s20 , s4, s14 + fmacs s28 , s5, s15 + fldmias BO!, { s10 - s11 } + fmacs s21 , s4, s15 + fmacs s29 , s5, s14 + + fmacs s22 , s6, s14 + fmacs s30 , s7, s15 + fmacs s23 , s6, s15 + fmacs s31 , s7, s14 + +.endm + + +.macro KERNEL2x2_E + + fmacs s16 , s4, s12 + fmacs s24 , s5, s13 + fmacs s17 , s4, s13 + fmacs s25 , s5, s12 + + fmacs s18 , s6, s12 + fmacs s26 , s7, s13 + fmacs s19 , s6, s13 + fmacs s27 , s7, s12 + + fmacs s20 , s4, s14 + fmacs s28 , s5, s15 + fmacs s21 , s4, s15 + fmacs s29 , s5, s14 + + fmacs s22 , s6, s14 + fmacs s30 , s7, s15 + fmacs s23 , s6, s15 + fmacs s31 , s7, s14 + +.endm + +.macro KERNEL2x2_SUB + + fldmias AO!, { s0 - s1 } + fldmias BO!, { s8 - s9 } + + fmacs s16 , s0, s8 + fmacs s24 , s1, s9 + fldmias AO!, { s2 - s3 } + fmacs s17 , s0, s9 + fmacs s25 , s1, s8 + + fldmias BO!, { s10 - s11 } + fmacs s18 , s2, s8 + fmacs s26 , s3, s9 + fmacs s19 , s2, s9 + fmacs s27 , s3, s8 + + fmacs s20 , s0, s10 + fmacs s28 , s1, s11 + fmacs s21 , s0, s11 + fmacs s29 , s1, s10 + + fmacs s22 , s2, s10 + fmacs s30 , s3, s11 + fmacs s23 , s2, s11 + fmacs s31 , s3, s10 + +.endm + + + + +.macro SAVE2x2 + pld [ CO1 , #C_PRE ] + + ldr r3 , LDC + add CO2 , CO1, r3 + flds s0, ALPHA_R + flds s1, ALPHA_I + + fldmias CO1, { s4 - s7 } + fldmias CO2, { s8 - s11 } + + FADD_R s16, s24 , s16 + FADD_I s17, s25 , s17 + FADD_R s18, s26 , s18 + FADD_I s19, s27 , s19 + FADD_R s20, s28 , s20 + FADD_I s21, s29 , s21 + FADD_R s22, s30 , s22 + FADD_I s23, s31 , s23 + + FMAC_R1 s4 , s0 , s16 + FMAC_I1 s5 , s0 , s17 + FMAC_R2 s4 , s1 , s17 + FMAC_I2 s5 , s1 , s16 + + FMAC_R1 s6 , s0 , s18 + FMAC_I1 s7 , s0 , s19 + FMAC_R2 s6 , s1 , s19 + FMAC_I2 s7 , s1 , s18 + + FMAC_R1 s8 , s0 , s20 + FMAC_I1 s9 , s0 , s21 + FMAC_R2 s8 , s1 , s21 + FMAC_I2 s9 , s1 , s20 + + FMAC_R1 s10, s0 , s22 + FMAC_I1 s11, s0 , s23 + FMAC_R2 s10, s1 , s23 + FMAC_I2 s11, s1 , s22 + + fstmias CO1, { s4 - s7 } + fstmias CO2, { s8 - s11 } + + add CO1, CO1, #16 + +.endm + +/******************************************************************************/ + +.macro INIT1x2 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s17, s16 + vmov.f32 s20, s16 + vmov.f32 s21, s16 + vmov.f32 s24, s16 + vmov.f32 s25, s16 + vmov.f32 s28, s16 + vmov.f32 s29, s16 + +.endm + +.macro KERNEL1x2_I + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + flds s10, [ BO, #8 ] + flds s11, [ BO, #12 ] + + fmuls s16 , s0, s8 + fmuls s24 , s1, s9 + fmuls s17 , s0, s9 + fmuls s25 , s1, s8 + + fmuls s20 , s0, s10 + fmuls s28 , s1, s11 + fmuls s21 , s0, s11 + fmuls s29 , s1, s10 + + add BO , BO, #16 + add AO , AO, #8 + + pld [ BO , #B_PRE ] + + flds s4 , [ AO, #0 ] + flds s5 , [ AO, #4 ] + + flds s12, [ BO ] + flds s13, [ BO, #4 ] + flds s14, [ BO, #8 ] + flds s15, [ BO, #12 ] + + add BO , BO, #16 + add AO , AO, #8 +.endm + + + +.macro KERNEL1x2_M1 + pld [ BO , #B_PRE ] + + fmacs s16 , s0, s8 + fmacs s24 , s1, s9 + fmacs s17 , s0, s9 + fmacs s25 , s1, s8 + + fmacs s20 , s0, s10 + fmacs s28 , s1, s11 + fmacs s21 , s0, s11 + fmacs s29 , s1, s10 + + flds s4 , [ AO, #0 ] + flds s5 , [ AO, #4 ] + + flds s12, [ BO ] + flds s13, [ BO, #4 ] + flds s14, [ BO, #8 ] + flds s15, [ BO, #12 ] + + add BO , BO, #16 + add AO , AO, #8 +.endm + +.macro KERNEL1x2_M2 + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + + fmacs s16 , s4, s12 + fmacs s24 , s5, s13 + fmacs s17 , s4, s13 + fmacs s25 , s5, s12 + + fmacs s20 , s4, s14 + fmacs s28 , s5, s15 + fmacs s21 , s4, s15 + fmacs s29 , s5, s14 + + flds s0 , [ AO, #0 ] + flds s1 , [ AO, #4 ] + + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + flds s10, [ BO, #8 ] + flds s11, [ BO, #12 ] + + add BO , BO, #16 + add AO , AO, #8 +.endm + + +.macro KERNEL1x2_E + + fmacs s16 , s4, s12 + fmacs s24 , s5, s13 + fmacs s17 , s4, s13 + fmacs s25 , s5, s12 + + fmacs s20 , s4, s14 + fmacs s28 , s5, s15 + fmacs s21 , s4, s15 + fmacs s29 , s5, s14 + +.endm + +.macro KERNEL1x2_SUB + + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + flds s10, [ BO, #8 ] + flds s11, [ BO, #12 ] + + fmacs s16 , s0, s8 + fmacs s24 , s1, s9 + fmacs s17 , s0, s9 + fmacs s25 , s1, s8 + + fmacs s20 , s0, s10 + fmacs s28 , s1, s11 + fmacs s21 , s0, s11 + fmacs s29 , s1, s10 + + add BO , BO, #16 + add AO , AO, #8 + +.endm + + + + +.macro SAVE1x2 + pld [ CO1 , #C_PRE ] + + ldr r3 , LDC + add CO2 , CO1, r3 + flds s0, ALPHA_R + flds s1, ALPHA_I + + fldmias CO1, { s4 - s5 } + fldmias CO2, { s8 - s9 } + + FADD_R s16, s24 , s16 + FADD_I s17, s25 , s17 + FADD_R s20, s28 , s20 + FADD_I s21, s29 , s21 + + FMAC_R1 s4 , s0 , s16 + FMAC_I1 s5 , s0 , s17 + FMAC_R2 s4 , s1 , s17 + FMAC_I2 s5 , s1 , s16 + + FMAC_R1 s8 , s0 , s20 + FMAC_I1 s9 , s0 , s21 + FMAC_R2 s8 , s1 , s21 + FMAC_I2 s9 , s1 , s20 + + fstmias CO1, { s4 - s5 } + fstmias CO2, { s8 - s9 } + + add CO1, CO1, #8 + +.endm + +/******************************************************************************/ + +.macro INIT2x1 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s17, s16 + vmov.f32 s18, s16 + vmov.f32 s19, s16 + vmov.f32 s24, s16 + vmov.f32 s25, s16 + vmov.f32 s26, s16 + vmov.f32 s27, s16 + +.endm + +.macro KERNEL2x1_I + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s2 , [ AO, #8 ] + flds s3 , [ AO, #12 ] + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + + fmuls s16 , s0, s8 + fmuls s24 , s1, s9 + fmuls s17 , s0, s9 + fmuls s25 , s1, s8 + + fmuls s18 , s2, s8 + fmuls s26 , s3, s9 + fmuls s19 , s2, s9 + fmuls s27 , s3, s8 + + add BO , BO, #8 + add AO , AO, #16 + + pld [ BO , #B_PRE ] + pld [ AO , #A_PRE ] + + flds s4 , [ AO, #0 ] + flds s5 , [ AO, #4 ] + flds s6 , [ AO, #8 ] + flds s7 , [ AO, #12 ] + + flds s12, [ BO ] + flds s13, [ BO, #4 ] + + add BO , BO, #8 + add AO , AO, #16 +.endm + + + +.macro KERNEL2x1_M1 + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + + fmacs s16 , s0, s8 + fmacs s24 , s1, s9 + fmacs s17 , s0, s9 + fmacs s25 , s1, s8 + + fmacs s18 , s2, s8 + fmacs s26 , s3, s9 + fmacs s19 , s2, s9 + fmacs s27 , s3, s8 + + flds s4 , [ AO, #0 ] + flds s5 , [ AO, #4 ] + flds s6 , [ AO, #8 ] + flds s7 , [ AO, #12 ] + + flds s12, [ BO ] + flds s13, [ BO, #4 ] + + add BO , BO, #8 + add AO , AO, #16 +.endm + +.macro KERNEL2x1_M2 + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + + fmacs s16 , s4, s12 + fmacs s24 , s5, s13 + fmacs s17 , s4, s13 + fmacs s25 , s5, s12 + + fmacs s18 , s6, s12 + fmacs s26 , s7, s13 + fmacs s19 , s6, s13 + fmacs s27 , s7, s12 + + flds s0 , [ AO, #0 ] + flds s1 , [ AO, #4 ] + flds s2 , [ AO, #8 ] + flds s3 , [ AO, #12 ] + + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + + add BO , BO, #8 + add AO , AO, #16 +.endm + + +.macro KERNEL2x1_E + + fmacs s16 , s4, s12 + fmacs s24 , s5, s13 + fmacs s17 , s4, s13 + fmacs s25 , s5, s12 + + fmacs s18 , s6, s12 + fmacs s26 , s7, s13 + fmacs s19 , s6, s13 + fmacs s27 , s7, s12 + +.endm + +.macro KERNEL2x1_SUB + + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s2 , [ AO, #8 ] + flds s3 , [ AO, #12 ] + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + + fmacs s16 , s0, s8 + fmacs s24 , s1, s9 + fmacs s17 , s0, s9 + fmacs s25 , s1, s8 + + fmacs s18 , s2, s8 + fmacs s26 , s3, s9 + fmacs s19 , s2, s9 + fmacs s27 , s3, s8 + + add BO , BO, #8 + add AO , AO, #16 + +.endm + + + + +.macro SAVE2x1 + pld [ CO1 , #C_PRE ] + + flds s0, ALPHA_R + flds s1, ALPHA_I + + fldmias CO1, { s4 - s7 } + + FADD_R s16, s24 , s16 + FADD_I s17, s25 , s17 + FADD_R s18, s26 , s18 + FADD_I s19, s27 , s19 + + FMAC_R1 s4 , s0 , s16 + FMAC_I1 s5 , s0 , s17 + FMAC_R2 s4 , s1 , s17 + FMAC_I2 s5 , s1 , s16 + + FMAC_R1 s6 , s0 , s18 + FMAC_I1 s7 , s0 , s19 + FMAC_R2 s6 , s1 , s19 + FMAC_I2 s7 , s1 , s18 + + fstmias CO1, { s4 - s7 } + + add CO1, CO1, #16 + +.endm + +/******************************************************************************/ + +.macro INIT1x1 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s17, s16 + vmov.f32 s24, s16 + vmov.f32 s25, s16 + +.endm + +.macro KERNEL1x1_I + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + + fmuls s16 , s0, s8 + fmuls s24 , s1, s9 + fmuls s17 , s0, s9 + fmuls s25 , s1, s8 + + add BO , BO, #8 + add AO , AO, #8 + + pld [ BO , #B_PRE ] + pld [ AO , #A_PRE ] + + flds s4 , [ AO, #0 ] + flds s5 , [ AO, #4 ] + + flds s12, [ BO ] + flds s13, [ BO, #4 ] + + add BO , BO, #8 + add AO , AO, #8 +.endm + + + +.macro KERNEL1x1_M1 + + fmacs s16 , s0, s8 + fmacs s24 , s1, s9 + fmacs s17 , s0, s9 + fmacs s25 , s1, s8 + + flds s4 , [ AO, #0 ] + flds s5 , [ AO, #4 ] + + flds s12, [ BO ] + flds s13, [ BO, #4 ] + + add BO , BO, #8 + add AO , AO, #8 +.endm + +.macro KERNEL1x1_M2 + + fmacs s16 , s4, s12 + fmacs s24 , s5, s13 + fmacs s17 , s4, s13 + fmacs s25 , s5, s12 + + flds s0 , [ AO, #0 ] + flds s1 , [ AO, #4 ] + + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + + add BO , BO, #8 + add AO , AO, #8 +.endm + + +.macro KERNEL1x1_E + + fmacs s16 , s4, s12 + fmacs s24 , s5, s13 + fmacs s17 , s4, s13 + fmacs s25 , s5, s12 + +.endm + +.macro KERNEL1x1_SUB + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + + fmacs s16 , s0, s8 + fmacs s24 , s1, s9 + fmacs s17 , s0, s9 + fmacs s25 , s1, s8 + + add BO , BO, #8 + add AO , AO, #8 + +.endm + + + + +.macro SAVE1x1 + pld [ CO1 , #C_PRE ] + + flds s0, ALPHA_R + flds s1, ALPHA_I + + fldmias CO1, { s4 - s5 } + + FADD_R s16, s24 , s16 + FADD_I s17, s25 , s17 + + FMAC_R1 s4 , s0 , s16 + FMAC_I1 s5 , s0 , s17 + FMAC_R2 s4 , s1 , s17 + FMAC_I2 s5 , s1 , s16 + + fstmias CO1, { s4 - s5 } + + add CO1, CO1, #8 + +.endm + +/******************************************************************************/ + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + str OLD_M, M + str OLD_N, N + str OLD_K, K + str OLD_A, A + vstr OLD_ALPHA_R, ALPHA_R + vstr OLD_ALPHA_I, ALPHA_I + + sub r3, fp, #128 + vstm r3, { s8 - s31} // store floating point registers + + ldr r3, OLD_LDC + lsl r3, r3, #3 // ldc = ldc * 4 * 2 + str r3, LDC + + ldr K1, K + ldr BC, B + + ldr J, N + asrs J, J, #1 // J = J / 2 + ble cgemm_kernel_L1_BEGIN + +cgemm_kernel_L2_BEGIN: + + ldr CO1, C // CO1 = C + ldr r4 , LDC + lsl r4 , r4 , #1 // LDC * 2 + add r3 , r4, CO1 + str r3 , C // store C + + ldr AO, A // AO = A + pld [AO , #A_PRE-64] + pld [AO , #A_PRE-32] + + + +cgemm_kernel_L2_M2_BEGIN: + + ldr I, M + asrs I, I, #1 // I = I / 2 + ble cgemm_kernel_L2_M1_BEGIN + +cgemm_kernel_L2_M2_20: + + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + cmp L , #3 + blt cgemm_kernel_L2_M2_30 + .align 5 + + + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + sub L, L, #2 + +cgemm_kernel_L2_M2_22: + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + subs L, L, #1 + bgt cgemm_kernel_L2_M2_22 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b cgemm_kernel_L2_M2_44 + + +cgemm_kernel_L2_M2_30: + tst L, #3 + ble cgemm_kernel_L2_M2_40 + + tst L, #2 + ble cgemm_kernel_L2_M2_32 + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b cgemm_kernel_L2_M2_44 + +cgemm_kernel_L2_M2_32: + + tst L, #1 + ble cgemm_kernel_L2_M2_40 + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b cgemm_kernel_L2_M2_44 + + +cgemm_kernel_L2_M2_40: + + INIT2x2 + + +cgemm_kernel_L2_M2_44: + + ands L , K1, #7 // L = L % 8 + ble cgemm_kernel_L2_M2_100 + +cgemm_kernel_L2_M2_46: + + KERNEL2x2_SUB + + subs L, L, #1 + bne cgemm_kernel_L2_M2_46 + +cgemm_kernel_L2_M2_100: + + SAVE2x2 + +cgemm_kernel_L2_M2_END: + + subs I, I, #1 + bne cgemm_kernel_L2_M2_20 + + +cgemm_kernel_L2_M1_BEGIN: + + ldr I, M + tst I, #1 // I = I % 2 + ble cgemm_kernel_L2_END + +cgemm_kernel_L2_M1_20: + + INIT1x2 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble cgemm_kernel_L2_M1_40 + +cgemm_kernel_L2_M1_22: + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs L, L, #1 + bgt cgemm_kernel_L2_M1_22 + + +cgemm_kernel_L2_M1_40: + + ands L , K1, #7 // L = L % 8 + ble cgemm_kernel_L2_M1_100 + +cgemm_kernel_L2_M1_42: + + KERNEL1x2_SUB + + subs L, L, #1 + bgt cgemm_kernel_L2_M1_42 + +cgemm_kernel_L2_M1_100: + + SAVE1x2 + + +cgemm_kernel_L2_END: + + mov r3, BC + mov r4, K1 + lsl r4, r4, #4 // k * 2 * 4 * 2 + add r3, r3, r4 // B = B + K * 2 * 8 + mov BC, r3 + + subs J , #1 // j-- + bgt cgemm_kernel_L2_BEGIN + + + +/*********************************************************************************************/ + +cgemm_kernel_L1_BEGIN: + + ldr J , N + tst J , #1 + ble cgemm_kernel_L999 + + + ldr CO1, C // CO1 = C + ldr r4 , LDC + add r3 , r4, CO1 + str r3 , C // store C + + ldr AO, A // AO = A + +cgemm_kernel_L1_M2_BEGIN: + + ldr I, M + asrs I, I, #1 // I = I / 2 + ble cgemm_kernel_L1_M1_BEGIN + +cgemm_kernel_L1_M2_20: + + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + cmp L , #3 + blt cgemm_kernel_L1_M2_30 + .align 5 + + + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + sub L, L, #2 + +cgemm_kernel_L1_M2_22: + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + subs L, L, #1 + bgt cgemm_kernel_L1_M2_22 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b cgemm_kernel_L1_M2_44 + + +cgemm_kernel_L1_M2_30: + tst L, #3 + ble cgemm_kernel_L1_M2_40 + + tst L, #2 + ble cgemm_kernel_L1_M2_32 + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b cgemm_kernel_L1_M2_44 + +cgemm_kernel_L1_M2_32: + + tst L, #1 + ble cgemm_kernel_L1_M2_40 + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b cgemm_kernel_L1_M2_44 + + +cgemm_kernel_L1_M2_40: + + INIT2x1 + + +cgemm_kernel_L1_M2_44: + + ands L , K1, #7 // L = L % 8 + ble cgemm_kernel_L1_M2_100 + +cgemm_kernel_L1_M2_46: + + KERNEL2x1_SUB + + subs L, L, #1 + bne cgemm_kernel_L1_M2_46 + +cgemm_kernel_L1_M2_100: + + SAVE2x1 + +cgemm_kernel_L1_M2_END: + + subs I, I, #1 + bne cgemm_kernel_L1_M2_20 + + +cgemm_kernel_L1_M1_BEGIN: + + ldr I, M + tst I, #1 // I = I % 2 + ble cgemm_kernel_L1_END + +cgemm_kernel_L1_M1_20: + + INIT1x1 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble cgemm_kernel_L1_M1_40 + +cgemm_kernel_L1_M1_22: + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs L, L, #1 + bgt cgemm_kernel_L1_M1_22 + + +cgemm_kernel_L1_M1_40: + + ands L , K1, #7 // L = L % 8 + ble cgemm_kernel_L1_M1_100 + +cgemm_kernel_L1_M1_42: + + KERNEL1x1_SUB + + subs L, L, #1 + bgt cgemm_kernel_L1_M1_42 + +cgemm_kernel_L1_M1_100: + + SAVE1x1 + + +cgemm_kernel_L1_END: + + + +cgemm_kernel_L999: + + sub r3, fp, #128 + vldm r3, { s8 - s31} // restore floating point registers + + movs r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/cgemm_ncopy_2_vfp.S b/kernel/arm/cgemm_ncopy_2_vfp.S new file mode 100644 index 000000000..08fbd5501 --- /dev/null +++ b/kernel/arm/cgemm_ncopy_2_vfp.S @@ -0,0 +1,258 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/05 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_A r2 +#define OLD_LDA r3 + + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define LDA [fp, #-260 ] + +#define B [fp, #4 ] + +#define M r0 +#define N r1 +#define A r2 + +#define BO r5 + +#define AO1 r6 +#define AO2 r7 + +#define I r3 +#define J r12 + +#define A_PRE 256 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro COPY2x2 + + flds s0 , [ AO1, #0 ] + flds s1 , [ AO1, #4 ] + flds s4 , [ AO1, #8 ] + flds s5 , [ AO1, #12 ] + + flds s2 , [ AO2, #0 ] + flds s3 , [ AO2, #4 ] + add AO1, AO1, #16 + flds s6 , [ AO2, #8 ] + flds s7 , [ AO2, #12 ] + + fstmias BO!, { s0 - s7 } + add AO2, AO2, #16 + +.endm + + +.macro COPY1x2 + + flds s0 , [ AO1, #0 ] + flds s1 , [ AO1, #4 ] + flds s2 , [ AO2, #0 ] + flds s3 , [ AO2, #4 ] + + add AO1, AO1, #8 + fstmias BO!, { s0 - s3 } + add AO2, AO2, #8 + +.endm + +.macro COPY2x1 + + flds s0 , [ AO1, #0 ] + flds s1 , [ AO1, #4 ] + flds s2 , [ AO1, #8 ] + flds s3 , [ AO1, #12 ] + + fstmias BO!, { s0 - s3 } + add AO1, AO1, #16 + +.endm + + +.macro COPY1x1 + + flds s0 , [ AO1, #0 ] + flds s1 , [ AO1, #4 ] + + fstmias BO!, { s0 - s1 } + add AO1, AO1, #8 + +.endm + + + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + + lsl r3, r3, #3 // lda = lda * 4 * 2 + str r3, LDA + + sub r4, fp, #128 + vstm r4, { s8 - s15} // store floating point registers + + ldr BO, B + +/*********************************************************************************************/ + +cgemm_ncopy_L2_BEGIN: + + asrs J, N, #1 // J = N / 2 + ble cgemm_ncopy_L1_BEGIN + +cgemm_ncopy_L2_M2_BEGIN: + + mov AO1, A // AO1 = A + ldr r4 , LDA + add AO2, AO1, r4 + add A , AO2, r4 // A = A + 2 * LDA + + asrs I, M, #1 // I = M / 2 + ble cgemm_ncopy_L2_M2_40 + +cgemm_ncopy_L2_M2_20: + + pld [ AO1, #A_PRE ] + pld [ AO2, #A_PRE ] + + COPY2x2 + subs I , I , #1 + ble cgemm_ncopy_L2_M2_40 + + COPY2x2 + subs I , I , #1 + bne cgemm_ncopy_L2_M2_20 + + +cgemm_ncopy_L2_M2_40: + + ands I, M , #1 + ble cgemm_ncopy_L2_M2_END + +cgemm_ncopy_L2_M2_60: + + COPY1x2 + + subs I , I , #1 + bne cgemm_ncopy_L2_M2_60 + + +cgemm_ncopy_L2_M2_END: + + subs J , J, #1 // j-- + bne cgemm_ncopy_L2_M2_BEGIN + + +/*********************************************************************************************/ + +cgemm_ncopy_L1_BEGIN: + + tst N, #1 + ble cgemm_ncopy_L999 + + +cgemm_ncopy_L1_M2_BEGIN: + + mov AO1, A // AO1 = A + ldr r4 , LDA + add A , AO1, r4 // A = A + 1 * LDA + + asrs I, M, #1 // I = M / 2 + ble cgemm_ncopy_L1_M2_40 + +cgemm_ncopy_L1_M2_20: + + COPY2x1 + + subs I , I , #1 + bne cgemm_ncopy_L1_M2_20 + + +cgemm_ncopy_L1_M2_40: + + ands I, M , #1 + ble cgemm_ncopy_L1_M2_END + +cgemm_ncopy_L1_M2_60: + + COPY1x1 + + subs I , I , #1 + bne cgemm_ncopy_L1_M2_60 + + +cgemm_ncopy_L1_M2_END: + + + +cgemm_ncopy_L999: + + sub r3, fp, #128 + vldm r3, { s8 - s15} // restore floating point registers + + movs r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/cgemm_tcopy_2_vfp.S b/kernel/arm/cgemm_tcopy_2_vfp.S new file mode 100644 index 000000000..9036b994d --- /dev/null +++ b/kernel/arm/cgemm_tcopy_2_vfp.S @@ -0,0 +1,243 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/07 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_A r2 +#define OLD_LDA r3 + + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define B [fp, #4 ] +#define A [fp, #-248 ] + +#define M r0 +#define N r1 +#define M4 r2 + +#define LDA r5 + +#define AO1 r6 +#define BO1 r7 +#define BO2 r8 + +#define I r4 +#define J r12 + +#define A_PRE 256 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ +.macro COPY2x2 + + fldmias AO1, { s0 - s3 } + + add r3, AO1, LDA + fldmias r3, { s4 - s7 } + + fstmias BO1, { s0 - s7 } + add AO1, AO1, #16 + add BO1, BO1, M4 + +.endm + +.macro COPY1x2 + + fldmias AO1, { s0 -s1 } + + add r3, AO1, LDA + fldmias r3, { s2 - s3 } + + fstmias BO2, { s0 - s3 } + add AO1, AO1, #8 + add BO2, BO2, #16 + +.endm + +/*************************************************************************************************************************/ +.macro COPY2x1 + + fldmias AO1, { s0 - s3 } + + fstmias BO1, { s0 - s3 } + add AO1, AO1, #16 + add BO1, BO1, M4 + +.endm + +.macro COPY1x1 + + fldmias AO1, { s0 - s1 } + + fstmias BO2, { s0 - s1 } + add AO1, AO1, #8 + add BO2, BO2, #8 + +.endm + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + str OLD_A, A // store A + + lsl LDA, OLD_LDA, #3 // lda = lda * SIZE * 2 + + sub r4, fp, #128 + vstm r4, { s8 - s15} // store floating point registers + + lsl r4 , M, #3 // M * SIZE * 2 + + ldr r3, B + + and BO2 , N , #-2 + + mul BO2, BO2, r4 + + add BO2 , BO2, r3 + + lsl M4, M, #4 // M4 = M * 2 * SIZE * 2 + +cgemm_tcopy_L2_BEGIN: + + asrs J, M, #1 // J = N / 2 + ble cgemm_tcopy_L1_BEGIN + +cgemm_tcopy_L2_M2_BEGIN: + + ldr AO1, A // AO1 = A + lsl r3, LDA, #1 // r3 = 2 * LDA + add r3, r3 , AO1 // A = A + 2 * LDA + str r3, A // store A + + ldr BO1, B + add r3, BO1, #32 // B = B + 4 * SIZE *2 + str r3, B + + asrs I, N, #1 // I = M / 2 + ble cgemm_tcopy_L2_M2_60 + +cgemm_tcopy_L2_M2_40: + + COPY2x2 + subs I, I, #1 + bne cgemm_tcopy_L2_M2_40 + +cgemm_tcopy_L2_M2_60: + + tst N , #1 + ble cgemm_tcopy_L2_M2_END + + COPY1x2 + + +cgemm_tcopy_L2_M2_END: + + subs J , J, #1 // j-- + bne cgemm_tcopy_L2_M2_BEGIN + +/*********************************************************************************************/ + +cgemm_tcopy_L1_BEGIN: + + tst M, #1 + ble cgemm_tcopy_L999 + + +cgemm_tcopy_L1_M2_BEGIN: + + ldr AO1, A // AO1 = A + add r3, LDA , AO1 // A = A + 1 * LDA + str r3, A // store A + + ldr BO1, B + add r3, BO1, #16 // B = B + 2 * SIZE *2 + str r3, B + + asrs I, N, #1 // I = M / 2 + ble cgemm_tcopy_L1_M2_60 + + +cgemm_tcopy_L1_M2_40: + + COPY2x1 + subs I, I, #1 + bne cgemm_tcopy_L1_M2_40 + +cgemm_tcopy_L1_M2_60: + + tst N , #1 + ble cgemm_tcopy_L1_M2_END + + COPY1x1 + + +cgemm_tcopy_L1_M2_END: + + + +cgemm_tcopy_L999: + + sub r3, fp, #128 + vldm r3, { s8 - s15} // restore floating point registers + + mov r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/cgemv_n_vfp.S b/kernel/arm/cgemv_n_vfp.S new file mode 100644 index 000000000..522c4c764 --- /dev/null +++ b/kernel/arm/cgemv_n_vfp.S @@ -0,0 +1,697 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/29 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_LDA [fp, #0 ] +#define X [fp, #4 ] +#define OLD_INC_X [fp, #8 ] +#define Y [fp, #12 ] +#define OLD_INC_Y [fp, #16 ] +#define OLD_A r3 +#define OLD_M r0 + +#define AO1 r0 +#define N r1 +#define J r2 + +#define AO2 r4 +#define XO r5 +#define YO r6 +#define LDA r7 +#define INC_X r8 +#define INC_Y r9 + +#define I r12 + +#define ALPHA_I [fp, #-236] +#define ALPHA_R [fp, #-244] + +#define M [fp, #-252 ] +#define A [fp, #-256 ] + + +#define X_PRE 64 +#define Y_PRE 0 +#define A_PRE 0 + +/**************************************************************************************/ + +#if !defined(CONJ) && !defined(XCONJ) + + #define KMAC_R fnmacs + #define KMAC_I fmacs + + #define FMAC_R1 fmacs + #define FMAC_R2 fnmacs + #define FMAC_I1 fmacs + #define FMAC_I2 fmacs + +#elif defined(CONJ) && !defined(XCONJ) + + #define KMAC_R fmacs + #define KMAC_I fnmacs + + #define FMAC_R1 fmacs + #define FMAC_R2 fnmacs + #define FMAC_I1 fmacs + #define FMAC_I2 fmacs + +#elif !defined(CONJ) && defined(XCONJ) + + #define KMAC_R fmacs + #define KMAC_I fnmacs + + #define FMAC_R1 fmacs + #define FMAC_R2 fmacs + #define FMAC_I1 fnmacs + #define FMAC_I2 fmacs + +#else + + #define KMAC_R fnmacs + #define KMAC_I fmacs + + #define FMAC_R1 fmacs + #define FMAC_R2 fmacs + #define FMAC_I1 fnmacs + #define FMAC_I2 fmacs + +#endif + +.macro INIT_F4 + + pld [ YO, #Y_PRE ] + vsub.f32 s8 , s8 , s8 + vmov.f32 s9 , s8 + vmov.f32 s10, s8 + vmov.f32 s11, s8 + vmov.f32 s12, s8 + vmov.f32 s13, s8 + vmov.f32 s14, s8 + vmov.f32 s15, s8 + +.endm + +.macro KERNEL_F4X4 + + pld [ XO, #X_PRE ] + KERNEL_F4X1 + KERNEL_F4X1 + KERNEL_F4X1 + KERNEL_F4X1 + +.endm + +.macro KERNEL_F4X1 + + pld [ AO2, #A_PRE ] + flds s0 , [ AO1 ] + flds s1 , [ AO1, #4 ] + flds s2 , [ AO1, #8 ] + flds s3 , [ AO1, #12 ] + + flds s4 , [ XO ] + flds s5 , [ XO, #4 ] + + fmacs s8 , s0, s4 + fmacs s9 , s0, s5 + fmacs s10 , s2, s4 + fmacs s11 , s2, s5 + + KMAC_R s8 , s1, s5 + KMAC_I s9 , s1, s4 + KMAC_R s10 , s3, s5 + KMAC_I s11 , s3, s4 + + flds s0 , [ AO1, #16 ] + flds s1 , [ AO1, #20 ] + flds s2 , [ AO1, #24 ] + flds s3 , [ AO1, #28 ] + + fmacs s12 , s0, s4 + fmacs s13 , s0, s5 + fmacs s14 , s2, s4 + fmacs s15 , s2, s5 + + KMAC_R s12 , s1, s5 + KMAC_I s13 , s1, s4 + KMAC_R s14 , s3, s5 + KMAC_I s15 , s3, s4 + + add XO , XO, #8 + add AO1 , AO1, LDA + add AO2 , AO2, LDA + +.endm + +.macro SAVE_F4 + + flds s0, ALPHA_R + flds s1, ALPHA_I + + fldmias YO, { s4 - s7 } + + FMAC_R1 s4 , s0 , s8 + FMAC_I1 s5 , s0 , s9 + FMAC_R2 s4 , s1 , s9 + FMAC_I2 s5 , s1 , s8 + + FMAC_R1 s6 , s0 , s10 + FMAC_I1 s7 , s0 , s11 + FMAC_R2 s6 , s1 , s11 + FMAC_I2 s7 , s1 , s10 + + fstmias YO!, { s4 - s7 } + + fldmias YO, { s4 - s7 } + + FMAC_R1 s4 , s0 , s12 + FMAC_I1 s5 , s0 , s13 + FMAC_R2 s4 , s1 , s13 + FMAC_I2 s5 , s1 , s12 + + FMAC_R1 s6 , s0 , s14 + FMAC_I1 s7 , s0 , s15 + FMAC_R2 s6 , s1 , s15 + FMAC_I2 s7 , s1 , s14 + + fstmias YO!, { s4 - s7 } + +.endm + + + + +.macro INIT_F1 + + vsub.f32 s8 , s8 , s8 + vmov.f32 s9 , s8 + +.endm + +.macro KERNEL_F1X1 + + flds s0 , [ AO1 ] + flds s1 , [ AO1, #4 ] + + flds s4 , [ XO ] + flds s5 , [ XO, #4 ] + + fmacs s8 , s0, s4 + fmacs s9 , s0, s5 + + KMAC_R s8 , s1, s5 + KMAC_I s9 , s1, s4 + + add XO , XO, #8 + add AO1 , AO1, LDA + + +.endm + +.macro SAVE_F1 + + flds s0, ALPHA_R + flds s1, ALPHA_I + + fldmias YO, { s4 - s5 } + + FMAC_R1 s4 , s0 , s8 + FMAC_I1 s5 , s0 , s9 + FMAC_R2 s4 , s1 , s9 + FMAC_I2 s5 , s1 , s8 + + fstmias YO, { s4 - s5 } + + add YO, YO, #8 + +.endm + +/****************************************************************************************/ + +.macro INIT_S4 + + vsub.f32 s8 , s8 , s8 + vmov.f32 s9 , s8 + vmov.f32 s10, s8 + vmov.f32 s11, s8 + vmov.f32 s12, s8 + vmov.f32 s13, s8 + vmov.f32 s14, s8 + vmov.f32 s15, s8 + +.endm + +.macro KERNEL_S4X4 + + KERNEL_S4X1 + KERNEL_S4X1 + KERNEL_S4X1 + KERNEL_S4X1 + +.endm + +.macro KERNEL_S4X1 + + flds s0 , [ AO1 ] + flds s1 , [ AO1, #4 ] + flds s2 , [ AO1, #8 ] + flds s3 , [ AO1, #12 ] + + flds s4 , [ XO ] + flds s5 , [ XO, #4 ] + + fmacs s8 , s0, s4 + fmacs s9 , s0, s5 + fmacs s10 , s2, s4 + fmacs s11 , s2, s5 + + KMAC_R s8 , s1, s5 + KMAC_I s9 , s1, s4 + KMAC_R s10 , s3, s5 + KMAC_I s11 , s3, s4 + + flds s0 , [ AO1, #16 ] + flds s1 , [ AO1, #20 ] + flds s2 , [ AO1, #24 ] + flds s3 , [ AO1, #28 ] + + fmacs s12 , s0, s4 + fmacs s13 , s0, s5 + fmacs s14 , s2, s4 + fmacs s15 , s2, s5 + + KMAC_R s12 , s1, s5 + KMAC_I s13 , s1, s4 + KMAC_R s14 , s3, s5 + KMAC_I s15 , s3, s4 + + add XO , XO, INC_X + add AO1 , AO1, LDA + add AO2 , AO2, LDA + +.endm + +.macro SAVE_S4 + + flds s0, ALPHA_R + flds s1, ALPHA_I + + fldmias YO, { s4 - s5 } + + FMAC_R1 s4 , s0 , s8 + FMAC_I1 s5 , s0 , s9 + FMAC_R2 s4 , s1 , s9 + FMAC_I2 s5 , s1 , s8 + + fstmias YO, { s4 - s5 } + + add YO, YO, INC_Y + + fldmias YO, { s6 - s7 } + + FMAC_R1 s6 , s0 , s10 + FMAC_I1 s7 , s0 , s11 + FMAC_R2 s6 , s1 , s11 + FMAC_I2 s7 , s1 , s10 + + fstmias YO, { s6 - s7 } + + add YO, YO, INC_Y + + fldmias YO, { s4 - s5 } + + FMAC_R1 s4 , s0 , s12 + FMAC_I1 s5 , s0 , s13 + FMAC_R2 s4 , s1 , s13 + FMAC_I2 s5 , s1 , s12 + + fstmias YO, { s4 - s5 } + + add YO, YO, INC_Y + + fldmias YO, { s6 - s7 } + + FMAC_R1 s6 , s0 , s14 + FMAC_I1 s7 , s0 , s15 + FMAC_R2 s6 , s1 , s15 + FMAC_I2 s7 , s1 , s14 + + fstmias YO, { s6 - s7 } + + add YO, YO, INC_Y + +.endm + + + + +.macro INIT_S1 + + vsub.f32 s8 , s8 , s8 + vmov.f32 s9 , s8 + +.endm + +.macro KERNEL_S1X1 + + flds s0 , [ AO1 ] + flds s1 , [ AO1, #4 ] + + flds s4 , [ XO ] + flds s5 , [ XO, #4 ] + + fmacs s8 , s0, s4 + fmacs s9 , s0, s5 + + KMAC_R s8 , s1, s5 + KMAC_I s9 , s1, s4 + + add XO , XO, INC_X + add AO1 , AO1, LDA + + +.endm + +.macro SAVE_S1 + + flds s0, ALPHA_R + flds s1, ALPHA_I + + fldmias YO, { s4 - s5 } + + FMAC_R1 s4 , s0 , s8 + FMAC_I1 s5 , s0 , s9 + FMAC_R2 s4 , s1 , s9 + FMAC_I2 s5 , s1 , s8 + + fstmias YO, { s4 - s5 } + + add YO, YO, INC_Y + +.endm + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + push {r4 - r9 , fp} + add fp, sp, #28 + sub sp, sp, #STACKSIZE // reserve stack + + sub r12, fp, #192 + +#if defined(DOUBLE) + vstm r12, { d8 - d15 } // store floating point registers +#else + vstm r12, { s8 - s15 } // store floating point registers +#endif + + cmp OLD_M, #0 + ble cgemvn_kernel_L999 + + cmp N, #0 + ble cgemvn_kernel_L999 + + str OLD_A, A + str OLD_M, M + vstr s0 , ALPHA_R + vstr s1 , ALPHA_I + + + ldr INC_X , OLD_INC_X + ldr INC_Y , OLD_INC_Y + + cmp INC_X, #0 + beq cgemvn_kernel_L999 + + cmp INC_Y, #0 + beq cgemvn_kernel_L999 + + ldr LDA, OLD_LDA + + +#if defined(DOUBLE) + lsl LDA, LDA, #4 // LDA * SIZE * 2 +#else + lsl LDA, LDA, #3 // LDA * SIZE * 2 +#endif + + cmp INC_X, #1 + bne cgemvn_kernel_S4_BEGIN + + cmp INC_Y, #1 + bne cgemvn_kernel_S4_BEGIN + + +cgemvn_kernel_F4_BEGIN: + + ldr YO , Y + + ldr I, M + asrs I, I, #2 // I = M / 4 + ble cgemvn_kernel_F1_BEGIN + +cgemvn_kernel_F4X4: + + ldr AO1, A + add AO2, AO1, LDA + add r3 , AO1, #32 + str r3 , A + + add AO2, AO2, LDA + add AO2, AO2, LDA + + ldr XO , X + + INIT_F4 + + asrs J, N, #2 // J = N / 4 + ble cgemvn_kernel_F4X1 + + +cgemvn_kernel_F4X4_10: + + KERNEL_F4X4 + + subs J, J, #1 + bne cgemvn_kernel_F4X4_10 + + +cgemvn_kernel_F4X1: + + ands J, N , #3 + ble cgemvn_kernel_F4_END + +cgemvn_kernel_F4X1_10: + + KERNEL_F4X1 + + subs J, J, #1 + bne cgemvn_kernel_F4X1_10 + + +cgemvn_kernel_F4_END: + + SAVE_F4 + + subs I , I , #1 + bne cgemvn_kernel_F4X4 + + +cgemvn_kernel_F1_BEGIN: + + ldr I, M + ands I, I , #3 + ble cgemvn_kernel_L999 + +cgemvn_kernel_F1X1: + + ldr AO1, A + add r3, AO1, #8 + str r3, A + + ldr XO , X + + INIT_F1 + + mov J, N + + +cgemvn_kernel_F1X1_10: + + KERNEL_F1X1 + + subs J, J, #1 + bne cgemvn_kernel_F1X1_10 + + +cgemvn_kernel_F1_END: + + SAVE_F1 + + subs I , I , #1 + bne cgemvn_kernel_F1X1 + + b cgemvn_kernel_L999 + + + +/*************************************************************************************************************/ + +cgemvn_kernel_S4_BEGIN: + +#if defined(DOUBLE) + lsl INC_X, INC_X, #4 // INC_X * SIZE * 2 + lsl INC_Y, INC_Y, #4 // INC_Y * SIZE * 2 +#else + lsl INC_X, INC_X, #3 // INC_X * SIZE * 2 + lsl INC_Y, INC_Y, #3 // INC_Y * SIZE * 2 +#endif + + ldr YO , Y + + ldr I, M + asrs I, I, #2 // I = M / 4 + ble cgemvn_kernel_S1_BEGIN + +cgemvn_kernel_S4X4: + + ldr AO1, A + add AO2, AO1, LDA + add r3 , AO1, #32 + str r3 , A + + ldr XO , X + + INIT_S4 + + asrs J, N, #2 // J = N / 4 + ble cgemvn_kernel_S4X1 + + +cgemvn_kernel_S4X4_10: + + KERNEL_S4X4 + + subs J, J, #1 + bne cgemvn_kernel_S4X4_10 + + +cgemvn_kernel_S4X1: + + ands J, N , #3 + ble cgemvn_kernel_S4_END + +cgemvn_kernel_S4X1_10: + + KERNEL_S4X1 + + subs J, J, #1 + bne cgemvn_kernel_S4X1_10 + + +cgemvn_kernel_S4_END: + + SAVE_S4 + + subs I , I , #1 + bne cgemvn_kernel_S4X4 + + +cgemvn_kernel_S1_BEGIN: + + ldr I, M + ands I, I , #3 + ble cgemvn_kernel_L999 + +cgemvn_kernel_S1X1: + + ldr AO1, A + add r3, AO1, #8 + str r3, A + + ldr XO , X + + INIT_S1 + + mov J, N + + +cgemvn_kernel_S1X1_10: + + KERNEL_S1X1 + + subs J, J, #1 + bne cgemvn_kernel_S1X1_10 + + +cgemvn_kernel_S1_END: + + SAVE_S1 + + subs I , I , #1 + bne cgemvn_kernel_S1X1 + + +/*************************************************************************************************************/ + +cgemvn_kernel_L999: + + sub r3, fp, #192 + +#if defined(DOUBLE) + vldm r3, { d8 - d15 } // restore floating point registers +#else + vldm r3, { s8 - s15 } // restore floating point registers +#endif + + mov r0, #0 // set return value + + sub sp, fp, #28 + pop {r4 -r9 ,fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/cgemv_t_vfp.S b/kernel/arm/cgemv_t_vfp.S new file mode 100644 index 000000000..52276a06f --- /dev/null +++ b/kernel/arm/cgemv_t_vfp.S @@ -0,0 +1,607 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/29 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_LDA [fp, #0 ] +#define X [fp, #4 ] +#define OLD_INC_X [fp, #8 ] +#define Y [fp, #12 ] +#define OLD_INC_Y [fp, #16 ] +#define OLD_A r3 +#define OLD_N r1 + +#define M r0 +#define AO1 r1 +#define J r2 + +#define AO2 r4 +#define XO r5 +#define YO r6 +#define LDA r7 +#define INC_X r8 +#define INC_Y r9 + +#define I r12 + +#define N [fp, #-252 ] +#define A [fp, #-256 ] + + +#define X_PRE 512 +#define A_PRE 512 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +#if !defined(CONJ) && !defined(XCONJ) + + #define KMAC_R fnmacs + #define KMAC_I fmacs + + #define FMAC_R1 fmacs + #define FMAC_R2 fnmacs + #define FMAC_I1 fmacs + #define FMAC_I2 fmacs + +#elif defined(CONJ) && !defined(XCONJ) + + #define KMAC_R fmacs + #define KMAC_I fnmacs + + #define FMAC_R1 fmacs + #define FMAC_R2 fnmacs + #define FMAC_I1 fmacs + #define FMAC_I2 fmacs + +#elif !defined(CONJ) && defined(XCONJ) + + #define KMAC_R fmacs + #define KMAC_I fnmacs + + #define FMAC_R1 fmacs + #define FMAC_R2 fmacs + #define FMAC_I1 fnmacs + #define FMAC_I2 fmacs + +#else + + #define KMAC_R fnmacs + #define KMAC_I fmacs + + #define FMAC_R1 fmacs + #define FMAC_R2 fmacs + #define FMAC_I1 fnmacs + #define FMAC_I2 fmacs + +#endif + + + +.macro INIT_F2 + + vsub.f32 s12, s12, s12 + vsub.f32 s13, s13, s13 + vsub.f32 s14, s14, s14 + vsub.f32 s15, s15, s15 + +.endm + +.macro KERNEL_F2X4 + + KERNEL_F2X1 + KERNEL_F2X1 + KERNEL_F2X1 + KERNEL_F2X1 + +.endm + +.macro KERNEL_F2X1 + + fldmias XO! , { s2 - s3 } + fldmias AO1!, { s4 - s5 } + fldmias AO2!, { s8 - s9 } + + fmacs s12 , s4 , s2 + fmacs s13 , s4 , s3 + KMAC_R s12 , s5 , s3 + KMAC_I s13 , s5 , s2 + + fmacs s14 , s8 , s2 + fmacs s15 , s8 , s3 + KMAC_R s14 , s9 , s3 + KMAC_I s15 , s9 , s2 + +.endm + +.macro SAVE_F2 + + fldmias YO, { s4 - s7 } + + FMAC_R1 s4 , s0 , s12 + FMAC_I1 s5 , s0 , s13 + FMAC_R2 s4 , s1 , s13 + FMAC_I2 s5 , s1 , s12 + + FMAC_R1 s6 , s0 , s14 + FMAC_I1 s7 , s0 , s15 + FMAC_R2 s6 , s1 , s15 + FMAC_I2 s7 , s1 , s14 + + fstmias YO!, { s4 - s7 } + +.endm + +/************************************************************************************************/ + +.macro INIT_F1 + + vsub.f32 s12, s12, s12 + vsub.f32 s13, s13, s13 + +.endm + +.macro KERNEL_F1X4 + + KERNEL_F1X1 + KERNEL_F1X1 + KERNEL_F1X1 + KERNEL_F1X1 + +.endm + +.macro KERNEL_F1X1 + + fldmias XO! , { s2 - s3 } + fldmias AO1!, { s4 - s5 } + + fmacs s12 , s4 , s2 + fmacs s13 , s4 , s3 + KMAC_R s12 , s5 , s3 + KMAC_I s13 , s5 , s2 + +.endm + +.macro SAVE_F1 + + fldmias YO, { s4 - s5 } + + FMAC_R1 s4 , s0 , s12 + FMAC_I1 s5 , s0 , s13 + FMAC_R2 s4 , s1 , s13 + FMAC_I2 s5 , s1 , s12 + + fstmias YO!, { s4 - s5 } + +.endm + +/************************************************************************************************/ + +.macro INIT_S2 + + vsub.f32 s12, s12, s12 + vsub.f32 s13, s13, s13 + vsub.f32 s14, s14, s14 + vsub.f32 s15, s15, s15 + +.endm + +.macro KERNEL_S2X4 + + KERNEL_S2X1 + KERNEL_S2X1 + KERNEL_S2X1 + KERNEL_S2X1 + +.endm + +.macro KERNEL_S2X1 + + fldmias XO , { s2 - s3 } + fldmias AO1!, { s4 - s5 } + fldmias AO2!, { s8 - s9 } + + fmacs s12 , s4 , s2 + fmacs s13 , s4 , s3 + KMAC_R s12 , s5 , s3 + KMAC_I s13 , s5 , s2 + + fmacs s14 , s8 , s2 + fmacs s15 , s8 , s3 + KMAC_R s14 , s9 , s3 + KMAC_I s15 , s9 , s2 + + add XO, XO, INC_X + +.endm + +.macro SAVE_S2 + + fldmias YO, { s4 - s5 } + + FMAC_R1 s4 , s0 , s12 + FMAC_I1 s5 , s0 , s13 + FMAC_R2 s4 , s1 , s13 + FMAC_I2 s5 , s1 , s12 + + fstmias YO, { s4 - s5 } + + add YO, YO, INC_Y + + fldmias YO, { s6 - s7 } + + FMAC_R1 s6 , s0 , s14 + FMAC_I1 s7 , s0 , s15 + FMAC_R2 s6 , s1 , s15 + FMAC_I2 s7 , s1 , s14 + + fstmias YO, { s6 - s7 } + + add YO, YO, INC_Y + +.endm + +/************************************************************************************************/ + +.macro INIT_S1 + + vsub.f32 s12, s12, s12 + vsub.f32 s13, s13, s13 + +.endm + +.macro KERNEL_S1X4 + + KERNEL_S1X1 + KERNEL_S1X1 + KERNEL_S1X1 + KERNEL_S1X1 + +.endm + +.macro KERNEL_S1X1 + + fldmias XO , { s2 - s3 } + fldmias AO1!, { s4 - s5 } + + fmacs s12 , s4 , s2 + fmacs s13 , s4 , s3 + KMAC_R s12 , s5 , s3 + KMAC_I s13 , s5 , s2 + + add XO, XO, INC_X + +.endm + +.macro SAVE_S1 + + fldmias YO, { s4 - s5 } + + FMAC_R1 s4 , s0 , s12 + FMAC_I1 s5 , s0 , s13 + FMAC_R2 s4 , s1 , s13 + FMAC_I2 s5 , s1 , s12 + + fstmias YO, { s4 - s5 } + + add YO, YO, INC_Y + +.endm + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + push {r4 - r9 , fp} + add fp, sp, #28 + sub sp, sp, #STACKSIZE // reserve stack + + sub r12, fp, #192 + +#if defined(DOUBLE) + vstm r12, { d8 - d15 } // store floating point registers +#else + vstm r12, { s8 - s15 } // store floating point registers +#endif + + cmp M, #0 + ble cgemvt_kernel_L999 + + cmp OLD_N, #0 + ble cgemvt_kernel_L999 + + str OLD_A, A + str OLD_N, N + + ldr INC_X , OLD_INC_X + ldr INC_Y , OLD_INC_Y + + cmp INC_X, #0 + beq cgemvt_kernel_L999 + + cmp INC_Y, #0 + beq cgemvt_kernel_L999 + + ldr LDA, OLD_LDA + + +#if defined(DOUBLE) + lsl LDA, LDA, #4 // LDA * SIZE +#else + lsl LDA, LDA, #3 // LDA * SIZE +#endif + + cmp INC_X, #1 + bne cgemvt_kernel_S2_BEGIN + + cmp INC_Y, #1 + bne cgemvt_kernel_S2_BEGIN + + +cgemvt_kernel_F2_BEGIN: + + ldr YO , Y + + ldr J, N + asrs J, J, #1 // J = N / 2 + ble cgemvt_kernel_F1_BEGIN + +cgemvt_kernel_F2X4: + + ldr AO1, A + add AO2, AO1, LDA + add r3 , AO2, LDA + str r3 , A + + ldr XO , X + + INIT_F2 + + asrs I, M, #2 // I = M / 4 + ble cgemvt_kernel_F2X1 + + +cgemvt_kernel_F2X4_10: + + KERNEL_F2X4 + + subs I, I, #1 + bne cgemvt_kernel_F2X4_10 + + +cgemvt_kernel_F2X1: + + ands I, M , #3 + ble cgemvt_kernel_F2_END + +cgemvt_kernel_F2X1_10: + + KERNEL_F2X1 + + subs I, I, #1 + bne cgemvt_kernel_F2X1_10 + + +cgemvt_kernel_F2_END: + + SAVE_F2 + + subs J , J , #1 + bne cgemvt_kernel_F2X4 + + +cgemvt_kernel_F1_BEGIN: + + ldr J, N + ands J, J, #1 + ble cgemvt_kernel_L999 + +cgemvt_kernel_F1X4: + + ldr AO1, A + + ldr XO , X + + INIT_F1 + + asrs I, M, #2 // I = M / 4 + ble cgemvt_kernel_F1X1 + + +cgemvt_kernel_F1X4_10: + + KERNEL_F1X4 + + subs I, I, #1 + bne cgemvt_kernel_F1X4_10 + + +cgemvt_kernel_F1X1: + + ands I, M , #3 + ble cgemvt_kernel_F1_END + +cgemvt_kernel_F1X1_10: + + KERNEL_F1X1 + + subs I, I, #1 + bne cgemvt_kernel_F1X1_10 + + +cgemvt_kernel_F1_END: + + SAVE_F1 + + b cgemvt_kernel_L999 + + + +/*************************************************************************************************************/ + +cgemvt_kernel_S2_BEGIN: + +#if defined(DOUBLE) + lsl INC_X, INC_X, #4 // INC_X * SIZE + lsl INC_Y, INC_Y, #4 // INC_Y * SIZE +#else + lsl INC_X, INC_X, #3 // INC_X * SIZE + lsl INC_Y, INC_Y, #3 // INC_Y * SIZE +#endif + + ldr YO , Y + + ldr J, N + asrs J, J, #1 // J = N / 2 + ble cgemvt_kernel_S1_BEGIN + +cgemvt_kernel_S2X4: + + ldr AO1, A + add AO2, AO1, LDA + add r3 , AO2, LDA + str r3 , A + + ldr XO , X + + INIT_S2 + + asrs I, M, #2 // I = M / 4 + ble cgemvt_kernel_S2X1 + + +cgemvt_kernel_S2X4_10: + + KERNEL_S2X4 + + subs I, I, #1 + bne cgemvt_kernel_S2X4_10 + + +cgemvt_kernel_S2X1: + + ands I, M , #3 + ble cgemvt_kernel_S2_END + +cgemvt_kernel_S2X1_10: + + KERNEL_S2X1 + + subs I, I, #1 + bne cgemvt_kernel_S2X1_10 + + +cgemvt_kernel_S2_END: + + SAVE_S2 + + subs J , J , #1 + bne cgemvt_kernel_S2X4 + + +cgemvt_kernel_S1_BEGIN: + + ldr J, N + ands J, J, #1 + ble cgemvt_kernel_L999 + +cgemvt_kernel_S1X4: + + ldr AO1, A + + ldr XO , X + + INIT_S1 + + asrs I, M, #2 // I = M / 4 + ble cgemvt_kernel_S1X1 + + +cgemvt_kernel_S1X4_10: + + KERNEL_S1X4 + + subs I, I, #1 + bne cgemvt_kernel_S1X4_10 + + +cgemvt_kernel_S1X1: + + ands I, M , #3 + ble cgemvt_kernel_S1_END + +cgemvt_kernel_S1X1_10: + + KERNEL_S1X1 + + subs I, I, #1 + bne cgemvt_kernel_S1X1_10 + + +cgemvt_kernel_S1_END: + + SAVE_S1 + + + +/*************************************************************************************************************/ + +cgemvt_kernel_L999: + + sub r3, fp, #192 + +#if defined(DOUBLE) + vldm r3, { d8 - d15 } // restore floating point registers +#else + vldm r3, { s8 - s15 } // restore floating point registers +#endif + + mov r0, #0 // set return value + + sub sp, fp, #28 + pop {r4 -r9 ,fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/copy.c b/kernel/arm/copy.c new file mode 100644 index 000000000..f742a4a33 --- /dev/null +++ b/kernel/arm/copy.c @@ -0,0 +1,59 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : OK +* BLASTEST double : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#include "common.h" + +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + + if ( n < 0 ) return(0); + + while(i < n) + { + + y[iy] = x[ix] ; + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(0); + +} + + diff --git a/kernel/arm/ctrmm_kernel_2x2_vfp.S b/kernel/arm/ctrmm_kernel_2x2_vfp.S new file mode 100644 index 000000000..a68434f97 --- /dev/null +++ b/kernel/arm/ctrmm_kernel_2x2_vfp.S @@ -0,0 +1,1455 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/10/16 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_K r2 +#define OLD_A r3 +#define OLD_ALPHA_R s0 +#define OLD_ALPHA_I s1 + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define KKK [fp, #-240] +#define KK [fp, #-244 ] +#define A [fp, #-248 ] +#define LDC [fp, #-252 ] +#define M [fp, #-256 ] +#define N [fp, #-260 ] +#define K [fp, #-264 ] + +#define ALPHA_I [fp, #-272] +#define ALPHA_R [fp, #-280] + +#define B [fp, #4 ] +#define C [fp, #8 ] +#define OLD_LDC [fp, #12 ] +#define OFFSET [fp, #16 ] + +#define I r0 +#define J r1 +#define L r2 + +#define AO r5 +#define BO r6 + +#define CO1 r8 +#define CO2 r9 + +#define K1 r7 +#define BC r12 + +#define A_PRE 96 +#define B_PRE 96 +#define C_PRE 64 + + + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + + #define KMAC_R fnmacs + #define KMAC_I fmacs + + #define FMAC_R1 fmacs + #define FMAC_R2 fnmacs + #define FMAC_I1 fmacs + #define FMAC_I2 fmacs + +#elif defined(CN) || defined(CT) + + #define KMAC_R fmacs + #define KMAC_I fnmacs + + #define FMAC_R1 fmacs + #define FMAC_R2 fnmacs + #define FMAC_I1 fmacs + #define FMAC_I2 fmacs + +#elif defined(NC) || defined(TC) + + #define KMAC_R fmacs + #define KMAC_I fnmacs + + #define FMAC_R1 fmacs + #define FMAC_R2 fmacs + #define FMAC_I1 fnmacs + #define FMAC_I2 fmacs + +#else + + #define KMAC_R fnmacs + #define KMAC_I fmacs + + #define FMAC_R1 fmacs + #define FMAC_R2 fmacs + #define FMAC_I1 fnmacs + #define FMAC_I2 fmacs + +#endif + + +.macro INIT2x2 + + vsub.f32 s8 , s8 , s8 + vmov.f32 s9 , s8 + vmov.f32 s10, s8 + vmov.f32 s11, s8 + vmov.f32 s12, s8 + vmov.f32 s13, s8 + vmov.f32 s14, s8 + vmov.f32 s15, s8 + +.endm + +.macro KERNEL2x2_I + + pld [ AO, #A_PRE ] + fldmias AO!, { s0 - s3 } + pld [ BO, #B_PRE ] + fldmias BO!, { s4 - s7 } + + + fmuls s8 , s0, s4 + fmuls s9 , s0, s5 + fmuls s10 , s2, s4 + fmuls s11 , s2, s5 + + KMAC_R s8 , s1, s5 + KMAC_I s9 , s1, s4 + KMAC_R s10 , s3, s5 + KMAC_I s11 , s3, s4 + + fmuls s12 , s0, s6 + fmuls s13 , s0, s7 + fmuls s14 , s2, s6 + fmuls s15 , s2, s7 + + KMAC_R s12 , s1, s7 + KMAC_I s13 , s1, s6 + KMAC_R s14 , s3, s7 + KMAC_I s15 , s3, s6 + +.endm + + + +.macro KERNEL2x2_M1 + + pld [ AO, #A_PRE ] + fldmias AO!, { s0 - s3 } + pld [ BO, #B_PRE ] + fldmias BO!, { s4 - s7 } + + fmacs s8 , s0, s4 + fmacs s9 , s0, s5 + fmacs s10 , s2, s4 + fmacs s11 , s2, s5 + + KMAC_R s8 , s1, s5 + KMAC_I s9 , s1, s4 + KMAC_R s10 , s3, s5 + KMAC_I s11 , s3, s4 + + fmacs s12 , s0, s6 + fmacs s13 , s0, s7 + fmacs s14 , s2, s6 + fmacs s15 , s2, s7 + + KMAC_R s12 , s1, s7 + KMAC_I s13 , s1, s6 + KMAC_R s14 , s3, s7 + KMAC_I s15 , s3, s6 + +.endm + +.macro KERNEL2x2_M2 + + fldmias AO!, { s0 - s3 } + fldmias BO!, { s4 - s7 } + + fmacs s8 , s0, s4 + fmacs s9 , s0, s5 + fmacs s10 , s2, s4 + fmacs s11 , s2, s5 + + KMAC_R s8 , s1, s5 + KMAC_I s9 , s1, s4 + KMAC_R s10 , s3, s5 + KMAC_I s11 , s3, s4 + + fmacs s12 , s0, s6 + fmacs s13 , s0, s7 + fmacs s14 , s2, s6 + fmacs s15 , s2, s7 + + KMAC_R s12 , s1, s7 + KMAC_I s13 , s1, s6 + KMAC_R s14 , s3, s7 + KMAC_I s15 , s3, s6 + + +.endm + + +.macro KERNEL2x2_E + + fldmias AO!, { s0 - s3 } + fldmias BO!, { s4 - s7 } + + fmacs s8 , s0, s4 + fmacs s9 , s0, s5 + fmacs s10 , s2, s4 + fmacs s11 , s2, s5 + + KMAC_R s8 , s1, s5 + KMAC_I s9 , s1, s4 + KMAC_R s10 , s3, s5 + KMAC_I s11 , s3, s4 + + fmacs s12 , s0, s6 + fmacs s13 , s0, s7 + fmacs s14 , s2, s6 + fmacs s15 , s2, s7 + + KMAC_R s12 , s1, s7 + KMAC_I s13 , s1, s6 + KMAC_R s14 , s3, s7 + KMAC_I s15 , s3, s6 + + +.endm + +.macro KERNEL2x2_SUB + + fldmias AO!, { s0 - s3 } + fldmias BO!, { s4 - s7 } + + fmacs s8 , s0, s4 + fmacs s9 , s0, s5 + fmacs s10 , s2, s4 + fmacs s11 , s2, s5 + + KMAC_R s8 , s1, s5 + KMAC_I s9 , s1, s4 + KMAC_R s10 , s3, s5 + KMAC_I s11 , s3, s4 + + fmacs s12 , s0, s6 + fmacs s13 , s0, s7 + fmacs s14 , s2, s6 + fmacs s15 , s2, s7 + + KMAC_R s12 , s1, s7 + KMAC_I s13 , s1, s6 + KMAC_R s14 , s3, s7 + KMAC_I s15 , s3, s6 + + +.endm + +.macro SAVE2x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + flds s0, ALPHA_R + flds s1, ALPHA_I + + vsub.f32 s4, s4, s4 + vsub.f32 s5, s5, s5 + vsub.f32 s6, s6, s6 + vsub.f32 s7, s7, s7 + + FMAC_R1 s4 , s0 , s8 + FMAC_I1 s5 , s0 , s9 + FMAC_R2 s4 , s1 , s9 + FMAC_I2 s5 , s1 , s8 + + FMAC_R1 s6 , s0 , s10 + FMAC_I1 s7 , s0 , s11 + FMAC_R2 s6 , s1 , s11 + FMAC_I2 s7 , s1 , s10 + + fstmias CO1, { s4 - s7 } + + vsub.f32 s4, s4, s4 + vsub.f32 s5, s5, s5 + vsub.f32 s6, s6, s6 + vsub.f32 s7, s7, s7 + + FMAC_R1 s4 , s0 , s12 + FMAC_I1 s5 , s0 , s13 + FMAC_R2 s4 , s1 , s13 + FMAC_I2 s5 , s1 , s12 + + FMAC_R1 s6 , s0 , s14 + FMAC_I1 s7 , s0 , s15 + FMAC_R2 s6 , s1 , s15 + FMAC_I2 s7 , s1 , s14 + + fstmias CO2, { s4 - s7 } + + add CO1, CO1, #16 + +.endm + +/******************************************************************************/ + +.macro INIT1x2 + + vsub.f32 s8 , s8 , s8 + vmov.f32 s9 , s8 + vmov.f32 s12, s8 + vmov.f32 s13, s8 + +.endm + +.macro KERNEL1x2_I + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + flds s6 , [ BO, #8 ] + flds s7 , [ BO, #12 ] + + fmuls s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmuls s9 , s0, s5 + KMAC_I s9 , s1, s4 + + fmuls s12 , s0, s6 + KMAC_R s12 , s1, s7 + fmuls s13 , s0, s7 + KMAC_I s13 , s1, s6 + + add BO , BO, #16 + add AO , AO, #8 + +.endm + + + +.macro KERNEL1x2_M1 + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + flds s6 , [ BO, #8 ] + flds s7 , [ BO, #12 ] + + fmacs s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmacs s9 , s0, s5 + KMAC_I s9 , s1, s4 + + fmacs s12 , s0, s6 + KMAC_R s12 , s1, s7 + fmacs s13 , s0, s7 + KMAC_I s13 , s1, s6 + + add BO , BO, #16 + add AO , AO, #8 + +.endm + +.macro KERNEL1x2_M2 + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + flds s6 , [ BO, #8 ] + flds s7 , [ BO, #12 ] + + fmacs s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmacs s9 , s0, s5 + KMAC_I s9 , s1, s4 + + fmacs s12 , s0, s6 + KMAC_R s12 , s1, s7 + fmacs s13 , s0, s7 + KMAC_I s13 , s1, s6 + + add BO , BO, #16 + add AO , AO, #8 + + +.endm + + +.macro KERNEL1x2_E + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + flds s6 , [ BO, #8 ] + flds s7 , [ BO, #12 ] + + fmacs s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmacs s9 , s0, s5 + KMAC_I s9 , s1, s4 + + fmacs s12 , s0, s6 + KMAC_R s12 , s1, s7 + fmacs s13 , s0, s7 + KMAC_I s13 , s1, s6 + + add BO , BO, #16 + add AO , AO, #8 + +.endm + +.macro KERNEL1x2_SUB + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + flds s6 , [ BO, #8 ] + flds s7 , [ BO, #12 ] + + fmacs s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmacs s9 , s0, s5 + KMAC_I s9 , s1, s4 + + fmacs s12 , s0, s6 + KMAC_R s12 , s1, s7 + fmacs s13 , s0, s7 + KMAC_I s13 , s1, s6 + + add BO , BO, #16 + add AO , AO, #8 + + +.endm + + +.macro SAVE1x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + flds s0, ALPHA_R + flds s1, ALPHA_I + + vsub.f32 s4, s4, s4 + vsub.f32 s5, s5, s5 + + FMAC_R1 s4 , s0 , s8 + FMAC_I1 s5 , s0 , s9 + FMAC_R2 s4 , s1 , s9 + FMAC_I2 s5 , s1 , s8 + + fstmias CO1, { s4 - s5 } + + vsub.f32 s4, s4, s4 + vsub.f32 s5, s5, s5 + + FMAC_R1 s4 , s0 , s12 + FMAC_I1 s5 , s0 , s13 + FMAC_R2 s4 , s1 , s13 + FMAC_I2 s5 , s1 , s12 + + fstmias CO2, { s4 - s5 } + + add CO1, CO1, #8 + +.endm + + +/******************************************************************************/ + +.macro INIT2x1 + + vsub.f32 s8 , s8 , s8 + vmov.f32 s9 , s8 + vmov.f32 s10, s8 + vmov.f32 s11, s8 + +.endm + +.macro KERNEL2x1_I + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + flds s2 , [ AO, #8 ] + flds s3 , [ AO, #12 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + fmuls s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmuls s9 , s0, s5 + KMAC_I s9 , s1, s4 + + fmuls s10 , s2, s4 + KMAC_R s10 , s3, s5 + fmuls s11 , s2, s5 + KMAC_I s11 , s3, s4 + + add BO , BO, #8 + add AO , AO, #16 + +.endm + + + +.macro KERNEL2x1_M1 + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s2 , [ AO, #8 ] + flds s3 , [ AO, #12 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + fmacs s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmacs s9 , s0, s5 + KMAC_I s9 , s1, s4 + + fmacs s10 , s2, s4 + KMAC_R s10 , s3, s5 + fmacs s11 , s2, s5 + KMAC_I s11 , s3, s4 + + add BO , BO, #8 + add AO , AO, #16 + +.endm + +.macro KERNEL2x1_M2 + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s2 , [ AO, #8 ] + flds s3 , [ AO, #12 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + fmacs s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmacs s9 , s0, s5 + KMAC_I s9 , s1, s4 + + fmacs s10 , s2, s4 + KMAC_R s10 , s3, s5 + fmacs s11 , s2, s5 + KMAC_I s11 , s3, s4 + + add BO , BO, #8 + add AO , AO, #16 + + +.endm + + +.macro KERNEL2x1_E + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s2 , [ AO, #8 ] + flds s3 , [ AO, #12 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + fmacs s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmacs s9 , s0, s5 + KMAC_I s9 , s1, s4 + + fmacs s10 , s2, s4 + KMAC_R s10 , s3, s5 + fmacs s11 , s2, s5 + KMAC_I s11 , s3, s4 + + add BO , BO, #8 + add AO , AO, #16 + +.endm + +.macro KERNEL2x1_SUB + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s2 , [ AO, #8 ] + flds s3 , [ AO, #12 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + fmacs s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmacs s9 , s0, s5 + KMAC_I s9 , s1, s4 + + fmacs s10 , s2, s4 + KMAC_R s10 , s3, s5 + fmacs s11 , s2, s5 + KMAC_I s11 , s3, s4 + + add BO , BO, #8 + add AO , AO, #16 + + +.endm + + +.macro SAVE2x1 + + flds s0, ALPHA_R + flds s1, ALPHA_I + + vsub.f32 s4, s4, s4 + vsub.f32 s5, s5, s5 + vsub.f32 s6, s6, s6 + vsub.f32 s7, s7, s7 + + FMAC_R1 s4 , s0 , s8 + FMAC_I1 s5 , s0 , s9 + FMAC_R2 s4 , s1 , s9 + FMAC_I2 s5 , s1 , s8 + + FMAC_R1 s6 , s0 , s10 + FMAC_I1 s7 , s0 , s11 + FMAC_R2 s6 , s1 , s11 + FMAC_I2 s7 , s1 , s10 + + fstmias CO1, { s4 - s7 } + + add CO1, CO1, #16 + +.endm + + +/******************************************************************************/ + +.macro INIT1x1 + + vsub.f32 s8 , s8 , s8 + vmov.f32 s9 , s8 + +.endm + +.macro KERNEL1x1_I + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + fmuls s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmuls s9 , s0, s5 + KMAC_I s9 , s1, s4 + + add BO , BO, #8 + add AO , AO, #8 + +.endm + + + +.macro KERNEL1x1_M1 + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + fmacs s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmacs s9 , s0, s5 + KMAC_I s9 , s1, s4 + + add BO , BO, #8 + add AO , AO, #8 + +.endm + +.macro KERNEL1x1_M2 + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + fmacs s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmacs s9 , s0, s5 + KMAC_I s9 , s1, s4 + + add BO , BO, #8 + add AO , AO, #8 + + +.endm + + +.macro KERNEL1x1_E + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + fmacs s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmacs s9 , s0, s5 + KMAC_I s9 , s1, s4 + + add BO , BO, #8 + add AO , AO, #8 + +.endm + +.macro KERNEL1x1_SUB + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + fmacs s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmacs s9 , s0, s5 + KMAC_I s9 , s1, s4 + + add BO , BO, #8 + add AO , AO, #8 + + +.endm + + +.macro SAVE1x1 + + flds s0, ALPHA_R + flds s1, ALPHA_I + + vsub.f32 s4, s4, s4 + vsub.f32 s5, s5, s5 + + FMAC_R1 s4 , s0 , s8 + FMAC_I1 s5 , s0 , s9 + FMAC_R2 s4 , s1 , s9 + FMAC_I2 s5 , s1 , s8 + + fstmias CO1, { s4 - s5 } + + add CO1, CO1, #8 + +.endm + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + str OLD_M, M + str OLD_N, N + str OLD_K, K + str OLD_A, A + vstr OLD_ALPHA_R, ALPHA_R + vstr OLD_ALPHA_I, ALPHA_I + + sub r3, fp, #128 + vstm r3, { s8 - s15} // store floating point registers + + ldr r3, OLD_LDC + lsl r3, r3, #3 // ldc = ldc * 4 * 2 + str r3, LDC + + ldr r3, OFFSET +#ifndef LEFT + neg r3 , r3 +#endif + str r3 , KK + + ldr BC, B + + ldr J, N + asrs J, J, #1 // J = J / 2 + ble _L1_BEGIN + +_L2_BEGIN: + + ldr CO1, C // CO1 = C + ldr r4 , LDC + lsl r4 , r4 , #1 // LDC * 2 + add r3 , r4, CO1 + str r3 , C // store C + +#if defined(LEFT) + ldr r3 , OFFSET + str r3 , KK +#endif + + ldr AO, A // AO = A + pld [AO , #A_PRE-64] + pld [AO , #A_PRE-32] + + + +_L2_M2_BEGIN: + + ldr I, M + asrs I, I, #1 // I = I / 2 + ble _L2_M1_BEGIN + +_L2_M2_20: + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #4 // 2 * 4 * 2 float values + add BO , BO , r4 + lsls r4 , r3 , #4 // 2 * 4 * 2 float values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #2 // number of values in AO +#else + add K1, K1, #2 // number of values in BO +#endif + str K1, KKK +#endif + + asrs L , K1, #3 // L = L / 8 + cmp L , #3 + blt _L2_M2_30 + .align 5 + + + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + sub L, L, #2 + +_L2_M2_22: + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + subs L, L, #1 + bgt _L2_M2_22 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b _L2_M2_44 + + +_L2_M2_30: + tst L, #3 + ble _L2_M2_40 + + tst L, #2 + ble _L2_M2_32 + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b _L2_M2_44 + +_L2_M2_32: + + tst L, #1 + ble _L2_M2_40 + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b _L2_M2_44 + + +_L2_M2_40: + + INIT2x2 + + +_L2_M2_44: + + ands L , K1, #7 // L = L % 8 + ble _L2_M2_100 + +_L2_M2_46: + + KERNEL2x2_SUB + + subs L, L, #1 + bne _L2_M2_46 + +_L2_M2_100: + + SAVE2x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #4 // 2 * 4 * 2 float values + add BO , BO , r4 + lsls r4 , r3 , #4 // 2 * 4 * 2 float values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in AO + str r3 , KK +#endif + + +_L2_M2_END: + + subs I, I, #1 + bne _L2_M2_20 + + +_L2_M1_BEGIN: + + ldr I, M + tst I, #1 // I = I % 2 + ble _L2_END + +_L2_M1_20: + + INIT1x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #4 // 2 * 4 * 2 float values + add BO , BO , r4 + lsls r4 , r3 , #3 // 1 * 4 * 2 float values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #1 // number of values in AO +#else + add K1, K1, #2 // number of values in BO +#endif + str K1, KKK +#endif + + asrs L , K1, #3 // L = L / 8 + ble _L2_M1_40 + +_L2_M1_22: + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs L, L, #1 + bgt _L2_M1_22 + + +_L2_M1_40: + + ands L , K1, #7 // L = L % 8 + ble _L2_M1_100 + +_L2_M1_42: + + KERNEL1x2_SUB + + subs L, L, #1 + bgt _L2_M1_42 + +_L2_M1_100: + + SAVE1x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #4 // 2 * 4 * 2 float values + add BO , BO , r4 + lsls r4 , r3 , #3 // 1 * 4 * 2 float values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #1 // number of values in AO + str r3 , KK +#endif + + + +_L2_END: + + mov r3, BC + ldr r4, K + lsl r4, r4, #4 // k * 2 * 4 * 2 + add r3, r3, r4 // B = B + K * 2 * 8 + mov BC, r3 + +#if !defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in BO + str r3 , KK +#endif + + subs J , #1 // j-- + bgt _L2_BEGIN + + + +/*********************************************************************************************/ + +_L1_BEGIN: + + ldr J , N + tst J , #1 + ble _L999 + + + ldr CO1, C // CO1 = C + ldr r4 , LDC + add r3 , r4, CO1 + str r3 , C // store C + +#if defined(LEFT) + ldr r3 , OFFSET + str r3 , KK +#endif + + ldr AO, A // AO = A + +_L1_M2_BEGIN: + + ldr I, M + asrs I, I, #1 // I = I / 2 + ble _L1_M1_BEGIN + +_L1_M2_20: + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #3 // 1 * 4 * 2 float values + add BO , BO , r4 + lsls r4 , r3 , #4 // 2 * 4 * 2 float values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #2 // number of values in AO +#else + add K1, K1, #1 // number of values in BO +#endif + str K1, KKK +#endif + + asrs L , K1, #3 // L = L / 8 + cmp L , #3 + blt _L1_M2_30 + .align 5 + + + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + sub L, L, #2 + +_L1_M2_22: + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + subs L, L, #1 + bgt _L1_M2_22 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b _L1_M2_44 + + +_L1_M2_30: + tst L, #3 + ble _L1_M2_40 + + tst L, #2 + ble _L1_M2_32 + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b _L1_M2_44 + +_L1_M2_32: + + tst L, #1 + ble _L1_M2_40 + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b _L1_M2_44 + + +_L1_M2_40: + + INIT2x1 + + +_L1_M2_44: + + ands L , K1, #7 // L = L % 8 + ble _L1_M2_100 + +_L1_M2_46: + + KERNEL2x1_SUB + + subs L, L, #1 + bne _L1_M2_46 + +_L1_M2_100: + + SAVE2x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #3 // 1 * 4 * 2 float values + add BO , BO , r4 + lsls r4 , r3 , #4 // 2 * 4 * 2 float values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in AO + str r3 , KK +#endif + + + +_L1_M2_END: + + subs I, I, #1 + bne _L1_M2_20 + + +_L1_M1_BEGIN: + + ldr I, M + tst I, #1 // I = I % 2 + ble _L1_END + +_L1_M1_20: + + INIT1x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #3 // 1 * 4 * 2 float values + add BO , BO , r4 + lsls r4 , r3 , #3 // 1 * 4 * 2 float values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #1 // number of values in AO +#else + add K1, K1, #1 // number of values in BO +#endif + str K1, KKK +#endif + + asrs L , K1, #3 // L = L / 8 + ble _L1_M1_40 + +_L1_M1_22: + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs L, L, #1 + bgt _L1_M1_22 + + +_L1_M1_40: + + ands L , K1, #7 // L = L % 8 + ble _L1_M1_100 + +_L1_M1_42: + + KERNEL1x1_SUB + + subs L, L, #1 + bgt _L1_M1_42 + +_L1_M1_100: + + SAVE1x1 + + +_L1_END: + + + +_L999: + + sub r3, fp, #128 + vldm r3, { s8 - s15} // restore floating point registers + + movs r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/ctrmm_kernel_2x2_vfpv3.S b/kernel/arm/ctrmm_kernel_2x2_vfpv3.S new file mode 100644 index 000000000..28e555caa --- /dev/null +++ b/kernel/arm/ctrmm_kernel_2x2_vfpv3.S @@ -0,0 +1,1476 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/10/16 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_K r2 +#define OLD_A r3 +#define OLD_ALPHA_R s0 +#define OLD_ALPHA_I s1 + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define KKK [fp, #-240] +#define KK [fp, #-244 ] +#define A [fp, #-248 ] +#define LDC [fp, #-252 ] +#define M [fp, #-256 ] +#define N [fp, #-260 ] +#define K [fp, #-264 ] + +#define ALPHA_I [fp, #-272] +#define ALPHA_R [fp, #-280] + +#define B [fp, #4 ] +#define C [fp, #8 ] +#define OLD_LDC [fp, #12 ] +#define OFFSET [fp, #16 ] + +#define I r0 +#define J r1 +#define L r2 + +#define AO r5 +#define BO r6 + +#define CO1 r8 +#define CO2 r9 + +#define K1 r7 +#define BC r12 + +#define A_PRE 96 +#define B_PRE 96 +#define C_PRE 64 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + + #define FADD_R fsubs + #define FADD_I fadds + + #define FMAC_R1 fnmuls + #define FMAC_R2 fnmacs + #define FMAC_I1 fmuls + #define FMAC_I2 fnmacs + +#elif defined(CN) || defined(CT) + + #define FADD_R fadds + #define FADD_I fsubs + + #define FMAC_R1 fmuls + #define FMAC_R2 fmacs + #define FMAC_I1 fnmuls + #define FMAC_I2 fmacs + +#elif defined(NC) || defined(TC) + + #define FADD_R fadds + #define FADD_I fsubs + + #define FMAC_R1 fmuls + #define FMAC_R2 fnmacs + #define FMAC_I1 fmuls + #define FMAC_I2 fmacs + +#else + + #define FADD_R fsubs + #define FADD_I fadds + + #define FMAC_R1 fnmuls + #define FMAC_R2 fmacs + #define FMAC_I1 fnmuls + #define FMAC_I2 fnmacs + +#endif + + + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro INIT2x2 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s17, s16 + vmov.f32 s18, s16 + vmov.f32 s19, s16 + vmov.f32 s20, s16 + vmov.f32 s21, s16 + vmov.f32 s22, s16 + vmov.f32 s23, s16 + vmov.f32 s24, s16 + vmov.f32 s25, s16 + vmov.f32 s26, s16 + vmov.f32 s27, s16 + vmov.f32 s28, s16 + vmov.f32 s29, s16 + vmov.f32 s30, s16 + vmov.f32 s31, s16 + +.endm + +.macro KERNEL2x2_I + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + fldmias AO!, { s0 - s1 } + fldmias BO!, { s8 - s9 } + + fmuls s16 , s0, s8 + fmuls s24 , s1, s9 + fldmias AO!, { s2 - s3 } + fmuls s17 , s0, s9 + fmuls s25 , s1, s8 + + fldmias BO!, { s10 - s11 } + fmuls s18 , s2, s8 + fmuls s26 , s3, s9 + fldmias AO!, { s4 - s5 } + fmuls s19 , s2, s9 + fmuls s27 , s3, s8 + + fldmias BO!, { s12 - s13 } + fmuls s20 , s0, s10 + fmuls s28 , s1, s11 + fldmias AO!, { s6 - s7 } + fmuls s21 , s0, s11 + fmuls s29 , s1, s10 + + fldmias BO!, { s14 - s15 } + fmuls s22 , s2, s10 + fmuls s30 , s3, s11 + fmuls s23 , s2, s11 + fmuls s31 , s3, s10 + +.endm + + + +.macro KERNEL2x2_M1 + + fmacs s16 , s0, s8 + fldmias AO!, { s4 - s5 } + fmacs s24 , s1, s9 + fmacs s17 , s0, s9 + fldmias BO!, { s12 - s13 } + fmacs s25 , s1, s8 + + fmacs s18 , s2, s8 + fldmias AO!, { s6 - s7 } + fmacs s26 , s3, s9 + fmacs s19 , s2, s9 + fldmias BO!, { s14 - s15 } + fmacs s27 , s3, s8 + + fmacs s20 , s0, s10 + fmacs s28 , s1, s11 + fmacs s21 , s0, s11 + fmacs s29 , s1, s10 + + fmacs s22 , s2, s10 + fmacs s30 , s3, s11 + fmacs s23 , s2, s11 + fmacs s31 , s3, s10 + +.endm + +.macro KERNEL2x2_M2 + pld [ AO , #A_PRE ] + + fmacs s16 , s4, s12 + pld [ BO , #B_PRE ] + fmacs s24 , s5, s13 + fmacs s17 , s4, s13 + fldmias AO!, { s0 - s1 } + fmacs s25 , s5, s12 + + fmacs s18 , s6, s12 + fmacs s26 , s7, s13 + fldmias BO!, { s8 - s9 } + fmacs s19 , s6, s13 + fmacs s27 , s7, s12 + + fldmias AO!, { s2 - s3 } + fmacs s20 , s4, s14 + fmacs s28 , s5, s15 + fldmias BO!, { s10 - s11 } + fmacs s21 , s4, s15 + fmacs s29 , s5, s14 + + fmacs s22 , s6, s14 + fmacs s30 , s7, s15 + fmacs s23 , s6, s15 + fmacs s31 , s7, s14 + +.endm + + +.macro KERNEL2x2_E + + fmacs s16 , s4, s12 + fmacs s24 , s5, s13 + fmacs s17 , s4, s13 + fmacs s25 , s5, s12 + + fmacs s18 , s6, s12 + fmacs s26 , s7, s13 + fmacs s19 , s6, s13 + fmacs s27 , s7, s12 + + fmacs s20 , s4, s14 + fmacs s28 , s5, s15 + fmacs s21 , s4, s15 + fmacs s29 , s5, s14 + + fmacs s22 , s6, s14 + fmacs s30 , s7, s15 + fmacs s23 , s6, s15 + fmacs s31 , s7, s14 + +.endm + +.macro KERNEL2x2_SUB + + fldmias AO!, { s0 - s1 } + fldmias BO!, { s8 - s9 } + + fmacs s16 , s0, s8 + fmacs s24 , s1, s9 + fldmias AO!, { s2 - s3 } + fmacs s17 , s0, s9 + fmacs s25 , s1, s8 + + fldmias BO!, { s10 - s11 } + fmacs s18 , s2, s8 + fmacs s26 , s3, s9 + fmacs s19 , s2, s9 + fmacs s27 , s3, s8 + + fmacs s20 , s0, s10 + fmacs s28 , s1, s11 + fmacs s21 , s0, s11 + fmacs s29 , s1, s10 + + fmacs s22 , s2, s10 + fmacs s30 , s3, s11 + fmacs s23 , s2, s11 + fmacs s31 , s3, s10 + +.endm + + + + +.macro SAVE2x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + flds s0, ALPHA_R + flds s1, ALPHA_I + + FADD_R s16, s24 , s16 + FADD_I s17, s25 , s17 + FADD_R s18, s26 , s18 + FADD_I s19, s27 , s19 + FADD_R s20, s28 , s20 + FADD_I s21, s29 , s21 + FADD_R s22, s30 , s22 + FADD_I s23, s31 , s23 + + FMAC_R1 s4 , s0 , s16 + FMAC_I1 s5 , s0 , s17 + FMAC_R2 s4 , s1 , s17 + FMAC_I2 s5 , s1 , s16 + + FMAC_R1 s6 , s0 , s18 + FMAC_I1 s7 , s0 , s19 + FMAC_R2 s6 , s1 , s19 + FMAC_I2 s7 , s1 , s18 + + FMAC_R1 s8 , s0 , s20 + FMAC_I1 s9 , s0 , s21 + FMAC_R2 s8 , s1 , s21 + FMAC_I2 s9 , s1 , s20 + + FMAC_R1 s10, s0 , s22 + FMAC_I1 s11, s0 , s23 + FMAC_R2 s10, s1 , s23 + FMAC_I2 s11, s1 , s22 + + fstmias CO1, { s4 - s7 } + fstmias CO2, { s8 - s11 } + + add CO1, CO1, #16 + +.endm + +/******************************************************************************/ + +.macro INIT1x2 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s17, s16 + vmov.f32 s20, s16 + vmov.f32 s21, s16 + vmov.f32 s24, s16 + vmov.f32 s25, s16 + vmov.f32 s28, s16 + vmov.f32 s29, s16 + +.endm + +.macro KERNEL1x2_I + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + flds s10, [ BO, #8 ] + flds s11, [ BO, #12 ] + + fmuls s16 , s0, s8 + fmuls s24 , s1, s9 + fmuls s17 , s0, s9 + fmuls s25 , s1, s8 + + fmuls s20 , s0, s10 + fmuls s28 , s1, s11 + fmuls s21 , s0, s11 + fmuls s29 , s1, s10 + + add BO , BO, #16 + add AO , AO, #8 + + pld [ BO , #B_PRE ] + + flds s4 , [ AO, #0 ] + flds s5 , [ AO, #4 ] + + flds s12, [ BO ] + flds s13, [ BO, #4 ] + flds s14, [ BO, #8 ] + flds s15, [ BO, #12 ] + + add BO , BO, #16 + add AO , AO, #8 +.endm + + + +.macro KERNEL1x2_M1 + pld [ BO , #B_PRE ] + + fmacs s16 , s0, s8 + fmacs s24 , s1, s9 + fmacs s17 , s0, s9 + fmacs s25 , s1, s8 + + fmacs s20 , s0, s10 + fmacs s28 , s1, s11 + fmacs s21 , s0, s11 + fmacs s29 , s1, s10 + + flds s4 , [ AO, #0 ] + flds s5 , [ AO, #4 ] + + flds s12, [ BO ] + flds s13, [ BO, #4 ] + flds s14, [ BO, #8 ] + flds s15, [ BO, #12 ] + + add BO , BO, #16 + add AO , AO, #8 +.endm + +.macro KERNEL1x2_M2 + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + + fmacs s16 , s4, s12 + fmacs s24 , s5, s13 + fmacs s17 , s4, s13 + fmacs s25 , s5, s12 + + fmacs s20 , s4, s14 + fmacs s28 , s5, s15 + fmacs s21 , s4, s15 + fmacs s29 , s5, s14 + + flds s0 , [ AO, #0 ] + flds s1 , [ AO, #4 ] + + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + flds s10, [ BO, #8 ] + flds s11, [ BO, #12 ] + + add BO , BO, #16 + add AO , AO, #8 +.endm + + +.macro KERNEL1x2_E + + fmacs s16 , s4, s12 + fmacs s24 , s5, s13 + fmacs s17 , s4, s13 + fmacs s25 , s5, s12 + + fmacs s20 , s4, s14 + fmacs s28 , s5, s15 + fmacs s21 , s4, s15 + fmacs s29 , s5, s14 + +.endm + +.macro KERNEL1x2_SUB + + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + flds s10, [ BO, #8 ] + flds s11, [ BO, #12 ] + + fmacs s16 , s0, s8 + fmacs s24 , s1, s9 + fmacs s17 , s0, s9 + fmacs s25 , s1, s8 + + fmacs s20 , s0, s10 + fmacs s28 , s1, s11 + fmacs s21 , s0, s11 + fmacs s29 , s1, s10 + + add BO , BO, #16 + add AO , AO, #8 + +.endm + + + + +.macro SAVE1x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + flds s0, ALPHA_R + flds s1, ALPHA_I + + FADD_R s16, s24 , s16 + FADD_I s17, s25 , s17 + FADD_R s20, s28 , s20 + FADD_I s21, s29 , s21 + + FMAC_R1 s4 , s0 , s16 + FMAC_I1 s5 , s0 , s17 + FMAC_R2 s4 , s1 , s17 + FMAC_I2 s5 , s1 , s16 + + FMAC_R1 s8 , s0 , s20 + FMAC_I1 s9 , s0 , s21 + FMAC_R2 s8 , s1 , s21 + FMAC_I2 s9 , s1 , s20 + + fstmias CO1, { s4 - s5 } + fstmias CO2, { s8 - s9 } + + add CO1, CO1, #8 + +.endm + +/******************************************************************************/ + +.macro INIT2x1 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s17, s16 + vmov.f32 s18, s16 + vmov.f32 s19, s16 + vmov.f32 s24, s16 + vmov.f32 s25, s16 + vmov.f32 s26, s16 + vmov.f32 s27, s16 + +.endm + +.macro KERNEL2x1_I + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s2 , [ AO, #8 ] + flds s3 , [ AO, #12 ] + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + + fmuls s16 , s0, s8 + fmuls s24 , s1, s9 + fmuls s17 , s0, s9 + fmuls s25 , s1, s8 + + fmuls s18 , s2, s8 + fmuls s26 , s3, s9 + fmuls s19 , s2, s9 + fmuls s27 , s3, s8 + + add BO , BO, #8 + add AO , AO, #16 + + pld [ BO , #B_PRE ] + pld [ AO , #A_PRE ] + + flds s4 , [ AO, #0 ] + flds s5 , [ AO, #4 ] + flds s6 , [ AO, #8 ] + flds s7 , [ AO, #12 ] + + flds s12, [ BO ] + flds s13, [ BO, #4 ] + + add BO , BO, #8 + add AO , AO, #16 +.endm + + + +.macro KERNEL2x1_M1 + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + + fmacs s16 , s0, s8 + fmacs s24 , s1, s9 + fmacs s17 , s0, s9 + fmacs s25 , s1, s8 + + fmacs s18 , s2, s8 + fmacs s26 , s3, s9 + fmacs s19 , s2, s9 + fmacs s27 , s3, s8 + + flds s4 , [ AO, #0 ] + flds s5 , [ AO, #4 ] + flds s6 , [ AO, #8 ] + flds s7 , [ AO, #12 ] + + flds s12, [ BO ] + flds s13, [ BO, #4 ] + + add BO , BO, #8 + add AO , AO, #16 +.endm + +.macro KERNEL2x1_M2 + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + + fmacs s16 , s4, s12 + fmacs s24 , s5, s13 + fmacs s17 , s4, s13 + fmacs s25 , s5, s12 + + fmacs s18 , s6, s12 + fmacs s26 , s7, s13 + fmacs s19 , s6, s13 + fmacs s27 , s7, s12 + + flds s0 , [ AO, #0 ] + flds s1 , [ AO, #4 ] + flds s2 , [ AO, #8 ] + flds s3 , [ AO, #12 ] + + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + + add BO , BO, #8 + add AO , AO, #16 +.endm + + +.macro KERNEL2x1_E + + fmacs s16 , s4, s12 + fmacs s24 , s5, s13 + fmacs s17 , s4, s13 + fmacs s25 , s5, s12 + + fmacs s18 , s6, s12 + fmacs s26 , s7, s13 + fmacs s19 , s6, s13 + fmacs s27 , s7, s12 + +.endm + +.macro KERNEL2x1_SUB + + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s2 , [ AO, #8 ] + flds s3 , [ AO, #12 ] + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + + fmacs s16 , s0, s8 + fmacs s24 , s1, s9 + fmacs s17 , s0, s9 + fmacs s25 , s1, s8 + + fmacs s18 , s2, s8 + fmacs s26 , s3, s9 + fmacs s19 , s2, s9 + fmacs s27 , s3, s8 + + add BO , BO, #8 + add AO , AO, #16 + +.endm + + + + +.macro SAVE2x1 + + flds s0, ALPHA_R + flds s1, ALPHA_I + + FADD_R s16, s24 , s16 + FADD_I s17, s25 , s17 + FADD_R s18, s26 , s18 + FADD_I s19, s27 , s19 + + FMAC_R1 s4 , s0 , s16 + FMAC_I1 s5 , s0 , s17 + FMAC_R2 s4 , s1 , s17 + FMAC_I2 s5 , s1 , s16 + + FMAC_R1 s6 , s0 , s18 + FMAC_I1 s7 , s0 , s19 + FMAC_R2 s6 , s1 , s19 + FMAC_I2 s7 , s1 , s18 + + fstmias CO1, { s4 - s7 } + + add CO1, CO1, #16 + +.endm + +/******************************************************************************/ + +.macro INIT1x1 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s17, s16 + vmov.f32 s24, s16 + vmov.f32 s25, s16 + +.endm + +.macro KERNEL1x1_I + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + + fmuls s16 , s0, s8 + fmuls s24 , s1, s9 + fmuls s17 , s0, s9 + fmuls s25 , s1, s8 + + add BO , BO, #8 + add AO , AO, #8 + + pld [ BO , #B_PRE ] + pld [ AO , #A_PRE ] + + flds s4 , [ AO, #0 ] + flds s5 , [ AO, #4 ] + + flds s12, [ BO ] + flds s13, [ BO, #4 ] + + add BO , BO, #8 + add AO , AO, #8 +.endm + + + +.macro KERNEL1x1_M1 + + fmacs s16 , s0, s8 + fmacs s24 , s1, s9 + fmacs s17 , s0, s9 + fmacs s25 , s1, s8 + + flds s4 , [ AO, #0 ] + flds s5 , [ AO, #4 ] + + flds s12, [ BO ] + flds s13, [ BO, #4 ] + + add BO , BO, #8 + add AO , AO, #8 +.endm + +.macro KERNEL1x1_M2 + + fmacs s16 , s4, s12 + fmacs s24 , s5, s13 + fmacs s17 , s4, s13 + fmacs s25 , s5, s12 + + flds s0 , [ AO, #0 ] + flds s1 , [ AO, #4 ] + + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + + add BO , BO, #8 + add AO , AO, #8 +.endm + + +.macro KERNEL1x1_E + + fmacs s16 , s4, s12 + fmacs s24 , s5, s13 + fmacs s17 , s4, s13 + fmacs s25 , s5, s12 + +.endm + +.macro KERNEL1x1_SUB + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + + fmacs s16 , s0, s8 + fmacs s24 , s1, s9 + fmacs s17 , s0, s9 + fmacs s25 , s1, s8 + + add BO , BO, #8 + add AO , AO, #8 + +.endm + + + + +.macro SAVE1x1 + + flds s0, ALPHA_R + flds s1, ALPHA_I + + FADD_R s16, s24 , s16 + FADD_I s17, s25 , s17 + + FMAC_R1 s4 , s0 , s16 + FMAC_I1 s5 , s0 , s17 + FMAC_R2 s4 , s1 , s17 + FMAC_I2 s5 , s1 , s16 + + fstmias CO1, { s4 - s5 } + + add CO1, CO1, #8 + +.endm + +/******************************************************************************/ + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + str OLD_M, M + str OLD_N, N + str OLD_K, K + str OLD_A, A + vstr OLD_ALPHA_R, ALPHA_R + vstr OLD_ALPHA_I, ALPHA_I + + sub r3, fp, #128 + vstm r3, { s8 - s31} // store floating point registers + + ldr r3, OLD_LDC + lsl r3, r3, #3 // ldc = ldc * 4 * 2 + str r3, LDC + + ldr r3, OFFSET +#ifndef LEFT + neg r3 , r3 +#endif + str r3 , KK + + ldr BC, B + + ldr J, N + asrs J, J, #1 // J = J / 2 + ble _L1_BEGIN + +_L2_BEGIN: + + ldr CO1, C // CO1 = C + ldr r4 , LDC + lsl r4 , r4 , #1 // LDC * 2 + add r3 , r4, CO1 + str r3 , C // store C + +#if defined(LEFT) + ldr r3 , OFFSET + str r3 , KK +#endif + + ldr AO, A // AO = A + pld [AO , #A_PRE-64] + pld [AO , #A_PRE-32] + + + +_L2_M2_BEGIN: + + ldr I, M + asrs I, I, #1 // I = I / 2 + ble _L2_M1_BEGIN + +_L2_M2_20: + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #4 // 2 * 4 * 2 float values + add BO , BO , r4 + lsls r4 , r3 , #4 // 2 * 4 * 2 float values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #2 // number of values in AO +#else + add K1, K1, #2 // number of values in BO +#endif + str K1, KKK +#endif + + asrs L , K1, #3 // L = L / 8 + cmp L , #3 + blt _L2_M2_30 + .align 5 + + + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + sub L, L, #2 + +_L2_M2_22: + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + subs L, L, #1 + bgt _L2_M2_22 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b _L2_M2_44 + + +_L2_M2_30: + tst L, #3 + ble _L2_M2_40 + + tst L, #2 + ble _L2_M2_32 + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b _L2_M2_44 + +_L2_M2_32: + + tst L, #1 + ble _L2_M2_40 + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b _L2_M2_44 + + +_L2_M2_40: + + INIT2x2 + + +_L2_M2_44: + + ands L , K1, #7 // L = L % 8 + ble _L2_M2_100 + +_L2_M2_46: + + KERNEL2x2_SUB + + subs L, L, #1 + bne _L2_M2_46 + +_L2_M2_100: + + SAVE2x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #4 // 2 * 4 * 2 float values + add BO , BO , r4 + lsls r4 , r3 , #4 // 2 * 4 * 2 float values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in AO + str r3 , KK +#endif + + +_L2_M2_END: + + subs I, I, #1 + bne _L2_M2_20 + + +_L2_M1_BEGIN: + + ldr I, M + tst I, #1 // I = I % 2 + ble _L2_END + +_L2_M1_20: + + INIT1x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #4 // 2 * 4 * 2 float values + add BO , BO , r4 + lsls r4 , r3 , #3 // 1 * 4 * 2 float values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #1 // number of values in AO +#else + add K1, K1, #2 // number of values in BO +#endif + str K1, KKK +#endif + + asrs L , K1, #3 // L = L / 8 + ble _L2_M1_40 + +_L2_M1_22: + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs L, L, #1 + bgt _L2_M1_22 + + +_L2_M1_40: + + ands L , K1, #7 // L = L % 8 + ble _L2_M1_100 + +_L2_M1_42: + + KERNEL1x2_SUB + + subs L, L, #1 + bgt _L2_M1_42 + +_L2_M1_100: + + SAVE1x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #4 // 2 * 4 * 2 float values + add BO , BO , r4 + lsls r4 , r3 , #3 // 1 * 4 * 2 float values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #1 // number of values in AO + str r3 , KK +#endif + + + +_L2_END: + + mov r3, BC + ldr r4, K + lsl r4, r4, #4 // k * 2 * 4 * 2 + add r3, r3, r4 // B = B + K * 2 * 8 + mov BC, r3 + +#if !defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in BO + str r3 , KK +#endif + + subs J , #1 // j-- + bgt _L2_BEGIN + + + +/*********************************************************************************************/ + +_L1_BEGIN: + + ldr J , N + tst J , #1 + ble _L999 + + + ldr CO1, C // CO1 = C + ldr r4 , LDC + add r3 , r4, CO1 + str r3 , C // store C + +#if defined(LEFT) + ldr r3 , OFFSET + str r3 , KK +#endif + + ldr AO, A // AO = A + +_L1_M2_BEGIN: + + ldr I, M + asrs I, I, #1 // I = I / 2 + ble _L1_M1_BEGIN + +_L1_M2_20: + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #3 // 1 * 4 * 2 float values + add BO , BO , r4 + lsls r4 , r3 , #4 // 2 * 4 * 2 float values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #2 // number of values in AO +#else + add K1, K1, #1 // number of values in BO +#endif + str K1, KKK +#endif + + asrs L , K1, #3 // L = L / 8 + cmp L , #3 + blt _L1_M2_30 + .align 5 + + + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + sub L, L, #2 + +_L1_M2_22: + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + subs L, L, #1 + bgt _L1_M2_22 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b _L1_M2_44 + + +_L1_M2_30: + tst L, #3 + ble _L1_M2_40 + + tst L, #2 + ble _L1_M2_32 + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b _L1_M2_44 + +_L1_M2_32: + + tst L, #1 + ble _L1_M2_40 + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b _L1_M2_44 + + +_L1_M2_40: + + INIT2x1 + + +_L1_M2_44: + + ands L , K1, #7 // L = L % 8 + ble _L1_M2_100 + +_L1_M2_46: + + KERNEL2x1_SUB + + subs L, L, #1 + bne _L1_M2_46 + +_L1_M2_100: + + SAVE2x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #3 // 1 * 4 * 2 float values + add BO , BO , r4 + lsls r4 , r3 , #4 // 2 * 4 * 2 float values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in AO + str r3 , KK +#endif + + + +_L1_M2_END: + + subs I, I, #1 + bne _L1_M2_20 + + +_L1_M1_BEGIN: + + ldr I, M + tst I, #1 // I = I % 2 + ble _L1_END + +_L1_M1_20: + + INIT1x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #3 // 1 * 4 * 2 float values + add BO , BO , r4 + lsls r4 , r3 , #3 // 1 * 4 * 2 float values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #1 // number of values in AO +#else + add K1, K1, #1 // number of values in BO +#endif + str K1, KKK +#endif + + asrs L , K1, #3 // L = L / 8 + ble _L1_M1_40 + +_L1_M1_22: + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs L, L, #1 + bgt _L1_M1_22 + + +_L1_M1_40: + + ands L , K1, #7 // L = L % 8 + ble _L1_M1_100 + +_L1_M1_42: + + KERNEL1x1_SUB + + subs L, L, #1 + bgt _L1_M1_42 + +_L1_M1_100: + + SAVE1x1 + + +_L1_END: + + + +_L999: + + sub r3, fp, #128 + vldm r3, { s8 - s31} // restore floating point registers + + movs r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/dcopy_vfp.S b/kernel/arm/dcopy_vfp.S new file mode 100644 index 000000000..0fad3c4a6 --- /dev/null +++ b/kernel/arm/dcopy_vfp.S @@ -0,0 +1,222 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/07 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define N r0 +#define X r1 +#define INC_X r2 +#define OLD_Y r3 + + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define OLD_INC_Y [fp, #4 ] + +#define I r5 +#define Y r6 +#define INC_Y r7 + +#define X_PRE 256 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro COPY_F4 + + pld [ X, #X_PRE ] + fldmiad X!, { d0 - d3 } + fstmiad Y!, { d0 - d3 } + +.endm + +.macro COPY_F1 + + fldmiad X!, { d0 } + fstmiad Y!, { d0 } + +.endm + + +/*************************************************************************************************************************/ + +.macro COPY_S4 + + nop + fldmiad X, { d0 } + fstmiad Y, { d0 } + add X, X, INC_X + add Y, Y, INC_Y + + fldmiad X, { d1 } + fstmiad Y, { d1 } + add X, X, INC_X + add Y, Y, INC_Y + + fldmiad X, { d0 } + fstmiad Y, { d0 } + add X, X, INC_X + add Y, Y, INC_Y + + fldmiad X, { d1 } + fstmiad Y, { d1 } + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + +.macro COPY_S1 + + fldmiad X, { d0 } + fstmiad Y, { d0 } + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + sub r4, fp, #128 + vstm r4, { d8 - d15} // store floating point registers + + mov Y, OLD_Y + ldr INC_Y, OLD_INC_Y + + cmp N, #0 + ble dcopy_kernel_L999 + + cmp INC_X, #0 + beq dcopy_kernel_L999 + + cmp INC_Y, #0 + beq dcopy_kernel_L999 + + cmp INC_X, #1 + bne dcopy_kernel_S_BEGIN + + cmp INC_Y, #1 + bne dcopy_kernel_S_BEGIN + +dcopy_kernel_F_BEGIN: + + asrs I, N, #2 // I = N / 4 + ble dcopy_kernel_F1 + +dcopy_kernel_F4: + + COPY_F4 + + subs I, I, #1 + bne dcopy_kernel_F4 + +dcopy_kernel_F1: + + ands I, N, #3 + ble dcopy_kernel_L999 + +dcopy_kernel_F10: + + COPY_F1 + + subs I, I, #1 + bne dcopy_kernel_F10 + + b dcopy_kernel_L999 + +dcopy_kernel_S_BEGIN: + + lsl INC_X, INC_X, #3 // INC_X * SIZE + lsl INC_Y, INC_Y, #3 // INC_Y * SIZE + + asrs I, N, #2 // I = N / 4 + ble dcopy_kernel_S1 + +dcopy_kernel_S4: + + COPY_S4 + + subs I, I, #1 + bne dcopy_kernel_S4 + +dcopy_kernel_S1: + + ands I, N, #3 + ble dcopy_kernel_L999 + +dcopy_kernel_S10: + + COPY_S1 + + subs I, I, #1 + bne dcopy_kernel_S10 + + + + + + +dcopy_kernel_L999: + + sub r3, fp, #128 + vldm r3, { d8 - d15} // restore floating point registers + + mov r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/ddot_vfp.S b/kernel/arm/ddot_vfp.S new file mode 100644 index 000000000..ab819ec98 --- /dev/null +++ b/kernel/arm/ddot_vfp.S @@ -0,0 +1,248 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/11 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define N r0 +#define X r1 +#define INC_X r2 +#define OLD_Y r3 + + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define OLD_INC_Y [fp, #4 ] + +#define I r5 +#define Y r6 +#define INC_Y r7 + +#define X_PRE 512 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro KERNEL_F4 + + pld [ X, #X_PRE ] + fldmiad X!, { d8 } + pld [ Y, #X_PRE ] + fldmiad Y!, { d4 } + fldmiad Y!, { d5 } + fmacd d0 , d4, d8 + fldmiad X!, { d9 } + fldmiad Y!, { d6 } + fmacd d1 , d5, d9 + fldmiad X!, { d10 } + fldmiad X!, { d11 } + fmacd d0 , d6, d10 + fldmiad Y!, { d7 } + fmacd d1 , d7, d11 + +.endm + +.macro KERNEL_F1 + + fldmiad X!, { d4 } + fldmiad Y!, { d8 } + fmacd d0 , d4, d8 + +.endm + + +/*************************************************************************************************************************/ + +.macro KERNEL_S4 + + nop + fldmiad X, { d4 } + fldmiad Y, { d8 } + add X, X, INC_X + add Y, Y, INC_Y + fmacd d0 , d4, d8 + + fldmiad X, { d5 } + fldmiad Y, { d9 } + add X, X, INC_X + add Y, Y, INC_Y + fmacd d1 , d5, d9 + + fldmiad X, { d6 } + fldmiad Y, { d10 } + add X, X, INC_X + add Y, Y, INC_Y + fmacd d0 , d6, d10 + + fldmiad X, { d7 } + fldmiad Y, { d11 } + add X, X, INC_X + add Y, Y, INC_Y + fmacd d1 , d7, d11 + +.endm + + +.macro KERNEL_S1 + + fldmiad X, { d4 } + fldmiad Y, { d8 } + add X, X, INC_X + fmacd d0 , d4, d8 + add Y, Y, INC_Y + +.endm + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + sub r4, fp, #128 + vstm r4, { d8 - d15} // store floating point registers + + mov Y, OLD_Y + ldr INC_Y, OLD_INC_Y + + vsub.f64 d0 , d0 , d0 + vsub.f64 d1 , d1 , d1 + + cmp N, #0 + ble ddot_kernel_L999 + + cmp INC_X, #0 + beq ddot_kernel_L999 + + cmp INC_Y, #0 + beq ddot_kernel_L999 + + cmp INC_X, #1 + bne ddot_kernel_S_BEGIN + + cmp INC_Y, #1 + bne ddot_kernel_S_BEGIN + +ddot_kernel_F_BEGIN: + + asrs I, N, #2 // I = N / 4 + ble ddot_kernel_F1 + +ddot_kernel_F4: + + KERNEL_F4 + + subs I, I, #1 + ble ddot_kernel_F1 + + + KERNEL_F4 + + subs I, I, #1 + bne ddot_kernel_F4 + +ddot_kernel_F1: + + ands I, N, #3 + ble ddot_kernel_L999 + +ddot_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne ddot_kernel_F10 + + b ddot_kernel_L999 + +ddot_kernel_S_BEGIN: + + lsl INC_X, INC_X, #3 // INC_X * SIZE + lsl INC_Y, INC_Y, #3 // INC_Y * SIZE + + asrs I, N, #2 // I = N / 4 + ble ddot_kernel_S1 + +ddot_kernel_S4: + + KERNEL_S4 + + subs I, I, #1 + bne ddot_kernel_S4 + +ddot_kernel_S1: + + ands I, N, #3 + ble ddot_kernel_L999 + +ddot_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne ddot_kernel_S10 + + + + + + +ddot_kernel_L999: + + sub r3, fp, #128 + vldm r3, { d8 - d15} // restore floating point registers + + vadd.f64 d0 , d0, d1 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/dgemm_kernel_4x2_vfp.S b/kernel/arm/dgemm_kernel_4x2_vfp.S new file mode 100644 index 000000000..55409a5ef --- /dev/null +++ b/kernel/arm/dgemm_kernel_4x2_vfp.S @@ -0,0 +1,806 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/27 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_K r2 +#define OLD_A r3 +#define OLD_ALPHA d0 + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define LDC [fp, #-252 ] +#define M [fp, #-256 ] +#define N [fp, #-260 ] +#define K [fp, #-264 ] +#define A [fp, #-268 ] + +#define ALPHA [fp, #-280] + +#define B [fp, #4 ] +#define C [fp, #8 ] +#define OLD_LDC [fp, #12 ] + +#define I r0 +#define J r1 +#define L r2 + +#define AO r5 +#define BO r6 + +#define CO1 r8 +#define CO2 r9 + +#define K1 r7 +#define BC r12 + +#define A_PRE 96 +#define B_PRE 96 +#define C_PRE 32 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro INIT4x2 + + vsub.f64 d8 , d8 , d8 + vmov.f64 d9, d8 + vmov.f64 d10, d8 + vmov.f64 d11, d8 + vmov.f64 d12, d8 + vmov.f64 d13, d8 + vmov.f64 d14, d8 + vmov.f64 d15, d8 + +.endm + + + +.macro KERNEL4x2_SUB + + pld [ AO, #A_PRE ] + fldd d4 , [ BO ] + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fmacd d8 , d0, d4 + fldd d2 , [ AO, #16 ] + fmacd d9 , d1, d4 + fldd d3 , [ AO, #24 ] + fmacd d10 , d2, d4 + fldd d5 , [ BO, #8 ] + fmacd d11 , d3, d4 + + fmacd d12 , d0, d5 + fmacd d13 , d1, d5 + add AO , AO, #32 + fmacd d14 , d2, d5 + add BO , BO, #16 + fmacd d15 , d3, d5 + + +.endm + +.macro SAVE4x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + fldd d0, ALPHA + + + fldd d4 , [CO1] + fldd d5 , [CO1, #8 ] + + pld [ CO1, #C_PRE ] + fmacd d4 , d0 , d8 + fldd d6 , [CO1, #16 ] + fmacd d5 , d0 , d9 + fldd d7 , [CO1, #24 ] + fmacd d6 , d0 , d10 + fstd d4 , [CO1] + fmacd d7 , d0 , d11 + + fstd d5 , [CO1, #8 ] + fstd d6 , [CO1, #16 ] + fstd d7 , [CO1, #24 ] + + fldd d4 , [CO2] + fldd d5 , [CO2, #8 ] + + pld [ CO2, #C_PRE ] + fmacd d4 , d0 , d12 + fldd d6 , [CO2, #16 ] + fmacd d5 , d0 , d13 + fldd d7 , [CO2, #24 ] + fmacd d6 , d0 , d14 + fstd d4 , [CO2] + fmacd d7 , d0 , d15 + add CO1, CO1, #32 + + fstd d5 , [CO2, #8 ] + fstd d6 , [CO2, #16 ] + fstd d7 , [CO2, #24 ] + + +.endm + + +/******************************************************************************/ + +.macro INIT2x2 + + vsub.f64 d8 , d8 , d8 + vmov.f64 d9, d8 + vmov.f64 d12, d8 + vmov.f64 d13, d8 + +.endm + +.macro KERNEL2x2_SUB + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fmacd d8 , d0, d4 + fmacd d9 , d1, d4 + + fmacd d12 , d0, d5 + fmacd d13 , d1, d5 + + add AO , AO, #16 + add BO , BO, #16 + +.endm + +.macro SAVE2x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + fldd d0, ALPHA + + fldd d4 , [CO1] + fldd d5 , [CO1, #8 ] + + fmacd d4 , d0 , d8 + fmacd d5 , d0 , d9 + + fstd d4 , [CO1] + fstd d5 , [CO1, #8 ] + + fldd d4 , [CO2] + fldd d5 , [CO2, #8 ] + + fmacd d4 , d0 , d12 + fmacd d5 , d0 , d13 + + fstd d4 , [CO2] + fstd d5 , [CO2, #8 ] + + add CO1, CO1, #16 + +.endm + + +/******************************************************************************/ + +.macro INIT1x2 + + vsub.f64 d8 , d8 , d8 + vmov.f64 d12, d8 + +.endm + +.macro KERNEL1x2_SUB + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fldd d0 , [ AO ] + + fmacd d8 , d0, d4 + + fmacd d12 , d0, d5 + + add AO , AO, #8 + add BO , BO, #16 + +.endm + +.macro SAVE1x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + fldd d0, ALPHA + + fldd d4 , [CO1] + + fmacd d4 , d0 , d8 + + fstd d4 , [CO1] + + fldd d4 , [CO2] + + fmacd d4 , d0 , d12 + + fstd d4 , [CO2] + + add CO1, CO1, #8 + +.endm + + + +/******************************************************************************/ + +.macro INIT4x1 + + vsub.f64 d8 , d8 , d8 + vmov.f64 d9, d8 + vmov.f64 d10, d8 + vmov.f64 d11, d8 + +.endm + + + +.macro KERNEL4x1_SUB + + fldd d4 , [ BO ] + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fmacd d8 , d0, d4 + fmacd d9 , d1, d4 + fmacd d10 , d2, d4 + fmacd d11 , d3, d4 + + add AO , AO, #32 + add BO , BO, #8 + +.endm + +.macro SAVE4x1 + + fldd d0, ALPHA + + fldd d4 , [CO1] + fldd d5 , [CO1, #8 ] + fldd d6 , [CO1, #16 ] + fldd d7 , [CO1, #24 ] + + fmacd d4 , d0 , d8 + fmacd d5 , d0 , d9 + fmacd d6 , d0 , d10 + fmacd d7 , d0 , d11 + + fstd d4 , [CO1] + fstd d5 , [CO1, #8 ] + fstd d6 , [CO1, #16 ] + fstd d7 , [CO1, #24 ] + + add CO1, CO1, #32 + +.endm + +/******************************************************************************/ + +.macro INIT2x1 + + vsub.f64 d8 , d8 , d8 + vmov.f64 d9 , d8 + +.endm + +.macro KERNEL2x1_SUB + + fldd d4 , [ BO ] + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fmacd d8 , d0, d4 + fmacd d9 , d1, d4 + + add AO , AO, #16 + add BO , BO, #8 + +.endm + +.macro SAVE2x1 + + fldd d0, ALPHA + + fldd d4 , [CO1] + fldd d5 , [CO1, #8 ] + + fmacd d4 , d0 , d8 + fmacd d5 , d0 , d9 + + fstd d4 , [CO1] + fstd d5 , [CO1, #8 ] + + add CO1, CO1, #16 + +.endm + + +/******************************************************************************/ + +.macro INIT1x1 + + vsub.f64 d8 , d8 , d8 + +.endm + +.macro KERNEL1x1_SUB + + fldd d4 , [ BO ] + + fldd d0 , [ AO ] + + fmacd d8 , d0, d4 + + add AO , AO, #8 + add BO , BO, #8 + +.endm + +.macro SAVE1x1 + + fldd d0, ALPHA + + fldd d4 , [CO1] + + fmacd d4 , d0 , d8 + + fstd d4 , [CO1] + + add CO1, CO1, #8 + +.endm + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + str OLD_M, M + str OLD_N, N + str OLD_K, K + str OLD_A, A + vstr OLD_ALPHA, ALPHA + + sub r3, fp, #128 + vstm r3, { d8 - d15} // store floating point registers + + ldr r3, OLD_LDC + lsl r3, r3, #3 // ldc = ldc * 8 + str r3, LDC + + ldr K1, K + ldr BC, B + + ldr J, N + asrs J, J, #1 // J = J / 2 + ble dgemm_kernel_L1_BEGIN + + +/*********************************************************************************************/ + +dgemm_kernel_L2_BEGIN: + + ldr CO1, C // CO1 = C + ldr r4 , LDC + lsl r4 , r4 , #1 // LDC * 2 + add r3 , r4, CO1 + str r3 , C // store C + + ldr AO, A // AO = A + +dgemm_kernel_L2_M4_BEGIN: + + ldr I, M + asrs I, I, #2 // I = I / 4 + ble dgemm_kernel_L2_M2_BEGIN + +dgemm_kernel_L2_M4_20: + + INIT4x2 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble dgemm_kernel_L2_M4_40 + .align 5 + +dgemm_kernel_L2_M4_22: + + pld [ BO, #B_PRE ] + KERNEL4x2_SUB + KERNEL4x2_SUB + pld [ BO, #B_PRE ] + KERNEL4x2_SUB + KERNEL4x2_SUB + + pld [ BO, #B_PRE ] + KERNEL4x2_SUB + KERNEL4x2_SUB + pld [ BO, #B_PRE ] + KERNEL4x2_SUB + KERNEL4x2_SUB + + subs L, L, #1 + bgt dgemm_kernel_L2_M4_22 + + +dgemm_kernel_L2_M4_40: + + ands L , K1, #7 // L = L % 8 + ble dgemm_kernel_L2_M4_100 + +dgemm_kernel_L2_M4_42: + + KERNEL4x2_SUB + + subs L, L, #1 + bgt dgemm_kernel_L2_M4_42 + +dgemm_kernel_L2_M4_100: + + SAVE4x2 + +dgemm_kernel_L2_M4_END: + + subs I, I, #1 + bgt dgemm_kernel_L2_M4_20 + + +dgemm_kernel_L2_M2_BEGIN: + + ldr I, M + tst I , #3 + ble dgemm_kernel_L2_END + + tst I, #2 // I = I / 2 + ble dgemm_kernel_L2_M1_BEGIN + +dgemm_kernel_L2_M2_20: + + INIT2x2 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble dgemm_kernel_L2_M2_40 + +dgemm_kernel_L2_M2_22: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + subs L, L, #1 + bgt dgemm_kernel_L2_M2_22 + + +dgemm_kernel_L2_M2_40: + + ands L , K1, #7 // L = L % 8 + ble dgemm_kernel_L2_M2_100 + +dgemm_kernel_L2_M2_42: + + KERNEL2x2_SUB + + subs L, L, #1 + bgt dgemm_kernel_L2_M2_42 + +dgemm_kernel_L2_M2_100: + + SAVE2x2 + +dgemm_kernel_L2_M2_END: + + +dgemm_kernel_L2_M1_BEGIN: + + tst I, #1 // I = I % 2 + ble dgemm_kernel_L2_END + +dgemm_kernel_L2_M1_20: + + INIT1x2 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble dgemm_kernel_L2_M1_40 + +dgemm_kernel_L2_M1_22: + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs L, L, #1 + bgt dgemm_kernel_L2_M1_22 + + +dgemm_kernel_L2_M1_40: + + ands L , K1, #7 // L = L % 8 + ble dgemm_kernel_L2_M1_100 + +dgemm_kernel_L2_M1_42: + + KERNEL1x2_SUB + + subs L, L, #1 + bgt dgemm_kernel_L2_M1_42 + +dgemm_kernel_L2_M1_100: + + SAVE1x2 + + +dgemm_kernel_L2_END: + + mov r3, BC + mov r4, K1 + lsl r4, r4, #4 // k * 2 * 8 + add r3, r3, r4 // B = B + K * 2 * 8 + mov BC, r3 + + subs J , #1 // j-- + bgt dgemm_kernel_L2_BEGIN + +/*********************************************************************************************/ + +dgemm_kernel_L1_BEGIN: + + ldr J , N + tst J , #1 + ble dgemm_kernel_L999 + + + ldr CO1, C // CO1 = C + ldr r4 , LDC + add r3 , r4, CO1 + str r3 , C // store C + + ldr AO, A // AO = A + + + +dgemm_kernel_L1_M4_BEGIN: + + ldr I, M + asrs I, I, #2 // I = I / 4 + ble dgemm_kernel_L1_M2_BEGIN + +dgemm_kernel_L1_M4_20: + + INIT4x1 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble dgemm_kernel_L1_M4_40 + .align 5 + +dgemm_kernel_L1_M4_22: + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + subs L, L, #1 + bgt dgemm_kernel_L1_M4_22 + + +dgemm_kernel_L1_M4_40: + + ands L , K1, #7 // L = L % 8 + ble dgemm_kernel_L1_M4_100 + +dgemm_kernel_L1_M4_42: + + KERNEL4x1_SUB + + subs L, L, #1 + bgt dgemm_kernel_L1_M4_42 + +dgemm_kernel_L1_M4_100: + + SAVE4x1 + +dgemm_kernel_L1_M4_END: + + subs I, I, #1 + bgt dgemm_kernel_L1_M4_20 + + +dgemm_kernel_L1_M2_BEGIN: + + ldr I, M + tst I , #3 + ble dgemm_kernel_L1_END + + tst I, #2 // I = I / 2 + ble dgemm_kernel_L1_M1_BEGIN + +dgemm_kernel_L1_M2_20: + + INIT2x1 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble dgemm_kernel_L1_M2_40 + +dgemm_kernel_L1_M2_22: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + subs L, L, #1 + bgt dgemm_kernel_L1_M2_22 + + +dgemm_kernel_L1_M2_40: + + ands L , K1, #7 // L = L % 8 + ble dgemm_kernel_L1_M2_100 + +dgemm_kernel_L1_M2_42: + + KERNEL2x1_SUB + + subs L, L, #1 + bgt dgemm_kernel_L1_M2_42 + +dgemm_kernel_L1_M2_100: + + SAVE2x1 + +dgemm_kernel_L1_M2_END: + + +dgemm_kernel_L1_M1_BEGIN: + + tst I, #1 // I = I % 2 + ble dgemm_kernel_L1_END + +dgemm_kernel_L1_M1_20: + + INIT1x1 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble dgemm_kernel_L1_M1_40 + +dgemm_kernel_L1_M1_22: + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs L, L, #1 + bgt dgemm_kernel_L1_M1_22 + + +dgemm_kernel_L1_M1_40: + + ands L , K1, #7 // L = L % 8 + ble dgemm_kernel_L1_M1_100 + +dgemm_kernel_L1_M1_42: + + KERNEL1x1_SUB + + subs L, L, #1 + bgt dgemm_kernel_L1_M1_42 + +dgemm_kernel_L1_M1_100: + + SAVE1x1 + + +dgemm_kernel_L1_END: + + +dgemm_kernel_L999: + + sub r3, fp, #128 + vldm r3, { d8 - d15} // restore floating point registers + + movs r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/dgemm_kernel_4x4_vfpv3.S b/kernel/arm/dgemm_kernel_4x4_vfpv3.S new file mode 100644 index 000000000..3b6af19a3 --- /dev/null +++ b/kernel/arm/dgemm_kernel_4x4_vfpv3.S @@ -0,0 +1,1483 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/23 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +* +* 2013/10/11 Saar +* UNROLL_N 4 +* UNROLL_M 4 +* DGEMM_P 128 +* DGEMM_Q 96 +* DGEMM_R 512 +* A_PRE 96 +* B_PRE 96 +* C_PRE 64 +* +* Performance on Odroid U2: +* +* 1 Core: 1.57 GFLOPS ATLAS: 1.59 GFLOPS +* 2 Cores: 3.14 GFLOPS ATLAS: 3.16 GFLOPS +* 3 Cores: 4.56 GFLOPS ATLAS: 4.60 GFLOPS +* 4 Cores: 5.82 GFLOPS ATLAS: 5.41 GFLOPS +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_K r2 +#define OLD_A r3 +#define OLD_ALPHA d0 + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define LDC [fp, #-252 ] +#define M [fp, #-256 ] +#define N [fp, #-260 ] +#define K [fp, #-264 ] +#define A [fp, #-268 ] + +#define ALPHA [fp, #-280] + +#define B [fp, #4 ] +#define C [fp, #8 ] +#define OLD_LDC [fp, #12 ] + +#define I r0 +#define J r1 +#define L r2 + +#define AO r5 +#define BO r6 + +#define CO1 r8 +#define CO2 r9 + +#define K1 r7 +#define BC r12 + +#define A_PRE 96 +#define B_PRE 96 +#define C_PRE 64 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro INIT4x4 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d17, d16 + vmov.f64 d18, d16 + vmov.f64 d19, d16 + vmov.f64 d20, d16 + vmov.f64 d21, d16 + vmov.f64 d22, d16 + vmov.f64 d23, d16 + vmov.f64 d24, d16 + vmov.f64 d25, d16 + vmov.f64 d26, d16 + vmov.f64 d27, d16 + vmov.f64 d28, d16 + vmov.f64 d29, d16 + vmov.f64 d30, d16 + vmov.f64 d31, d16 + +.endm + +.macro KERNEL4x4_I + pld [ BO , #B_PRE ] + fldd d8 , [ BO ] + fldd d0 , [ AO ] + pld [ AO , #A_PRE ] + + fldd d1 , [ AO, #8 ] + fmuld d16 , d0, d8 + fldd d2 , [ AO, #16 ] + fmuld d17 , d1, d8 + fldd d3 , [ AO, #24 ] + fmuld d18 , d2, d8 + fldd d9 , [ BO, #8 ] + fmuld d19 , d3, d8 + + fldd d10, [ BO, #16 ] + fmuld d20 , d0, d9 + fldd d11, [ BO, #24 ] + fmuld d21 , d1, d9 + add BO , BO, #32 + add AO , AO, #32 + fmuld d22 , d2, d9 + + pld [ BO , #B_PRE ] + fldd d12, [ BO ] + fmuld d23 , d3, d9 + + pld [ AO , #A_PRE ] + fldd d4 , [ AO, #0 ] + fmuld d24 , d0, d10 + fldd d5 , [ AO, #8 ] + fmuld d25 , d1, d10 + fldd d6 , [ AO, #16 ] + fmuld d26 , d2, d10 + fldd d7 , [ AO, #24 ] + fmuld d27 , d3, d10 + + fldd d13, [ BO, #8 ] + fmuld d28 , d0, d11 + fldd d14, [ BO, #16 ] + fmuld d29 , d1, d11 + fldd d15, [ BO, #24 ] + fmuld d30 , d2, d11 + fmuld d31 , d3, d11 + +.endm + + + +.macro KERNEL4x4_M2 + + fmacd d16 , d4, d12 + pld [ AO , #A_PRE+32 ] + fmacd d17 , d5, d12 + fldd d0 , [ AO , #32 ] + fmacd d18 , d6, d12 + pld [ BO , #B_PRE+32 ] + fmacd d19 , d7, d12 + + fldd d8 , [ BO , #32 ] + fmacd d20 , d4, d13 + fldd d1 , [ AO, #40 ] + fmacd d21 , d5, d13 + fldd d2 , [ AO, #48 ] + fmacd d22 , d6, d13 + fldd d3 , [ AO, #56 ] + fmacd d23 , d7, d13 + + fmacd d24 , d4, d14 + fmacd d25 , d5, d14 + fldd d9 , [ BO, #40 ] + fmacd d26 , d6, d14 + fldd d10, [ BO, #48 ] + fmacd d27 , d7, d14 + + fldd d11, [ BO, #56 ] + fmacd d28 , d4, d15 + fmacd d29 , d5, d15 + add AO , AO, #64 + fmacd d30 , d6, d15 + add BO , BO, #64 + fmacd d31 , d7, d15 + +.endm + + +.macro KERNEL4x4_M1 + + fmacd d16 , d0, d8 + pld [ AO , #A_PRE ] + fmacd d17 , d1, d8 + fldd d4 , [ AO ] + fmacd d18 , d2, d8 + pld [ BO , #B_PRE ] + fmacd d19 , d3, d8 + + fldd d12, [ BO ] + fmacd d20 , d0, d9 + fldd d5 , [ AO, #8 ] + fmacd d21 , d1, d9 + fldd d6 , [ AO, #16 ] + fmacd d22 , d2, d9 + fldd d7 , [ AO, #24 ] + fmacd d23 , d3, d9 + + fmacd d24 , d0, d10 + fmacd d25 , d1, d10 + fldd d13, [ BO, #8 ] + fmacd d26 , d2, d10 + fldd d14, [ BO, #16 ] + fmacd d27 , d3, d10 + + fldd d15, [ BO, #24 ] + fmacd d28 , d0, d11 + fmacd d29 , d1, d11 + fmacd d30 , d2, d11 + fmacd d31 , d3, d11 + +.endm + + + +.macro KERNEL4x4_E + + fmacd d16 , d4, d12 + fmacd d17 , d5, d12 + add BO , BO, #32 + add AO , AO, #32 + fmacd d18 , d6, d12 + fmacd d19 , d7, d12 + + fmacd d20 , d4, d13 + fmacd d21 , d5, d13 + fmacd d22 , d6, d13 + fmacd d23 , d7, d13 + + fmacd d24 , d4, d14 + fmacd d25 , d5, d14 + fmacd d26 , d6, d14 + fmacd d27 , d7, d14 + + fmacd d28 , d4, d15 + fmacd d29 , d5, d15 + fmacd d30 , d6, d15 + fmacd d31 , d7, d15 + +.endm + + + + +.macro KERNEL4x4_SUB + + fldd d8 , [ BO ] + pld [ BO , #B_PRE ] + + fldd d0 , [ AO ] + pld [ AO , #A_PRE ] + fldd d1 , [ AO, #8 ] + + fmacd d16 , d0, d8 + fldd d2 , [ AO, #16 ] + fmacd d17 , d1, d8 + fldd d3 , [ AO, #24 ] + fmacd d18 , d2, d8 + fldd d9 , [ BO, #8 ] + fmacd d19 , d3, d8 + + fldd d10, [ BO, #16 ] + fmacd d20 , d0, d9 + fldd d11, [ BO, #24 ] + fmacd d21 , d1, d9 + fmacd d22 , d2, d9 + fmacd d23 , d3, d9 + + fmacd d24 , d0, d10 + fmacd d25 , d1, d10 + fmacd d26 , d2, d10 + fmacd d27 , d3, d10 + + fmacd d28 , d0, d11 + fmacd d29 , d1, d11 + add AO , AO, #32 + fmacd d30 , d2, d11 + add BO , BO, #32 + fmacd d31 , d3, d11 + +.endm + +.macro SAVE4x4 + pld [ CO1 , #C_PRE ] + + ldr r3 , LDC + add CO2 , CO1, r3 + fldd d0, ALPHA + add r4 , CO2, r3 + pld [ CO2 , #C_PRE ] + + fldmiad CO1, { d8 - d11 } + pld [ r4 , #C_PRE ] + + fmacd d8 , d0 , d16 + fldd d12, [CO2] + fmacd d9 , d0 , d17 + fldd d13, [CO2, #8 ] + fmacd d10, d0 , d18 + fldd d14, [CO2, #16 ] + fmacd d11, d0 , d19 + fldd d15, [CO2, #24 ] + + fmacd d12, d0 , d20 + fstd d8 , [CO1] + fmacd d13, d0 , d21 + fstd d9 , [CO1, #8 ] + fmacd d14, d0 , d22 + fstd d10, [CO1, #16 ] + fmacd d15, d0 , d23 + fstd d11, [CO1, #24 ] + + fldmiad r4, { d8 - d11 } + + fmacd d8 , d0 , d24 + fstd d12, [CO2] + fmacd d9 , d0 , d25 + fstd d13, [CO2, #8 ] + fmacd d10, d0 , d26 + fstd d14, [CO2, #16 ] + fmacd d11, d0 , d27 + fstd d15, [CO2, #24 ] + + add CO2, r4 , r3 + + pld [ CO2 , #C_PRE ] + + fldmiad CO2, { d12 - d15 } + + fstd d8 , [r4 ] + fmacd d12, d0 , d28 + fstd d9 , [r4 , #8 ] + fmacd d13, d0 , d29 + fstd d10, [r4 , #16 ] + fmacd d14, d0 , d30 + fstd d11, [r4 , #24 ] + fmacd d15, d0 , d31 + + fstmiad CO2, { d12 - d15 } + + add CO1, CO1, #32 + +.endm + +/******************************************************************************/ + +.macro INIT2x4 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d17, d16 + vmov.f64 d20, d16 + vmov.f64 d21, d16 + vmov.f64 d24, d16 + vmov.f64 d25, d16 + vmov.f64 d28, d16 + vmov.f64 d29, d16 + +.endm + + + +.macro KERNEL2x4_SUB + + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + fldd d10, [ BO, #16 ] + fldd d11, [ BO, #24 ] + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fmacd d16 , d0, d8 + fmacd d17 , d1, d8 + + fmacd d20 , d0, d9 + fmacd d21 , d1, d9 + + fmacd d24 , d0, d10 + fmacd d25 , d1, d10 + + fmacd d28 , d0, d11 + fmacd d29 , d1, d11 + add AO , AO, #16 + add BO , BO, #32 + +.endm + +.macro SAVE2x4 + + ldr r3 , LDC + add CO2 , CO1, r3 + add r4 , CO2, r3 + + fldd d0, ALPHA + + fldd d8 , [CO1] + fldd d9 , [CO1, #8 ] + + fmacd d8 , d0 , d16 + fmacd d9 , d0 , d17 + + fstd d8 , [CO1] + fstd d9 , [CO1, #8 ] + + fldd d12, [CO2] + fldd d13, [CO2, #8 ] + + fmacd d12, d0 , d20 + fmacd d13, d0 , d21 + + fstd d12, [CO2] + fstd d13, [CO2, #8 ] + + fldd d8 , [r4 ] + fldd d9 , [r4 , #8 ] + + fmacd d8 , d0 , d24 + fmacd d9 , d0 , d25 + + fstd d8 , [r4 ] + fstd d9 , [r4 , #8 ] + + add CO2, r4 , r3 + + fldd d12, [CO2] + fldd d13, [CO2, #8 ] + + fmacd d12, d0 , d28 + fmacd d13, d0 , d29 + + fstd d12, [CO2] + fstd d13, [CO2, #8 ] + + add CO1, CO1, #16 + +.endm + + +/******************************************************************************/ + +.macro INIT1x4 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d20, d16 + vmov.f64 d24, d16 + vmov.f64 d28, d16 + +.endm + + + +.macro KERNEL1x4_SUB + + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + fldd d10, [ BO, #16 ] + fldd d11, [ BO, #24 ] + + fldd d0 , [ AO ] + + fmacd d16 , d0, d8 + fmacd d20 , d0, d9 + fmacd d24 , d0, d10 + fmacd d28 , d0, d11 + + add AO , AO, #8 + add BO , BO, #32 + +.endm + +.macro SAVE1x4 + + ldr r3 , LDC + add CO2 , CO1, r3 + add r4 , CO2, r3 + + fldd d0, ALPHA + + fldd d8 , [CO1] + fmacd d8 , d0 , d16 + fstd d8 , [CO1] + + fldd d12, [CO2] + fmacd d12, d0 , d20 + fstd d12, [CO2] + + fldd d8 , [r4 ] + fmacd d8 , d0 , d24 + fstd d8 , [r4 ] + + add CO2, r4 , r3 + + fldd d12, [CO2] + fmacd d12, d0 , d28 + fstd d12, [CO2] + + add CO1, CO1, #8 + +.endm + +/******************************************************************************/ +/******************************************************************************/ + +.macro INIT4x2 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d17, d16 + vmov.f64 d18, d16 + vmov.f64 d19, d16 + vmov.f64 d20, d16 + vmov.f64 d21, d16 + vmov.f64 d22, d16 + vmov.f64 d23, d16 + +.endm + + + +.macro KERNEL4x2_SUB + + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fmacd d16 , d0, d8 + fmacd d17 , d1, d8 + fmacd d18 , d2, d8 + fmacd d19 , d3, d8 + + fmacd d20 , d0, d9 + fmacd d21 , d1, d9 + fmacd d22 , d2, d9 + fmacd d23 , d3, d9 + + add AO , AO, #32 + add BO , BO, #16 + +.endm + +.macro SAVE4x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + fldd d0, ALPHA + + fldd d8 , [CO1] + fldd d9 , [CO1, #8 ] + fldd d10, [CO1, #16 ] + fldd d11, [CO1, #24 ] + + fmacd d8 , d0 , d16 + fmacd d9 , d0 , d17 + fmacd d10, d0 , d18 + fmacd d11, d0 , d19 + + fstd d8 , [CO1] + fstd d9 , [CO1, #8 ] + fstd d10, [CO1, #16 ] + fstd d11, [CO1, #24 ] + + fldd d12, [CO2] + fldd d13, [CO2, #8 ] + fldd d14, [CO2, #16 ] + fldd d15, [CO2, #24 ] + + fmacd d12, d0 , d20 + fmacd d13, d0 , d21 + fmacd d14, d0 , d22 + fmacd d15, d0 , d23 + + fstd d12, [CO2] + fstd d13, [CO2, #8 ] + fstd d14, [CO2, #16 ] + fstd d15, [CO2, #24 ] + + add CO1, CO1, #32 + +.endm + + +/******************************************************************************/ + +.macro INIT2x2 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d17, d16 + vmov.f64 d20, d16 + vmov.f64 d21, d16 + +.endm + + + +.macro KERNEL2x2_SUB + + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fmacd d16 , d0, d8 + fmacd d17 , d1, d8 + + fmacd d20 , d0, d9 + fmacd d21 , d1, d9 + + add AO , AO, #16 + add BO , BO, #16 + +.endm + +.macro SAVE2x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + fldd d0, ALPHA + + fldd d8 , [CO1] + fldd d9 , [CO1, #8 ] + + fmacd d8 , d0 , d16 + fmacd d9 , d0 , d17 + + fstd d8 , [CO1] + fstd d9 , [CO1, #8 ] + + fldd d12, [CO2] + fldd d13, [CO2, #8 ] + + fmacd d12, d0 , d20 + fmacd d13, d0 , d21 + + fstd d12, [CO2] + fstd d13, [CO2, #8 ] + + add CO1, CO1, #16 + +.endm + +/******************************************************************************/ + +.macro INIT1x2 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d20, d16 + +.endm + + + +.macro KERNEL1x2_SUB + + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + fldd d0 , [ AO ] + fmacd d16 , d0, d8 + fmacd d20 , d0, d9 + + add AO , AO, #8 + add BO , BO, #16 + +.endm + +.macro SAVE1x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + fldd d0, ALPHA + + fldd d8 , [CO1] + fmacd d8 , d0 , d16 + fstd d8 , [CO1] + + fldd d12, [CO2] + fmacd d12, d0 , d20 + fstd d12, [CO2] + + add CO1, CO1, #8 + +.endm + +/******************************************************************************/ +/******************************************************************************/ + +.macro INIT4x1 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d17, d16 + vmov.f64 d18, d16 + vmov.f64 d19, d16 + +.endm + + + +.macro KERNEL4x1_SUB + + fldd d8 , [ BO ] + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fmacd d16 , d0, d8 + fmacd d17 , d1, d8 + fmacd d18 , d2, d8 + fmacd d19 , d3, d8 + + add AO , AO, #32 + add BO , BO, #8 + +.endm + +.macro SAVE4x1 + + + fldd d0, ALPHA + + fldd d8 , [CO1] + fldd d9 , [CO1, #8 ] + fldd d10, [CO1, #16 ] + fldd d11, [CO1, #24 ] + + fmacd d8 , d0 , d16 + fmacd d9 , d0 , d17 + fmacd d10, d0 , d18 + fmacd d11, d0 , d19 + + fstd d8 , [CO1] + fstd d9 , [CO1, #8 ] + fstd d10, [CO1, #16 ] + fstd d11, [CO1, #24 ] + + add CO1, CO1, #32 + +.endm + + + + +/******************************************************************************/ + +.macro INIT2x1 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d17, d16 + +.endm + + + +.macro KERNEL2x1_SUB + + fldd d8 , [ BO ] + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fmacd d16 , d0, d8 + fmacd d17 , d1, d8 + + add AO , AO, #16 + add BO , BO, #8 + +.endm + +.macro SAVE2x1 + + + fldd d0, ALPHA + + fldd d8 , [CO1] + fldd d9 , [CO1, #8 ] + + fmacd d8 , d0 , d16 + fmacd d9 , d0 , d17 + + fstd d8 , [CO1] + fstd d9 , [CO1, #8 ] + + add CO1, CO1, #16 + +.endm + +/******************************************************************************/ + +.macro INIT1x1 + + vsub.f64 d16 , d16 , d16 + +.endm + + + +.macro KERNEL1x1_SUB + + fldd d8 , [ BO ] + + fldd d0 , [ AO ] + + fmacd d16 , d0, d8 + + add AO , AO, #8 + add BO , BO, #8 + +.endm + +.macro SAVE1x1 + + + fldd d0, ALPHA + + fldd d8 , [CO1] + fmacd d8 , d0 , d16 + fstd d8 , [CO1] + + add CO1, CO1, #8 + +.endm + + + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + str OLD_M, M + str OLD_N, N + str OLD_K, K + str OLD_A, A + vstr OLD_ALPHA, ALPHA + + sub r3, fp, #128 + vstm r3, { d8 - d15} // store floating point registers + + ldr r3, OLD_LDC + lsl r3, r3, #3 // ldc = ldc * 8 + str r3, LDC + + ldr K1, K + ldr BC, B + + ldr J, N + asrs J, J, #2 // J = J / 4 + ble dgemm_kernel_L2_BEGIN + +dgemm_kernel_L4_BEGIN: + + ldr CO1, C // CO1 = C + ldr r4 , LDC + lsl r4 , r4 , #2 // LDC * 4 + add r3 , r4, CO1 + str r3 , C // store C + + ldr AO, A // AO = A + pld [AO , #A_PRE-64] + pld [AO , #A_PRE-32] + + + +dgemm_kernel_L4_M4_BEGIN: + + ldr I, M + asrs I, I, #2 // I = I / 4 + ble dgemm_kernel_L4_M2_BEGIN + +dgemm_kernel_L4_M4_20: + + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + cmp L , #2 + blt dgemm_kernel_L4_M4_32 + + + KERNEL4x4_I + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + subs L, L, #2 + ble dgemm_kernel_L4_M4_22a + .align 5 + +dgemm_kernel_L4_M4_22: + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + subs L, L, #1 + bgt dgemm_kernel_L4_M4_22 + +dgemm_kernel_L4_M4_22a: + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_E + + b dgemm_kernel_L4_M4_44 + +dgemm_kernel_L4_M4_32: + + tst L, #1 + ble dgemm_kernel_L4_M4_40 + + KERNEL4x4_I + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_E + + b dgemm_kernel_L4_M4_44 + + +dgemm_kernel_L4_M4_40: + + INIT4x4 + + +dgemm_kernel_L4_M4_44: + + ands L , K1, #7 // L = L % 8 + ble dgemm_kernel_L4_M4_100 + +dgemm_kernel_L4_M4_46: + + KERNEL4x4_SUB + + subs L, L, #1 + bne dgemm_kernel_L4_M4_46 + +dgemm_kernel_L4_M4_100: + + SAVE4x4 + +dgemm_kernel_L4_M4_END: + + subs I, I, #1 + bne dgemm_kernel_L4_M4_20 + + +dgemm_kernel_L4_M2_BEGIN: + + ldr I, M + tst I , #3 + ble dgemm_kernel_L4_END + + tst I, #2 // I = I / 2 + ble dgemm_kernel_L4_M1_BEGIN + +dgemm_kernel_L4_M2_20: + + INIT2x4 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble dgemm_kernel_L4_M2_40 + +dgemm_kernel_L4_M2_22: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + subs L, L, #1 + bgt dgemm_kernel_L4_M2_22 + + +dgemm_kernel_L4_M2_40: + + ands L , K1, #7 // L = L % 8 + ble dgemm_kernel_L4_M2_100 + +dgemm_kernel_L4_M2_42: + + KERNEL2x4_SUB + + subs L, L, #1 + bgt dgemm_kernel_L4_M2_42 + +dgemm_kernel_L4_M2_100: + + SAVE2x4 + +dgemm_kernel_L4_M2_END: + + +dgemm_kernel_L4_M1_BEGIN: + + tst I, #1 // I = I % 2 + ble dgemm_kernel_L4_END + +dgemm_kernel_L4_M1_20: + + INIT1x4 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble dgemm_kernel_L4_M1_40 + +dgemm_kernel_L4_M1_22: + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + subs L, L, #1 + bgt dgemm_kernel_L4_M1_22 + + +dgemm_kernel_L4_M1_40: + + ands L , K1, #7 // L = L % 8 + ble dgemm_kernel_L4_M1_100 + +dgemm_kernel_L4_M1_42: + + KERNEL1x4_SUB + + subs L, L, #1 + bgt dgemm_kernel_L4_M1_42 + +dgemm_kernel_L4_M1_100: + + SAVE1x4 + + +dgemm_kernel_L4_END: + + mov r3, BC + mov r4, K1 + lsl r4, r4, #5 // k * 4 * 8 + add r3, r3, r4 // B = B + K * 4 * 8 + mov BC, r3 + + subs J , #1 // j-- + bgt dgemm_kernel_L4_BEGIN + + + +/*********************************************************************************************/ + +dgemm_kernel_L2_BEGIN: + + ldr J , N + tst J , #3 + ble dgemm_kernel_L999 + + tst J , #2 + ble dgemm_kernel_L1_BEGIN + + ldr CO1, C // CO1 = C + ldr r4 , LDC + lsl r4 , r4 , #1 // LDC * 2 + add r3 , r4, CO1 + str r3 , C // store C + + ldr AO, A // AO = A + + + +dgemm_kernel_L2_M4_BEGIN: + + ldr I, M + asrs I, I, #2 // I = I / 4 + ble dgemm_kernel_L2_M2_BEGIN + +dgemm_kernel_L2_M4_20: + + INIT4x2 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble dgemm_kernel_L2_M4_40 + .align 5 + +dgemm_kernel_L2_M4_22: + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + subs L, L, #1 + bgt dgemm_kernel_L2_M4_22 + + +dgemm_kernel_L2_M4_40: + + ands L , K1, #7 // L = L % 8 + ble dgemm_kernel_L2_M4_100 + +dgemm_kernel_L2_M4_42: + + KERNEL4x2_SUB + + subs L, L, #1 + bgt dgemm_kernel_L2_M4_42 + +dgemm_kernel_L2_M4_100: + + SAVE4x2 + +dgemm_kernel_L2_M4_END: + + subs I, I, #1 + bgt dgemm_kernel_L2_M4_20 + + +dgemm_kernel_L2_M2_BEGIN: + + ldr I, M + tst I , #3 + ble dgemm_kernel_L2_END + + tst I, #2 // I = I / 2 + ble dgemm_kernel_L2_M1_BEGIN + +dgemm_kernel_L2_M2_20: + + INIT2x2 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble dgemm_kernel_L2_M2_40 + +dgemm_kernel_L2_M2_22: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + subs L, L, #1 + bgt dgemm_kernel_L2_M2_22 + + +dgemm_kernel_L2_M2_40: + + ands L , K1, #7 // L = L % 8 + ble dgemm_kernel_L2_M2_100 + +dgemm_kernel_L2_M2_42: + + KERNEL2x2_SUB + + subs L, L, #1 + bgt dgemm_kernel_L2_M2_42 + +dgemm_kernel_L2_M2_100: + + SAVE2x2 + +dgemm_kernel_L2_M2_END: + + +dgemm_kernel_L2_M1_BEGIN: + + tst I, #1 // I = I % 2 + ble dgemm_kernel_L2_END + +dgemm_kernel_L2_M1_20: + + INIT1x2 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble dgemm_kernel_L2_M1_40 + +dgemm_kernel_L2_M1_22: + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs L, L, #1 + bgt dgemm_kernel_L2_M1_22 + + +dgemm_kernel_L2_M1_40: + + ands L , K1, #7 // L = L % 8 + ble dgemm_kernel_L2_M1_100 + +dgemm_kernel_L2_M1_42: + + KERNEL1x2_SUB + + subs L, L, #1 + bgt dgemm_kernel_L2_M1_42 + +dgemm_kernel_L2_M1_100: + + SAVE1x2 + + +dgemm_kernel_L2_END: + + mov r3, BC + mov r4, K1 + lsl r4, r4, #4 // k * 2 * 8 + add r3, r3, r4 // B = B + K * 2 * 8 + mov BC, r3 + +/*********************************************************************************************/ + +dgemm_kernel_L1_BEGIN: + + ldr J , N + tst J , #1 + ble dgemm_kernel_L999 + + + ldr CO1, C // CO1 = C + ldr r4 , LDC + add r3 , r4, CO1 + str r3 , C // store C + + ldr AO, A // AO = A + + + +dgemm_kernel_L1_M4_BEGIN: + + ldr I, M + asrs I, I, #2 // I = I / 4 + ble dgemm_kernel_L1_M2_BEGIN + +dgemm_kernel_L1_M4_20: + + INIT4x1 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble dgemm_kernel_L1_M4_40 + .align 5 + +dgemm_kernel_L1_M4_22: + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + subs L, L, #1 + bgt dgemm_kernel_L1_M4_22 + + +dgemm_kernel_L1_M4_40: + + ands L , K1, #7 // L = L % 8 + ble dgemm_kernel_L1_M4_100 + +dgemm_kernel_L1_M4_42: + + KERNEL4x1_SUB + + subs L, L, #1 + bgt dgemm_kernel_L1_M4_42 + +dgemm_kernel_L1_M4_100: + + SAVE4x1 + +dgemm_kernel_L1_M4_END: + + subs I, I, #1 + bgt dgemm_kernel_L1_M4_20 + + +dgemm_kernel_L1_M2_BEGIN: + + ldr I, M + tst I , #3 + ble dgemm_kernel_L1_END + + tst I, #2 // I = I / 2 + ble dgemm_kernel_L1_M1_BEGIN + +dgemm_kernel_L1_M2_20: + + INIT2x1 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble dgemm_kernel_L1_M2_40 + +dgemm_kernel_L1_M2_22: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + subs L, L, #1 + bgt dgemm_kernel_L1_M2_22 + + +dgemm_kernel_L1_M2_40: + + ands L , K1, #7 // L = L % 8 + ble dgemm_kernel_L1_M2_100 + +dgemm_kernel_L1_M2_42: + + KERNEL2x1_SUB + + subs L, L, #1 + bgt dgemm_kernel_L1_M2_42 + +dgemm_kernel_L1_M2_100: + + SAVE2x1 + +dgemm_kernel_L1_M2_END: + + +dgemm_kernel_L1_M1_BEGIN: + + tst I, #1 // I = I % 2 + ble dgemm_kernel_L1_END + +dgemm_kernel_L1_M1_20: + + INIT1x1 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble dgemm_kernel_L1_M1_40 + +dgemm_kernel_L1_M1_22: + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs L, L, #1 + bgt dgemm_kernel_L1_M1_22 + + +dgemm_kernel_L1_M1_40: + + ands L , K1, #7 // L = L % 8 + ble dgemm_kernel_L1_M1_100 + +dgemm_kernel_L1_M1_42: + + KERNEL1x1_SUB + + subs L, L, #1 + bgt dgemm_kernel_L1_M1_42 + +dgemm_kernel_L1_M1_100: + + SAVE1x1 + + +dgemm_kernel_L1_END: + + +dgemm_kernel_L999: + + sub r3, fp, #128 + vldm r3, { d8 - d15} // restore floating point registers + + movs r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/dgemm_ncopy_2_vfp.S b/kernel/arm/dgemm_ncopy_2_vfp.S new file mode 100644 index 000000000..763c032e1 --- /dev/null +++ b/kernel/arm/dgemm_ncopy_2_vfp.S @@ -0,0 +1,225 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/24 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_A r2 +#define OLD_LDA r3 + +#define B [fp, #4 ] + +#define M r0 +#define N r1 +#define A r2 + +#define BO r5 + +#define AO1 r6 +#define AO2 r7 +#define LDA r8 + +#define I r3 +#define J r12 + +#define A_PRE 256 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro COPY2x2 + + fldd d0 , [ AO1, #0 ] + fldd d2 , [ AO1, #8 ] + + fldd d1 , [ AO2, #0 ] + fldd d3 , [ AO2, #8 ] + + add AO1, AO1, #16 + fstmiad BO!, { d0 - d3 } + add AO2, AO2, #16 + +.endm + + +.macro COPY1x2 + + fldd d0 , [ AO1, #0 ] + fldd d1 , [ AO2, #0 ] + add AO1, AO1, #8 + + fstmiad BO!, { d0 - d1 } + add AO2, AO2, #8 + +.endm + +.macro COPY2x1 + + fldd d0 , [ AO1, #0 ] + fldd d1 , [ AO1, #8 ] + + fstmiad BO!, { d0 - d1 } + add AO1, AO1, #16 + +.endm + + +.macro COPY1x1 + + fldd d0 , [ AO1, #0 ] + + fstmiad BO!, { d0 } + add AO1, AO1, #8 + +.endm + + + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + + lsl LDA, OLD_LDA, #3 // lda = lda * 8 + + ldr BO, B + + +/*********************************************************************************************/ + +dgemm_ncopy_L2_BEGIN: + + asrs J, N, #1 // J = N / 2 + ble dgemm_ncopy_L1_BEGIN + +dgemm_ncopy_L2_M2_BEGIN: + + mov AO1, A // AO1 = A + add AO2, AO1, LDA + add A , AO2, LDA // A = A + 2 * LDA + + asrs I, M, #1 // I = M / 2 + ble dgemm_ncopy_L2_M2_40 + +dgemm_ncopy_L2_M2_20: + + COPY2x2 + + subs I , I , #1 + bne dgemm_ncopy_L2_M2_20 + + +dgemm_ncopy_L2_M2_40: + + ands I, M , #1 + ble dgemm_ncopy_L2_M2_END + +dgemm_ncopy_L2_M2_60: + + COPY1x2 + + subs I , I , #1 + bne dgemm_ncopy_L2_M2_60 + + +dgemm_ncopy_L2_M2_END: + + subs J , J, #1 // j-- + bne dgemm_ncopy_L2_M2_BEGIN + +/*********************************************************************************************/ + +dgemm_ncopy_L1_BEGIN: + + tst N, #1 + ble dgemm_ncopy_L999 + + +dgemm_ncopy_L1_M2_BEGIN: + + mov AO1, A // AO1 = A + add A , AO1, LDA // A = A + 1 * LDA + + asrs I, M, #1 // I = M / 2 + ble dgemm_ncopy_L1_M2_40 + +dgemm_ncopy_L1_M2_20: + + COPY2x1 + + subs I , I , #1 + bne dgemm_ncopy_L1_M2_20 + + +dgemm_ncopy_L1_M2_40: + + ands I, M , #1 + ble dgemm_ncopy_L1_M2_END + +dgemm_ncopy_L1_M2_60: + + COPY1x1 + + subs I , I , #1 + bne dgemm_ncopy_L1_M2_60 + + +dgemm_ncopy_L1_M2_END: + + + +dgemm_ncopy_L999: + + + movs r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/dgemm_ncopy_4_vfp.S b/kernel/arm/dgemm_ncopy_4_vfp.S new file mode 100644 index 000000000..ad6692e50 --- /dev/null +++ b/kernel/arm/dgemm_ncopy_4_vfp.S @@ -0,0 +1,349 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/05 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_A r2 +#define OLD_LDA r3 + + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define LDA [fp, #-260 ] + +#define B [fp, #4 ] + +#define M r0 +#define N r1 +#define A r2 + +#define BO r5 + +#define AO1 r6 +#define AO2 r7 +#define AO3 r8 +#define AO4 r9 + +#define I r3 +#define J r12 + +#define A_PRE 256 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro COPY4x4 + + pld [ AO1, #A_PRE ] + pld [ AO2, #A_PRE ] + pld [ AO3, #A_PRE ] + pld [ AO4, #A_PRE ] + + fldd d0 , [ AO1, #0 ] + fldd d1 , [ AO2, #0 ] + fldd d2 , [ AO3, #0 ] + fldd d3 , [ AO4, #0 ] + + fldd d4 , [ AO1, #8 ] + fldd d8 , [ AO1, #16 ] + fldd d12, [ AO1, #24 ] + + fldd d5 , [ AO2, #8 ] + add AO1, AO1, #32 + fldd d9 , [ AO2, #16 ] + fldd d13, [ AO2, #24 ] + + fldd d6 , [ AO3, #8 ] + add AO2, AO2, #32 + fldd d10, [ AO3, #16 ] + fldd d14, [ AO3, #24 ] + + fldd d7 , [ AO4, #8 ] + add AO3, AO3, #32 + fldd d11, [ AO4, #16 ] + fldd d15, [ AO4, #24 ] + + fstmiad BO!, { d0 - d3 } + add AO4, AO4, #32 + fstmiad BO!, { d4 - d7 } + fstmiad BO!, { d8 - d15 } + +.endm + +.macro COPY1x4 + + fldd d0 , [ AO1, #0 ] + fldd d1 , [ AO2, #0 ] + add AO1, AO1, #8 + fldd d2 , [ AO3, #0 ] + add AO2, AO2, #8 + fldd d3 , [ AO4, #0 ] + + add AO3, AO3, #8 + fstmiad BO!, { d0 - d3 } + add AO4, AO4, #8 + +.endm + +.macro COPY4x2 + + fldd d0 , [ AO1, #0 ] + fldd d2 , [ AO1, #8 ] + fldd d4 , [ AO1, #16 ] + fldd d6 , [ AO1, #24 ] + + fldd d1 , [ AO2, #0 ] + fldd d3 , [ AO2, #8 ] + add AO1, AO1, #32 + fldd d5 , [ AO2, #16 ] + fldd d7 , [ AO2, #24 ] + + fstmiad BO!, { d0 - d7 } + add AO2, AO2, #32 + +.endm + + +.macro COPY1x2 + + fldd d0 , [ AO1, #0 ] + fldd d1 , [ AO2, #0 ] + add AO1, AO1, #8 + + fstmiad BO!, { d0 - d1 } + add AO2, AO2, #8 + +.endm + +.macro COPY4x1 + + fldd d0 , [ AO1, #0 ] + fldd d1 , [ AO1, #8 ] + fldd d2 , [ AO1, #16 ] + fldd d3 , [ AO1, #24 ] + + fstmiad BO!, { d0 - d3 } + add AO1, AO1, #32 + +.endm + + +.macro COPY1x1 + + fldd d0 , [ AO1, #0 ] + + fstmiad BO!, { d0 } + add AO1, AO1, #8 + +.endm + + + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + + lsl r3, r3, #3 // lda = lda * 8 + str r3, LDA + + sub r4, fp, #128 + vstm r4, { d8 - d15} // store floating point registers + + ldr BO, B + +dgemm_ncopy_L4_BEGIN: + + asrs J, N, #2 // J = N / 4 + ble dgemm_ncopy_L2_BEGIN + +dgemm_ncopy_L4_M4_BEGIN: + + mov AO1, A // AO1 = A + ldr r4 , LDA + add AO2, AO1, r4 + add AO3, AO2, r4 + add AO4, AO3, r4 + add A , AO4, r4 // A = A + 4 * LDA + + asrs I, M, #2 // I = M / 4 + ble dgemm_ncopy_L4_M4_40 + +dgemm_ncopy_L4_M4_20: + + COPY4x4 + + subs I , I , #1 + bne dgemm_ncopy_L4_M4_20 + + +dgemm_ncopy_L4_M4_40: + + ands I, M , #3 + ble dgemm_ncopy_L4_M4_END + +dgemm_ncopy_L4_M4_60: + + COPY1x4 + + subs I , I , #1 + bne dgemm_ncopy_L4_M4_60 + + +dgemm_ncopy_L4_M4_END: + + subs J , J, #1 // j-- + bne dgemm_ncopy_L4_M4_BEGIN + + + +/*********************************************************************************************/ + +dgemm_ncopy_L2_BEGIN: + + tst N, #3 + ble dgemm_ncopy_L999 + + tst N, #2 + ble dgemm_ncopy_L1_BEGIN + +dgemm_ncopy_L2_M4_BEGIN: + + mov AO1, A // AO1 = A + ldr r4 , LDA + add AO2, AO1, r4 + add A , AO2, r4 // A = A + 2 * LDA + + asrs I, M, #2 // I = M / 4 + ble dgemm_ncopy_L2_M4_40 + +dgemm_ncopy_L2_M4_20: + + COPY4x2 + + subs I , I , #1 + bne dgemm_ncopy_L2_M4_20 + + +dgemm_ncopy_L2_M4_40: + + ands I, M , #3 + ble dgemm_ncopy_L2_M4_END + +dgemm_ncopy_L2_M4_60: + + COPY1x2 + + subs I , I , #1 + bne dgemm_ncopy_L2_M4_60 + + +dgemm_ncopy_L2_M4_END: + + +/*********************************************************************************************/ + +dgemm_ncopy_L1_BEGIN: + + tst N, #1 + ble dgemm_ncopy_L999 + + +dgemm_ncopy_L1_M4_BEGIN: + + mov AO1, A // AO1 = A + ldr r4 , LDA + add A , AO1, r4 // A = A + 1 * LDA + + asrs I, M, #2 // I = M / 4 + ble dgemm_ncopy_L1_M4_40 + +dgemm_ncopy_L1_M4_20: + + COPY4x1 + + subs I , I , #1 + bne dgemm_ncopy_L1_M4_20 + + +dgemm_ncopy_L1_M4_40: + + ands I, M , #3 + ble dgemm_ncopy_L1_M4_END + +dgemm_ncopy_L1_M4_60: + + COPY1x1 + + subs I , I , #1 + bne dgemm_ncopy_L1_M4_60 + + +dgemm_ncopy_L1_M4_END: + + + +dgemm_ncopy_L999: + + sub r3, fp, #128 + vldm r3, { d8 - d15} // restore floating point registers + + movs r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/dgemm_tcopy_4_vfp.S b/kernel/arm/dgemm_tcopy_4_vfp.S new file mode 100644 index 000000000..88a139ad8 --- /dev/null +++ b/kernel/arm/dgemm_tcopy_4_vfp.S @@ -0,0 +1,408 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/06 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_A r2 +#define OLD_LDA r3 + + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define B [fp, #4 ] +#define A [fp, #-248 ] + +#define M r0 +#define N r1 +#define M4 r2 + +#define LDA r5 + +#define AO1 r6 +#define BO1 r7 +#define BO2 r8 +#define BO3 r9 + +#define I r4 +#define J r12 + +#define A_PRE 256 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro COPY4x4 + + pld [ AO1, #A_PRE ] + fldmiad AO1, { d0 - d3 } + + add r3, AO1, LDA + pld [ r3, #A_PRE ] + fldmiad r3, { d4 - d7 } + + add r3, r3, LDA + pld [ r3, #A_PRE ] + fldmiad r3, { d8 - d11 } + + add r3, r3, LDA + pld [ r3, #A_PRE ] + fldmiad r3, { d12 - d15 } + + fstmiad BO1, { d0 - d15 } + add AO1, AO1, #32 + add BO1, BO1, M4 + +.endm + +.macro COPY2x4 + + fldmiad AO1, { d0 - d1 } + + add r3, AO1, LDA + fldmiad r3, { d2 - d3 } + + add r3, r3, LDA + fldmiad r3, { d4 - d5 } + + add r3, r3, LDA + fldmiad r3, { d6 - d7 } + + fstmiad BO2, { d0 - d7 } + add AO1, AO1, #16 + add BO2, BO2, #64 + +.endm + +.macro COPY1x4 + + fldmiad AO1, { d0 } + + add r3, AO1, LDA + fldmiad r3, { d1 } + + add r3, r3, LDA + fldmiad r3, { d2 } + + add r3, r3, LDA + fldmiad r3, { d3 } + + fstmiad BO3, { d0 - d3 } + add AO1, AO1, #8 + add BO3, BO3, #32 + +.endm + +/*************************************************************************************************************************/ + +.macro COPY4x2 + + pld [ AO1, #A_PRE ] + fldmiad AO1, { d0 - d3 } + + add r3, AO1, LDA + pld [ r3, #A_PRE ] + fldmiad r3, { d4 - d7 } + + fstmiad BO1, { d0 - d7 } + add AO1, AO1, #32 + add BO1, BO1, M4 + +.endm + +.macro COPY2x2 + + fldmiad AO1, { d0 - d1 } + + add r3, AO1, LDA + fldmiad r3, { d2 - d3 } + + fstmiad BO2, { d0 - d3 } + add AO1, AO1, #16 + add BO2, BO2, #32 + +.endm + +.macro COPY1x2 + + fldmiad AO1, { d0 } + + add r3, AO1, LDA + fldmiad r3, { d1 } + + fstmiad BO3, { d0 - d1 } + add AO1, AO1, #8 + add BO3, BO3, #16 + +.endm + +/*************************************************************************************************************************/ + +.macro COPY4x1 + + pld [ AO1, #A_PRE ] + fldmiad AO1, { d0 - d3 } + + fstmiad BO1, { d0 - d3 } + add AO1, AO1, #32 + add BO1, BO1, M4 + +.endm + +.macro COPY2x1 + + fldmiad AO1, { d0 - d1 } + + fstmiad BO2, { d0 - d1 } + add AO1, AO1, #16 + add BO2, BO2, #16 + +.endm + +.macro COPY1x1 + + fldmiad AO1, { d0 } + + fstmiad BO3, { d0 } + add AO1, AO1, #8 + add BO3, BO3, #8 + +.endm + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + str OLD_A, A // store A + + lsl LDA, OLD_LDA, #3 // lda = lda * SIZE + + sub r4, fp, #128 + vstm r4, { d8 - d15} // store floating point registers + + lsl r4 , M, #3 // M * SIZE + + ldr r3, B + + and BO2 , N , #-4 + and BO3 , N , #-2 + + mul BO2, BO2, r4 + mul BO3, BO3, r4 + + add BO2 , BO2, r3 + add BO3 , BO3, r3 + + lsl M4, M, #5 // M4 = M * 4 * SIZE + +dgemm_tcopy_L4_BEGIN: + + asrs J, M, #2 // J = N / 4 + ble dgemm_tcopy_L2_BEGIN + +dgemm_tcopy_L4_M4_BEGIN: + + ldr AO1, A // AO1 = A + lsl r3, LDA, #2 // r3 = 4 * LDA + add r3, r3 , AO1 // A = A + 4 * LDA + str r3, A // store A + + ldr BO1, B + add r3, BO1, #128 // B = B + 16 * SIZE + str r3, B + + asrs I, N, #2 // I = M / 4 + ble dgemm_tcopy_L4_M4_40 + +dgemm_tcopy_L4_M4_20: + + COPY4x4 + + subs I , I , #1 + bne dgemm_tcopy_L4_M4_20 + + +dgemm_tcopy_L4_M4_40: + + tst N , #2 + ble dgemm_tcopy_L4_M4_60 + + COPY2x4 + + +dgemm_tcopy_L4_M4_60: + + tst N, #1 + ble dgemm_tcopy_L4_M4_END + + COPY1x4 + + +dgemm_tcopy_L4_M4_END: + + subs J , J, #1 // j-- + bne dgemm_tcopy_L4_M4_BEGIN + + + +/*********************************************************************************************/ + +dgemm_tcopy_L2_BEGIN: + + tst M, #3 + ble dgemm_tcopy_L999 + + tst M, #2 + ble dgemm_tcopy_L1_BEGIN + +dgemm_tcopy_L2_M4_BEGIN: + + ldr AO1, A // AO1 = A + lsl r3, LDA, #1 // r3 = 2 * LDA + add r3, r3 , AO1 // A = A + 2 * LDA + str r3, A // store A + + ldr BO1, B + add r3, BO1, #64 // B = B + 8 * SIZE + str r3, B + + asrs I, N, #2 // I = M / 4 + ble dgemm_tcopy_L2_M4_40 + +dgemm_tcopy_L2_M4_20: + + COPY4x2 + + subs I , I , #1 + bne dgemm_tcopy_L2_M4_20 + + +dgemm_tcopy_L2_M4_40: + + tst N , #2 + ble dgemm_tcopy_L2_M4_60 + + COPY2x2 + +dgemm_tcopy_L2_M4_60: + + tst N , #1 + ble dgemm_tcopy_L2_M4_END + + COPY1x2 + + +dgemm_tcopy_L2_M4_END: + + +/*********************************************************************************************/ + +dgemm_tcopy_L1_BEGIN: + + tst M, #1 + ble dgemm_tcopy_L999 + + +dgemm_tcopy_L1_M4_BEGIN: + + ldr AO1, A // AO1 = A + add r3, LDA , AO1 // A = A + 1 * LDA + str r3, A // store A + + ldr BO1, B + add r3, BO1, #32 // B = B + 4 * SIZE + str r3, B + + asrs I, N, #2 // I = M / 4 + ble dgemm_tcopy_L1_M4_40 + +dgemm_tcopy_L1_M4_20: + + COPY4x1 + + subs I , I , #1 + bne dgemm_tcopy_L1_M4_20 + + +dgemm_tcopy_L1_M4_40: + + tst N , #2 + ble dgemm_tcopy_L1_M4_60 + + COPY2x1 + +dgemm_tcopy_L1_M4_60: + + tst N , #1 + ble dgemm_tcopy_L1_M4_END + + COPY1x1 + + +dgemm_tcopy_L1_M4_END: + + + +dgemm_tcopy_L999: + + sub r3, fp, #128 + vldm r3, { d8 - d15} // restore floating point registers + + mov r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/dot.c b/kernel/arm/dot.c new file mode 100644 index 000000000..30490e291 --- /dev/null +++ b/kernel/arm/dot.c @@ -0,0 +1,64 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : OK +* BLASTEST double : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#include "common.h" + +#if defined(DSDOT) +double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +#else +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +#endif +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + double dot = 0.0 ; + + if ( n < 0 ) return(dot); + + while(i < n) + { + + dot += y[iy] * x[ix] ; + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(dot); + +} + + diff --git a/kernel/arm/dtrmm_kernel_4x2_vfp.S b/kernel/arm/dtrmm_kernel_4x2_vfp.S new file mode 100644 index 000000000..762b9c580 --- /dev/null +++ b/kernel/arm/dtrmm_kernel_4x2_vfp.S @@ -0,0 +1,1089 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/28 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 252 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_K r2 +#define OLD_A r3 +#define OLD_ALPHA d0 + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define KK [fp, #-240 ] +#define KKK [fp, #-244] +#define C [fp, #-248 ] +#define LDC [fp, #-252 ] +#define M [fp, #-256 ] +#define N [fp, #-260 ] +#define K [fp, #-264 ] +#define A [fp, #-268 ] + +#define ALPHA [fp, #-276 ] + +#define B [fp, #4 ] +#define OLD_C [fp, #8 ] +#define OLD_LDC [fp, #12 ] +#define OFFSET [fp, #16 ] + +#define I r0 +#define J r1 +#define L r2 + +#define AO r5 +#define BO r6 + +#define CO1 r8 +#define CO2 r9 + +#define K1 r7 +#define BC r12 + +#define A_PRE 64 +#define B_PRE 64 +#define C_PRE 64 + + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro INIT4x2 + + vsub.f64 d8 , d8 , d8 + vmov.f64 d9, d8 + vmov.f64 d10, d8 + vmov.f64 d11, d8 + vmov.f64 d12, d8 + vmov.f64 d13, d8 + vmov.f64 d14, d8 + vmov.f64 d15, d8 + +.endm + + + +.macro KERNEL4x2_SUB + + fldd d4 , [ BO ] + fldd d0 , [ AO ] + + fldd d1 , [ AO, #8 ] + pld [ AO , #A_PRE ] + + fmacd d8 , d0, d4 + fldd d2 , [ AO, #16 ] + fmacd d9 , d1, d4 + fldd d3 , [ AO, #24 ] + fmacd d10 , d2, d4 + fldd d5 , [ BO, #8 ] + fmacd d11 , d3, d4 + + fmacd d12 , d0, d5 + fmacd d13 , d1, d5 + add AO , AO, #32 + fmacd d14 , d2, d5 + add BO , BO, #16 + fmacd d15 , d3, d5 + + +.endm + +.macro SAVE4x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + fldd d0, ALPHA + + fmuld d4 , d0 , d8 + fmuld d5 , d0 , d9 + fmuld d6 , d0 , d10 + fmuld d7 , d0 , d11 + + fstd d4 , [CO1] + fstd d5 , [CO1, #8 ] + fstd d6 , [CO1, #16 ] + fstd d7 , [CO1, #24 ] + + fmuld d4 , d0 , d12 + fmuld d5 , d0 , d13 + fmuld d6 , d0 , d14 + fmuld d7 , d0 , d15 + + fstd d4 , [CO2] + fstd d5 , [CO2, #8 ] + fstd d6 , [CO2, #16 ] + fstd d7 , [CO2, #24 ] + + add CO1, CO1, #32 + +.endm + + +/******************************************************************************/ + +.macro INIT2x2 + + vsub.f64 d8 , d8 , d8 + vmov.f64 d9, d8 + vmov.f64 d12, d8 + vmov.f64 d13, d8 + +.endm + +.macro KERNEL2x2_SUB + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fmacd d8 , d0, d4 + fmacd d9 , d1, d4 + + fmacd d12 , d0, d5 + fmacd d13 , d1, d5 + + add AO , AO, #16 + add BO , BO, #16 + +.endm + +.macro SAVE2x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + fldd d0, ALPHA + + + fmuld d4 , d0 , d8 + fmuld d5 , d0 , d9 + + fstd d4 , [CO1] + fstd d5 , [CO1, #8 ] + + fmuld d4 , d0 , d12 + fmuld d5 , d0 , d13 + + fstd d4 , [CO2] + fstd d5 , [CO2, #8 ] + + add CO1, CO1, #16 + +.endm + + +/******************************************************************************/ + +.macro INIT1x2 + + vsub.f64 d8 , d8 , d8 + vmov.f64 d12, d8 + +.endm + +.macro KERNEL1x2_SUB + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fldd d0 , [ AO ] + + fmacd d8 , d0, d4 + + fmacd d12 , d0, d5 + + add AO , AO, #8 + add BO , BO, #16 + +.endm + +.macro SAVE1x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + fldd d0, ALPHA + + + fmuld d4 , d0 , d8 + + fstd d4 , [CO1] + + + fmuld d4 , d0 , d12 + + fstd d4 , [CO2] + + add CO1, CO1, #8 + +.endm + + + +/******************************************************************************/ + +.macro INIT4x1 + + vsub.f64 d8 , d8 , d8 + vmov.f64 d9, d8 + vmov.f64 d10, d8 + vmov.f64 d11, d8 + +.endm + + + +.macro KERNEL4x1_SUB + + fldd d4 , [ BO ] + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fmacd d8 , d0, d4 + fmacd d9 , d1, d4 + fmacd d10 , d2, d4 + fmacd d11 , d3, d4 + + add AO , AO, #32 + add BO , BO, #8 + +.endm + +.macro SAVE4x1 + + fldd d0, ALPHA + + fmuld d4 , d0 , d8 + fmuld d5 , d0 , d9 + fmuld d6 , d0 , d10 + fmuld d7 , d0 , d11 + + fstd d4 , [CO1] + fstd d5 , [CO1, #8 ] + fstd d6 , [CO1, #16 ] + fstd d7 , [CO1, #24 ] + + add CO1, CO1, #32 + +.endm + +/******************************************************************************/ + +.macro INIT2x1 + + vsub.f64 d8 , d8 , d8 + vmov.f64 d9 , d8 + +.endm + +.macro KERNEL2x1_SUB + + fldd d4 , [ BO ] + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fmacd d8 , d0, d4 + fmacd d9 , d1, d4 + + add AO , AO, #16 + add BO , BO, #8 + +.endm + +.macro SAVE2x1 + + fldd d0, ALPHA + + fmuld d4 , d0 , d8 + fmuld d5 , d0 , d9 + + fstd d4 , [CO1] + fstd d5 , [CO1, #8 ] + + add CO1, CO1, #16 + +.endm + + +/******************************************************************************/ + +.macro INIT1x1 + + vsub.f64 d8 , d8 , d8 + +.endm + +.macro KERNEL1x1_SUB + + fldd d4 , [ BO ] + + fldd d0 , [ AO ] + + fmacd d8 , d0, d4 + + add AO , AO, #8 + add BO , BO, #8 + +.endm + +.macro SAVE1x1 + + fldd d0, ALPHA + + fmuld d4 , d0 , d8 + + fstd d4 , [CO1] + + add CO1, CO1, #8 + +.endm + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + str OLD_M, M + str OLD_N, N + str OLD_K, K + str OLD_A, A + vstr OLD_ALPHA, ALPHA + + sub r3, fp, #128 + vstm r3, { d8 - d15} // store floating point registers + + ldr r3, OLD_LDC + lsl r3, r3, #3 // ldc = ldc * 8 + str r3, LDC + + ldr r3, OLD_C + str r3, C + + ldr BC, B + + ldr r3, OFFSET +#ifndef LEFT + neg r3 , r3 +#endif + str r3 , KK + + ldr J, N + asrs J, J, #1 // J = J / 2 + ble _L1_BEGIN + +_L2_BEGIN: + + ldr CO1, C // CO1 = C + ldr r4 , LDC + lsl r4 , r4 , #1 // LDC * 2 + add r3 , r4, CO1 + str r3 , C // store C + +#if defined(LEFT) + ldr r3 , OFFSET + str r3 , KK +#endif + + ldr AO, A // AO = A + + +_L2_M4_BEGIN: + + ldr I, M + asrs I, I, #2 // I = I / 4 + ble _L2_M2_BEGIN + +_L2_M4_20: + + INIT4x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #4 // 2 double values + add BO , BO , r4 + lsls r4 , r3 , #5 // 4 double values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr L , K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr L , K + ldr r3, KK + sub L , L, r3 + str L , KKK +#else + ldr L , KK +#ifdef LEFT + add L , L , #4 // number of values in AO +#else + add L , L , #2 // number of values in BO +#endif + str L , KKK +#endif + + mov K1, L + asrs L , K1, #3 // L = L / 8 + ble _L2_M4_40 + .align 5 + +_L2_M4_22: + + pld [ BO , #B_PRE ] + KERNEL4x2_SUB + KERNEL4x2_SUB + pld [ BO , #B_PRE ] + KERNEL4x2_SUB + KERNEL4x2_SUB + + pld [ BO , #B_PRE ] + KERNEL4x2_SUB + KERNEL4x2_SUB + pld [ BO , #B_PRE ] + KERNEL4x2_SUB + KERNEL4x2_SUB + + subs L, L, #1 + bgt _L2_M4_22 + + +_L2_M4_40: + + ands L , K1, #7 // L = L % 8 + ble _L2_M4_100 + +_L2_M4_42: + + KERNEL4x2_SUB + + subs L, L, #1 + bgt _L2_M4_42 + +_L2_M4_100: + + SAVE4x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #4 // 2 double values + add BO , BO , r4 + lsls r4 , r3 , #5 // 4 double values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #4 // number of values in AO + str r3 , KK +#endif + + + +_L2_M4_END: + + subs I, I, #1 + bgt _L2_M4_20 + + +_L2_M2_BEGIN: + + ldr I, M + tst I , #3 + ble _L2_END + + tst I, #2 // I = I / 2 + ble _L2_M1_BEGIN + +_L2_M2_20: + + INIT2x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #4 // 2 double values + add BO , BO , r4 + lsls r4 , r3 , #4 // 2 double values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr L , K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr L , K + ldr r3, KK + sub L , L, r3 + str L , KKK +#else + ldr L , KK +#ifdef LEFT + add L , L , #2 // number of values in AO +#else + add L , L , #2 // number of values in BO +#endif + str L , KKK +#endif + + mov K1, L + asrs L , K1, #3 // L = L / 8 + ble _L2_M2_40 + +_L2_M2_22: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + subs L, L, #1 + bgt _L2_M2_22 + + +_L2_M2_40: + + ands L , K1, #7 // L = L % 8 + ble _L2_M2_100 + +_L2_M2_42: + + KERNEL2x2_SUB + + subs L, L, #1 + bgt _L2_M2_42 + +_L2_M2_100: + + SAVE2x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #4 // 2 double values + add BO , BO , r4 + lsls r4 , r3 , #4 // 2 double values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in AO + str r3 , KK +#endif + + + +_L2_M2_END: + + +_L2_M1_BEGIN: + + tst I, #1 // I = I % 2 + ble _L2_END + +_L2_M1_20: + + INIT1x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #4 // 2 double values + add BO , BO , r4 + lsls r4 , r3 , #3 // 1 double value + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr L , K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr L , K + ldr r3, KK + sub L , L, r3 + str L , KKK +#else + ldr L , KK +#ifdef LEFT + add L , L , #1 // number of values in AO +#else + add L , L , #2 // number of values in BO +#endif + str L , KKK +#endif + + mov K1, L + asrs L , K1, #3 // L = L / 8 + ble _L2_M1_40 + +_L2_M1_22: + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs L, L, #1 + bgt _L2_M1_22 + + +_L2_M1_40: + + ands L , K1, #7 // L = L % 8 + ble _L2_M1_100 + +_L2_M1_42: + + KERNEL1x2_SUB + + subs L, L, #1 + bgt _L2_M1_42 + +_L2_M1_100: + + SAVE1x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #4 // 2 double values + add BO , BO , r4 + lsls r4 , r3 , #3 // 1 double value + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #1 // number of values in AO + str r3 , KK +#endif + + + + +_L2_END: + + mov r3, BC + ldr r4, K + lsl r4, r4, #4 // k * 2 * 8 + add r3, r3, r4 // B = B + K * 2 * 8 + mov BC, r3 + +#if !defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in BO + str r3 , KK +#endif + + + subs J , #1 // j-- + bgt _L2_BEGIN + +/*********************************************************************************************/ + +_L1_BEGIN: + + ldr J , N + tst J , #1 + ble _L999 + + + ldr CO1, C // CO1 = C + ldr r4 , LDC + add r3 , r4, CO1 + str r3 , C // store C + +#if defined(LEFT) + ldr r3 , OFFSET + str r3 , KK +#endif + + ldr AO, A // AO = A + //pld [AO , #A_PRE-96] + //pld [AO , #A_PRE-64] + //pld [AO , #A_PRE-32] + + + +_L1_M4_BEGIN: + + ldr I, M + asrs I, I, #2 // I = I / 4 + ble _L1_M2_BEGIN + +_L1_M4_20: + + INIT4x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #3 // 1 double value + add BO , BO , r4 + lsls r4 , r3 , #5 // 4 double values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr L , K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr L , K + ldr r3, KK + sub L , L, r3 + str L , KKK +#else + ldr L , KK +#ifdef LEFT + add L , L , #4 // number of values in AO +#else + add L , L , #1 // number of values in BO +#endif + str L , KKK +#endif + + mov K1, L + asrs L , K1, #3 // L = L / 8 + ble _L1_M4_40 + .align 5 + +_L1_M4_22: + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + subs L, L, #1 + bgt _L1_M4_22 + + +_L1_M4_40: + + ands L , K1, #7 // L = L % 8 + ble _L1_M4_100 + +_L1_M4_42: + + KERNEL4x1_SUB + + subs L, L, #1 + bgt _L1_M4_42 + +_L1_M4_100: + + SAVE4x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #3 // 1 double value + add BO , BO , r4 + lsls r4 , r3 , #5 // 4 double values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #4 // number of values in AO + str r3 , KK +#endif + + + +_L1_M4_END: + + subs I, I, #1 + bgt _L1_M4_20 + + +_L1_M2_BEGIN: + + ldr I, M + tst I , #3 + ble _L1_END + + tst I, #2 // I = I / 2 + ble _L1_M1_BEGIN + +_L1_M2_20: + + INIT2x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #3 // 1 double value + add BO , BO , r4 + lsls r4 , r3 , #4 // 2 double values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr L , K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr L , K + ldr r3, KK + sub L , L, r3 + str L , KKK +#else + ldr L , KK +#ifdef LEFT + add L , L , #2 // number of values in AO +#else + add L , L , #1 // number of values in BO +#endif + str L , KKK +#endif + + mov K1, L + asrs L , K1, #3 // L = L / 8 + ble _L1_M2_40 + +_L1_M2_22: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + subs L, L, #1 + bgt _L1_M2_22 + + +_L1_M2_40: + + ands L , K1, #7 // L = L % 8 + ble _L1_M2_100 + +_L1_M2_42: + + KERNEL2x1_SUB + + subs L, L, #1 + bgt _L1_M2_42 + +_L1_M2_100: + + SAVE2x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #3 // 1 double value + add BO , BO , r4 + lsls r4 , r3 , #4 // 2 double values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in AO + str r3 , KK +#endif + + + +_L1_M2_END: + + +_L1_M1_BEGIN: + + tst I, #1 // I = I % 2 + ble _L1_END + +_L1_M1_20: + + INIT1x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #3 // 1 double value + add BO , BO , r4 + lsls r4 , r3 , #3 // 1 double value + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr L , K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr L , K + ldr r3, KK + sub L , L, r3 + str L , KKK +#else + ldr L , KK +#ifdef LEFT + add L , L , #1 // number of values in AO +#else + add L , L , #1 // number of values in BO +#endif + str L , KKK +#endif + + mov K1, L + asrs L , K1, #3 // L = L / 8 + ble _L1_M1_40 + +_L1_M1_22: + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs L, L, #1 + bgt _L1_M1_22 + + +_L1_M1_40: + + ands L , K1, #7 // L = L % 8 + ble _L1_M1_100 + +_L1_M1_42: + + KERNEL1x1_SUB + + subs L, L, #1 + bgt _L1_M1_42 + +_L1_M1_100: + + SAVE1x1 + + +_L1_END: + + +_L999: + + sub r3, fp, #128 + vldm r3, { d8 - d15} // restore floating point registers + + movs r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/dtrmm_kernel_4x4_vfpv3.S b/kernel/arm/dtrmm_kernel_4x4_vfpv3.S new file mode 100644 index 000000000..0f8a9291a --- /dev/null +++ b/kernel/arm/dtrmm_kernel_4x4_vfpv3.S @@ -0,0 +1,1953 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/23 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 252 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_K r2 +#define OLD_A r3 +#define OLD_ALPHA d0 + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define KK [fp, #-240 ] +#define KKK [fp, #-244] +#define C [fp, #-248 ] +#define LDC [fp, #-252 ] +#define M [fp, #-256 ] +#define N [fp, #-260 ] +#define K [fp, #-264 ] +#define A [fp, #-268 ] + +#define ALPHA [fp, #-276 ] + +#define B [fp, #4 ] +#define OLD_C [fp, #8 ] +#define OLD_LDC [fp, #12 ] +#define OFFSET [fp, #16 ] + +#define I r0 +#define J r1 +#define L r2 + +#define AO r5 +#define BO r6 + +#define CO1 r8 +#define CO2 r9 + +#define K1 r7 +#define BC r12 + +#define A_PRE 64 +#define B_PRE 64 +#define C_PRE 64 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro INIT4x4 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d17, d16 + vmov.f64 d18, d16 + vmov.f64 d19, d16 + vmov.f64 d20, d16 + vmov.f64 d21, d16 + vmov.f64 d22, d16 + vmov.f64 d23, d16 + vmov.f64 d24, d16 + vmov.f64 d25, d16 + vmov.f64 d26, d16 + vmov.f64 d27, d16 + vmov.f64 d28, d16 + vmov.f64 d29, d16 + vmov.f64 d30, d16 + vmov.f64 d31, d16 + +.endm + +.macro KERNEL4x4_I + pld [ BO , #B_PRE ] + + fldd d8 , [ BO ] + + pld [ AO , #A_PRE ] + fldmiad AO!, { d0 - d1} + + fmuld d16 , d0, d8 + fldmiad AO!, { d2 - d3} + fmuld d17 , d1, d8 + fldd d9 , [ BO, #8 ] + fmuld d18 , d2, d8 + fldd d10, [ BO, #16 ] + fmuld d19 , d3, d8 + + fldd d11, [ BO, #24 ] + fmuld d20 , d0, d9 + fmuld d21 , d1, d9 + add BO , BO, #32 + fmuld d22 , d2, d9 + + fldd d12, [ BO ] + fmuld d23 , d3, d9 + + fmuld d24 , d0, d10 + fldmiad AO!, { d4 - d5 } + fmuld d25 , d1, d10 + fmuld d26 , d2, d10 + fldmiad AO!, { d6 - d7 } + fmuld d27 , d3, d10 + + fldd d13, [ BO, #8 ] + fmuld d28 , d0, d11 + fldd d14, [ BO, #16 ] + fmuld d29 , d1, d11 + fldd d15, [ BO, #24 ] + fmuld d30 , d2, d11 + fmuld d31 , d3, d11 + add BO , BO, #32 + +.endm + + + +.macro KERNEL4x4_S + pld [ BO , #B_PRE ] + + fldd d8 , [ BO ] + + pld [ AO , #A_PRE ] + fldmiad AO!, { d0 - d1} + + fmacd d16 , d0, d8 + fldmiad AO!, { d2 - d3} + fmacd d17 , d1, d8 + fldd d9 , [ BO, #8 ] + fmacd d18 , d2, d8 + fldd d10, [ BO, #16 ] + fmacd d19 , d3, d8 + + fldd d11, [ BO, #24 ] + fmacd d20 , d0, d9 + fmacd d21 , d1, d9 + add BO , BO, #32 + fmacd d22 , d2, d9 + + fldd d12, [ BO ] + fmacd d23 , d3, d9 + + fmacd d24 , d0, d10 + fldmiad AO!, { d4 - d5 } + fmacd d25 , d1, d10 + fmacd d26 , d2, d10 + fldmiad AO!, { d6 - d7 } + fmacd d27 , d3, d10 + + fldd d13, [ BO, #8 ] + fmacd d28 , d0, d11 + fldd d14, [ BO, #16 ] + fmacd d29 , d1, d11 + fldd d15, [ BO, #24 ] + fmacd d30 , d2, d11 + fmacd d31 , d3, d11 + add BO , BO, #32 + +.endm + + + +.macro KERNEL4x4_M1 + + fmacd d16 , d4, d12 + pld [ AO , #A_PRE ] + fmacd d17 , d5, d12 + fmacd d18 , d6, d12 + pld [ BO , #B_PRE ] + fmacd d19 , d7, d12 + + fmacd d20 , d4, d13 + fldd d8 , [ BO ] + fmacd d21 , d5, d13 + fmacd d22 , d6, d13 + fldmiad AO!, { d0 - d1 } + fmacd d23 , d7, d13 + + fmacd d24 , d4, d14 + fldmiad AO!, { d2 - d3 } + fmacd d25 , d5, d14 + fldd d9 , [ BO, #8 ] + fmacd d26 , d6, d14 + fldd d10, [ BO, #16 ] + fmacd d27 , d7, d14 + + fldd d11, [ BO, #24 ] + fmacd d28 , d4, d15 + fmacd d29 , d5, d15 + fmacd d30 , d6, d15 + add BO , BO, #32 + fmacd d31 , d7, d15 + +.endm + +.macro KERNEL4x4_M2 + + + fmacd d16 , d0, d8 + pld [ AO , #A_PRE ] + fmacd d17 , d1, d8 + pld [ BO , #B_PRE ] + fmacd d18 , d2, d8 + fldd d12, [ BO ] + fmacd d19 , d3, d8 + + fmacd d20 , d0, d9 + fldmiad AO!, { d4 - d5 } + fmacd d21 , d1, d9 + fmacd d22 , d2, d9 + fldmiad AO!, { d6 - d7 } + fmacd d23 , d3, d9 + + fmacd d24 , d0, d10 + fmacd d25 , d1, d10 + fmacd d26 , d2, d10 + fmacd d27 , d3, d10 + + fldd d13, [ BO, #8 ] + fmacd d28 , d0, d11 + fldd d14, [ BO, #16 ] + fmacd d29 , d1, d11 + fldd d15, [ BO, #24 ] + fmacd d30 , d2, d11 + fmacd d31 , d3, d11 + add BO , BO, #32 + +.endm + + +.macro KERNEL4x4_E + + fmacd d16 , d4, d12 + pld [ AO , #A_PRE ] + fmacd d17 , d5, d12 + fmacd d18 , d6, d12 + pld [ BO , #B_PRE ] + fmacd d19 , d7, d12 + + fmacd d20 , d4, d13 + fmacd d21 , d5, d13 + fmacd d22 , d6, d13 + fmacd d23 , d7, d13 + + fmacd d24 , d4, d14 + fmacd d25 , d5, d14 + fmacd d26 , d6, d14 + fmacd d27 , d7, d14 + + fmacd d28 , d4, d15 + fmacd d29 , d5, d15 + fmacd d30 , d6, d15 + fmacd d31 , d7, d15 + +.endm + + + + +.macro KERNEL4x4_SUB + pld [ BO , #B_PRE ] + pld [ AO , #A_PRE ] + + fldd d8 , [ BO ] + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fmacd d16 , d0, d8 + fldd d9 , [ BO, #8 ] + fmacd d17 , d1, d8 + fldd d10, [ BO, #16 ] + fmacd d18 , d2, d8 + fldd d11, [ BO, #24 ] + fmacd d19 , d3, d8 + + fmacd d20 , d0, d9 + fmacd d21 , d1, d9 + fmacd d22 , d2, d9 + fmacd d23 , d3, d9 + + fmacd d24 , d0, d10 + fmacd d25 , d1, d10 + fmacd d26 , d2, d10 + fmacd d27 , d3, d10 + + fmacd d28 , d0, d11 + fmacd d29 , d1, d11 + add AO , AO, #32 + fmacd d30 , d2, d11 + add BO , BO, #32 + fmacd d31 , d3, d11 + +.endm + +.macro SAVE4x4 + + ldr r3 , LDC + add CO2 , CO1, r3 + fldd d0, ALPHA + add r4 , CO2, r3 + + + fmuld d8 , d0 , d16 + fmuld d9 , d0 , d17 + fmuld d10, d0 , d18 + fmuld d11, d0 , d19 + + fmuld d12, d0 , d20 + fstd d8 , [CO1] + fmuld d13, d0 , d21 + fstd d9 , [CO1, #8 ] + fmuld d14, d0 , d22 + fstd d10, [CO1, #16 ] + fmuld d15, d0 , d23 + fstd d11, [CO1, #24 ] + + + fmuld d8 , d0 , d24 + fstd d12, [CO2] + fmuld d9 , d0 , d25 + fstd d13, [CO2, #8 ] + fmuld d10, d0 , d26 + fstd d14, [CO2, #16 ] + fmuld d11, d0 , d27 + fstd d15, [CO2, #24 ] + + add CO2, r4 , r3 + + fstd d8 , [r4 ] + fmuld d12, d0 , d28 + fstd d9 , [r4 , #8 ] + fmuld d13, d0 , d29 + fstd d10, [r4 , #16 ] + fmuld d14, d0 , d30 + fstd d11, [r4 , #24 ] + fmuld d15, d0 , d31 + + fstmiad CO2, { d12 - d15 } + + add CO1, CO1, #32 + +.endm + +/******************************************************************************/ + +.macro INIT2x4 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d17, d16 + vmov.f64 d20, d16 + vmov.f64 d21, d16 + vmov.f64 d24, d16 + vmov.f64 d25, d16 + vmov.f64 d28, d16 + vmov.f64 d29, d16 + +.endm + + + +.macro KERNEL2x4_SUB + + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + fldd d10, [ BO, #16 ] + fldd d11, [ BO, #24 ] + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fmacd d16 , d0, d8 + fmacd d17 , d1, d8 + + fmacd d20 , d0, d9 + fmacd d21 , d1, d9 + + fmacd d24 , d0, d10 + fmacd d25 , d1, d10 + + fmacd d28 , d0, d11 + fmacd d29 , d1, d11 + add AO , AO, #16 + add BO , BO, #32 + +.endm + +.macro SAVE2x4 + + ldr r3 , LDC + add CO2 , CO1, r3 + add r4 , CO2, r3 + + fldd d0, ALPHA + + fmuld d8 , d0 , d16 + fmuld d9 , d0 , d17 + + fstd d8 , [CO1] + fstd d9 , [CO1, #8 ] + + fmuld d12, d0 , d20 + fmuld d13, d0 , d21 + + fstd d12, [CO2] + fstd d13, [CO2, #8 ] + + fmuld d8 , d0 , d24 + fmuld d9 , d0 , d25 + + fstd d8 , [r4 ] + fstd d9 , [r4 , #8 ] + + add CO2, r4 , r3 + + fmuld d12, d0 , d28 + fmuld d13, d0 , d29 + + fstd d12, [CO2] + fstd d13, [CO2, #8 ] + + add CO1, CO1, #16 + +.endm + + +/******************************************************************************/ + +.macro INIT1x4 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d20, d16 + vmov.f64 d24, d16 + vmov.f64 d28, d16 + +.endm + + + +.macro KERNEL1x4_SUB + + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + fldd d10, [ BO, #16 ] + fldd d11, [ BO, #24 ] + + fldd d0 , [ AO ] + + fmacd d16 , d0, d8 + fmacd d20 , d0, d9 + fmacd d24 , d0, d10 + fmacd d28 , d0, d11 + + add AO , AO, #8 + add BO , BO, #32 + +.endm + +.macro SAVE1x4 + + ldr r3 , LDC + add CO2 , CO1, r3 + add r4 , CO2, r3 + + fldd d0, ALPHA + + fmuld d8 , d0 , d16 + fstd d8 , [CO1] + + fmuld d12, d0 , d20 + fstd d12, [CO2] + + fmuld d8 , d0 , d24 + fstd d8 , [r4 ] + + add CO2, r4 , r3 + + fmuld d12, d0 , d28 + fstd d12, [CO2] + + add CO1, CO1, #8 + +.endm + +/******************************************************************************/ +/******************************************************************************/ + +.macro INIT4x2 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d17, d16 + vmov.f64 d18, d16 + vmov.f64 d19, d16 + vmov.f64 d20, d16 + vmov.f64 d21, d16 + vmov.f64 d22, d16 + vmov.f64 d23, d16 + +.endm + + + +.macro KERNEL4x2_SUB + + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fmacd d16 , d0, d8 + fmacd d17 , d1, d8 + fmacd d18 , d2, d8 + fmacd d19 , d3, d8 + + fmacd d20 , d0, d9 + fmacd d21 , d1, d9 + fmacd d22 , d2, d9 + fmacd d23 , d3, d9 + + add AO , AO, #32 + add BO , BO, #16 + +.endm + +.macro SAVE4x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + fldd d0, ALPHA + + fmuld d8 , d0 , d16 + fmuld d9 , d0 , d17 + fmuld d10, d0 , d18 + fmuld d11, d0 , d19 + + fstd d8 , [CO1] + fstd d9 , [CO1, #8 ] + fstd d10, [CO1, #16 ] + fstd d11, [CO1, #24 ] + + fmuld d12, d0 , d20 + fmuld d13, d0 , d21 + fmuld d14, d0 , d22 + fmuld d15, d0 , d23 + + fstd d12, [CO2] + fstd d13, [CO2, #8 ] + fstd d14, [CO2, #16 ] + fstd d15, [CO2, #24 ] + + add CO1, CO1, #32 + +.endm + + +/******************************************************************************/ + +.macro INIT2x2 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d17, d16 + vmov.f64 d20, d16 + vmov.f64 d21, d16 + +.endm + + + +.macro KERNEL2x2_SUB + + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fmacd d16 , d0, d8 + fmacd d17 , d1, d8 + + fmacd d20 , d0, d9 + fmacd d21 , d1, d9 + + add AO , AO, #16 + add BO , BO, #16 + +.endm + +.macro SAVE2x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + fldd d0, ALPHA + + fmuld d8 , d0 , d16 + fmuld d9 , d0 , d17 + + fstd d8 , [CO1] + fstd d9 , [CO1, #8 ] + + fmuld d12, d0 , d20 + fmuld d13, d0 , d21 + + fstd d12, [CO2] + fstd d13, [CO2, #8 ] + + add CO1, CO1, #16 + +.endm + +/******************************************************************************/ + +.macro INIT1x2 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d20, d16 + +.endm + + + +.macro KERNEL1x2_SUB + + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + fldd d0 , [ AO ] + fmacd d16 , d0, d8 + fmacd d20 , d0, d9 + + add AO , AO, #8 + add BO , BO, #16 + +.endm + +.macro SAVE1x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + fldd d0, ALPHA + + fmuld d8 , d0 , d16 + fstd d8 , [CO1] + + fmuld d12, d0 , d20 + fstd d12, [CO2] + + add CO1, CO1, #8 + +.endm + +/******************************************************************************/ +/******************************************************************************/ + +.macro INIT4x1 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d17, d16 + vmov.f64 d18, d16 + vmov.f64 d19, d16 + +.endm + + + +.macro KERNEL4x1_SUB + + fldd d8 , [ BO ] + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fmacd d16 , d0, d8 + fmacd d17 , d1, d8 + fmacd d18 , d2, d8 + fmacd d19 , d3, d8 + + add AO , AO, #32 + add BO , BO, #8 + +.endm + +.macro SAVE4x1 + + + fldd d0, ALPHA + + fmuld d8 , d0 , d16 + fmuld d9 , d0 , d17 + fmuld d10, d0 , d18 + fmuld d11, d0 , d19 + + fstd d8 , [CO1] + fstd d9 , [CO1, #8 ] + fstd d10, [CO1, #16 ] + fstd d11, [CO1, #24 ] + + add CO1, CO1, #32 + +.endm + + + + +/******************************************************************************/ + +.macro INIT2x1 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d17, d16 + +.endm + + + +.macro KERNEL2x1_SUB + + fldd d8 , [ BO ] + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fmacd d16 , d0, d8 + fmacd d17 , d1, d8 + + add AO , AO, #16 + add BO , BO, #8 + +.endm + +.macro SAVE2x1 + + + fldd d0, ALPHA + + fmuld d8 , d0 , d16 + fmuld d9 , d0 , d17 + + fstd d8 , [CO1] + fstd d9 , [CO1, #8 ] + + add CO1, CO1, #16 + +.endm + +/******************************************************************************/ + +.macro INIT1x1 + + vsub.f64 d16 , d16 , d16 + +.endm + + + +.macro KERNEL1x1_SUB + + fldd d8 , [ BO ] + + fldd d0 , [ AO ] + + fmacd d16 , d0, d8 + + add AO , AO, #8 + add BO , BO, #8 + +.endm + +.macro SAVE1x1 + + + fldd d0, ALPHA + + fmuld d8 , d0 , d16 + fstd d8 , [CO1] + + add CO1, CO1, #8 + +.endm + + + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + str OLD_M, M + str OLD_N, N + str OLD_K, K + str OLD_A, A + vstr OLD_ALPHA, ALPHA + + sub r3, fp, #128 + vstm r3, { d8 - d15} // store floating point registers + + ldr r3, OLD_LDC + lsl r3, r3, #3 // ldc = ldc * 8 + str r3, LDC + + ldr r3, OLD_C + str r3, C + + ldr BC, B + + ldr r3, OFFSET +#ifndef LEFT + neg r3 , r3 +#endif + str r3 , KK + + ldr J, N + asrs J, J, #2 // J = J / 4 + ble _L2_BEGIN + +_L4_BEGIN: + + ldr CO1, C // CO1 = C + ldr r4 , LDC + lsl r4 , r4 , #2 // LDC * 4 + add r3 , r4, CO1 + str r3 , C // store C + +#if defined(LEFT) + ldr r3 , OFFSET + str r3 , KK +#endif + + + ldr AO, A // AO = A + pld [AO , #A_PRE-64] + pld [AO , #A_PRE-32] + + + +_L4_M4_BEGIN: + + ldr I, M + asrs I, I, #2 // I = I / 4 + ble _L4_M2_BEGIN + +_L4_M4_20: + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #5 // 4 double values + add BO , BO , r4 + lsls r4 , r3 , #5 // 4 double values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr L , K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr L , K + ldr r3, KK + sub L , L, r3 + str L , KKK +#else + ldr L , KK +#ifdef LEFT + add L , L , #4 // number of values in AO +#else + add L , L , #4 // number of values in BO +#endif + str L , KKK +#endif + + mov K1, L + asrs L , K1, #5 // L = L / 8 + ble _L4_M4_40 + .align 5 + + + + KERNEL4x4_I + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + + + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_E + + subs L, L, #1 + ble _L4_M4_41 + +_L4_M4_22: + + KERNEL4x4_S + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + + + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_E + + subs L, L, #1 + ble _L4_M4_41 + + b _L4_M4_22 + + +_L4_M4_40: + + INIT4x4 + +_L4_M4_41: + + ands L , K1, #31 // L = L % 8 + ble _L4_M4_100 + +_L4_M4_42: + + KERNEL4x4_SUB + + subs L, L, #1 + bgt _L4_M4_42 + +_L4_M4_100: + + SAVE4x4 + + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #5 // 4 double values + add BO , BO , r4 + lsls r4 , r3 , #5 // 4 double values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #4 // number of values in AO + str r3 , KK +#endif + + +_L4_M4_END: + + subs I, I, #1 + bgt _L4_M4_20 + + +_L4_M2_BEGIN: + + ldr I, M + tst I , #3 + ble _L4_END + + tst I, #2 // I = I / 2 + ble _L4_M1_BEGIN + +_L4_M2_20: + + INIT2x4 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #5 // 4 double values + add BO , BO , r4 + lsls r4 , r3 , #4 // 2 double values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr L , K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr L , K + ldr r3, KK + sub L , L, r3 + str L , KKK +#else + ldr L , KK +#ifdef LEFT + add L , L , #2 // number of values in AO +#else + add L , L , #4 // number of values in BO +#endif + str L , KKK +#endif + + mov K1, L + asrs L , K1, #3 // L = L / 8 + ble _L4_M2_40 + +_L4_M2_22: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + subs L, L, #1 + bgt _L4_M2_22 + + +_L4_M2_40: + + ands L , K1, #7 // L = L % 8 + ble _L4_M2_100 + +_L4_M2_42: + + KERNEL2x4_SUB + + subs L, L, #1 + bgt _L4_M2_42 + +_L4_M2_100: + + SAVE2x4 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #5 // 4 double values + add BO , BO , r4 + lsls r4 , r3 , #4 // 2 double values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in AO + str r3 , KK +#endif + + +_L4_M2_END: + + +_L4_M1_BEGIN: + + tst I, #1 // I = I % 2 + ble _L4_END + +_L4_M1_20: + + INIT1x4 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #5 // 4 double values + add BO , BO , r4 + lsls r4 , r3 , #3 // 1 double value + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr L , K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr L , K + ldr r3, KK + sub L , L, r3 + str L , KKK +#else + ldr L , KK +#ifdef LEFT + add L , L , #1 // number of values in AO +#else + add L , L , #4 // number of values in BO +#endif + str L , KKK +#endif + + mov K1, L + asrs L , K1, #3 // L = L / 8 + ble _L4_M1_40 + +_L4_M1_22: + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + subs L, L, #1 + bgt _L4_M1_22 + + +_L4_M1_40: + + ands L , K1, #7 // L = L % 8 + ble _L4_M1_100 + +_L4_M1_42: + + KERNEL1x4_SUB + + subs L, L, #1 + bgt _L4_M1_42 + +_L4_M1_100: + + SAVE1x4 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #5 // 4 double values + add BO , BO , r4 + lsls r4 , r3 , #3 // 1 double value + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #1 // number of values in AO + str r3 , KK +#endif + + + +_L4_END: + + mov r3, BC + ldr r4, K + lsl r4, r4, #5 // k * 4 * 8 + add r3, r3, r4 // B = B + K * 4 * 8 + mov BC, r3 + +#if !defined(LEFT) + ldr r3 , KK + add r3 , r3 , #4 // number of values in BO + str r3 , KK +#endif + + subs J , #1 // j-- + bgt _L4_BEGIN + + + +/*********************************************************************************************/ + +_L2_BEGIN: + + ldr J , N + tst J , #3 + ble _L999 + + tst J , #2 + ble _L1_BEGIN + + ldr CO1, C // CO1 = C + ldr r4 , LDC + lsl r4 , r4 , #1 // LDC * 2 + add r3 , r4, CO1 + str r3 , C // store C + +#if defined(LEFT) + ldr r3 , OFFSET + str r3 , KK +#endif + + ldr AO, A // AO = A + //pld [AO , #A_PRE-96] + //pld [AO , #A_PRE-64] + //pld [AO , #A_PRE-32] + + + +_L2_M4_BEGIN: + + ldr I, M + asrs I, I, #2 // I = I / 4 + ble _L2_M2_BEGIN + +_L2_M4_20: + + INIT4x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #4 // 2 double values + add BO , BO , r4 + lsls r4 , r3 , #5 // 4 double values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr L , K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr L , K + ldr r3, KK + sub L , L, r3 + str L , KKK +#else + ldr L , KK +#ifdef LEFT + add L , L , #4 // number of values in AO +#else + add L , L , #2 // number of values in BO +#endif + str L , KKK +#endif + + mov K1, L + asrs L , K1, #3 // L = L / 8 + ble _L2_M4_40 + .align 5 + +_L2_M4_22: + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + subs L, L, #1 + bgt _L2_M4_22 + + +_L2_M4_40: + + ands L , K1, #7 // L = L % 8 + ble _L2_M4_100 + +_L2_M4_42: + + KERNEL4x2_SUB + + subs L, L, #1 + bgt _L2_M4_42 + +_L2_M4_100: + + SAVE4x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #4 // 2 double values + add BO , BO , r4 + lsls r4 , r3 , #5 // 4 double values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #4 // number of values in AO + str r3 , KK +#endif + + + +_L2_M4_END: + + subs I, I, #1 + bgt _L2_M4_20 + + +_L2_M2_BEGIN: + + ldr I, M + tst I , #3 + ble _L2_END + + tst I, #2 // I = I / 2 + ble _L2_M1_BEGIN + +_L2_M2_20: + + INIT2x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #4 // 2 double values + add BO , BO , r4 + lsls r4 , r3 , #4 // 2 double values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr L , K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr L , K + ldr r3, KK + sub L , L, r3 + str L , KKK +#else + ldr L , KK +#ifdef LEFT + add L , L , #2 // number of values in AO +#else + add L , L , #2 // number of values in BO +#endif + str L , KKK +#endif + + mov K1, L + asrs L , K1, #3 // L = L / 8 + ble _L2_M2_40 + +_L2_M2_22: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + subs L, L, #1 + bgt _L2_M2_22 + + +_L2_M2_40: + + ands L , K1, #7 // L = L % 8 + ble _L2_M2_100 + +_L2_M2_42: + + KERNEL2x2_SUB + + subs L, L, #1 + bgt _L2_M2_42 + +_L2_M2_100: + + SAVE2x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #4 // 2 double values + add BO , BO , r4 + lsls r4 , r3 , #4 // 2 double values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in AO + str r3 , KK +#endif + + + +_L2_M2_END: + + +_L2_M1_BEGIN: + + tst I, #1 // I = I % 2 + ble _L2_END + +_L2_M1_20: + + INIT1x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #4 // 2 double values + add BO , BO , r4 + lsls r4 , r3 , #3 // 1 double value + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr L , K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr L , K + ldr r3, KK + sub L , L, r3 + str L , KKK +#else + ldr L , KK +#ifdef LEFT + add L , L , #1 // number of values in AO +#else + add L , L , #2 // number of values in BO +#endif + str L , KKK +#endif + + mov K1, L + asrs L , K1, #3 // L = L / 8 + ble _L2_M1_40 + +_L2_M1_22: + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs L, L, #1 + bgt _L2_M1_22 + + +_L2_M1_40: + + ands L , K1, #7 // L = L % 8 + ble _L2_M1_100 + +_L2_M1_42: + + KERNEL1x2_SUB + + subs L, L, #1 + bgt _L2_M1_42 + +_L2_M1_100: + + SAVE1x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #4 // 2 double values + add BO , BO , r4 + lsls r4 , r3 , #3 // 1 double value + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #1 // number of values in AO + str r3 , KK +#endif + + + + +_L2_END: + + mov r3, BC + ldr r4, K + lsl r4, r4, #4 // k * 2 * 8 + add r3, r3, r4 // B = B + K * 2 * 8 + mov BC, r3 + +#if !defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in BO + str r3 , KK +#endif + + +/*********************************************************************************************/ + +_L1_BEGIN: + + ldr J , N + tst J , #1 + ble _L999 + + + ldr CO1, C // CO1 = C + ldr r4 , LDC + add r3 , r4, CO1 + str r3 , C // store C + +#if defined(LEFT) + ldr r3 , OFFSET + str r3 , KK +#endif + + ldr AO, A // AO = A + //pld [AO , #A_PRE-96] + //pld [AO , #A_PRE-64] + //pld [AO , #A_PRE-32] + + + +_L1_M4_BEGIN: + + ldr I, M + asrs I, I, #2 // I = I / 4 + ble _L1_M2_BEGIN + +_L1_M4_20: + + INIT4x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #3 // 1 double value + add BO , BO , r4 + lsls r4 , r3 , #5 // 4 double values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr L , K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr L , K + ldr r3, KK + sub L , L, r3 + str L , KKK +#else + ldr L , KK +#ifdef LEFT + add L , L , #4 // number of values in AO +#else + add L , L , #1 // number of values in BO +#endif + str L , KKK +#endif + + mov K1, L + asrs L , K1, #3 // L = L / 8 + ble _L1_M4_40 + .align 5 + +_L1_M4_22: + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + subs L, L, #1 + bgt _L1_M4_22 + + +_L1_M4_40: + + ands L , K1, #7 // L = L % 8 + ble _L1_M4_100 + +_L1_M4_42: + + KERNEL4x1_SUB + + subs L, L, #1 + bgt _L1_M4_42 + +_L1_M4_100: + + SAVE4x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #3 // 1 double value + add BO , BO , r4 + lsls r4 , r3 , #5 // 4 double values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #4 // number of values in AO + str r3 , KK +#endif + + + +_L1_M4_END: + + subs I, I, #1 + bgt _L1_M4_20 + + +_L1_M2_BEGIN: + + ldr I, M + tst I , #3 + ble _L1_END + + tst I, #2 // I = I / 2 + ble _L1_M1_BEGIN + +_L1_M2_20: + + INIT2x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #3 // 1 double value + add BO , BO , r4 + lsls r4 , r3 , #4 // 2 double values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr L , K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr L , K + ldr r3, KK + sub L , L, r3 + str L , KKK +#else + ldr L , KK +#ifdef LEFT + add L , L , #2 // number of values in AO +#else + add L , L , #1 // number of values in BO +#endif + str L , KKK +#endif + + mov K1, L + asrs L , K1, #3 // L = L / 8 + ble _L1_M2_40 + +_L1_M2_22: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + subs L, L, #1 + bgt _L1_M2_22 + + +_L1_M2_40: + + ands L , K1, #7 // L = L % 8 + ble _L1_M2_100 + +_L1_M2_42: + + KERNEL2x1_SUB + + subs L, L, #1 + bgt _L1_M2_42 + +_L1_M2_100: + + SAVE2x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #3 // 1 double value + add BO , BO , r4 + lsls r4 , r3 , #4 // 2 double values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in AO + str r3 , KK +#endif + + + +_L1_M2_END: + + +_L1_M1_BEGIN: + + tst I, #1 // I = I % 2 + ble _L1_END + +_L1_M1_20: + + INIT1x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #3 // 1 double value + add BO , BO , r4 + lsls r4 , r3 , #3 // 1 double value + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr L , K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr L , K + ldr r3, KK + sub L , L, r3 + str L , KKK +#else + ldr L , KK +#ifdef LEFT + add L , L , #1 // number of values in AO +#else + add L , L , #1 // number of values in BO +#endif + str L , KKK +#endif + + mov K1, L + asrs L , K1, #3 // L = L / 8 + ble _L1_M1_40 + +_L1_M1_22: + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs L, L, #1 + bgt _L1_M1_22 + + +_L1_M1_40: + + ands L , K1, #7 // L = L % 8 + ble _L1_M1_100 + +_L1_M1_42: + + KERNEL1x1_SUB + + subs L, L, #1 + bgt _L1_M1_42 + +_L1_M1_100: + + SAVE1x1 + + +_L1_END: + + +_L999: + + sub r3, fp, #128 + vldm r3, { d8 - d15} // restore floating point registers + + movs r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/gemv_n.c b/kernel/arm/gemv_n.c new file mode 100644 index 000000000..aedcca965 --- /dev/null +++ b/kernel/arm/gemv_n.c @@ -0,0 +1,67 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +/************************************************************************************** + * * 2013/09/14 Saar + * * BLASTEST float : OK + * * BLASTEST double : OK + * CTEST : OK + * TEST : OK + * * + * **************************************************************************************/ + + +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +{ + BLASLONG i; + BLASLONG ix,iy; + BLASLONG j; + FLOAT *a_ptr; + FLOAT temp; + + ix = 0; + a_ptr = a; + + for (j=0; j + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT maxf=0.0; + BLASLONG max=0; + + if (n < 0 || inc_x < 1 ) return(max); + + maxf=ABS(x[0]); + + while(i < n) + { + if( ABS(x[ix]) > ABS(maxf) ) + { + max = i; + maxf = ABS(x[ix]); + } + ix += inc_x; + i++; + } + return(max+1); +} + + diff --git a/kernel/arm/iamax_vfp.S b/kernel/arm/iamax_vfp.S new file mode 100644 index 000000000..1d7344898 --- /dev/null +++ b/kernel/arm/iamax_vfp.S @@ -0,0 +1,478 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/14 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define N r0 +#define X r1 +#define INC_X r2 +#define INDEX r3 +#define Z r4 + +#define I r12 + +#define X_PRE 512 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +#if defined(USE_ABS) + +#if defined(DOUBLE) + +#define VABS(x0,x1) vabs.f64 x0, x1 + +#else + +#define VABS(x0,x1) vabs.f32 x0, x1 + +#endif + +#else + +#define VABS(x0,x1) nop + +#endif + +/*****************************************************************************************/ + +#if defined(USE_MIN) + +#define MOVCOND movlt + +#if defined(DOUBLE) + +#define VMOVCOND vmovlt.f64 + +#else + +#define VMOVCOND vmovlt.f32 + +#endif + +#else + +#define MOVCOND movgt + +#if defined(DOUBLE) + +#define VMOVCOND vmovgt.f64 + +#else + +#define VMOVCOND vmovgt.f32 + +#endif + + +#endif + + +/*****************************************************************************************/ + + + +#if !defined(COMPLEX) + +#if defined(DOUBLE) + +.macro INIT_F + + fldmiad X!, { d0 } + VABS( d0, d0 ) + mov Z, #1 + mov INDEX, Z + +.endm + +.macro KERNEL_F1 + + fldmiad X!, { d4 } + add Z, Z, #1 + VABS( d4, d4 ) + vcmpe.f64 d4, d0 + vmrs APSR_nzcv, fpscr + VMOVCOND d0, d4 + MOVCOND INDEX, Z + +.endm + +.macro INIT_S + + fldmiad X, { d0 } + VABS( d0, d0 ) + mov Z, #1 + mov INDEX, Z + add X, X, INC_X + +.endm + + +.macro KERNEL_S1 + + fldmiad X, { d4 } + add Z, Z, #1 + VABS( d4, d4 ) + vcmpe.f64 d4, d0 + vmrs APSR_nzcv, fpscr + VMOVCOND d0, d4 + MOVCOND INDEX, Z + add X, X, INC_X + +.endm + +#else + +.macro INIT_F + + fldmias X!, { s0 } + VABS( s0, s0 ) + mov Z, #1 + mov INDEX, Z + +.endm + +.macro KERNEL_F1 + + fldmias X!, { s4 } + add Z, Z, #1 + VABS( s4, s4 ) + vcmpe.f32 s4, s0 + vmrs APSR_nzcv, fpscr + VMOVCOND s0, s4 + MOVCOND INDEX, Z + +.endm + +.macro INIT_S + + fldmias X, { s0 } + VABS( s0, s0 ) + mov Z, #1 + mov INDEX, Z + add X, X, INC_X + +.endm + + +.macro KERNEL_S1 + + fldmias X, { s4 } + add Z, Z, #1 + VABS( s4, s4 ) + vcmpe.f32 s4, s0 + vmrs APSR_nzcv, fpscr + VMOVCOND s0, s4 + MOVCOND INDEX, Z + add X, X, INC_X + +.endm + + + + +#endif + +#else + +#if defined(DOUBLE) + +.macro INIT_F + + fldmiad X!, { d0 -d1 } + vabs.f64 d0, d0 + vabs.f64 d1, d1 + vadd.f64 d0 , d0, d1 + mov Z, #1 + mov INDEX, Z + +.endm + + +.macro KERNEL_F1 + + fldmiad X!, { d4 - d5 } + add Z, Z, #1 + vabs.f64 d4, d4 + vabs.f64 d5, d5 + vadd.f64 d4 , d4, d5 + vcmpe.f64 d4, d0 + vmrs APSR_nzcv, fpscr + VMOVCOND d0, d4 + MOVCOND INDEX, Z + +.endm + +.macro INIT_S + + fldmiad X, { d0 -d1 } + vabs.f64 d0, d0 + vabs.f64 d1, d1 + vadd.f64 d0 , d0, d1 + mov Z, #1 + mov INDEX, Z + add X, X, INC_X + +.endm + + + +.macro KERNEL_S1 + + fldmiad X, { d4 - d5 } + add Z, Z, #1 + vabs.f64 d4, d4 + vabs.f64 d5, d5 + vadd.f64 d4 , d4, d5 + vcmpe.f64 d4, d0 + vmrs APSR_nzcv, fpscr + VMOVCOND d0, d4 + MOVCOND INDEX, Z + add X, X, INC_X + +.endm + +#else + +.macro INIT_F + + fldmias X!, { s0 -s1 } + vabs.f32 s0, s0 + vabs.f32 s1, s1 + vadd.f32 s0 , s0, s1 + mov Z, #1 + mov INDEX, Z + +.endm + + +.macro KERNEL_F1 + + fldmias X!, { s4 - s5 } + add Z, Z, #1 + vabs.f32 s4, s4 + vabs.f32 s5, s5 + vadd.f32 s4 , s4, s5 + vcmpe.f32 s4, s0 + vmrs APSR_nzcv, fpscr + VMOVCOND s0, s4 + MOVCOND INDEX, Z + +.endm + +.macro INIT_S + + fldmias X, { s0 -s1 } + vabs.f32 s0, s0 + vabs.f32 s1, s1 + vadd.f32 s0 , s0, s1 + mov Z, #1 + mov INDEX, Z + add X, X, INC_X + +.endm + + + +.macro KERNEL_S1 + + fldmias X, { s4 - s5 } + add Z, Z, #1 + vabs.f32 s4, s4 + vabs.f32 s5, s5 + vadd.f32 s4 , s4, s5 + vcmpe.f32 s4, s0 + vmrs APSR_nzcv, fpscr + VMOVCOND s0, s4 + MOVCOND INDEX, Z + add X, X, INC_X + +.endm + + + + +#endif + +#endif + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + push {r4} + +#if defined(DOUBLE) + vsub.f64 d0 , d0 , d0 +#else + vsub.f32 s0 , s0 , s0 +#endif + mov INDEX, #0 + + cmp N, #0 + ble iamax_kernel_L999 + + cmp INC_X, #0 + beq iamax_kernel_L999 + + + cmp INC_X, #1 + bne iamax_kernel_S_BEGIN + + +iamax_kernel_F_BEGIN: + + INIT_F + + subs N, N , #1 + ble iamax_kernel_L999 + + asrs I, N, #2 // I = N / 4 + ble iamax_kernel_F1 + + .align 5 + +iamax_kernel_F4: + + pld [ X, #X_PRE ] + KERNEL_F1 + KERNEL_F1 +#if defined(COMPLEX) && defined(DOUBLE) + pld [ X, #X_PRE ] +#endif + KERNEL_F1 + KERNEL_F1 + + subs I, I, #1 + ble iamax_kernel_F1 + + +#if defined(COMPLEX) || defined(DOUBLE) + pld [ X, #X_PRE ] +#endif + KERNEL_F1 + KERNEL_F1 +#if defined(COMPLEX) && defined(DOUBLE) + pld [ X, #X_PRE ] +#endif + KERNEL_F1 + KERNEL_F1 + + subs I, I, #1 + bne iamax_kernel_F4 + +iamax_kernel_F1: + + ands I, N, #3 + ble iamax_kernel_L999 + +iamax_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne iamax_kernel_F10 + + b iamax_kernel_L999 + +iamax_kernel_S_BEGIN: + +#if defined(COMPLEX) + +#if defined(DOUBLE) + lsl INC_X, INC_X, #4 // INC_X * SIZE * 2 +#else + lsl INC_X, INC_X, #3 // INC_X * SIZE * 2 +#endif + +#else + +#if defined(DOUBLE) + lsl INC_X, INC_X, #3 // INC_X * SIZE +#else + lsl INC_X, INC_X, #2 // INC_X * SIZE +#endif + +#endif + + INIT_S + + subs N, N , #1 + ble iamax_kernel_L999 + + asrs I, N, #2 // I = N / 4 + ble iamax_kernel_S1 + + .align 5 + +iamax_kernel_S4: + + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + + subs I, I, #1 + bne iamax_kernel_S4 + +iamax_kernel_S1: + + ands I, N, #3 + ble iamax_kernel_L999 + +iamax_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne iamax_kernel_S10 + + +iamax_kernel_L999: + + mov r0, INDEX // set return value + + pop {r4} + bx lr + + EPILOGUE + diff --git a/kernel/arm/iamin.c b/kernel/arm/iamin.c new file mode 100644 index 000000000..fdb5d7a10 --- /dev/null +++ b/kernel/arm/iamin.c @@ -0,0 +1,75 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : NoTest +* BLASTEST double : NoTest +* CTEST : NoTest +* TEST : NoTest +* +**************************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT minf=0.0; + BLASLONG min=0; + + if (n < 0 || inc_x < 1 ) return(min); + + minf=ABS(x[0]); + + while(i < n) + { + if( ABS(x[ix]) < ABS(minf) ) + { + min = i; + minf = ABS(x[ix]); + } + ix += inc_x; + i++; + } + return(min+1); +} + + diff --git a/kernel/arm/imax.c b/kernel/arm/imax.c new file mode 100644 index 000000000..e3e4b9a6c --- /dev/null +++ b/kernel/arm/imax.c @@ -0,0 +1,67 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : NoTest +* BLASTEST double : NoTest +* CTEST : NoTest +* TEST : NoTest +* +**************************************************************************************/ + +#include "common.h" +#include + + + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT maxf=0.0; + BLASLONG max=0; + + if (n < 0 || inc_x < 1 ) return(max); + + maxf=x[0]; + + while(i < n) + { + if( x[ix] > maxf ) + { + max = i; + maxf = x[ix]; + } + ix += inc_x; + i++; + } + return(max+1); +} + + diff --git a/kernel/arm/imin.c b/kernel/arm/imin.c new file mode 100644 index 000000000..fbcadc2fd --- /dev/null +++ b/kernel/arm/imin.c @@ -0,0 +1,65 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +/************************************************************************************** +* 2013/08/19 Saar +* BLASTEST float +* BLASTEST double +* +**************************************************************************************/ + +#include "common.h" +#include + + + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT minf=0.0; + BLASLONG min=0; + + if (n < 0 || inc_x < 1 ) return(min); + + minf=x[0]; + + while(i < n) + { + if( x[ix] > minf ) + { + min = i; + minf = x[ix]; + } + ix += inc_x; + i++; + } + return(min+1); +} + + diff --git a/kernel/arm/izamax.c b/kernel/arm/izamax.c new file mode 100644 index 000000000..a6ba86388 --- /dev/null +++ b/kernel/arm/izamax.c @@ -0,0 +1,81 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : NoTest +* BLASTEST double : NoTest +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + +#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT maxf[2]; + BLASLONG max=0; + BLASLONG inc_x2; + + if (n < 0 || inc_x < 1 ) return(max); + + inc_x2 = 2 * inc_x; + + maxf[0] = ABS(x[ix]); + maxf[1] = ABS(x[ix+1]); + + while(i < n) + { + if( CABS1(x,ix) > CABS1(maxf,0) ) + { + max = i; + maxf[0] = ABS(x[ix]); + maxf[1] = ABS(x[ix+1]); + } + ix += inc_x2; + i++; + } + return(max+1); +} + + diff --git a/kernel/arm/izamin.c b/kernel/arm/izamin.c new file mode 100644 index 000000000..45c2a7c9c --- /dev/null +++ b/kernel/arm/izamin.c @@ -0,0 +1,81 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : NoTest +* BLASTEST double : NoTest +* CTEST : NoTest +* TEST : NoTest +* +**************************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + +#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT minf[2]; + BLASLONG min=0; + BLASLONG inc_x2; + + if (n < 0 || inc_x < 1 ) return(min); + + inc_x2 = 2 * inc_x; + + minf[0] = ABS(x[ix]); + minf[1] = ABS(x[ix+1]); + + while(i < n) + { + if( CABS1(x,ix) < CABS1(minf,0) ) + { + min = i; + minf[0] = ABS(x[ix]); + minf[1] = ABS(x[ix+1]); + } + ix += inc_x2; + i++; + } + return(min+1); +} + + diff --git a/kernel/arm/max.c b/kernel/arm/max.c new file mode 100644 index 000000000..3239e3408 --- /dev/null +++ b/kernel/arm/max.c @@ -0,0 +1,63 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : NoTest +* BLASTEST double : NoTest +* CTEST : NoTest +* TEST : NoTest +* +**************************************************************************************/ + +#include "common.h" +#include + + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT maxf=0.0; + + if (n < 0 || inc_x < 1 ) return(maxf); + + maxf=x[0]; + + while(i < n) + { + if( x[ix] > maxf ) + { + maxf = x[ix]; + } + ix += inc_x; + i++; + } + return(maxf); +} + + diff --git a/kernel/arm/min.c b/kernel/arm/min.c new file mode 100644 index 000000000..de4c4719a --- /dev/null +++ b/kernel/arm/min.c @@ -0,0 +1,63 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : NoTest +* BLASTEST double : NoTest +* CTEST : NoTest +* TEST : NoTest +* +**************************************************************************************/ + +#include "common.h" +#include + + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT minf=0.0; + + if (n < 0 || inc_x < 1 ) return(minf); + + minf=x[0]; + + while(i < n) + { + if( x[ix] < minf ) + { + minf = x[ix]; + } + ix += inc_x; + i++; + } + return(minf); +} + + diff --git a/kernel/arm/nrm2.c b/kernel/arm/nrm2.c new file mode 100644 index 000000000..d65c5a410 --- /dev/null +++ b/kernel/arm/nrm2.c @@ -0,0 +1,88 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/13 Saar +* BLASTEST float : OK +* BLASTEST double : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + + + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + FLOAT scale = 0.0; + FLOAT ssq = 1.0; + FLOAT absxi = 0.0; + + + if (n < 0 || inc_x < 1 ) return(0.0); + if ( n == 1 ) return( ABS(x[0]) ); + + n *= inc_x; + while(i < n) + { + + if ( x[i] != 0.0 ) + { + absxi = ABS( x[i] ); + if ( scale < absxi ) + { + ssq = 1 + ssq * ( scale / absxi ) * ( scale / absxi ); + scale = absxi ; + } + else + { + ssq += ( absxi/scale ) * ( absxi/scale ); + } + + } + i += inc_x; + } + scale = scale * sqrt( ssq ); + return(scale); + +} + + diff --git a/kernel/arm/nrm2_vfp.S b/kernel/arm/nrm2_vfp.S new file mode 100644 index 000000000..4c62917b9 --- /dev/null +++ b/kernel/arm/nrm2_vfp.S @@ -0,0 +1,565 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/22 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define N r0 +#define X r1 +#define INC_X r2 + +#define I r12 + +#define X_PRE 512 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + + +#if !defined(COMPLEX) + +#if defined(DOUBLE) + + +.macro KERNEL_F1 + + fldmiad X!, { d4 } + vcmpe.f64 d4, d6 // compare with 0.0 + vmrs APSR_nzcv, fpscr + beq KERNEL_F1_NEXT_\@ + vabs.f64 d4, d4 + vcmpe.f64 d0, d4 // compare with scale + vmrs APSR_nzcv, fpscr + vdivge.f64 d2 , d4, d0 // scale >= x ? x / scale + vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale ) + bge KERNEL_F1_NEXT_\@ + vdiv.f64 d2 , d0, d4 // scale / x + vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x ) + vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x ) + vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) + vmov.f64 d0 , d4 // scale = x + +KERNEL_F1_NEXT_\@: + +.endm + +.macro KERNEL_F8 + + pld [ X, #X_PRE ] + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + pld [ X, #X_PRE ] + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + +.endm + +.macro KERNEL_S1 + + fldmiad X, { d4 } + vcmpe.f64 d4, d6 // compare with 0.0 + vmrs APSR_nzcv, fpscr + beq KERNEL_S1_NEXT + vabs.f64 d4, d4 + vcmpe.f64 d0, d4 // compare with scale + vmrs APSR_nzcv, fpscr + vdivge.f64 d2 , d4, d0 // scale >= x ? x / scale + vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale ) + bge KERNEL_S1_NEXT + vdiv.f64 d2 , d0, d4 // scale / x + vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x ) + vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x ) + vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) + vmov.f64 d0 , d4 // scale = x + +KERNEL_S1_NEXT: + + add X, X, INC_X + +.endm + +#else + +.macro KERNEL_F1 + + fldmias X!, { s4 } + vcmpe.f32 s4, s6 // compare with 0.0 + vmrs APSR_nzcv, fpscr + beq KERNEL_F1_NEXT_\@ + vabs.f32 s4, s4 + vcmpe.f32 s0, s4 // compare with scale + vmrs APSR_nzcv, fpscr + vdivge.f32 s2 , s4, s0 // scale >= x ? x / scale + vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale ) + bge KERNEL_F1_NEXT_\@ + vdiv.f32 s2 , s0, s4 // scale / x + vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x ) + vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x ) + vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) + vmov.f32 s0 , s4 // scale = x + +KERNEL_F1_NEXT_\@: + +.endm + +.macro KERNEL_F8 + + pld [ X, #X_PRE ] + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + +.endm + +.macro KERNEL_S1 + + fldmias X, { s4 } + vcmpe.f32 s4, s6 // compare with 0.0 + vmrs APSR_nzcv, fpscr + beq KERNEL_S1_NEXT + vabs.f32 s4, s4 + vcmpe.f32 s0, s4 // compare with scale + vmrs APSR_nzcv, fpscr + vdivge.f32 s2 , s4, s0 // scale >= x ? x / scale + vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale ) + bge KERNEL_S1_NEXT + vdiv.f32 s2 , s0, s4 // scale / x + vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x ) + vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x ) + vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) + vmov.f32 s0 , s4 // scale = x + +KERNEL_S1_NEXT: + + add X, X, INC_X + +.endm + + + + +#endif + +#else + +#if defined(DOUBLE) + +.macro KERNEL_F1 + + fldmiad X!, { d4 - d5 } + + vcmpe.f64 d4, d6 // compare with 0.0 + vmrs APSR_nzcv, fpscr + beq KERNEL_F1_NEXT_\@ + vabs.f64 d4, d4 + vcmpe.f64 d0, d4 // compare with scale + vmrs APSR_nzcv, fpscr + vdivge.f64 d2 , d4, d0 // scale >= x ? x / scale + vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale ) + bge KERNEL_F1_NEXT_\@ + vdiv.f64 d2 , d0, d4 // scale / x + vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x ) + vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x ) + vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) + vmov.f64 d0 , d4 // scale = x + +KERNEL_F1_NEXT_\@: + + vcmpe.f64 d5, d6 // compare with 0.0 + vmrs APSR_nzcv, fpscr + beq KERNEL_F1_END_\@ + vabs.f64 d5, d5 + vcmpe.f64 d0, d5 // compare with scale + vmrs APSR_nzcv, fpscr + vdivge.f64 d2 , d5, d0 // scale >= x ? x / scale + vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale ) + bge KERNEL_F1_END_\@ + vdiv.f64 d2 , d0, d5 // scale / x + vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x ) + vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x ) + vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) + vmov.f64 d0 , d5 // scale = x + +KERNEL_F1_END_\@: + + +.endm + +.macro KERNEL_F8 + + pld [ X, #X_PRE ] + KERNEL_F1 + KERNEL_F1 + pld [ X, #X_PRE ] + KERNEL_F1 + KERNEL_F1 + pld [ X, #X_PRE ] + KERNEL_F1 + KERNEL_F1 + pld [ X, #X_PRE ] + KERNEL_F1 + KERNEL_F1 + +.endm + +.macro KERNEL_S1 + + fldmiad X, { d4 - d5 } + + vcmpe.f64 d4, d6 // compare with 0.0 + vmrs APSR_nzcv, fpscr + beq KERNEL_S1_NEXT_\@ + vabs.f64 d4, d4 + vcmpe.f64 d0, d4 // compare with scale + vmrs APSR_nzcv, fpscr + vdivge.f64 d2 , d4, d0 // scale >= x ? x / scale + vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale ) + bge KERNEL_S1_NEXT_\@ + vdiv.f64 d2 , d0, d4 // scale / x + vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x ) + vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x ) + vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) + vmov.f64 d0 , d4 // scale = x + +KERNEL_S1_NEXT_\@: + + vcmpe.f64 d5, d6 // compare with 0.0 + vmrs APSR_nzcv, fpscr + beq KERNEL_S1_END_\@ + vabs.f64 d5, d5 + vcmpe.f64 d0, d5 // compare with scale + vmrs APSR_nzcv, fpscr + vdivge.f64 d2 , d5, d0 // scale >= x ? x / scale + vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale ) + bge KERNEL_S1_END_\@ + vdiv.f64 d2 , d0, d5 // scale / x + vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x ) + vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x ) + vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) + vmov.f64 d0 , d5 // scale = x + +KERNEL_S1_END_\@: + + add X, X, INC_X + +.endm + + +#else + +.macro KERNEL_F1 + + fldmias X!, { s4 - s5 } + + vcmpe.f32 s4, s6 // compare with 0.0 + vmrs APSR_nzcv, fpscr + beq KERNEL_F1_NEXT_\@ + vabs.f32 s4, s4 + vcmpe.f32 s0, s4 // compare with scale + vmrs APSR_nzcv, fpscr + vdivge.f32 s2 , s4, s0 // scale >= x ? x / scale + vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale ) + bge KERNEL_F1_NEXT_\@ + vdiv.f32 s2 , s0, s4 // scale / x + vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x ) + vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x ) + vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) + vmov.f32 s0 , s4 // scale = x + +KERNEL_F1_NEXT_\@: + + vcmpe.f32 s5, s6 // compare with 0.0 + vmrs APSR_nzcv, fpscr + beq KERNEL_F1_END_\@ + vabs.f32 s5, s5 + vcmpe.f32 s0, s5 // compare with scale + vmrs APSR_nzcv, fpscr + vdivge.f32 s2 , s5, s0 // scale >= x ? x / scale + vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale ) + bge KERNEL_F1_END_\@ + vdiv.f32 s2 , s0, s5 // scale / x + vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x ) + vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x ) + vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) + vmov.f32 s0 , s5 // scale = x + +KERNEL_F1_END_\@: + + +.endm + +.macro KERNEL_F8 + + pld [ X, #X_PRE ] + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + pld [ X, #X_PRE ] + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + +.endm + +.macro KERNEL_S1 + + fldmias X, { s4 - s5 } + + vcmpe.f32 s4, s6 // compare with 0.0 + vmrs APSR_nzcv, fpscr + beq KERNEL_S1_NEXT_\@ + vabs.f32 s4, s4 + vcmpe.f32 s0, s4 // compare with scale + vmrs APSR_nzcv, fpscr + vdivge.f32 s2 , s4, s0 // scale >= x ? x / scale + vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale ) + bge KERNEL_S1_NEXT_\@ + vdiv.f32 s2 , s0, s4 // scale / x + vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x ) + vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x ) + vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) + vmov.f32 s0 , s4 // scale = x + +KERNEL_S1_NEXT_\@: + + vcmpe.f32 s5, s6 // compare with 0.0 + vmrs APSR_nzcv, fpscr + beq KERNEL_S1_END_\@ + vabs.f32 s5, s5 + vcmpe.f32 s0, s5 // compare with scale + vmrs APSR_nzcv, fpscr + vdivge.f32 s2 , s5, s0 // scale >= x ? x / scale + vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale ) + bge KERNEL_S1_END_\@ + vdiv.f32 s2 , s0, s5 // scale / x + vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x ) + vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x ) + vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) + vmov.f32 s0 , s5 // scale = x + +KERNEL_S1_END_\@: + + add X, X, INC_X + +.endm + + + +#endif + +#endif + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + b nrm2_begin + + +#if defined(COMPLEX) + +#if defined(DOUBLE) + +znrm2_one: + .word 0x00000000 + .word 0x3ff00000 + +#else + +cnrm2_one: + .word 0x3f800000 + +#endif + +#else + +#if defined(DOUBLE) + +dnrm2_one: + .word 0x00000000 + .word 0x3ff00000 + +#else + +snrm2_one: + .word 0x3f800000 + +#endif + +#endif + + + .align 5 + + +nrm2_begin: + +#if defined(COMPLEX) + +#if defined(DOUBLE) + vsub.f64 d0 , d0 , d0 // scale=0.0 + vldr.64 d1 , znrm2_one // ssq=1.0 + vmov.f64 d7 , d1 // value 1.0 + vmov.f64 d6 , d0 // value 0.0 +#else + vsub.f32 s0 , s0 , s0 // scale=0.0 + vldr.32 s1 , cnrm2_one // ssq=1.0 + vmov.f32 s7 , s1 // value 1.0 + vmov.f32 s6 , s0 // value 0.0 +#endif + +#else + +#if defined(DOUBLE) + vsub.f64 d0 , d0 , d0 // scale=0.0 + vldr.64 d1 , dnrm2_one // ssq=1.0 + vmov.f64 d7 , d1 // value 1.0 + vmov.f64 d6 , d0 // value 0.0 +#else + vsub.f32 s0 , s0 , s0 // scale=0.0 + vldr.32 s1 , snrm2_one // ssq=1.0 + vmov.f32 s7 , s1 // value 1.0 + vmov.f32 s6 , s0 // value 0.0 +#endif + + +#endif + + + cmp N, #0 + ble nrm2_kernel_L999 + + cmp INC_X, #0 + beq nrm2_kernel_L999 + + + cmp INC_X, #1 + bne nrm2_kernel_S_BEGIN + + +nrm2_kernel_F_BEGIN: + + asrs I, N, #3 // I = N / 8 + ble nrm2_kernel_F1 + +nrm2_kernel_F8: + + KERNEL_F8 + + subs I, I, #1 + bne nrm2_kernel_F8 + +nrm2_kernel_F1: + + ands I, N, #7 + ble nrm2_kernel_L999 + + +nrm2_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne nrm2_kernel_F10 + + b nrm2_kernel_L999 + +nrm2_kernel_S_BEGIN: + +#if defined(COMPLEX) + +#if defined(DOUBLE) + lsl INC_X, INC_X, #4 // INC_X * SIZE * 2 +#else + lsl INC_X, INC_X, #3 // INC_X * SIZE * 2 +#endif + +#else + +#if defined(DOUBLE) + lsl INC_X, INC_X, #3 // INC_X * SIZE +#else + lsl INC_X, INC_X, #2 // INC_X * SIZE +#endif + +#endif + + + +nrm2_kernel_S1: + + mov I, N + + .align 5 + +nrm2_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne nrm2_kernel_S10 + + +nrm2_kernel_L999: + +#if defined(DOUBLE) + vsqrt.f64 d1, d1 + vmul.f64 d0, d0, d1 +#else + vsqrt.f32 s1, s1 + vmul.f32 s0, s0, s1 +#endif + + bx lr + + EPILOGUE + diff --git a/kernel/arm/nrm2_vfpv3.S b/kernel/arm/nrm2_vfpv3.S new file mode 100644 index 000000000..b56f8b038 --- /dev/null +++ b/kernel/arm/nrm2_vfpv3.S @@ -0,0 +1,508 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/16 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define N r0 +#define X r1 +#define INC_X r2 + +#define I r12 + +#define X_PRE 512 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + + +#if !defined(COMPLEX) + +#if defined(DOUBLE) + + +.macro KERNEL_F1 + + fldmiad X!, { d4 } + vcmpe.f64 d4, d6 // compare with 0.0 + vmrs APSR_nzcv, fpscr + beq KERNEL_F1_NEXT_\@ + vabs.f64 d4, d4 + vcmpe.f64 d0, d4 // compare with scale + vmrs APSR_nzcv, fpscr + vdivge.f64 d2 , d4, d0 // scale >= x ? x / scale + vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale ) + bge KERNEL_F1_NEXT_\@ + vdiv.f64 d2 , d0, d4 // scale / x + vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x ) + vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x ) + vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) + vmov.f64 d0 , d4 // scale = x + +KERNEL_F1_NEXT_\@: + +.endm + +.macro KERNEL_F8 + + pld [ X, #X_PRE ] + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + pld [ X, #X_PRE ] + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + +.endm + +.macro KERNEL_S1 + + fldmiad X, { d4 } + vcmpe.f64 d4, d6 // compare with 0.0 + vmrs APSR_nzcv, fpscr + beq KERNEL_S1_NEXT + vabs.f64 d4, d4 + vcmpe.f64 d0, d4 // compare with scale + vmrs APSR_nzcv, fpscr + vdivge.f64 d2 , d4, d0 // scale >= x ? x / scale + vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale ) + bge KERNEL_S1_NEXT + vdiv.f64 d2 , d0, d4 // scale / x + vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x ) + vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x ) + vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) + vmov.f64 d0 , d4 // scale = x + +KERNEL_S1_NEXT: + + add X, X, INC_X + +.endm + +#else + +.macro KERNEL_F1 + + fldmias X!, { s4 } + vcmpe.f32 s4, s6 // compare with 0.0 + vmrs APSR_nzcv, fpscr + beq KERNEL_F1_NEXT_\@ + vabs.f32 s4, s4 + vcmpe.f32 s0, s4 // compare with scale + vmrs APSR_nzcv, fpscr + vdivge.f32 s2 , s4, s0 // scale >= x ? x / scale + vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale ) + bge KERNEL_F1_NEXT_\@ + vdiv.f32 s2 , s0, s4 // scale / x + vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x ) + vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x ) + vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) + vmov.f32 s0 , s4 // scale = x + +KERNEL_F1_NEXT_\@: + +.endm + +.macro KERNEL_F8 + + pld [ X, #X_PRE ] + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + +.endm + +.macro KERNEL_S1 + + fldmias X, { s4 } + vcmpe.f32 s4, s6 // compare with 0.0 + vmrs APSR_nzcv, fpscr + beq KERNEL_S1_NEXT + vabs.f32 s4, s4 + vcmpe.f32 s0, s4 // compare with scale + vmrs APSR_nzcv, fpscr + vdivge.f32 s2 , s4, s0 // scale >= x ? x / scale + vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale ) + bge KERNEL_S1_NEXT + vdiv.f32 s2 , s0, s4 // scale / x + vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x ) + vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x ) + vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) + vmov.f32 s0 , s4 // scale = x + +KERNEL_S1_NEXT: + + add X, X, INC_X + +.endm + + + + +#endif + +#else + +#if defined(DOUBLE) + +.macro KERNEL_F1 + + fldmiad X!, { d4 - d5 } + + vcmpe.f64 d4, d6 // compare with 0.0 + vmrs APSR_nzcv, fpscr + beq KERNEL_F1_NEXT_\@ + vabs.f64 d4, d4 + vcmpe.f64 d0, d4 // compare with scale + vmrs APSR_nzcv, fpscr + vdivge.f64 d2 , d4, d0 // scale >= x ? x / scale + vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale ) + bge KERNEL_F1_NEXT_\@ + vdiv.f64 d2 , d0, d4 // scale / x + vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x ) + vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x ) + vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) + vmov.f64 d0 , d4 // scale = x + +KERNEL_F1_NEXT_\@: + + vcmpe.f64 d5, d6 // compare with 0.0 + vmrs APSR_nzcv, fpscr + beq KERNEL_F1_END_\@ + vabs.f64 d5, d5 + vcmpe.f64 d0, d5 // compare with scale + vmrs APSR_nzcv, fpscr + vdivge.f64 d2 , d5, d0 // scale >= x ? x / scale + vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale ) + bge KERNEL_F1_END_\@ + vdiv.f64 d2 , d0, d5 // scale / x + vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x ) + vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x ) + vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) + vmov.f64 d0 , d5 // scale = x + +KERNEL_F1_END_\@: + + +.endm + +.macro KERNEL_F8 + + pld [ X, #X_PRE ] + KERNEL_F1 + KERNEL_F1 + pld [ X, #X_PRE ] + KERNEL_F1 + KERNEL_F1 + pld [ X, #X_PRE ] + KERNEL_F1 + KERNEL_F1 + pld [ X, #X_PRE ] + KERNEL_F1 + KERNEL_F1 + +.endm + +.macro KERNEL_S1 + + fldmiad X, { d4 - d5 } + + vcmpe.f64 d4, d6 // compare with 0.0 + vmrs APSR_nzcv, fpscr + beq KERNEL_S1_NEXT_\@ + vabs.f64 d4, d4 + vcmpe.f64 d0, d4 // compare with scale + vmrs APSR_nzcv, fpscr + vdivge.f64 d2 , d4, d0 // scale >= x ? x / scale + vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale ) + bge KERNEL_S1_NEXT_\@ + vdiv.f64 d2 , d0, d4 // scale / x + vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x ) + vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x ) + vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) + vmov.f64 d0 , d4 // scale = x + +KERNEL_S1_NEXT_\@: + + vcmpe.f64 d5, d6 // compare with 0.0 + vmrs APSR_nzcv, fpscr + beq KERNEL_S1_END_\@ + vabs.f64 d5, d5 + vcmpe.f64 d0, d5 // compare with scale + vmrs APSR_nzcv, fpscr + vdivge.f64 d2 , d5, d0 // scale >= x ? x / scale + vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale ) + bge KERNEL_S1_END_\@ + vdiv.f64 d2 , d0, d5 // scale / x + vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x ) + vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x ) + vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) + vmov.f64 d0 , d5 // scale = x + +KERNEL_S1_END_\@: + + add X, X, INC_X + +.endm + + +#else + +.macro KERNEL_F1 + + fldmias X!, { s4 - s5 } + + vcmpe.f32 s4, s6 // compare with 0.0 + vmrs APSR_nzcv, fpscr + beq KERNEL_F1_NEXT_\@ + vabs.f32 s4, s4 + vcmpe.f32 s0, s4 // compare with scale + vmrs APSR_nzcv, fpscr + vdivge.f32 s2 , s4, s0 // scale >= x ? x / scale + vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale ) + bge KERNEL_F1_NEXT_\@ + vdiv.f32 s2 , s0, s4 // scale / x + vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x ) + vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x ) + vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) + vmov.f32 s0 , s4 // scale = x + +KERNEL_F1_NEXT_\@: + + vcmpe.f32 s5, s6 // compare with 0.0 + vmrs APSR_nzcv, fpscr + beq KERNEL_F1_END_\@ + vabs.f32 s5, s5 + vcmpe.f32 s0, s5 // compare with scale + vmrs APSR_nzcv, fpscr + vdivge.f32 s2 , s5, s0 // scale >= x ? x / scale + vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale ) + bge KERNEL_F1_END_\@ + vdiv.f32 s2 , s0, s5 // scale / x + vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x ) + vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x ) + vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) + vmov.f32 s0 , s5 // scale = x + +KERNEL_F1_END_\@: + + +.endm + +.macro KERNEL_F8 + + pld [ X, #X_PRE ] + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + pld [ X, #X_PRE ] + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + +.endm + +.macro KERNEL_S1 + + fldmias X, { s4 - s5 } + + vcmpe.f32 s4, s6 // compare with 0.0 + vmrs APSR_nzcv, fpscr + beq KERNEL_S1_NEXT_\@ + vabs.f32 s4, s4 + vcmpe.f32 s0, s4 // compare with scale + vmrs APSR_nzcv, fpscr + vdivge.f32 s2 , s4, s0 // scale >= x ? x / scale + vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale ) + bge KERNEL_S1_NEXT_\@ + vdiv.f32 s2 , s0, s4 // scale / x + vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x ) + vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x ) + vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) + vmov.f32 s0 , s4 // scale = x + +KERNEL_S1_NEXT_\@: + + vcmpe.f32 s5, s6 // compare with 0.0 + vmrs APSR_nzcv, fpscr + beq KERNEL_S1_END_\@ + vabs.f32 s5, s5 + vcmpe.f32 s0, s5 // compare with scale + vmrs APSR_nzcv, fpscr + vdivge.f32 s2 , s5, s0 // scale >= x ? x / scale + vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale ) + bge KERNEL_S1_END_\@ + vdiv.f32 s2 , s0, s5 // scale / x + vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x ) + vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x ) + vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) + vmov.f32 s0 , s5 // scale = x + +KERNEL_S1_END_\@: + + add X, X, INC_X + +.endm + + + +#endif + +#endif + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + +#if defined(DOUBLE) + vsub.f64 d0 , d0 , d0 // scale=0.0 + vmov.f64 d1 , #1.0 // ssq=1.0 + vmov.f64 d7 , d1 // value 1.0 + vmov.f64 d6 , d0 // value 0.0 +#else + vsub.f32 s0 , s0 , s0 // scale=0.0 + vmov.f32 s1 , #1.0 // ssq=1.0 + vmov.f32 s7 , s1 // value 1.0 + vmov.f32 s6 , s0 // value 0.0 +#endif + + + + cmp N, #0 + ble nrm2_kernel_L999 + + cmp INC_X, #0 + beq nrm2_kernel_L999 + + + cmp INC_X, #1 + bne nrm2_kernel_S_BEGIN + + +nrm2_kernel_F_BEGIN: + + asrs I, N, #3 // I = N / 8 + ble nrm2_kernel_F1 + +nrm2_kernel_F8: + + KERNEL_F8 + + subs I, I, #1 + bne nrm2_kernel_F8 + +nrm2_kernel_F1: + + ands I, N, #7 + ble nrm2_kernel_L999 + + +nrm2_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne nrm2_kernel_F10 + + b nrm2_kernel_L999 + +nrm2_kernel_S_BEGIN: + +#if defined(COMPLEX) + +#if defined(DOUBLE) + lsl INC_X, INC_X, #4 // INC_X * SIZE * 2 +#else + lsl INC_X, INC_X, #3 // INC_X * SIZE * 2 +#endif + +#else + +#if defined(DOUBLE) + lsl INC_X, INC_X, #3 // INC_X * SIZE +#else + lsl INC_X, INC_X, #2 // INC_X * SIZE +#endif + +#endif + + + +nrm2_kernel_S1: + + mov I, N + + .align 5 + +nrm2_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne nrm2_kernel_S10 + + +nrm2_kernel_L999: + +#if defined(DOUBLE) + vsqrt.f64 d1, d1 + vmul.f64 d0, d0, d1 +#else + vsqrt.f32 s1, s1 + vmul.f32 s0, s0, s1 +#endif + + bx lr + + EPILOGUE + diff --git a/kernel/arm/rot.c b/kernel/arm/rot.c new file mode 100644 index 000000000..aa60b4471 --- /dev/null +++ b/kernel/arm/rot.c @@ -0,0 +1,62 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : OK +* BLASTEST double : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#include "common.h" + +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT temp; + + if ( n <= 0 ) return(0); + + while(i < n) + { + temp = c*x[ix] + s*y[iy] ; + y[iy] = c*y[iy] - s*x[ix] ; + x[ix] = temp ; + + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(0); + +} + + diff --git a/kernel/arm/rot_vfp.S b/kernel/arm/rot_vfp.S new file mode 100644 index 000000000..663ecdf81 --- /dev/null +++ b/kernel/arm/rot_vfp.S @@ -0,0 +1,584 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/15 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_INC_Y [fp, #0 ] + + +#define N r0 +#define X r1 +#define INC_X r2 +#define Y r3 +#define INC_Y r4 + +#define I r12 + +#define X_PRE 512 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +/*****************************************************************************************/ + + + +#if !defined(COMPLEX) + +#if defined(DOUBLE) + +.macro KERNEL_F4 + + pld [ X, #X_PRE ] + pld [ Y, #X_PRE ] + + fldmiad X, { d4 } + fldmiad Y, { d5 } + vmul.f64 d2 , d0, d4 + fmacd d2 , d1, d5 + vmul.f64 d3 , d0, d5 + fnmacd d3 , d1, d4 + fstmiad X!, { d2 } + fstmiad Y!, { d3 } + + fldmiad X, { d4 } + fldmiad Y, { d5 } + vmul.f64 d2 , d0, d4 + fmacd d2 , d1, d5 + vmul.f64 d3 , d0, d5 + fnmacd d3 , d1, d4 + fstmiad X!, { d2 } + fstmiad Y!, { d3 } + + fldmiad X, { d4 } + fldmiad Y, { d5 } + vmul.f64 d2 , d0, d4 + fmacd d2 , d1, d5 + vmul.f64 d3 , d0, d5 + fnmacd d3 , d1, d4 + fstmiad X!, { d2 } + fstmiad Y!, { d3 } + + fldmiad X, { d4 } + fldmiad Y, { d5 } + vmul.f64 d2 , d0, d4 + fmacd d2 , d1, d5 + vmul.f64 d3 , d0, d5 + fnmacd d3 , d1, d4 + fstmiad X!, { d2 } + fstmiad Y!, { d3 } + +.endm + + +.macro KERNEL_F1 + + fldmiad X, { d4 } + fldmiad Y, { d5 } + vmul.f64 d2 , d0, d4 + fmacd d2 , d1, d5 + vmul.f64 d3 , d0, d5 + fnmacd d3 , d1, d4 + fstmiad X!, { d2 } + fstmiad Y!, { d3 } + +.endm + +.macro KERNEL_S1 + + fldmiad X, { d4 } + fldmiad Y, { d5 } + vmul.f64 d2 , d0, d4 + fmacd d2 , d1, d5 + vmul.f64 d3 , d0, d5 + fnmacd d3 , d1, d4 + fstmiad X, { d2 } + fstmiad Y, { d3 } + + add X, X, INC_X + add Y, Y, INC_Y + +.endm + +#else + +.macro KERNEL_F4 + + fldmias X, { s4 } + fldmias Y, { s5 } + vmul.f32 s2 , s0, s4 + fmacs s2 , s1, s5 + vmul.f32 s3 , s0, s5 + fnmacs s3 , s1, s4 + fstmias X!, { s2 } + fstmias Y!, { s3 } + + fldmias X, { s4 } + fldmias Y, { s5 } + vmul.f32 s2 , s0, s4 + fmacs s2 , s1, s5 + vmul.f32 s3 , s0, s5 + fnmacs s3 , s1, s4 + fstmias X!, { s2 } + fstmias Y!, { s3 } + + fldmias X, { s4 } + fldmias Y, { s5 } + vmul.f32 s2 , s0, s4 + fmacs s2 , s1, s5 + vmul.f32 s3 , s0, s5 + fnmacs s3 , s1, s4 + fstmias X!, { s2 } + fstmias Y!, { s3 } + + fldmias X, { s4 } + fldmias Y, { s5 } + vmul.f32 s2 , s0, s4 + fmacs s2 , s1, s5 + vmul.f32 s3 , s0, s5 + fnmacs s3 , s1, s4 + fstmias X!, { s2 } + fstmias Y!, { s3 } + +.endm + + +.macro KERNEL_F1 + + fldmias X, { s4 } + fldmias Y, { s5 } + vmul.f32 s2 , s0, s4 + fmacs s2 , s1, s5 + vmul.f32 s3 , s0, s5 + fnmacs s3 , s1, s4 + fstmias X!, { s2 } + fstmias Y!, { s3 } + +.endm + +.macro KERNEL_S1 + + fldmias X, { s4 } + fldmias Y, { s5 } + vmul.f32 s2 , s0, s4 + fmacs s2 , s1, s5 + vmul.f32 s3 , s0, s5 + fnmacs s3 , s1, s4 + fstmias X, { s2 } + fstmias Y, { s3 } + + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + + +#endif + +#else + +#if defined(DOUBLE) + +.macro KERNEL_F4 + + pld [ X, #X_PRE ] + pld [ Y, #X_PRE ] + + fldmiad X, { d4 - d5 } + fldmiad Y, { d6 - d7 } + vmul.f64 d2 , d0, d4 + fmacd d2 , d1, d6 + vmul.f64 d3 , d0, d6 + fnmacd d3 , d1, d4 + fstmiad X!, { d2 } + fstmiad Y!, { d3 } + vmul.f64 d2 , d0, d5 + fmacd d2 , d1, d7 + vmul.f64 d3 , d0, d7 + fnmacd d3 , d1, d5 + fstmiad X!, { d2 } + fstmiad Y!, { d3 } + + fldmiad X, { d4 - d5 } + fldmiad Y, { d6 - d7 } + vmul.f64 d2 , d0, d4 + fmacd d2 , d1, d6 + vmul.f64 d3 , d0, d6 + fnmacd d3 , d1, d4 + fstmiad X!, { d2 } + fstmiad Y!, { d3 } + vmul.f64 d2 , d0, d5 + fmacd d2 , d1, d7 + vmul.f64 d3 , d0, d7 + fnmacd d3 , d1, d5 + fstmiad X!, { d2 } + fstmiad Y!, { d3 } + + pld [ X, #X_PRE ] + pld [ Y, #X_PRE ] + + fldmiad X, { d4 - d5 } + fldmiad Y, { d6 - d7 } + vmul.f64 d2 , d0, d4 + fmacd d2 , d1, d6 + vmul.f64 d3 , d0, d6 + fnmacd d3 , d1, d4 + fstmiad X!, { d2 } + fstmiad Y!, { d3 } + vmul.f64 d2 , d0, d5 + fmacd d2 , d1, d7 + vmul.f64 d3 , d0, d7 + fnmacd d3 , d1, d5 + fstmiad X!, { d2 } + fstmiad Y!, { d3 } + + fldmiad X, { d4 - d5 } + fldmiad Y, { d6 - d7 } + vmul.f64 d2 , d0, d4 + fmacd d2 , d1, d6 + vmul.f64 d3 , d0, d6 + fnmacd d3 , d1, d4 + fstmiad X!, { d2 } + fstmiad Y!, { d3 } + vmul.f64 d2 , d0, d5 + fmacd d2 , d1, d7 + vmul.f64 d3 , d0, d7 + fnmacd d3 , d1, d5 + fstmiad X!, { d2 } + fstmiad Y!, { d3 } + +.endm + + +.macro KERNEL_F1 + + fldmiad X, { d4 - d5 } + fldmiad Y, { d6 - d7 } + vmul.f64 d2 , d0, d4 + fmacd d2 , d1, d6 + vmul.f64 d3 , d0, d6 + fnmacd d3 , d1, d4 + fstmiad X!, { d2 } + fstmiad Y!, { d3 } + vmul.f64 d2 , d0, d5 + fmacd d2 , d1, d7 + vmul.f64 d3 , d0, d7 + fnmacd d3 , d1, d5 + fstmiad X!, { d2 } + fstmiad Y!, { d3 } + + +.endm + +.macro KERNEL_S1 + + fldmiad X, { d4 - d5 } + fldmiad Y, { d6 - d7 } + vmul.f64 d2 , d0, d4 + fmacd d2 , d1, d6 + vmul.f64 d3 , d0, d6 + fnmacd d3 , d1, d4 + vstr d2 , [ X, #0 ] + vstr d3 , [ Y, #0 ] + vmul.f64 d2 , d0, d5 + fmacd d2 , d1, d7 + vmul.f64 d3 , d0, d7 + fnmacd d3 , d1, d5 + vstr d2 , [ X, #8 ] + vstr d3 , [ Y, #8 ] + + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + + +#else + +.macro KERNEL_F4 + + pld [ X, #X_PRE ] + pld [ Y, #X_PRE ] + + fldmias X, { s4 - s5 } + fldmias Y, { s6 - s7 } + vmul.f32 s2 , s0, s4 + fmacs s2 , s1, s6 + vmul.f32 s3 , s0, s6 + fnmacs s3 , s1, s4 + fstmias X!, { s2 } + fstmias Y!, { s3 } + vmul.f32 s2 , s0, s5 + fmacs s2 , s1, s7 + vmul.f32 s3 , s0, s7 + fnmacs s3 , s1, s5 + fstmias X!, { s2 } + fstmias Y!, { s3 } + + fldmias X, { s4 - s5 } + fldmias Y, { s6 - s7 } + vmul.f32 s2 , s0, s4 + fmacs s2 , s1, s6 + vmul.f32 s3 , s0, s6 + fnmacs s3 , s1, s4 + fstmias X!, { s2 } + fstmias Y!, { s3 } + vmul.f32 s2 , s0, s5 + fmacs s2 , s1, s7 + vmul.f32 s3 , s0, s7 + fnmacs s3 , s1, s5 + fstmias X!, { s2 } + fstmias Y!, { s3 } + + pld [ X, #X_PRE ] + pld [ Y, #X_PRE ] + + fldmias X, { s4 - s5 } + fldmias Y, { s6 - s7 } + vmul.f32 s2 , s0, s4 + fmacs s2 , s1, s6 + vmul.f32 s3 , s0, s6 + fnmacs s3 , s1, s4 + fstmias X!, { s2 } + fstmias Y!, { s3 } + vmul.f32 s2 , s0, s5 + fmacs s2 , s1, s7 + vmul.f32 s3 , s0, s7 + fnmacs s3 , s1, s5 + fstmias X!, { s2 } + fstmias Y!, { s3 } + + fldmias X, { s4 - s5 } + fldmias Y, { s6 - s7 } + vmul.f32 s2 , s0, s4 + fmacs s2 , s1, s6 + vmul.f32 s3 , s0, s6 + fnmacs s3 , s1, s4 + fstmias X!, { s2 } + fstmias Y!, { s3 } + vmul.f32 s2 , s0, s5 + fmacs s2 , s1, s7 + vmul.f32 s3 , s0, s7 + fnmacs s3 , s1, s5 + fstmias X!, { s2 } + fstmias Y!, { s3 } + +.endm + + +.macro KERNEL_F1 + + fldmias X, { s4 - s5 } + fldmias Y, { s6 - s7 } + vmul.f32 s2 , s0, s4 + fmacs s2 , s1, s6 + vmul.f32 s3 , s0, s6 + fnmacs s3 , s1, s4 + fstmias X!, { s2 } + fstmias Y!, { s3 } + vmul.f32 s2 , s0, s5 + fmacs s2 , s1, s7 + vmul.f32 s3 , s0, s7 + fnmacs s3 , s1, s5 + fstmias X!, { s2 } + fstmias Y!, { s3 } + + +.endm + +.macro KERNEL_S1 + + fldmias X, { s4 - s5 } + fldmias Y, { s6 - s7 } + vmul.f32 s2 , s0, s4 + fmacs s2 , s1, s6 + vmul.f32 s3 , s0, s6 + fnmacs s3 , s1, s4 + vstr s2 , [ X, #0 ] + vstr s3 , [ Y, #0 ] + vmul.f32 s2 , s0, s5 + fmacs s2 , s1, s7 + vmul.f32 s3 , s0, s7 + fnmacs s3 , s1, s5 + vstr s2 , [ X, #4 ] + vstr s3 , [ Y, #4 ] + + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + +#endif + +#endif + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + push {r4 , fp} + add fp, sp, #8 + + ldr INC_Y , OLD_INC_Y + + + cmp N, #0 + ble rot_kernel_L999 + + cmp INC_X, #0 + beq rot_kernel_L999 + + cmp INC_Y, #0 + beq rot_kernel_L999 + + cmp INC_X, #1 + bne rot_kernel_S_BEGIN + + cmp INC_Y, #1 + bne rot_kernel_S_BEGIN + + +rot_kernel_F_BEGIN: + + + asrs I, N, #2 // I = N / 4 + ble rot_kernel_F1 + + .align 5 + +rot_kernel_F4: + +#if !defined(COMPLEX) && !defined(DOUBLE) + pld [ X, #X_PRE ] + pld [ Y, #X_PRE ] +#endif + + KERNEL_F4 + + subs I, I, #1 + ble rot_kernel_F1 + + KERNEL_F4 + + subs I, I, #1 + bne rot_kernel_F4 + +rot_kernel_F1: + + ands I, N, #3 + ble rot_kernel_L999 + +rot_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne rot_kernel_F10 + + b rot_kernel_L999 + +rot_kernel_S_BEGIN: + +#if defined(COMPLEX) + +#if defined(DOUBLE) + lsl INC_X, INC_X, #4 // INC_X * SIZE * 2 + lsl INC_Y, INC_Y, #4 // INC_Y * SIZE * 2 +#else + lsl INC_X, INC_X, #3 // INC_X * SIZE * 2 + lsl INC_Y, INC_Y, #3 // INC_Y * SIZE * 2 +#endif + +#else + +#if defined(DOUBLE) + lsl INC_X, INC_X, #3 // INC_X * SIZE + lsl INC_Y, INC_Y, #3 // INC_Y * SIZE +#else + lsl INC_X, INC_X, #2 // INC_X * SIZE + lsl INC_Y, INC_Y, #2 // INC_Y * SIZE +#endif + +#endif + + + asrs I, N, #2 // I = N / 4 + ble rot_kernel_S1 + + .align 5 + +rot_kernel_S4: + + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + + subs I, I, #1 + bne rot_kernel_S4 + +rot_kernel_S1: + + ands I, N, #3 + ble rot_kernel_L999 + +rot_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne rot_kernel_S10 + + +rot_kernel_L999: + + mov r0, #0 // set return value + + sub sp, fp, #8 + pop {r4,fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/scal.c b/kernel/arm/scal.c new file mode 100644 index 000000000..d385c46bc --- /dev/null +++ b/kernel/arm/scal.c @@ -0,0 +1,58 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : OK +* BLASTEST double : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#include "common.h" + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + + if ( n < 0 || inc_x < 1 ) return(0); + if ( da == 1.0 ) return(0); + + n *= inc_x; + while(i < n) + { + + x[i] = da * x[i] ; + i += inc_x ; + + } + return(0); + +} + + diff --git a/kernel/arm/scal_vfp.S b/kernel/arm/scal_vfp.S new file mode 100644 index 000000000..a04b7241e --- /dev/null +++ b/kernel/arm/scal_vfp.S @@ -0,0 +1,376 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/15 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_INC_X [sp, #0 ] + + +#define N r0 +#define INC_X r1 +#define X r3 + +#define I r12 + +#define X_PRE 512 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +/*****************************************************************************************/ + + + +#if !defined(COMPLEX) + +#if defined(DOUBLE) + +.macro KERNEL_F4 + + pld [ X, #X_PRE ] + fldmiad X, { d4 - d7 } + vmul.f64 d4, d4, d0 + vmul.f64 d5, d5, d0 + vmul.f64 d6, d6, d0 + fstmiad X!, { d4 - d5 } + vmul.f64 d7, d7, d0 + fstmiad X!, { d6 - d7 } + +.endm + + +.macro KERNEL_F1 + + fldmiad X, { d4 } + vmul.f64 d4, d4, d0 + fstmiad X!, { d4 } + +.endm + +.macro KERNEL_S1 + + fldmiad X, { d4 } + vmul.f64 d4, d4, d0 + fstmiad X, { d4 } + add X, X, INC_X + +.endm + +#else + +.macro KERNEL_F4 + + fldmias X, { s4 - s7 } + vmul.f32 s4, s4, s0 + vmul.f32 s5, s5, s0 + vmul.f32 s6, s6, s0 + fstmias X!, { s4 - s5 } + vmul.f32 s7, s7, s0 + fstmias X!, { s6 - s7 } + +.endm + + +.macro KERNEL_F1 + + fldmias X, { s4 } + vmul.f32 s4, s4, s0 + fstmias X!, { s4 } + +.endm + +.macro KERNEL_S1 + + fldmias X, { s4 } + vmul.f32 s4, s4, s0 + fstmias X, { s4 } + add X, X, INC_X + +.endm + + + +#endif + +#else + +#if defined(DOUBLE) + +.macro KERNEL_F4 + + pld [ X, #X_PRE ] + + fldmiad X, { d4 - d5 } + vmul.f64 d2, d0, d4 + fnmacd d2, d1, d5 + vmul.f64 d3, d0, d5 + fmacd d3, d1, d4 + fstmiad X!, { d2 - d3 } + + fldmiad X, { d4 - d5 } + vmul.f64 d2, d0, d4 + fnmacd d2, d1, d5 + vmul.f64 d3, d0, d5 + fmacd d3, d1, d4 + fstmiad X!, { d2 - d3 } + + pld [ X, #X_PRE ] + + fldmiad X, { d4 - d5 } + vmul.f64 d2, d0, d4 + fnmacd d2, d1, d5 + vmul.f64 d3, d0, d5 + fmacd d3, d1, d4 + fstmiad X!, { d2 - d3 } + + fldmiad X, { d4 - d5 } + vmul.f64 d2, d0, d4 + fnmacd d2, d1, d5 + vmul.f64 d3, d0, d5 + fmacd d3, d1, d4 + fstmiad X!, { d2 - d3 } + +.endm + + +.macro KERNEL_F1 + + fldmiad X, { d4 - d5 } + vmul.f64 d2, d0, d4 + fnmacd d2, d1, d5 + vmul.f64 d3, d0, d5 + fmacd d3, d1, d4 + fstmiad X!, { d2 - d3 } + +.endm + +.macro KERNEL_S1 + + fldmiad X, { d4 - d5 } + vmul.f64 d2, d0, d4 + fnmacd d2, d1, d5 + vmul.f64 d3, d0, d5 + fmacd d3, d1, d4 + fstmiad X, { d2 - d3 } + add X, X, INC_X + +.endm + + +#else + +.macro KERNEL_F4 + + pld [ X, #X_PRE ] + + fldmias X, { s4 - s5 } + vmul.f32 s2, s0, s4 + fnmacs s2, s1, s5 + vmul.f32 s3, s0, s5 + fmacs s3, s1, s4 + fstmias X!, { s2 - s3 } + + fldmias X, { s4 - s5 } + vmul.f32 s2, s0, s4 + fnmacs s2, s1, s5 + vmul.f32 s3, s0, s5 + fmacs s3, s1, s4 + fstmias X!, { s2 - s3 } + + fldmias X, { s4 - s5 } + vmul.f32 s2, s0, s4 + fnmacs s2, s1, s5 + vmul.f32 s3, s0, s5 + fmacs s3, s1, s4 + fstmias X!, { s2 - s3 } + + fldmias X, { s4 - s5 } + vmul.f32 s2, s0, s4 + fnmacs s2, s1, s5 + vmul.f32 s3, s0, s5 + fmacs s3, s1, s4 + fstmias X!, { s2 - s3 } + +.endm + + +.macro KERNEL_F1 + + fldmias X, { s4 - s5 } + vmul.f32 s2, s0, s4 + fnmacs s2, s1, s5 + vmul.f32 s3, s0, s5 + fmacs s3, s1, s4 + fstmias X!, { s2 - s3 } + +.endm + +.macro KERNEL_S1 + + fldmias X, { s4 - s5 } + vmul.f32 s2, s0, s4 + fnmacs s2, s1, s5 + vmul.f32 s3, s0, s5 + fmacs s3, s1, s4 + fstmias X, { s2 - s3 } + add X, X, INC_X + +.endm + + + +#endif + +#endif + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + ldr INC_X , OLD_INC_X + + cmp N, #0 + ble scal_kernel_L999 + + cmp INC_X, #0 + ble scal_kernel_L999 + + cmp INC_X, #1 + bne scal_kernel_S_BEGIN + + +scal_kernel_F_BEGIN: + + + asrs I, N, #2 // I = N / 4 + ble scal_kernel_F1 + + .align 5 + +scal_kernel_F4: + +#if !defined(COMPLEX) && !defined(DOUBLE) + pld [ X, #X_PRE ] +#endif + + KERNEL_F4 + + subs I, I, #1 + ble scal_kernel_F1 + + KERNEL_F4 + + subs I, I, #1 + bne scal_kernel_F4 + +scal_kernel_F1: + + ands I, N, #3 + ble scal_kernel_L999 + +scal_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne scal_kernel_F10 + + b scal_kernel_L999 + +scal_kernel_S_BEGIN: + +#if defined(COMPLEX) + +#if defined(DOUBLE) + lsl INC_X, INC_X, #4 // INC_X * SIZE * 2 +#else + lsl INC_X, INC_X, #3 // INC_X * SIZE * 2 +#endif + +#else + +#if defined(DOUBLE) + lsl INC_X, INC_X, #3 // INC_X * SIZE +#else + lsl INC_X, INC_X, #2 // INC_X * SIZE +#endif + +#endif + + + asrs I, N, #2 // I = N / 4 + ble scal_kernel_S1 + + .align 5 + +scal_kernel_S4: + + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + + subs I, I, #1 + bne scal_kernel_S4 + +scal_kernel_S1: + + ands I, N, #3 + ble scal_kernel_L999 + +scal_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne scal_kernel_S10 + + +scal_kernel_L999: + + mov r0, #0 // set return value + + bx lr + + EPILOGUE + diff --git a/kernel/arm/scopy_vfp.S b/kernel/arm/scopy_vfp.S new file mode 100644 index 000000000..e6ceaf2fb --- /dev/null +++ b/kernel/arm/scopy_vfp.S @@ -0,0 +1,224 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/07 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define N r0 +#define X r1 +#define INC_X r2 +#define OLD_Y r3 + + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define OLD_INC_Y [fp, #4 ] + +#define I r5 +#define Y r6 +#define INC_Y r7 + +#define X_PRE 256 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro COPY_F8 + + pld [ X, #X_PRE ] + fldmias X!, { s0 - s3 } + fldmias X!, { s4 - s7 } + fstmias Y!, { s0 - s3 } + fstmias Y!, { s4 - s7 } + +.endm + +.macro COPY_F1 + + fldmias X!, { s0 } + fstmias Y!, { s0 } + +.endm + + +/*************************************************************************************************************************/ + +.macro COPY_S4 + + nop + fldmias X, { s0 } + fstmias Y, { s0 } + add X, X, INC_X + add Y, Y, INC_Y + + fldmias X, { s1 } + fstmias Y, { s1 } + add X, X, INC_X + add Y, Y, INC_Y + + fldmias X, { s0 } + fstmias Y, { s0 } + add X, X, INC_X + add Y, Y, INC_Y + + fldmias X, { s1 } + fstmias Y, { s1 } + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + +.macro COPY_S1 + + fldmias X, { s0 } + fstmias Y, { s0 } + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + sub r4, fp, #128 + vstm r4, { s8 - s15} // store floating point registers + + mov Y, OLD_Y + ldr INC_Y, OLD_INC_Y + + cmp N, #0 + ble scopy_kernel_L999 + + cmp INC_X, #0 + beq scopy_kernel_L999 + + cmp INC_Y, #0 + beq scopy_kernel_L999 + + cmp INC_X, #1 + bne scopy_kernel_S_BEGIN + + cmp INC_Y, #1 + bne scopy_kernel_S_BEGIN + +scopy_kernel_F_BEGIN: + + asrs I, N, #3 // I = N / 8 + ble scopy_kernel_F1 + +scopy_kernel_F8: + + COPY_F8 + + subs I, I, #1 + bne scopy_kernel_F8 + +scopy_kernel_F1: + + ands I, N, #7 + ble scopy_kernel_L999 + +scopy_kernel_F10: + + COPY_F1 + + subs I, I, #1 + bne scopy_kernel_F10 + + b scopy_kernel_L999 + +scopy_kernel_S_BEGIN: + + lsl INC_X, INC_X, #2 // INC_X * SIZE + lsl INC_Y, INC_Y, #2 // INC_Y * SIZE + + asrs I, N, #2 // I = N / 4 + ble scopy_kernel_S1 + +scopy_kernel_S4: + + COPY_S4 + + subs I, I, #1 + bne scopy_kernel_S4 + +scopy_kernel_S1: + + ands I, N, #3 + ble scopy_kernel_L999 + +scopy_kernel_S10: + + COPY_S1 + + subs I, I, #1 + bne scopy_kernel_S10 + + + + + + +scopy_kernel_L999: + + sub r3, fp, #128 + vldm r3, { s8 - s15} // restore floating point registers + + mov r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/sdot_vfp.S b/kernel/arm/sdot_vfp.S new file mode 100644 index 000000000..2d1909201 --- /dev/null +++ b/kernel/arm/sdot_vfp.S @@ -0,0 +1,347 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/11 Saar +* BLASTEST : OK +* CTEST : OK (no test for dsdot) +* TEST : OK (no test for dsdot) +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define N r0 +#define X r1 +#define INC_X r2 +#define OLD_Y r3 + + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define OLD_INC_Y [fp, #4 ] + +#define I r5 +#define Y r6 +#define INC_Y r7 + +#define X_PRE 512 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +#if defined(DSDOT) + +.macro KERNEL_F4 + + fldmias X!, { s14 } + fldmias Y!, { s15 } + vmul.f32 s15, s14, s15 + vcvt.f64.f32 d4, s15 + vadd.f64 d0 , d0, d4 + + fldmias X!, { s14 } + fldmias Y!, { s15 } + vmul.f32 s15, s14, s15 + vcvt.f64.f32 d4, s15 + vadd.f64 d0 , d0, d4 + + fldmias X!, { s14 } + fldmias Y!, { s15 } + vmul.f32 s15, s14, s15 + vcvt.f64.f32 d4, s15 + vadd.f64 d0 , d0, d4 + + fldmias X!, { s14 } + fldmias Y!, { s15 } + vmul.f32 s15, s14, s15 + vcvt.f64.f32 d4, s15 + vadd.f64 d0 , d0, d4 + +.endm + +.macro KERNEL_F1 + + fldmias X!, { s14 } + fldmias Y!, { s15 } + vmul.f32 s15, s14, s15 + vcvt.f64.f32 d4, s15 + vadd.f64 d0 , d0, d4 + +.endm + + +.macro KERNEL_S4 + + nop + + fldmias X, { s14 } + fldmias Y, { s15 } + vmul.f32 s15, s14, s15 + vcvt.f64.f32 d4, s15 + vadd.f64 d0 , d0, d4 + add X, X, INC_X + add Y, Y, INC_Y + + fldmias X, { s14 } + fldmias Y, { s15 } + vmul.f32 s15, s14, s15 + vcvt.f64.f32 d4, s15 + vadd.f64 d0 , d0, d4 + add X, X, INC_X + add Y, Y, INC_Y + + fldmias X, { s14 } + fldmias Y, { s15 } + vmul.f32 s15, s14, s15 + vcvt.f64.f32 d4, s15 + vadd.f64 d0 , d0, d4 + add X, X, INC_X + add Y, Y, INC_Y + + fldmias X, { s14 } + fldmias Y, { s15 } + vmul.f32 s15, s14, s15 + vcvt.f64.f32 d4, s15 + vadd.f64 d0 , d0, d4 + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + +.macro KERNEL_S1 + + fldmias X, { s14 } + fldmias Y, { s15 } + vmul.f32 s15, s14, s15 + vcvt.f64.f32 d4, s15 + vadd.f64 d0 , d0, d4 + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + + +#else + +.macro KERNEL_F4 + + fldmias X!, { s8 - s9 } + fldmias Y!, { s4 - s5} + fmacs s0 , s4, s8 + fldmias X!, { s10 - s11 } + fmacs s1 , s5, s9 + fldmias Y!, { s6 - s7 } + fmacs s0 , s6, s10 + fmacs s1 , s7, s11 + +.endm + +.macro KERNEL_F1 + + fldmias X!, { s4 } + fldmias Y!, { s8 } + fmacs s0 , s4, s8 + +.endm + + +.macro KERNEL_S4 + + nop + fldmias X, { s4 } + fldmias Y, { s8 } + add X, X, INC_X + add Y, Y, INC_Y + fmacs s0 , s4, s8 + + fldmias X, { s5 } + fldmias Y, { s9 } + add X, X, INC_X + add Y, Y, INC_Y + fmacs s1 , s5, s9 + + fldmias X, { s6 } + fldmias Y, { s10 } + add X, X, INC_X + add Y, Y, INC_Y + fmacs s0 , s6, s10 + + fldmias X, { s7 } + fldmias Y, { s11 } + add X, X, INC_X + add Y, Y, INC_Y + fmacs s1 , s7, s11 + +.endm + + +.macro KERNEL_S1 + + fldmias X, { s4 } + fldmias Y, { s8 } + add X, X, INC_X + fmacs s0 , s4, s8 + add Y, Y, INC_Y + +.endm + +#endif + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + sub r4, fp, #128 + vstm r4, { s8 - s15 } // store floating point registers + + mov Y, OLD_Y + ldr INC_Y, OLD_INC_Y + +#if defined(DSDOT) + + vsub.f64 d0 , d0 , d0 + vsub.f64 d1 , d1 , d1 + +#else + + vsub.f32 s0 , s0 , s0 + vsub.f32 s1 , s1 , s1 + +#endif + + cmp N, #0 + ble sdot_kernel_L999 + + cmp INC_X, #0 + beq sdot_kernel_L999 + + cmp INC_Y, #0 + beq sdot_kernel_L999 + + cmp INC_X, #1 + bne sdot_kernel_S_BEGIN + + cmp INC_Y, #1 + bne sdot_kernel_S_BEGIN + +sdot_kernel_F_BEGIN: + + asrs I, N, #2 // I = N / 4 + ble sdot_kernel_F1 + +sdot_kernel_F4: + + KERNEL_F4 + + subs I, I, #1 + bne sdot_kernel_F4 + +sdot_kernel_F1: + + ands I, N, #3 + ble sdot_kernel_L999 + +sdot_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne sdot_kernel_F10 + + b sdot_kernel_L999 + +sdot_kernel_S_BEGIN: + + lsl INC_X, INC_X, #2 // INC_X * SIZE + lsl INC_Y, INC_Y, #2 // INC_Y * SIZE + + asrs I, N, #2 // I = N / 4 + ble sdot_kernel_S1 + +sdot_kernel_S4: + + KERNEL_S4 + + subs I, I, #1 + bne sdot_kernel_S4 + +sdot_kernel_S1: + + ands I, N, #3 + ble sdot_kernel_L999 + +sdot_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne sdot_kernel_S10 + + + + + + +sdot_kernel_L999: + + sub r3, fp, #128 + vldm r3, { s8 - s15} // restore floating point registers + +#if defined(DSDOT) + + vadd.f64 d0 , d0, d1 // set return value + +#else + + vadd.f32 s0 , s0, s1 // set return value + +#endif + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/sgemm_kernel_4x2_vfp.S b/kernel/arm/sgemm_kernel_4x2_vfp.S new file mode 100644 index 000000000..0e2061d77 --- /dev/null +++ b/kernel/arm/sgemm_kernel_4x2_vfp.S @@ -0,0 +1,797 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/28 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_K r2 +#define OLD_A r3 +#define OLD_ALPHA s0 + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define LDC [fp, #-252 ] +#define M [fp, #-256 ] +#define N [fp, #-260 ] +#define K [fp, #-264 ] +#define A [fp, #-268 ] + +#define ALPHA [fp, #-280] + +#define B [fp, #4 ] +#define C [fp, #8 ] +#define OLD_LDC [fp, #12 ] + +#define I r0 +#define J r1 +#define L r2 + +#define AO r5 +#define BO r6 + +#define CO1 r8 +#define CO2 r9 + +#define K1 r7 +#define BC r12 + +#define A_PRE 96 +#define B_PRE 96 +#define C_PRE 64 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro INIT4x2 + + vsub.f32 s8 , s8 , s8 + vmov.f32 s9, s8 + vmov.f32 s10, s8 + vmov.f32 s11, s8 + vmov.f32 s12, s8 + vmov.f32 s13, s8 + vmov.f32 s14, s8 + vmov.f32 s15, s8 + +.endm + + + +.macro KERNEL4x2_SUB + + fldmias AO! , { s0 - s3 } + fldmias BO! , { s4 - s5 } + + fmacs s8 , s0, s4 + fmacs s9 , s1, s4 + fmacs s10 , s2, s4 + fmacs s11 , s3, s4 + + fmacs s12 , s0, s5 + fmacs s13 , s1, s5 + fmacs s14 , s2, s5 + fmacs s15 , s3, s5 + +.endm + +.macro SAVE4x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + flds s0, ALPHA + + flds s4 , [CO1] + flds s5 , [CO1, #4 ] + flds s6 , [CO1, #8 ] + flds s7 , [CO1, #12 ] + + fmacs s4 , s0 , s8 + fmacs s5 , s0 , s9 + fmacs s6 , s0 , s10 + fmacs s7 , s0 , s11 + + fsts s4 , [CO1] + fsts s5 , [CO1, #4 ] + fsts s6 , [CO1, #8 ] + fsts s7 , [CO1, #12 ] + + flds s4 , [CO2] + flds s5 , [CO2, #4 ] + flds s6 , [CO2, #8 ] + flds s7 , [CO2, #12 ] + + fmacs s4 , s0 , s12 + fmacs s5 , s0 , s13 + fmacs s6 , s0 , s14 + fmacs s7 , s0 , s15 + + fsts s4 , [CO2] + fsts s5 , [CO2, #4 ] + fsts s6 , [CO2, #8 ] + fsts s7 , [CO2, #12 ] + + add CO1, CO1, #16 + +.endm + + +/******************************************************************************/ + +.macro INIT2x2 + + vsub.f32 s8 , s8 , s8 + vmov.f32 s9, s8 + vmov.f32 s12, s8 + vmov.f32 s13, s8 + +.endm + +.macro KERNEL2x2_SUB + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + fmacs s8 , s0, s4 + fmacs s9 , s1, s4 + + fmacs s12 , s0, s5 + fmacs s13 , s1, s5 + + add AO , AO, #8 + add BO , BO, #8 + +.endm + +.macro SAVE2x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + flds s0, ALPHA + + flds s4 , [CO1] + flds s5 , [CO1, #4 ] + + fmacs s4 , s0 , s8 + fmacs s5 , s0 , s9 + + fsts s4 , [CO1] + fsts s5 , [CO1, #4 ] + + flds s4 , [CO2] + flds s5 , [CO2, #4 ] + + fmacs s4 , s0 , s12 + fmacs s5 , s0 , s13 + + fsts s4 , [CO2] + fsts s5 , [CO2, #4 ] + + add CO1, CO1, #8 + +.endm + + +/******************************************************************************/ + +.macro INIT1x2 + + vsub.f32 s8 , s8 , s8 + vmov.f32 s12, s8 + +.endm + +.macro KERNEL1x2_SUB + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + flds s0 , [ AO ] + + fmacs s8 , s0, s4 + + fmacs s12 , s0, s5 + + add AO , AO, #4 + add BO , BO, #8 + +.endm + +.macro SAVE1x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + flds s0, ALPHA + + flds s4 , [CO1] + + fmacs s4 , s0 , s8 + + fsts s4 , [CO1] + + flds s4 , [CO2] + + fmacs s4 , s0 , s12 + + fsts s4 , [CO2] + + add CO1, CO1, #4 + +.endm + + + +/******************************************************************************/ + +.macro INIT4x1 + + vsub.f32 s8 , s8 , s8 + vmov.f32 s9, s8 + vmov.f32 s10, s8 + vmov.f32 s11, s8 + +.endm + + + +.macro KERNEL4x1_SUB + + flds s4 , [ BO ] + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s2 , [ AO, #8 ] + flds s3 , [ AO, #12 ] + + fmacs s8 , s0, s4 + fmacs s9 , s1, s4 + fmacs s10 , s2, s4 + fmacs s11 , s3, s4 + + add AO , AO, #16 + add BO , BO, #4 + +.endm + +.macro SAVE4x1 + + flds s0, ALPHA + + flds s4 , [CO1] + flds s5 , [CO1, #4 ] + flds s6 , [CO1, #8 ] + flds s7 , [CO1, #12 ] + + fmacs s4 , s0 , s8 + fmacs s5 , s0 , s9 + fmacs s6 , s0 , s10 + fmacs s7 , s0 , s11 + + fsts s4 , [CO1] + fsts s5 , [CO1, #4 ] + fsts s6 , [CO1, #8 ] + fsts s7 , [CO1, #12 ] + + add CO1, CO1, #16 + +.endm + +/******************************************************************************/ + +.macro INIT2x1 + + vsub.f32 s8 , s8 , s8 + vmov.f32 s9 , s8 + +.endm + +.macro KERNEL2x1_SUB + + flds s4 , [ BO ] + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + fmacs s8 , s0, s4 + fmacs s9 , s1, s4 + + add AO , AO, #8 + add BO , BO, #4 + +.endm + +.macro SAVE2x1 + + flds s0, ALPHA + + flds s4 , [CO1] + flds s5 , [CO1, #4 ] + + fmacs s4 , s0 , s8 + fmacs s5 , s0 , s9 + + fsts s4 , [CO1] + fsts s5 , [CO1, #4 ] + + add CO1, CO1, #8 + +.endm + + +/******************************************************************************/ + +.macro INIT1x1 + + vsub.f32 s8 , s8 , s8 + +.endm + +.macro KERNEL1x1_SUB + + flds s4 , [ BO ] + + flds s0 , [ AO ] + + fmacs s8 , s0, s4 + + add AO , AO, #4 + add BO , BO, #4 + +.endm + +.macro SAVE1x1 + + flds s0, ALPHA + + flds s4 , [CO1] + + fmacs s4 , s0 , s8 + + fsts s4 , [CO1] + + add CO1, CO1, #4 + +.endm + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + str OLD_M, M + str OLD_N, N + str OLD_K, K + str OLD_A, A + vstr OLD_ALPHA, ALPHA + + sub r3, fp, #128 + vstm r3, { s8 - s15} // store floating point registers + + ldr r3, OLD_LDC + lsl r3, r3, #2 // ldc = ldc * 4 + str r3, LDC + + ldr K1, K + ldr BC, B + + ldr J, N + asrs J, J, #1 // J = J / 2 + ble sgemm_kernel_L1_BEGIN + + +/*********************************************************************************************/ + +sgemm_kernel_L2_BEGIN: + + ldr CO1, C // CO1 = C + ldr r4 , LDC + lsl r4 , r4 , #1 // LDC * 2 + add r3 , r4, CO1 + str r3 , C // store C + + ldr AO, A // AO = A + +sgemm_kernel_L2_M4_BEGIN: + + ldr I, M + asrs I, I, #2 // I = I / 4 + ble sgemm_kernel_L2_M2_BEGIN + +sgemm_kernel_L2_M4_20: + + INIT4x2 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble sgemm_kernel_L2_M4_40 + .align 5 + +sgemm_kernel_L2_M4_22: + + pld [ AO, #A_PRE ] + pld [ BO, #B_PRE ] + KERNEL4x2_SUB + KERNEL4x2_SUB + pld [ AO, #A_PRE ] + KERNEL4x2_SUB + KERNEL4x2_SUB + + pld [ AO, #A_PRE ] + pld [ BO, #B_PRE ] + KERNEL4x2_SUB + KERNEL4x2_SUB + pld [ AO, #A_PRE ] + KERNEL4x2_SUB + KERNEL4x2_SUB + + subs L, L, #1 + bgt sgemm_kernel_L2_M4_22 + + +sgemm_kernel_L2_M4_40: + + ands L , K1, #7 // L = L % 8 + ble sgemm_kernel_L2_M4_100 + +sgemm_kernel_L2_M4_42: + + KERNEL4x2_SUB + + subs L, L, #1 + bgt sgemm_kernel_L2_M4_42 + +sgemm_kernel_L2_M4_100: + + SAVE4x2 + +sgemm_kernel_L2_M4_END: + + subs I, I, #1 + bgt sgemm_kernel_L2_M4_20 + + +sgemm_kernel_L2_M2_BEGIN: + + ldr I, M + tst I , #3 + ble sgemm_kernel_L2_END + + tst I, #2 // I = I / 2 + ble sgemm_kernel_L2_M1_BEGIN + +sgemm_kernel_L2_M2_20: + + INIT2x2 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble sgemm_kernel_L2_M2_40 + +sgemm_kernel_L2_M2_22: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + subs L, L, #1 + bgt sgemm_kernel_L2_M2_22 + + +sgemm_kernel_L2_M2_40: + + ands L , K1, #7 // L = L % 8 + ble sgemm_kernel_L2_M2_100 + +sgemm_kernel_L2_M2_42: + + KERNEL2x2_SUB + + subs L, L, #1 + bgt sgemm_kernel_L2_M2_42 + +sgemm_kernel_L2_M2_100: + + SAVE2x2 + +sgemm_kernel_L2_M2_END: + + +sgemm_kernel_L2_M1_BEGIN: + + tst I, #1 // I = I % 2 + ble sgemm_kernel_L2_END + +sgemm_kernel_L2_M1_20: + + INIT1x2 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble sgemm_kernel_L2_M1_40 + +sgemm_kernel_L2_M1_22: + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs L, L, #1 + bgt sgemm_kernel_L2_M1_22 + + +sgemm_kernel_L2_M1_40: + + ands L , K1, #7 // L = L % 8 + ble sgemm_kernel_L2_M1_100 + +sgemm_kernel_L2_M1_42: + + KERNEL1x2_SUB + + subs L, L, #1 + bgt sgemm_kernel_L2_M1_42 + +sgemm_kernel_L2_M1_100: + + SAVE1x2 + + +sgemm_kernel_L2_END: + + mov r3, BC + mov r4, K1 + lsl r4, r4, #3 // k * 2 * 4 + add r3, r3, r4 // B = B + K * 2 * 4 + mov BC, r3 + + subs J , #1 // j-- + bgt sgemm_kernel_L2_BEGIN + +/*********************************************************************************************/ + +sgemm_kernel_L1_BEGIN: + + ldr J , N + tst J , #1 + ble sgemm_kernel_L999 + + + ldr CO1, C // CO1 = C + ldr r4 , LDC + add r3 , r4, CO1 + str r3 , C // store C + + ldr AO, A // AO = A + + + +sgemm_kernel_L1_M4_BEGIN: + + ldr I, M + asrs I, I, #2 // I = I / 4 + ble sgemm_kernel_L1_M2_BEGIN + +sgemm_kernel_L1_M4_20: + + INIT4x1 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble sgemm_kernel_L1_M4_40 + .align 5 + +sgemm_kernel_L1_M4_22: + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + subs L, L, #1 + bgt sgemm_kernel_L1_M4_22 + + +sgemm_kernel_L1_M4_40: + + ands L , K1, #7 // L = L % 8 + ble sgemm_kernel_L1_M4_100 + +sgemm_kernel_L1_M4_42: + + KERNEL4x1_SUB + + subs L, L, #1 + bgt sgemm_kernel_L1_M4_42 + +sgemm_kernel_L1_M4_100: + + SAVE4x1 + +sgemm_kernel_L1_M4_END: + + subs I, I, #1 + bgt sgemm_kernel_L1_M4_20 + + +sgemm_kernel_L1_M2_BEGIN: + + ldr I, M + tst I , #3 + ble sgemm_kernel_L1_END + + tst I, #2 // I = I / 2 + ble sgemm_kernel_L1_M1_BEGIN + +sgemm_kernel_L1_M2_20: + + INIT2x1 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble sgemm_kernel_L1_M2_40 + +sgemm_kernel_L1_M2_22: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + subs L, L, #1 + bgt sgemm_kernel_L1_M2_22 + + +sgemm_kernel_L1_M2_40: + + ands L , K1, #7 // L = L % 8 + ble sgemm_kernel_L1_M2_100 + +sgemm_kernel_L1_M2_42: + + KERNEL2x1_SUB + + subs L, L, #1 + bgt sgemm_kernel_L1_M2_42 + +sgemm_kernel_L1_M2_100: + + SAVE2x1 + +sgemm_kernel_L1_M2_END: + + +sgemm_kernel_L1_M1_BEGIN: + + tst I, #1 // I = I % 2 + ble sgemm_kernel_L1_END + +sgemm_kernel_L1_M1_20: + + INIT1x1 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble sgemm_kernel_L1_M1_40 + +sgemm_kernel_L1_M1_22: + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs L, L, #1 + bgt sgemm_kernel_L1_M1_22 + + +sgemm_kernel_L1_M1_40: + + ands L , K1, #7 // L = L % 8 + ble sgemm_kernel_L1_M1_100 + +sgemm_kernel_L1_M1_42: + + KERNEL1x1_SUB + + subs L, L, #1 + bgt sgemm_kernel_L1_M1_42 + +sgemm_kernel_L1_M1_100: + + SAVE1x1 + + +sgemm_kernel_L1_END: + + +sgemm_kernel_L999: + + sub r3, fp, #128 + vldm r3, { s8 - s15} // restore floating point registers + + movs r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/sgemm_kernel_4x4_vfpv3.S b/kernel/arm/sgemm_kernel_4x4_vfpv3.S new file mode 100644 index 000000000..38dc4d3ea --- /dev/null +++ b/kernel/arm/sgemm_kernel_4x4_vfpv3.S @@ -0,0 +1,1436 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/23 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +* +* 2013/11/02 Saar +* UNROLL_N 4 +* UNROLL_M 4 +* DGEMM_P 128 +* DGEMM_Q 240 +* DGEMM_R 12288 +* A_PRE 128 +* B_PRE 128 +* C_PRE 32 +* +* Performance on Odroid U2: +* +* 3072x3072 1 Core: 2.62 GFLOPS ATLAS: 2.69 GFLOPS +* 3072x3072 2 Cores: 5.23 GFLOPS ATLAS: 5.27 GFLOPS +* 3072x3072 3 Cores: 7.78 GFLOPS ATLAS: 7.87 GFLOPS +* 3072x3072 4 Cores: 10.10 GFLOPS ATLAS: 9.98 GFLOPS +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_K r2 +#define OLD_A r3 +#define OLD_ALPHA s0 + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define LDC [fp, #-252 ] +#define M [fp, #-256 ] +#define N [fp, #-260 ] +#define K [fp, #-264 ] +#define A [fp, #-268 ] + +#define ALPHA [fp, #-280] + +#define B [fp, #4 ] +#define C [fp, #8 ] +#define OLD_LDC [fp, #12 ] + +#define I r0 +#define J r1 +#define L r2 + +#define AO r5 +#define BO r6 + +#define CO1 r8 +#define CO2 r9 + +#define K1 r7 +#define BC r12 + +#define A_PRE 128 +#define B_PRE 128 +#define C_PRE 32 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro INIT4x4 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s17, s16 + vmov.f32 s18, s16 + vmov.f32 s19, s16 + vmov.f32 s20, s16 + vmov.f32 s21, s16 + vmov.f32 s22, s16 + vmov.f32 s23, s16 + vmov.f32 s24, s16 + vmov.f32 s25, s16 + vmov.f32 s26, s16 + vmov.f32 s27, s16 + vmov.f32 s28, s16 + vmov.f32 s29, s16 + vmov.f32 s30, s16 + vmov.f32 s31, s16 + +.endm + +.macro KERNEL4x4_I + + pld [ AO , #A_PRE ] + fldmias AO!, { s0 - s1 } + pld [ BO , #B_PRE ] + fldmias BO!, { s8 - s9 } + + fmuls s16 , s0, s8 + fldmias AO!, { s2 - s3 } + fmuls s17 , s1, s8 + fmuls s18 , s2, s8 + fldmias BO!, { s10 - s11 } + fmuls s19 , s3, s8 + + fmuls s20 , s0, s9 + fldmias AO!, { s4 - s5 } + fmuls s21 , s1, s9 + fmuls s22 , s2, s9 + fldmias AO!, { s6 - s7 } + fmuls s23 , s3, s9 + + fmuls s24 , s0, s10 + fldmias BO!, { s12 - s13 } + fmuls s25 , s1, s10 + fmuls s26 , s2, s10 + fldmias BO!, { s14 - s15 } + fmuls s27 , s3, s10 + + fmuls s28 , s0, s11 + fmuls s29 , s1, s11 + fmuls s30 , s2, s11 + fmuls s31 , s3, s11 + +.endm + + +.macro KERNEL4x4_M2 + + pld [ AO , #A_PRE ] + fmacs s16 , s4, s12 + fmacs s17 , s5, s12 + fldmias AO!, { s0 - s3 } + fmacs s18 , s6, s12 + pld [ BO , #B_PRE ] + fmacs s19 , s7, s12 + + fmacs s20 , s4, s13 + fldmias BO!, { s8 - s11 } + fmacs s21 , s5, s13 + fmacs s22 , s6, s13 + //fldmias AO!, { s2 - s3 } + fmacs s23 , s7, s13 + + fmacs s24 , s4, s14 + //fldmias BO!, { s10 - s11 } + fmacs s25 , s5, s14 + fmacs s26 , s6, s14 + fmacs s27 , s7, s14 + + fmacs s28 , s4, s15 + fmacs s29 , s5, s15 + fmacs s30 , s6, s15 + fmacs s31 , s7, s15 + +.endm + + +.macro KERNEL4x4_M1 + + fmacs s16 , s0, s8 + fldmias AO!, { s4 - s7 } + fmacs s17 , s1, s8 + fmacs s18 , s2, s8 + fldmias BO!, { s12 - s15 } + //fldmias AO!, { s6 - s7 } + fmacs s19 , s3, s8 + + fmacs s20 , s0, s9 + fmacs s21 , s1, s9 + fmacs s22 , s2, s9 + //fldmias BO!, { s14 - s15 } + fmacs s23 , s3, s9 + + fmacs s24 , s0, s10 + fmacs s25 , s1, s10 + fmacs s26 , s2, s10 + fmacs s27 , s3, s10 + + fmacs s28 , s0, s11 + fmacs s29 , s1, s11 + fmacs s30 , s2, s11 + fmacs s31 , s3, s11 + +.endm + + + +.macro KERNEL4x4_E + + fmacs s16 , s4, s12 + fmacs s17 , s5, s12 + fmacs s18 , s6, s12 + fmacs s19 , s7, s12 + + fmacs s20 , s4, s13 + fmacs s21 , s5, s13 + fmacs s22 , s6, s13 + fmacs s23 , s7, s13 + + fmacs s24 , s4, s14 + fmacs s25 , s5, s14 + fmacs s26 , s6, s14 + fmacs s27 , s7, s14 + + fmacs s28 , s4, s15 + fmacs s29 , s5, s15 + fmacs s30 , s6, s15 + fmacs s31 , s7, s15 + +.endm + + + + +.macro KERNEL4x4_SUB + + flds s8 , [ BO ] + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + fmacs s16 , s0, s8 + flds s2 , [ AO, #8 ] + fmacs s17 , s1, s8 + flds s3 , [ AO, #12 ] + fmacs s18 , s2, s8 + flds s9 , [ BO, #4 ] + fmacs s19 , s3, s8 + + flds s10, [ BO, #8 ] + fmacs s20 , s0, s9 + flds s11, [ BO, #12 ] + fmacs s21 , s1, s9 + fmacs s22 , s2, s9 + fmacs s23 , s3, s9 + + fmacs s24 , s0, s10 + fmacs s25 , s1, s10 + fmacs s26 , s2, s10 + fmacs s27 , s3, s10 + + fmacs s28 , s0, s11 + fmacs s29 , s1, s11 + add AO , AO, #16 + fmacs s30 , s2, s11 + add BO , BO, #16 + fmacs s31 , s3, s11 + +.endm + +.macro SAVE4x4 + + ldr r3 , LDC + add CO2 , CO1, r3 + flds s0, ALPHA + add r4 , CO2, r3 + + fldmias CO1, { s8 - s11 } + + fmacs s8 , s0 , s16 + flds s12, [CO2] + fmacs s9 , s0 , s17 + flds s13, [CO2, #4 ] + fmacs s10, s0 , s18 + flds s14, [CO2, #8 ] + fmacs s11, s0 , s19 + flds s15, [CO2, #12 ] + + fmacs s12, s0 , s20 + fsts s8 , [CO1] + fmacs s13, s0 , s21 + fsts s9 , [CO1, #4 ] + fmacs s14, s0 , s22 + fsts s10, [CO1, #8 ] + fmacs s15, s0 , s23 + fsts s11, [CO1, #12 ] + + pld [ CO1 , #C_PRE ] + + fldmias r4, { s8 - s11 } + + fmacs s8 , s0 , s24 + fsts s12, [CO2] + fmacs s9 , s0 , s25 + fsts s13, [CO2, #4 ] + fmacs s10, s0 , s26 + fsts s14, [CO2, #8 ] + fmacs s11, s0 , s27 + fsts s15, [CO2, #12 ] + + pld [ CO2 , #C_PRE ] + + add CO2, r4 , r3 + + + fldmias CO2, { s12 - s15 } + + fsts s8 , [r4 ] + fmacs s12, s0 , s28 + fsts s9 , [r4 , #4 ] + fmacs s13, s0 , s29 + fsts s10, [r4 , #8 ] + fmacs s14, s0 , s30 + fsts s11, [r4 , #12 ] + fmacs s15, s0 , s31 + + pld [ r4 , #C_PRE ] + fstmias CO2, { s12 - s15 } + pld [ CO2 , #C_PRE ] + + add CO1, CO1, #16 + +.endm + +/******************************************************************************/ + +.macro INIT2x4 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s17, s16 + vmov.f32 s20, s16 + vmov.f32 s21, s16 + vmov.f32 s24, s16 + vmov.f32 s25, s16 + vmov.f32 s28, s16 + vmov.f32 s29, s16 + +.endm + + + +.macro KERNEL2x4_SUB + + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + flds s10, [ BO, #8 ] + flds s11, [ BO, #12 ] + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + fmacs s16 , s0, s8 + fmacs s17 , s1, s8 + + fmacs s20 , s0, s9 + fmacs s21 , s1, s9 + + fmacs s24 , s0, s10 + fmacs s25 , s1, s10 + + fmacs s28 , s0, s11 + fmacs s29 , s1, s11 + add AO , AO, #8 + add BO , BO, #16 + +.endm + +.macro SAVE2x4 + + ldr r3 , LDC + add CO2 , CO1, r3 + add r4 , CO2, r3 + + flds s0, ALPHA + + flds s8 , [CO1] + flds s9 , [CO1, #4 ] + + fmacs s8 , s0 , s16 + fmacs s9 , s0 , s17 + + fsts s8 , [CO1] + fsts s9 , [CO1, #4 ] + + flds s12, [CO2] + flds s13, [CO2, #4 ] + + fmacs s12, s0 , s20 + fmacs s13, s0 , s21 + + fsts s12, [CO2] + fsts s13, [CO2, #4 ] + + flds s8 , [r4 ] + flds s9 , [r4 , #4 ] + + fmacs s8 , s0 , s24 + fmacs s9 , s0 , s25 + + fsts s8 , [r4 ] + fsts s9 , [r4 , #4 ] + + add CO2, r4 , r3 + + flds s12, [CO2] + flds s13, [CO2, #4 ] + + fmacs s12, s0 , s28 + fmacs s13, s0 , s29 + + fsts s12, [CO2] + fsts s13, [CO2, #4 ] + + add CO1, CO1, #8 + +.endm + + +/******************************************************************************/ + +.macro INIT1x4 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s20, s16 + vmov.f32 s24, s16 + vmov.f32 s28, s16 + +.endm + + + +.macro KERNEL1x4_SUB + + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + flds s10, [ BO, #8 ] + flds s11, [ BO, #12 ] + + flds s0 , [ AO ] + + fmacs s16 , s0, s8 + fmacs s20 , s0, s9 + fmacs s24 , s0, s10 + fmacs s28 , s0, s11 + + add AO , AO, #4 + add BO , BO, #16 + +.endm + +.macro SAVE1x4 + + ldr r3 , LDC + add CO2 , CO1, r3 + add r4 , CO2, r3 + + flds s0, ALPHA + + flds s8 , [CO1] + fmacs s8 , s0 , s16 + fsts s8 , [CO1] + + flds s12, [CO2] + fmacs s12, s0 , s20 + fsts s12, [CO2] + + flds s8 , [r4 ] + fmacs s8 , s0 , s24 + fsts s8 , [r4 ] + + add CO2, r4 , r3 + + flds s12, [CO2] + fmacs s12, s0 , s28 + fsts s12, [CO2] + + add CO1, CO1, #4 + +.endm + +/******************************************************************************/ +/******************************************************************************/ + +.macro INIT4x2 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s17, s16 + vmov.f32 s18, s16 + vmov.f32 s19, s16 + vmov.f32 s20, s16 + vmov.f32 s21, s16 + vmov.f32 s22, s16 + vmov.f32 s23, s16 + +.endm + + + +.macro KERNEL4x2_SUB + + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s2 , [ AO, #8 ] + flds s3 , [ AO, #12 ] + + fmacs s16 , s0, s8 + fmacs s17 , s1, s8 + fmacs s18 , s2, s8 + fmacs s19 , s3, s8 + + fmacs s20 , s0, s9 + fmacs s21 , s1, s9 + fmacs s22 , s2, s9 + fmacs s23 , s3, s9 + + add AO , AO, #16 + add BO , BO, #8 + +.endm + +.macro SAVE4x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + flds s0, ALPHA + + flds s8 , [CO1] + flds s9 , [CO1, #4 ] + flds s10, [CO1, #8 ] + flds s11, [CO1, #12 ] + + fmacs s8 , s0 , s16 + fmacs s9 , s0 , s17 + fmacs s10, s0 , s18 + fmacs s11, s0 , s19 + + fsts s8 , [CO1] + fsts s9 , [CO1, #4 ] + fsts s10, [CO1, #8 ] + fsts s11, [CO1, #12 ] + + flds s12, [CO2] + flds s13, [CO2, #4 ] + flds s14, [CO2, #8 ] + flds s15, [CO2, #12 ] + + fmacs s12, s0 , s20 + fmacs s13, s0 , s21 + fmacs s14, s0 , s22 + fmacs s15, s0 , s23 + + fsts s12, [CO2] + fsts s13, [CO2, #4 ] + fsts s14, [CO2, #8 ] + fsts s15, [CO2, #12 ] + + add CO1, CO1, #16 + +.endm + + +/******************************************************************************/ + +.macro INIT2x2 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s17, s16 + vmov.f32 s20, s16 + vmov.f32 s21, s16 + +.endm + + + +.macro KERNEL2x2_SUB + + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + fmacs s16 , s0, s8 + fmacs s17 , s1, s8 + + fmacs s20 , s0, s9 + fmacs s21 , s1, s9 + + add AO , AO, #8 + add BO , BO, #8 + +.endm + +.macro SAVE2x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + flds s0, ALPHA + + flds s8 , [CO1] + flds s9 , [CO1, #4 ] + + fmacs s8 , s0 , s16 + fmacs s9 , s0 , s17 + + fsts s8 , [CO1] + fsts s9 , [CO1, #4 ] + + flds s12, [CO2] + flds s13, [CO2, #4 ] + + fmacs s12, s0 , s20 + fmacs s13, s0 , s21 + + fsts s12, [CO2] + fsts s13, [CO2, #4 ] + + add CO1, CO1, #8 + +.endm + +/******************************************************************************/ + +.macro INIT1x2 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s20, s16 + +.endm + + + +.macro KERNEL1x2_SUB + + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + + flds s0 , [ AO ] + fmacs s16 , s0, s8 + fmacs s20 , s0, s9 + + add AO , AO, #4 + add BO , BO, #8 + +.endm + +.macro SAVE1x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + flds s0, ALPHA + + flds s8 , [CO1] + fmacs s8 , s0 , s16 + fsts s8 , [CO1] + + flds s12, [CO2] + fmacs s12, s0 , s20 + fsts s12, [CO2] + + add CO1, CO1, #4 + +.endm + +/******************************************************************************/ +/******************************************************************************/ + +.macro INIT4x1 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s17, s16 + vmov.f32 s18, s16 + vmov.f32 s19, s16 + +.endm + + + +.macro KERNEL4x1_SUB + + flds s8 , [ BO ] + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s2 , [ AO, #8 ] + flds s3 , [ AO, #12 ] + + fmacs s16 , s0, s8 + fmacs s17 , s1, s8 + fmacs s18 , s2, s8 + fmacs s19 , s3, s8 + + add AO , AO, #16 + add BO , BO, #4 + +.endm + +.macro SAVE4x1 + + + flds s0, ALPHA + + flds s8 , [CO1] + flds s9 , [CO1, #4 ] + flds s10, [CO1, #8 ] + flds s11, [CO1, #12 ] + + fmacs s8 , s0 , s16 + fmacs s9 , s0 , s17 + fmacs s10, s0 , s18 + fmacs s11, s0 , s19 + + fsts s8 , [CO1] + fsts s9 , [CO1, #4 ] + fsts s10, [CO1, #8 ] + fsts s11, [CO1, #12 ] + + add CO1, CO1, #16 + +.endm + + + + +/******************************************************************************/ + +.macro INIT2x1 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s17, s16 + +.endm + + + +.macro KERNEL2x1_SUB + + flds s8 , [ BO ] + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + fmacs s16 , s0, s8 + fmacs s17 , s1, s8 + + add AO , AO, #8 + add BO , BO, #4 + +.endm + +.macro SAVE2x1 + + + flds s0, ALPHA + + flds s8 , [CO1] + flds s9 , [CO1, #4 ] + + fmacs s8 , s0 , s16 + fmacs s9 , s0 , s17 + + fsts s8 , [CO1] + fsts s9 , [CO1, #4 ] + + add CO1, CO1, #8 + +.endm + +/******************************************************************************/ + +.macro INIT1x1 + + vsub.f32 s16 , s16 , s16 + +.endm + + + +.macro KERNEL1x1_SUB + + flds s8 , [ BO ] + + flds s0 , [ AO ] + + fmacs s16 , s0, s8 + + add AO , AO, #4 + add BO , BO, #4 + +.endm + +.macro SAVE1x1 + + + flds s0, ALPHA + + flds s8 , [CO1] + fmacs s8 , s0 , s16 + fsts s8 , [CO1] + + add CO1, CO1, #4 + +.endm + + + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + str OLD_M, M + str OLD_N, N + str OLD_K, K + str OLD_A, A + vstr OLD_ALPHA, ALPHA + + sub r3, fp, #128 + vstm r3, { s8 - s31} // store floating point registers + + ldr r3, OLD_LDC + lsl r3, r3, #2 // ldc = ldc * 4 + str r3, LDC + + ldr K1, K + ldr BC, B + + ldr J, N + asrs J, J, #2 // J = J / 4 + ble sgemm_kernel_L2_BEGIN + +sgemm_kernel_L4_BEGIN: + + ldr CO1, C // CO1 = C + ldr r4 , LDC + lsl r4 , r4 , #2 // LDC * 4 + add r3 , r4, CO1 + str r3 , C // store C + + ldr AO, A // AO = A + pld [AO , #A_PRE-64] + pld [AO , #A_PRE-32] + + + +sgemm_kernel_L4_M4_BEGIN: + + ldr I, M + asrs I, I, #2 // I = I / 4 + ble sgemm_kernel_L4_M2_BEGIN + +sgemm_kernel_L4_M4_20: + + + mov BO, BC + asrs L , K1, #1 // L = L / 8 + cmp L , #2 + blt sgemm_kernel_L4_M4_32 + + + + KERNEL4x4_I + KERNEL4x4_M2 + + subs L, L, #2 + ble sgemm_kernel_L4_M4_22a + .align 5 + +sgemm_kernel_L4_M4_22: + + KERNEL4x4_M1 + KERNEL4x4_M2 + + subs L, L, #1 + bgt sgemm_kernel_L4_M4_22 + +sgemm_kernel_L4_M4_22a: + + KERNEL4x4_M1 + KERNEL4x4_E + + b sgemm_kernel_L4_M4_44 + +sgemm_kernel_L4_M4_32: + + tst L, #1 + ble sgemm_kernel_L4_M4_40 + + KERNEL4x4_I + + KERNEL4x4_E + + b sgemm_kernel_L4_M4_44 + + +sgemm_kernel_L4_M4_40: + + INIT4x4 + + +sgemm_kernel_L4_M4_44: + + ands L , K1, #1 // L = L % 8 + ble sgemm_kernel_L4_M4_100 + +sgemm_kernel_L4_M4_46: + + KERNEL4x4_SUB + + subs L, L, #1 + bne sgemm_kernel_L4_M4_46 + +sgemm_kernel_L4_M4_100: + + SAVE4x4 + +sgemm_kernel_L4_M4_END: + + subs I, I, #1 + bne sgemm_kernel_L4_M4_20 + + +sgemm_kernel_L4_M2_BEGIN: + + ldr I, M + tst I , #3 + ble sgemm_kernel_L4_END + + tst I, #2 // I = I / 2 + ble sgemm_kernel_L4_M1_BEGIN + +sgemm_kernel_L4_M2_20: + + INIT2x4 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble sgemm_kernel_L4_M2_40 + +sgemm_kernel_L4_M2_22: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + subs L, L, #1 + bgt sgemm_kernel_L4_M2_22 + + +sgemm_kernel_L4_M2_40: + + ands L , K1, #7 // L = L % 8 + ble sgemm_kernel_L4_M2_100 + +sgemm_kernel_L4_M2_42: + + KERNEL2x4_SUB + + subs L, L, #1 + bgt sgemm_kernel_L4_M2_42 + +sgemm_kernel_L4_M2_100: + + SAVE2x4 + +sgemm_kernel_L4_M2_END: + + +sgemm_kernel_L4_M1_BEGIN: + + tst I, #1 // I = I % 2 + ble sgemm_kernel_L4_END + +sgemm_kernel_L4_M1_20: + + INIT1x4 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble sgemm_kernel_L4_M1_40 + +sgemm_kernel_L4_M1_22: + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + subs L, L, #1 + bgt sgemm_kernel_L4_M1_22 + + +sgemm_kernel_L4_M1_40: + + ands L , K1, #7 // L = L % 8 + ble sgemm_kernel_L4_M1_100 + +sgemm_kernel_L4_M1_42: + + KERNEL1x4_SUB + + subs L, L, #1 + bgt sgemm_kernel_L4_M1_42 + +sgemm_kernel_L4_M1_100: + + SAVE1x4 + + +sgemm_kernel_L4_END: + + mov r3, BC + mov r4, K1 + lsl r4, r4, #4 // k * 4 * 4 + add r3, r3, r4 // B = B + K * 4 * 4 + mov BC, r3 + + subs J , #1 // j-- + bgt sgemm_kernel_L4_BEGIN + + + +/*********************************************************************************************/ + +sgemm_kernel_L2_BEGIN: + + ldr J , N + tst J , #3 + ble sgemm_kernel_L999 + + tst J , #2 + ble sgemm_kernel_L1_BEGIN + + ldr CO1, C // CO1 = C + ldr r4 , LDC + lsl r4 , r4 , #1 // LDC * 2 + add r3 , r4, CO1 + str r3 , C // store C + + ldr AO, A // AO = A + //pld [AO , #A_PRE-96] + //pld [AO , #A_PRE-64] + //pld [AO , #A_PRE-32] + + + +sgemm_kernel_L2_M4_BEGIN: + + ldr I, M + asrs I, I, #2 // I = I / 4 + ble sgemm_kernel_L2_M2_BEGIN + +sgemm_kernel_L2_M4_20: + + INIT4x2 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble sgemm_kernel_L2_M4_40 + .align 5 + +sgemm_kernel_L2_M4_22: + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + subs L, L, #1 + bgt sgemm_kernel_L2_M4_22 + + +sgemm_kernel_L2_M4_40: + + ands L , K1, #7 // L = L % 8 + ble sgemm_kernel_L2_M4_100 + +sgemm_kernel_L2_M4_42: + + KERNEL4x2_SUB + + subs L, L, #1 + bgt sgemm_kernel_L2_M4_42 + +sgemm_kernel_L2_M4_100: + + SAVE4x2 + +sgemm_kernel_L2_M4_END: + + subs I, I, #1 + bgt sgemm_kernel_L2_M4_20 + + +sgemm_kernel_L2_M2_BEGIN: + + ldr I, M + tst I , #3 + ble sgemm_kernel_L2_END + + tst I, #2 // I = I / 2 + ble sgemm_kernel_L2_M1_BEGIN + +sgemm_kernel_L2_M2_20: + + INIT2x2 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble sgemm_kernel_L2_M2_40 + +sgemm_kernel_L2_M2_22: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + subs L, L, #1 + bgt sgemm_kernel_L2_M2_22 + + +sgemm_kernel_L2_M2_40: + + ands L , K1, #7 // L = L % 8 + ble sgemm_kernel_L2_M2_100 + +sgemm_kernel_L2_M2_42: + + KERNEL2x2_SUB + + subs L, L, #1 + bgt sgemm_kernel_L2_M2_42 + +sgemm_kernel_L2_M2_100: + + SAVE2x2 + +sgemm_kernel_L2_M2_END: + + +sgemm_kernel_L2_M1_BEGIN: + + tst I, #1 // I = I % 2 + ble sgemm_kernel_L2_END + +sgemm_kernel_L2_M1_20: + + INIT1x2 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble sgemm_kernel_L2_M1_40 + +sgemm_kernel_L2_M1_22: + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs L, L, #1 + bgt sgemm_kernel_L2_M1_22 + + +sgemm_kernel_L2_M1_40: + + ands L , K1, #7 // L = L % 8 + ble sgemm_kernel_L2_M1_100 + +sgemm_kernel_L2_M1_42: + + KERNEL1x2_SUB + + subs L, L, #1 + bgt sgemm_kernel_L2_M1_42 + +sgemm_kernel_L2_M1_100: + + SAVE1x2 + + +sgemm_kernel_L2_END: + + mov r3, BC + mov r4, K1 + lsl r4, r4, #3 // k * 2 * 4 + add r3, r3, r4 // B = B + K * 2 * 4 + mov BC, r3 + +/*********************************************************************************************/ + +sgemm_kernel_L1_BEGIN: + + ldr J , N + tst J , #1 + ble sgemm_kernel_L999 + + + ldr CO1, C // CO1 = C + ldr r4 , LDC + add r3 , r4, CO1 + str r3 , C // store C + + ldr AO, A // AO = A + //pld [AO , #A_PRE-96] + //pld [AO , #A_PRE-64] + //pld [AO , #A_PRE-32] + + + +sgemm_kernel_L1_M4_BEGIN: + + ldr I, M + asrs I, I, #2 // I = I / 4 + ble sgemm_kernel_L1_M2_BEGIN + +sgemm_kernel_L1_M4_20: + + INIT4x1 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble sgemm_kernel_L1_M4_40 + .align 5 + +sgemm_kernel_L1_M4_22: + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + subs L, L, #1 + bgt sgemm_kernel_L1_M4_22 + + +sgemm_kernel_L1_M4_40: + + ands L , K1, #7 // L = L % 8 + ble sgemm_kernel_L1_M4_100 + +sgemm_kernel_L1_M4_42: + + KERNEL4x1_SUB + + subs L, L, #1 + bgt sgemm_kernel_L1_M4_42 + +sgemm_kernel_L1_M4_100: + + SAVE4x1 + +sgemm_kernel_L1_M4_END: + + subs I, I, #1 + bgt sgemm_kernel_L1_M4_20 + + +sgemm_kernel_L1_M2_BEGIN: + + ldr I, M + tst I , #3 + ble sgemm_kernel_L1_END + + tst I, #2 // I = I / 2 + ble sgemm_kernel_L1_M1_BEGIN + +sgemm_kernel_L1_M2_20: + + INIT2x1 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble sgemm_kernel_L1_M2_40 + +sgemm_kernel_L1_M2_22: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + subs L, L, #1 + bgt sgemm_kernel_L1_M2_22 + + +sgemm_kernel_L1_M2_40: + + ands L , K1, #7 // L = L % 8 + ble sgemm_kernel_L1_M2_100 + +sgemm_kernel_L1_M2_42: + + KERNEL2x1_SUB + + subs L, L, #1 + bgt sgemm_kernel_L1_M2_42 + +sgemm_kernel_L1_M2_100: + + SAVE2x1 + +sgemm_kernel_L1_M2_END: + + +sgemm_kernel_L1_M1_BEGIN: + + tst I, #1 // I = I % 2 + ble sgemm_kernel_L1_END + +sgemm_kernel_L1_M1_20: + + INIT1x1 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble sgemm_kernel_L1_M1_40 + +sgemm_kernel_L1_M1_22: + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs L, L, #1 + bgt sgemm_kernel_L1_M1_22 + + +sgemm_kernel_L1_M1_40: + + ands L , K1, #7 // L = L % 8 + ble sgemm_kernel_L1_M1_100 + +sgemm_kernel_L1_M1_42: + + KERNEL1x1_SUB + + subs L, L, #1 + bgt sgemm_kernel_L1_M1_42 + +sgemm_kernel_L1_M1_100: + + SAVE1x1 + + +sgemm_kernel_L1_END: + + +sgemm_kernel_L999: + + sub r3, fp, #128 + vldm r3, { s8 - s31} // restore floating point registers + + movs r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/sgemm_ncopy_2_vfp.S b/kernel/arm/sgemm_ncopy_2_vfp.S new file mode 100644 index 000000000..0546f1d69 --- /dev/null +++ b/kernel/arm/sgemm_ncopy_2_vfp.S @@ -0,0 +1,225 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/24 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_A r2 +#define OLD_LDA r3 + +#define B [fp, #4 ] + +#define M r0 +#define N r1 +#define A r2 + +#define BO r5 + +#define AO1 r6 +#define AO2 r7 +#define LDA r8 + +#define I r3 +#define J r12 + +#define A_PRE 256 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro COPY2x2 + + flds s0 , [ AO1, #0 ] + flds s2 , [ AO1, #4 ] + + flds s1 , [ AO2, #0 ] + flds s3 , [ AO2, #4 ] + + add AO1, AO1, #8 + fstmias BO!, { s0 - s3 } + add AO2, AO2, #8 + +.endm + + +.macro COPY1x2 + + flds s0 , [ AO1, #0 ] + flds s1 , [ AO2, #0 ] + add AO1, AO1, #4 + + fstmias BO!, { s0 - s1 } + add AO2, AO2, #4 + +.endm + +.macro COPY2x1 + + flds s0 , [ AO1, #0 ] + flds s1 , [ AO1, #4 ] + + fstmias BO!, { s0 - s1 } + add AO1, AO1, #8 + +.endm + + +.macro COPY1x1 + + flds s0 , [ AO1, #0 ] + + fstmias BO!, { s0 } + add AO1, AO1, #4 + +.endm + + + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + + lsl LDA, OLD_LDA, #2 // lda = lda * 4 + + ldr BO, B + + +/*********************************************************************************************/ + +sgemm_ncopy_L2_BEGIN: + + asrs J, N, #1 // J = N / 2 + ble sgemm_ncopy_L1_BEGIN + +sgemm_ncopy_L2_M2_BEGIN: + + mov AO1, A // AO1 = A + add AO2, AO1, LDA + add A , AO2, LDA // A = A + 2 * LDA + + asrs I, M, #1 // I = M / 2 + ble sgemm_ncopy_L2_M2_40 + +sgemm_ncopy_L2_M2_20: + + COPY2x2 + + subs I , I , #1 + bne sgemm_ncopy_L2_M2_20 + + +sgemm_ncopy_L2_M2_40: + + ands I, M , #1 + ble sgemm_ncopy_L2_M2_END + +sgemm_ncopy_L2_M2_60: + + COPY1x2 + + subs I , I , #1 + bne sgemm_ncopy_L2_M2_60 + + +sgemm_ncopy_L2_M2_END: + + subs J , J, #1 // j-- + bne sgemm_ncopy_L2_M2_BEGIN + +/*********************************************************************************************/ + +sgemm_ncopy_L1_BEGIN: + + tst N, #1 + ble sgemm_ncopy_L999 + + +sgemm_ncopy_L1_M2_BEGIN: + + mov AO1, A // AO1 = A + add A , AO1, LDA // A = A + 1 * LDA + + asrs I, M, #1 // I = M / 2 + ble sgemm_ncopy_L1_M2_40 + +sgemm_ncopy_L1_M2_20: + + COPY2x1 + + subs I , I , #1 + bne sgemm_ncopy_L1_M2_20 + + +sgemm_ncopy_L1_M2_40: + + ands I, M , #1 + ble sgemm_ncopy_L1_M2_END + +sgemm_ncopy_L1_M2_60: + + COPY1x1 + + subs I , I , #1 + bne sgemm_ncopy_L1_M2_60 + + +sgemm_ncopy_L1_M2_END: + + + +sgemm_ncopy_L999: + + + movs r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/sgemm_ncopy_4_vfp.S b/kernel/arm/sgemm_ncopy_4_vfp.S new file mode 100644 index 000000000..2d8fa2e24 --- /dev/null +++ b/kernel/arm/sgemm_ncopy_4_vfp.S @@ -0,0 +1,353 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/05 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_A r2 +#define OLD_LDA r3 + + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define LDA [fp, #-260 ] + +#define B [fp, #4 ] + +#define M r0 +#define N r1 +#define A r2 + +#define BO r5 + +#define AO1 r6 +#define AO2 r7 +#define AO3 r8 +#define AO4 r9 + +#define I r3 +#define J r12 + +#define A_PRE 192 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro COPY4x4 + + flds s0 , [ AO1, #0 ] + flds s1 , [ AO2, #0 ] + flds s2 , [ AO3, #0 ] + flds s3 , [ AO4, #0 ] + + flds s4 , [ AO1, #4 ] + flds s8 , [ AO1, #8 ] + flds s12, [ AO1, #12 ] + + flds s5 , [ AO2, #4 ] + add AO1, AO1, #16 + flds s9 , [ AO2, #8 ] + flds s13, [ AO2, #12 ] + + flds s6 , [ AO3, #4 ] + add AO2, AO2, #16 + flds s10, [ AO3, #8 ] + flds s14, [ AO3, #12 ] + + flds s7 , [ AO4, #4 ] + add AO3, AO3, #16 + flds s11, [ AO4, #8 ] + flds s15, [ AO4, #12 ] + + fstmias BO!, { s0 - s3 } + add AO4, AO4, #16 + fstmias BO!, { s4 - s7 } + fstmias BO!, { s8 - s15 } + +.endm + +.macro COPY1x4 + + flds s0 , [ AO1, #0 ] + flds s1 , [ AO2, #0 ] + add AO1, AO1, #4 + flds s2 , [ AO3, #0 ] + add AO2, AO2, #4 + flds s3 , [ AO4, #0 ] + + add AO3, AO3, #4 + fstmias BO!, { s0 - s3 } + add AO4, AO4, #4 + +.endm + +.macro COPY4x2 + + flds s0 , [ AO1, #0 ] + flds s2 , [ AO1, #4 ] + flds s4 , [ AO1, #8 ] + flds s6 , [ AO1, #12 ] + + flds s1 , [ AO2, #0 ] + flds s3 , [ AO2, #4 ] + add AO1, AO1, #16 + flds s5 , [ AO2, #8 ] + flds s7 , [ AO2, #12 ] + + fstmias BO!, { s0 - s7 } + add AO2, AO2, #16 + +.endm + + +.macro COPY1x2 + + flds s0 , [ AO1, #0 ] + flds s1 , [ AO2, #0 ] + add AO1, AO1, #4 + + fstmias BO!, { s0 - s1 } + add AO2, AO2, #4 + +.endm + +.macro COPY4x1 + + flds s0 , [ AO1, #0 ] + flds s1 , [ AO1, #4 ] + flds s2 , [ AO1, #8 ] + flds s3 , [ AO1, #12 ] + + fstmias BO!, { s0 - s3 } + add AO1, AO1, #16 + +.endm + + +.macro COPY1x1 + + flds s0 , [ AO1, #0 ] + + fstmias BO!, { s0 } + add AO1, AO1, #4 + +.endm + + + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + + lsl r3, r3, #2 // lda = lda * 4 + str r3, LDA + + sub r4, fp, #128 + vstm r4, { s8 - s15} // store floating point registers + + ldr BO, B + +sgemm_ncopy_L4_BEGIN: + + asrs J, N, #2 // J = N / 4 + ble sgemm_ncopy_L2_BEGIN + +sgemm_ncopy_L4_M4_BEGIN: + + mov AO1, A // AO1 = A + ldr r4 , LDA + add AO2, AO1, r4 + add AO3, AO2, r4 + add AO4, AO3, r4 + add A , AO4, r4 // A = A + 4 * LDA + + asrs I, M, #2 // I = M / 4 + ble sgemm_ncopy_L4_M4_40 + +sgemm_ncopy_L4_M4_20: + + pld [ AO1, #A_PRE ] + pld [ AO2, #A_PRE ] + pld [ AO3, #A_PRE ] + pld [ AO4, #A_PRE ] + COPY4x4 + + subs I , I , #1 + ble sgemm_ncopy_L4_M4_40 + + COPY4x4 + + subs I , I , #1 + bne sgemm_ncopy_L4_M4_20 + + +sgemm_ncopy_L4_M4_40: + + ands I, M , #3 + ble sgemm_ncopy_L4_M4_END + +sgemm_ncopy_L4_M4_60: + + COPY1x4 + + subs I , I , #1 + bne sgemm_ncopy_L4_M4_60 + + +sgemm_ncopy_L4_M4_END: + + subs J , J, #1 // j-- + bne sgemm_ncopy_L4_M4_BEGIN + + + +/*********************************************************************************************/ + +sgemm_ncopy_L2_BEGIN: + + tst N, #3 + ble sgemm_ncopy_L999 + + tst N, #2 + ble sgemm_ncopy_L1_BEGIN + +sgemm_ncopy_L2_M4_BEGIN: + + mov AO1, A // AO1 = A + ldr r4 , LDA + add AO2, AO1, r4 + add A , AO2, r4 // A = A + 2 * LDA + + asrs I, M, #2 // I = M / 4 + ble sgemm_ncopy_L2_M4_40 + +sgemm_ncopy_L2_M4_20: + + COPY4x2 + + subs I , I , #1 + bne sgemm_ncopy_L2_M4_20 + + +sgemm_ncopy_L2_M4_40: + + ands I, M , #3 + ble sgemm_ncopy_L2_M4_END + +sgemm_ncopy_L2_M4_60: + + COPY1x2 + + subs I , I , #1 + bne sgemm_ncopy_L2_M4_60 + + +sgemm_ncopy_L2_M4_END: + + +/*********************************************************************************************/ + +sgemm_ncopy_L1_BEGIN: + + tst N, #1 + ble sgemm_ncopy_L999 + + +sgemm_ncopy_L1_M4_BEGIN: + + mov AO1, A // AO1 = A + ldr r4 , LDA + add A , AO1, r4 // A = A + 1 * LDA + + asrs I, M, #2 // I = M / 4 + ble sgemm_ncopy_L1_M4_40 + +sgemm_ncopy_L1_M4_20: + + COPY4x1 + + subs I , I , #1 + bne sgemm_ncopy_L1_M4_20 + + +sgemm_ncopy_L1_M4_40: + + ands I, M , #3 + ble sgemm_ncopy_L1_M4_END + +sgemm_ncopy_L1_M4_60: + + COPY1x1 + + subs I , I , #1 + bne sgemm_ncopy_L1_M4_60 + + +sgemm_ncopy_L1_M4_END: + + + +sgemm_ncopy_L999: + + sub r3, fp, #128 + vldm r3, { s8 - s15} // restore floating point registers + + movs r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/sgemm_tcopy_4_vfp.S b/kernel/arm/sgemm_tcopy_4_vfp.S new file mode 100644 index 000000000..b0a3278ff --- /dev/null +++ b/kernel/arm/sgemm_tcopy_4_vfp.S @@ -0,0 +1,430 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/06 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_A r2 +#define OLD_LDA r3 + + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define B [fp, #4 ] +#define A [fp, #-248 ] + +#define M r0 +#define N r1 +#define M4 r2 + +#define LDA r5 + +#define AO1 r6 +#define BO1 r7 +#define BO2 r8 +#define BO3 r9 + +#define I r4 +#define J r12 + +#define A_PRE 256 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro COPY4x4_1 + + pld [ AO1, #A_PRE ] + fldmias AO1, { s0 - s3 } + + add r3, AO1, LDA + pld [ r3, #A_PRE ] + fldmias r3, { s4 - s7 } + + add r3, r3, LDA + pld [ r3, #A_PRE ] + fldmias r3, { s8 - s11 } + + add r3, r3, LDA + pld [ r3, #A_PRE ] + fldmias r3, { s12 - s15 } + + fstmias BO1, { s0 - s15 } + add AO1, AO1, #16 + add BO1, BO1, M4 + +.endm + +.macro COPY4x4_2 + + fldmias AO1, { s0 - s3 } + + add r3, AO1, LDA + fldmias r3, { s4 - s7 } + + add r3, r3, LDA + fldmias r3, { s8 - s11 } + + add r3, r3, LDA + fldmias r3, { s12 - s15 } + + fstmias BO1, { s0 - s15 } + add AO1, AO1, #16 + add BO1, BO1, M4 + +.endm + + +.macro COPY2x4 + + fldmias AO1, { s0 - s1 } + + add r3, AO1, LDA + fldmias r3, { s2 - s3 } + + add r3, r3, LDA + fldmias r3, { s4 - s5 } + + add r3, r3, LDA + fldmias r3, { s6 - s7 } + + fstmias BO2, { s0 - s7 } + add AO1, AO1, #8 + add BO2, BO2, #32 + +.endm + +.macro COPY1x4 + + fldmias AO1, { s0 } + + add r3, AO1, LDA + fldmias r3, { s1 } + + add r3, r3, LDA + fldmias r3, { s2 } + + add r3, r3, LDA + fldmias r3, { s3 } + + fstmias BO3, { s0 - s3 } + add AO1, AO1, #4 + add BO3, BO3, #16 + +.endm + +/*************************************************************************************************************************/ + +.macro COPY4x2 + + fldmias AO1, { s0 - s3 } + + add r3, AO1, LDA + fldmias r3, { s4 - s7 } + + fstmias BO1, { s0 - s7 } + add AO1, AO1, #16 + add BO1, BO1, M4 + +.endm + +.macro COPY2x2 + + fldmias AO1, { s0 - s1 } + + add r3, AO1, LDA + fldmias r3, { s2 - s3 } + + fstmias BO2, { s0 - s3 } + add AO1, AO1, #8 + add BO2, BO2, #16 + +.endm + +.macro COPY1x2 + + fldmias AO1, { s0 } + + add r3, AO1, LDA + fldmias r3, { s1 } + + fstmias BO3, { s0 - s1 } + add AO1, AO1, #4 + add BO3, BO3, #8 + +.endm + +/*************************************************************************************************************************/ + +.macro COPY4x1 + + fldmias AO1, { s0 - s3 } + + fstmias BO1, { s0 - s3 } + add AO1, AO1, #16 + add BO1, BO1, M4 + +.endm + +.macro COPY2x1 + + fldmias AO1, { s0 - s1 } + + fstmias BO2, { s0 - s1 } + add AO1, AO1, #8 + add BO2, BO2, #8 + +.endm + +.macro COPY1x1 + + fldmias AO1, { s0 } + + fstmias BO3, { s0 } + add AO1, AO1, #4 + add BO3, BO3, #4 + +.endm + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + str OLD_A, A // store A + + lsl LDA, OLD_LDA, #2 // lda = lda * SIZE + + sub r4, fp, #128 + vstm r4, { s8 - s15} // store floating point registers + + lsl r4 , M, #2 // M * SIZE + + ldr r3, B + + and BO2 , N , #-4 + and BO3 , N , #-2 + + mul BO2, BO2, r4 + mul BO3, BO3, r4 + + add BO2 , BO2, r3 + add BO3 , BO3, r3 + + lsl M4, M, #4 // M4 = M * 4 * SIZE + +sgemm_tcopy_L4_BEGIN: + + asrs J, M, #2 // J = N / 4 + ble sgemm_tcopy_L2_BEGIN + +sgemm_tcopy_L4_M4_BEGIN: + + ldr AO1, A // AO1 = A + lsl r3, LDA, #2 // r3 = 4 * LDA + add r3, r3 , AO1 // A = A + 4 * LDA + str r3, A // store A + + ldr BO1, B + add r3, BO1, #64 // B = B + 16 * SIZE + str r3, B + + asrs I, N, #2 // I = M / 4 + ble sgemm_tcopy_L4_M4_40 + +sgemm_tcopy_L4_M4_20: + + COPY4x4_1 + + subs I , I , #1 + ble sgemm_tcopy_L4_M4_40 + + COPY4x4_2 + + subs I , I , #1 + bne sgemm_tcopy_L4_M4_20 + + +sgemm_tcopy_L4_M4_40: + + tst N , #2 + ble sgemm_tcopy_L4_M4_60 + + COPY2x4 + + +sgemm_tcopy_L4_M4_60: + + tst N, #1 + ble sgemm_tcopy_L4_M4_END + + COPY1x4 + + +sgemm_tcopy_L4_M4_END: + + subs J , J, #1 // j-- + bne sgemm_tcopy_L4_M4_BEGIN + + + +/*********************************************************************************************/ + +sgemm_tcopy_L2_BEGIN: + + tst M, #3 + ble sgemm_tcopy_L999 + + tst M, #2 + ble sgemm_tcopy_L1_BEGIN + +sgemm_tcopy_L2_M4_BEGIN: + + ldr AO1, A // AO1 = A + lsl r3, LDA, #1 // r3 = 2 * LDA + add r3, r3 , AO1 // A = A + 2 * LDA + str r3, A // store A + + ldr BO1, B + add r3, BO1, #32 // B = B + 8 * SIZE + str r3, B + + asrs I, N, #2 // I = M / 4 + ble sgemm_tcopy_L2_M4_40 + +sgemm_tcopy_L2_M4_20: + + COPY4x2 + + subs I , I , #1 + bne sgemm_tcopy_L2_M4_20 + + +sgemm_tcopy_L2_M4_40: + + tst N , #2 + ble sgemm_tcopy_L2_M4_60 + + COPY2x2 + +sgemm_tcopy_L2_M4_60: + + tst N , #1 + ble sgemm_tcopy_L2_M4_END + + COPY1x2 + + +sgemm_tcopy_L2_M4_END: + + +/*********************************************************************************************/ + +sgemm_tcopy_L1_BEGIN: + + tst M, #1 + ble sgemm_tcopy_L999 + + +sgemm_tcopy_L1_M4_BEGIN: + + ldr AO1, A // AO1 = A + add r3, LDA , AO1 // A = A + 1 * LDA + str r3, A // store A + + ldr BO1, B + add r3, BO1, #16 // B = B + 4 * SIZE + str r3, B + + asrs I, N, #2 // I = M / 4 + ble sgemm_tcopy_L1_M4_40 + +sgemm_tcopy_L1_M4_20: + + COPY4x1 + + subs I , I , #1 + bne sgemm_tcopy_L1_M4_20 + + +sgemm_tcopy_L1_M4_40: + + tst N , #2 + ble sgemm_tcopy_L1_M4_60 + + COPY2x1 + +sgemm_tcopy_L1_M4_60: + + tst N , #1 + ble sgemm_tcopy_L1_M4_END + + COPY1x1 + + +sgemm_tcopy_L1_M4_END: + + + +sgemm_tcopy_L999: + + sub r3, fp, #128 + vldm r3, { s8 - s15} // restore floating point registers + + mov r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/strmm_kernel_4x2_vfp.S b/kernel/arm/strmm_kernel_4x2_vfp.S new file mode 100644 index 000000000..ab5ff7fa2 --- /dev/null +++ b/kernel/arm/strmm_kernel_4x2_vfp.S @@ -0,0 +1,1081 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/28 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 252 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_K r2 +#define OLD_A r3 +#define OLD_ALPHA s0 + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define KK [fp, #-240 ] +#define KKK [fp, #-244] +#define C [fp, #-248 ] +#define LDC [fp, #-252 ] +#define M [fp, #-256 ] +#define N [fp, #-260 ] +#define K [fp, #-264 ] +#define A [fp, #-268 ] + +#define ALPHA [fp, #-276 ] + +#define B [fp, #4 ] +#define OLD_C [fp, #8 ] +#define OLD_LDC [fp, #12 ] +#define OFFSET [fp, #16 ] + +#define I r0 +#define J r1 +#define L r2 + +#define AO r5 +#define BO r6 + +#define CO1 r8 +#define CO2 r9 + +#define K1 r7 +#define BC r12 + +#define A_PRE 64 +#define B_PRE 64 +#define C_PRE 64 + + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro INIT4x2 + + vsub.f32 s8 , s8 , s8 + vmov.f32 s9, s8 + vmov.f32 s10, s8 + vmov.f32 s11, s8 + vmov.f32 s12, s8 + vmov.f32 s13, s8 + vmov.f32 s14, s8 + vmov.f32 s15, s8 + +.endm + + + +.macro KERNEL4x2_SUB + + fldmias AO!, { s0 - s3 } + fldmias BO!, { s4 - s5 } + + fmacs s8 , s0, s4 + fmacs s9 , s1, s4 + fmacs s10 , s2, s4 + fmacs s11 , s3, s4 + + fmacs s12 , s0, s5 + fmacs s13 , s1, s5 + fmacs s14 , s2, s5 + fmacs s15 , s3, s5 + +.endm + +.macro SAVE4x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + flds s0, ALPHA + + fmuls s4 , s0 , s8 + fmuls s5 , s0 , s9 + fmuls s6 , s0 , s10 + fmuls s7 , s0 , s11 + + fsts s4 , [CO1] + fsts s5 , [CO1, #4 ] + fsts s6 , [CO1, #8 ] + fsts s7 , [CO1, #12 ] + + fmuls s4 , s0 , s12 + fmuls s5 , s0 , s13 + fmuls s6 , s0 , s14 + fmuls s7 , s0 , s15 + + fsts s4 , [CO2] + fsts s5 , [CO2, #4 ] + fsts s6 , [CO2, #8 ] + fsts s7 , [CO2, #12 ] + + add CO1, CO1, #16 + +.endm + + +/******************************************************************************/ + +.macro INIT2x2 + + vsub.f32 s8 , s8 , s8 + vmov.f32 s9, s8 + vmov.f32 s12, s8 + vmov.f32 s13, s8 + +.endm + +.macro KERNEL2x2_SUB + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + fmacs s8 , s0, s4 + fmacs s9 , s1, s4 + + fmacs s12 , s0, s5 + fmacs s13 , s1, s5 + + add AO , AO, #8 + add BO , BO, #8 + +.endm + +.macro SAVE2x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + flds s0, ALPHA + + + fmuls s4 , s0 , s8 + fmuls s5 , s0 , s9 + + fsts s4 , [CO1] + fsts s5 , [CO1, #4 ] + + fmuls s4 , s0 , s12 + fmuls s5 , s0 , s13 + + fsts s4 , [CO2] + fsts s5 , [CO2, #4 ] + + add CO1, CO1, #8 + +.endm + + +/******************************************************************************/ + +.macro INIT1x2 + + vsub.f32 s8 , s8 , s8 + vmov.f32 s12, s8 + +.endm + +.macro KERNEL1x2_SUB + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + flds s0 , [ AO ] + + fmacs s8 , s0, s4 + + fmacs s12 , s0, s5 + + add AO , AO, #4 + add BO , BO, #8 + +.endm + +.macro SAVE1x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + flds s0, ALPHA + + + fmuls s4 , s0 , s8 + + fsts s4 , [CO1] + + + fmuls s4 , s0 , s12 + + fsts s4 , [CO2] + + add CO1, CO1, #4 + +.endm + + + +/******************************************************************************/ + +.macro INIT4x1 + + vsub.f32 s8 , s8 , s8 + vmov.f32 s9, s8 + vmov.f32 s10, s8 + vmov.f32 s11, s8 + +.endm + + + +.macro KERNEL4x1_SUB + + flds s4 , [ BO ] + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s2 , [ AO, #8 ] + flds s3 , [ AO, #12 ] + + fmacs s8 , s0, s4 + fmacs s9 , s1, s4 + fmacs s10 , s2, s4 + fmacs s11 , s3, s4 + + add AO , AO, #16 + add BO , BO, #4 + +.endm + +.macro SAVE4x1 + + flds s0, ALPHA + + fmuls s4 , s0 , s8 + fmuls s5 , s0 , s9 + fmuls s6 , s0 , s10 + fmuls s7 , s0 , s11 + + fsts s4 , [CO1] + fsts s5 , [CO1, #4 ] + fsts s6 , [CO1, #8 ] + fsts s7 , [CO1, #12 ] + + add CO1, CO1, #16 + +.endm + +/******************************************************************************/ + +.macro INIT2x1 + + vsub.f32 s8 , s8 , s8 + vmov.f32 s9 , s8 + +.endm + +.macro KERNEL2x1_SUB + + flds s4 , [ BO ] + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + fmacs s8 , s0, s4 + fmacs s9 , s1, s4 + + add AO , AO, #8 + add BO , BO, #4 + +.endm + +.macro SAVE2x1 + + flds s0, ALPHA + + fmuls s4 , s0 , s8 + fmuls s5 , s0 , s9 + + fsts s4 , [CO1] + fsts s5 , [CO1, #4 ] + + add CO1, CO1, #8 + +.endm + + +/******************************************************************************/ + +.macro INIT1x1 + + vsub.f32 s8 , s8 , s8 + +.endm + +.macro KERNEL1x1_SUB + + flds s4 , [ BO ] + + flds s0 , [ AO ] + + fmacs s8 , s0, s4 + + add AO , AO, #4 + add BO , BO, #4 + +.endm + +.macro SAVE1x1 + + flds s0, ALPHA + + fmuls s4 , s0 , s8 + + fsts s4 , [CO1] + + add CO1, CO1, #4 + +.endm + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + str OLD_M, M + str OLD_N, N + str OLD_K, K + str OLD_A, A + vstr OLD_ALPHA, ALPHA + + sub r3, fp, #128 + vstm r3, { s8 - s15} // store floating point registers + + ldr r3, OLD_LDC + lsl r3, r3, #2 // ldc = ldc * 4 + str r3, LDC + + ldr r3, OLD_C + str r3, C + + ldr BC, B + + ldr r3, OFFSET +#ifndef LEFT + neg r3 , r3 +#endif + str r3 , KK + + ldr J, N + asrs J, J, #1 // J = J / 2 + ble _L1_BEGIN + +_L2_BEGIN: + + ldr CO1, C // CO1 = C + ldr r4 , LDC + lsl r4 , r4 , #1 // LDC * 2 + add r3 , r4, CO1 + str r3 , C // store C + +#if defined(LEFT) + ldr r3 , OFFSET + str r3 , KK +#endif + + ldr AO, A // AO = A + + +_L2_M4_BEGIN: + + ldr I, M + asrs I, I, #2 // I = I / 4 + ble _L2_M2_BEGIN + +_L2_M4_20: + + INIT4x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #3 // 2 float values + add BO , BO , r4 + lsls r4 , r3 , #4 // 4 float values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr L , K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr L , K + ldr r3, KK + sub L , L, r3 + str L , KKK +#else + ldr L , KK +#ifdef LEFT + add L , L , #4 // number of values in AO +#else + add L , L , #2 // number of values in BO +#endif + str L , KKK +#endif + + mov K1, L + asrs L , K1, #3 // L = L / 8 + ble _L2_M4_40 + .align 5 + +_L2_M4_22: + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + KERNEL4x2_SUB + KERNEL4x2_SUB + pld [ AO , #A_PRE ] + KERNEL4x2_SUB + KERNEL4x2_SUB + + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + KERNEL4x2_SUB + KERNEL4x2_SUB + pld [ AO , #A_PRE ] + KERNEL4x2_SUB + KERNEL4x2_SUB + + subs L, L, #1 + bgt _L2_M4_22 + + +_L2_M4_40: + + ands L , K1, #7 // L = L % 8 + ble _L2_M4_100 + +_L2_M4_42: + + KERNEL4x2_SUB + + subs L, L, #1 + bgt _L2_M4_42 + +_L2_M4_100: + + SAVE4x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #3 // 2 float values + add BO , BO , r4 + lsls r4 , r3 , #4 // 4 float values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #4 // number of values in AO + str r3 , KK +#endif + + + +_L2_M4_END: + + subs I, I, #1 + bgt _L2_M4_20 + + +_L2_M2_BEGIN: + + ldr I, M + tst I , #3 + ble _L2_END + + tst I, #2 // I = I / 2 + ble _L2_M1_BEGIN + +_L2_M2_20: + + INIT2x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #3 // 2 float values + add BO , BO , r4 + lsls r4 , r3 , #3 // 2 float values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr L , K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr L , K + ldr r3, KK + sub L , L, r3 + str L , KKK +#else + ldr L , KK +#ifdef LEFT + add L , L , #2 // number of values in AO +#else + add L , L , #2 // number of values in BO +#endif + str L , KKK +#endif + + mov K1, L + asrs L , K1, #3 // L = L / 8 + ble _L2_M2_40 + +_L2_M2_22: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + subs L, L, #1 + bgt _L2_M2_22 + + +_L2_M2_40: + + ands L , K1, #7 // L = L % 8 + ble _L2_M2_100 + +_L2_M2_42: + + KERNEL2x2_SUB + + subs L, L, #1 + bgt _L2_M2_42 + +_L2_M2_100: + + SAVE2x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #3 // 2 float values + add BO , BO , r4 + lsls r4 , r3 , #3 // 2 float values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in AO + str r3 , KK +#endif + + + +_L2_M2_END: + + +_L2_M1_BEGIN: + + tst I, #1 // I = I % 2 + ble _L2_END + +_L2_M1_20: + + INIT1x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #3 // 2 float values + add BO , BO , r4 + lsls r4 , r3 , #2 // 1 float value + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr L , K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr L , K + ldr r3, KK + sub L , L, r3 + str L , KKK +#else + ldr L , KK +#ifdef LEFT + add L , L , #1 // number of values in AO +#else + add L , L , #2 // number of values in BO +#endif + str L , KKK +#endif + + mov K1, L + asrs L , K1, #3 // L = L / 8 + ble _L2_M1_40 + +_L2_M1_22: + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs L, L, #1 + bgt _L2_M1_22 + + +_L2_M1_40: + + ands L , K1, #7 // L = L % 8 + ble _L2_M1_100 + +_L2_M1_42: + + KERNEL1x2_SUB + + subs L, L, #1 + bgt _L2_M1_42 + +_L2_M1_100: + + SAVE1x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #3 // 2 float values + add BO , BO , r4 + lsls r4 , r3 , #2 // 1 float value + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #1 // number of values in AO + str r3 , KK +#endif + + + + +_L2_END: + + mov r3, BC + ldr r4, K + lsl r4, r4, #3 // k * 2 * 4 + add r3, r3, r4 // B = B + K * 2 * 4 + mov BC, r3 + +#if !defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in BO + str r3 , KK +#endif + + + subs J , #1 // j-- + bgt _L2_BEGIN + +/*********************************************************************************************/ + +_L1_BEGIN: + + ldr J , N + tst J , #1 + ble _L999 + + + ldr CO1, C // CO1 = C + ldr r4 , LDC + add r3 , r4, CO1 + str r3 , C // store C + +#if defined(LEFT) + ldr r3 , OFFSET + str r3 , KK +#endif + + ldr AO, A // AO = A + //pld [AO , #A_PRE-96] + //pld [AO , #A_PRE-64] + //pld [AO , #A_PRE-32] + + + +_L1_M4_BEGIN: + + ldr I, M + asrs I, I, #2 // I = I / 4 + ble _L1_M2_BEGIN + +_L1_M4_20: + + INIT4x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #2 // 1 float value + add BO , BO , r4 + lsls r4 , r3 , #4 // 4 float values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr L , K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr L , K + ldr r3, KK + sub L , L, r3 + str L , KKK +#else + ldr L , KK +#ifdef LEFT + add L , L , #4 // number of values in AO +#else + add L , L , #1 // number of values in BO +#endif + str L , KKK +#endif + + mov K1, L + asrs L , K1, #3 // L = L / 8 + ble _L1_M4_40 + .align 5 + +_L1_M4_22: + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + subs L, L, #1 + bgt _L1_M4_22 + + +_L1_M4_40: + + ands L , K1, #7 // L = L % 8 + ble _L1_M4_100 + +_L1_M4_42: + + KERNEL4x1_SUB + + subs L, L, #1 + bgt _L1_M4_42 + +_L1_M4_100: + + SAVE4x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #2 // 1 float value + add BO , BO , r4 + lsls r4 , r3 , #4 // 4 float values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #4 // number of values in AO + str r3 , KK +#endif + + + +_L1_M4_END: + + subs I, I, #1 + bgt _L1_M4_20 + + +_L1_M2_BEGIN: + + ldr I, M + tst I , #3 + ble _L1_END + + tst I, #2 // I = I / 2 + ble _L1_M1_BEGIN + +_L1_M2_20: + + INIT2x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #2 // 1 float value + add BO , BO , r4 + lsls r4 , r3 , #3 // 2 float values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr L , K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr L , K + ldr r3, KK + sub L , L, r3 + str L , KKK +#else + ldr L , KK +#ifdef LEFT + add L , L , #2 // number of values in AO +#else + add L , L , #1 // number of values in BO +#endif + str L , KKK +#endif + + mov K1, L + asrs L , K1, #3 // L = L / 8 + ble _L1_M2_40 + +_L1_M2_22: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + subs L, L, #1 + bgt _L1_M2_22 + + +_L1_M2_40: + + ands L , K1, #7 // L = L % 8 + ble _L1_M2_100 + +_L1_M2_42: + + KERNEL2x1_SUB + + subs L, L, #1 + bgt _L1_M2_42 + +_L1_M2_100: + + SAVE2x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #2 // 1 float value + add BO , BO , r4 + lsls r4 , r3 , #3 // 2 float values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in AO + str r3 , KK +#endif + + + +_L1_M2_END: + + +_L1_M1_BEGIN: + + tst I, #1 // I = I % 2 + ble _L1_END + +_L1_M1_20: + + INIT1x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #2 // 1 float value + add BO , BO , r4 + lsls r4 , r3 , #2 // 1 float value + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr L , K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr L , K + ldr r3, KK + sub L , L, r3 + str L , KKK +#else + ldr L , KK +#ifdef LEFT + add L , L , #1 // number of values in AO +#else + add L , L , #1 // number of values in BO +#endif + str L , KKK +#endif + + mov K1, L + asrs L , K1, #3 // L = L / 8 + ble _L1_M1_40 + +_L1_M1_22: + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs L, L, #1 + bgt _L1_M1_22 + + +_L1_M1_40: + + ands L , K1, #7 // L = L % 8 + ble _L1_M1_100 + +_L1_M1_42: + + KERNEL1x1_SUB + + subs L, L, #1 + bgt _L1_M1_42 + +_L1_M1_100: + + SAVE1x1 + + +_L1_END: + + +_L999: + + sub r3, fp, #128 + vldm r3, { s8 - s15} // restore floating point registers + + movs r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/strmm_kernel_4x4_vfpv3.S b/kernel/arm/strmm_kernel_4x4_vfpv3.S new file mode 100644 index 000000000..3a0c8af87 --- /dev/null +++ b/kernel/arm/strmm_kernel_4x4_vfpv3.S @@ -0,0 +1,1884 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/23 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_K r2 +#define OLD_A r3 +#define OLD_ALPHA s0 + +/****************************************************** +* [fp, #-128] - [fp, #-32] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define KK [fp, #-244 ] +#define KKK [fp, #-248] +#define LDC [fp, #-252 ] +#define M [fp, #-256 ] +#define N [fp, #-260 ] +#define K [fp, #-264 ] +#define A [fp, #-268 ] + +#define ALPHA [fp, #-280] + +#define B [fp, #4 ] +#define C [fp, #8 ] +#define OLD_LDC [fp, #12 ] +#define OFFSET [fp, #16 ] + +#define I r0 +#define J r1 +#define L r2 + +#define AO r5 +#define BO r6 + +#define CO1 r8 +#define CO2 r9 + +#define K1 r7 +#define BC r12 + +#define A_PRE 96 +#define B_PRE 96 +#define C_PRE 64 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro INIT4x4 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s17, s16 + vmov.f32 s18, s16 + vmov.f32 s19, s16 + vmov.f32 s20, s16 + vmov.f32 s21, s16 + vmov.f32 s22, s16 + vmov.f32 s23, s16 + vmov.f32 s24, s16 + vmov.f32 s25, s16 + vmov.f32 s26, s16 + vmov.f32 s27, s16 + vmov.f32 s28, s16 + vmov.f32 s29, s16 + vmov.f32 s30, s16 + vmov.f32 s31, s16 + +.endm + +.macro KERNEL4x4_I + + fldmias AO!, { s0 - s1 } + pld [ AO , #A_PRE-8 ] + fldmias BO!, { s8 - s9 } + pld [ BO , #B_PRE-8 ] + + fmuls s16 , s0, s8 + fldmias AO!, { s2 - s3 } + fmuls s17 , s1, s8 + fmuls s18 , s2, s8 + fldmias BO!, { s10 - s11 } + fmuls s19 , s3, s8 + + fmuls s20 , s0, s9 + fldmias AO!, { s4 - s5 } + fmuls s21 , s1, s9 + fmuls s22 , s2, s9 + fldmias AO!, { s6 - s7 } + fmuls s23 , s3, s9 + + fmuls s24 , s0, s10 + fldmias BO!, { s12 - s13 } + fmuls s25 , s1, s10 + fmuls s26 , s2, s10 + fldmias BO!, { s14 - s15 } + fmuls s27 , s3, s10 + + fmuls s28 , s0, s11 + fmuls s29 , s1, s11 + fmuls s30 , s2, s11 + fmuls s31 , s3, s11 + +.endm + + +.macro KERNEL4x4_M2 + + pld [ AO , #A_PRE ] + fmacs s16 , s4, s12 + fmacs s17 , s5, s12 + fldmias AO!, { s0 - s1 } + fmacs s18 , s6, s12 + pld [ BO , #B_PRE ] + fmacs s19 , s7, s12 + + fmacs s20 , s4, s13 + fldmias AO!, { s2 - s3 } + fmacs s21 , s5, s13 + fmacs s22 , s6, s13 + fldmias BO!, { s8 - s9 } + fmacs s23 , s7, s13 + + fmacs s24 , s4, s14 + fldmias BO!, { s10 - s11 } + fmacs s25 , s5, s14 + fmacs s26 , s6, s14 + fmacs s27 , s7, s14 + + fmacs s28 , s4, s15 + fmacs s29 , s5, s15 + fmacs s30 , s6, s15 + fmacs s31 , s7, s15 + +.endm + + +.macro KERNEL4x4_M1 + + fmacs s16 , s0, s8 + fldmias AO!, { s4 - s5 } + fmacs s17 , s1, s8 + fmacs s18 , s2, s8 + fldmias AO!, { s6 - s7 } + fmacs s19 , s3, s8 + + fmacs s20 , s0, s9 + fldmias BO!, { s12 - s13 } + fmacs s21 , s1, s9 + fmacs s22 , s2, s9 + fldmias BO!, { s14 - s15 } + fmacs s23 , s3, s9 + + fmacs s24 , s0, s10 + fmacs s25 , s1, s10 + fmacs s26 , s2, s10 + fmacs s27 , s3, s10 + + fmacs s28 , s0, s11 + fmacs s29 , s1, s11 + fmacs s30 , s2, s11 + fmacs s31 , s3, s11 + +.endm + + + +.macro KERNEL4x4_E + + fmacs s16 , s4, s12 + fmacs s17 , s5, s12 + fmacs s18 , s6, s12 + fmacs s19 , s7, s12 + + fmacs s20 , s4, s13 + fmacs s21 , s5, s13 + fmacs s22 , s6, s13 + fmacs s23 , s7, s13 + + fmacs s24 , s4, s14 + fmacs s25 , s5, s14 + fmacs s26 , s6, s14 + fmacs s27 , s7, s14 + + fmacs s28 , s4, s15 + fmacs s29 , s5, s15 + fmacs s30 , s6, s15 + fmacs s31 , s7, s15 + +.endm + + + + +.macro KERNEL4x4_SUB + + flds s8 , [ BO ] + pld [ BO , #B_PRE ] + + flds s0 , [ AO ] + pld [ AO , #A_PRE ] + flds s1 , [ AO, #4 ] + + fmacs s16 , s0, s8 + flds s2 , [ AO, #8 ] + fmacs s17 , s1, s8 + flds s3 , [ AO, #12 ] + fmacs s18 , s2, s8 + flds s9 , [ BO, #4 ] + fmacs s19 , s3, s8 + + flds s10, [ BO, #8 ] + fmacs s20 , s0, s9 + flds s11, [ BO, #12 ] + fmacs s21 , s1, s9 + fmacs s22 , s2, s9 + fmacs s23 , s3, s9 + + fmacs s24 , s0, s10 + fmacs s25 , s1, s10 + fmacs s26 , s2, s10 + fmacs s27 , s3, s10 + + fmacs s28 , s0, s11 + fmacs s29 , s1, s11 + add AO , AO, #16 + fmacs s30 , s2, s11 + add BO , BO, #16 + fmacs s31 , s3, s11 + +.endm + +.macro SAVE4x4 + + ldr r3 , LDC + add CO2 , CO1, r3 + flds s0, ALPHA + add r4 , CO2, r3 + + + fmuls s8 , s0 , s16 + fmuls s9 , s0 , s17 + fmuls s10, s0 , s18 + fmuls s11, s0 , s19 + + fmuls s12, s0 , s20 + fsts s8 , [CO1] + fmuls s13, s0 , s21 + fsts s9 , [CO1, #4 ] + fmuls s14, s0 , s22 + fsts s10, [CO1, #8 ] + fmuls s15, s0 , s23 + fsts s11, [CO1, #12 ] + + + fmuls s8 , s0 , s24 + fsts s12, [CO2] + fmuls s9 , s0 , s25 + fsts s13, [CO2, #4 ] + fmuls s10, s0 , s26 + fsts s14, [CO2, #8 ] + fmuls s11, s0 , s27 + fsts s15, [CO2, #12 ] + + add CO2, r4 , r3 + + fsts s8 , [r4 ] + fmuls s12, s0 , s28 + fsts s9 , [r4 , #4 ] + fmuls s13, s0 , s29 + fsts s10, [r4 , #8 ] + fmuls s14, s0 , s30 + fsts s11, [r4 , #12 ] + fmuls s15, s0 , s31 + + fstmias CO2, { s12 - s15 } + + add CO1, CO1, #16 + +.endm + +/******************************************************************************/ + +.macro INIT2x4 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s17, s16 + vmov.f32 s20, s16 + vmov.f32 s21, s16 + vmov.f32 s24, s16 + vmov.f32 s25, s16 + vmov.f32 s28, s16 + vmov.f32 s29, s16 + +.endm + + + +.macro KERNEL2x4_SUB + + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + flds s10, [ BO, #8 ] + flds s11, [ BO, #12 ] + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + fmacs s16 , s0, s8 + fmacs s17 , s1, s8 + + fmacs s20 , s0, s9 + fmacs s21 , s1, s9 + + fmacs s24 , s0, s10 + fmacs s25 , s1, s10 + + fmacs s28 , s0, s11 + fmacs s29 , s1, s11 + add AO , AO, #8 + add BO , BO, #16 + +.endm + +.macro SAVE2x4 + + ldr r3 , LDC + add CO2 , CO1, r3 + add r4 , CO2, r3 + + flds s0, ALPHA + + fmuls s8 , s0 , s16 + fmuls s9 , s0 , s17 + + fsts s8 , [CO1] + fsts s9 , [CO1, #4 ] + + fmuls s12, s0 , s20 + fmuls s13, s0 , s21 + + fsts s12, [CO2] + fsts s13, [CO2, #4 ] + + + fmuls s8 , s0 , s24 + fmuls s9 , s0 , s25 + + fsts s8 , [r4 ] + fsts s9 , [r4 , #4 ] + + add CO2, r4 , r3 + + fmuls s12, s0 , s28 + fmuls s13, s0 , s29 + + fsts s12, [CO2] + fsts s13, [CO2, #4 ] + + add CO1, CO1, #8 + +.endm + + +/******************************************************************************/ + +.macro INIT1x4 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s20, s16 + vmov.f32 s24, s16 + vmov.f32 s28, s16 + +.endm + + + +.macro KERNEL1x4_SUB + + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + flds s10, [ BO, #8 ] + flds s11, [ BO, #12 ] + + flds s0 , [ AO ] + + fmacs s16 , s0, s8 + fmacs s20 , s0, s9 + fmacs s24 , s0, s10 + fmacs s28 , s0, s11 + + add AO , AO, #4 + add BO , BO, #16 + +.endm + +.macro SAVE1x4 + + ldr r3 , LDC + add CO2 , CO1, r3 + add r4 , CO2, r3 + + flds s0, ALPHA + + fmuls s8 , s0 , s16 + fsts s8 , [CO1] + + fmuls s12, s0 , s20 + fsts s12, [CO2] + + fmuls s8 , s0 , s24 + fsts s8 , [r4 ] + + add CO2, r4 , r3 + + fmuls s12, s0 , s28 + fsts s12, [CO2] + + add CO1, CO1, #4 + +.endm + +/******************************************************************************/ +/******************************************************************************/ + +.macro INIT4x2 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s17, s16 + vmov.f32 s18, s16 + vmov.f32 s19, s16 + vmov.f32 s20, s16 + vmov.f32 s21, s16 + vmov.f32 s22, s16 + vmov.f32 s23, s16 + +.endm + + + +.macro KERNEL4x2_SUB + + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s2 , [ AO, #8 ] + flds s3 , [ AO, #12 ] + + fmacs s16 , s0, s8 + fmacs s17 , s1, s8 + fmacs s18 , s2, s8 + fmacs s19 , s3, s8 + + fmacs s20 , s0, s9 + fmacs s21 , s1, s9 + fmacs s22 , s2, s9 + fmacs s23 , s3, s9 + + add AO , AO, #16 + add BO , BO, #8 + +.endm + +.macro SAVE4x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + flds s0, ALPHA + + fmuls s8 , s0 , s16 + fmuls s9 , s0 , s17 + fmuls s10, s0 , s18 + fmuls s11, s0 , s19 + + fsts s8 , [CO1] + fsts s9 , [CO1, #4 ] + fsts s10, [CO1, #8 ] + fsts s11, [CO1, #12 ] + + fmuls s12, s0 , s20 + fmuls s13, s0 , s21 + fmuls s14, s0 , s22 + fmuls s15, s0 , s23 + + fsts s12, [CO2] + fsts s13, [CO2, #4 ] + fsts s14, [CO2, #8 ] + fsts s15, [CO2, #12 ] + + add CO1, CO1, #16 + +.endm + + +/******************************************************************************/ + +.macro INIT2x2 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s17, s16 + vmov.f32 s20, s16 + vmov.f32 s21, s16 + +.endm + + + +.macro KERNEL2x2_SUB + + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + fmacs s16 , s0, s8 + fmacs s17 , s1, s8 + + fmacs s20 , s0, s9 + fmacs s21 , s1, s9 + + add AO , AO, #8 + add BO , BO, #8 + +.endm + +.macro SAVE2x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + flds s0, ALPHA + + fmuls s8 , s0 , s16 + fmuls s9 , s0 , s17 + + fsts s8 , [CO1] + fsts s9 , [CO1, #4 ] + + fmuls s12, s0 , s20 + fmuls s13, s0 , s21 + + fsts s12, [CO2] + fsts s13, [CO2, #4 ] + + add CO1, CO1, #8 + +.endm + +/******************************************************************************/ + +.macro INIT1x2 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s20, s16 + +.endm + + + +.macro KERNEL1x2_SUB + + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + + flds s0 , [ AO ] + fmacs s16 , s0, s8 + fmacs s20 , s0, s9 + + add AO , AO, #4 + add BO , BO, #8 + +.endm + +.macro SAVE1x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + flds s0, ALPHA + + fmuls s8 , s0 , s16 + fsts s8 , [CO1] + + fmuls s12, s0 , s20 + fsts s12, [CO2] + + add CO1, CO1, #4 + +.endm + +/******************************************************************************/ +/******************************************************************************/ + +.macro INIT4x1 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s17, s16 + vmov.f32 s18, s16 + vmov.f32 s19, s16 + +.endm + + + +.macro KERNEL4x1_SUB + + flds s8 , [ BO ] + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s2 , [ AO, #8 ] + flds s3 , [ AO, #12 ] + + fmacs s16 , s0, s8 + fmacs s17 , s1, s8 + fmacs s18 , s2, s8 + fmacs s19 , s3, s8 + + add AO , AO, #16 + add BO , BO, #4 + +.endm + +.macro SAVE4x1 + + + flds s0, ALPHA + + fmuls s8 , s0 , s16 + fmuls s9 , s0 , s17 + fmuls s10, s0 , s18 + fmuls s11, s0 , s19 + + fsts s8 , [CO1] + fsts s9 , [CO1, #4 ] + fsts s10, [CO1, #8 ] + fsts s11, [CO1, #12 ] + + add CO1, CO1, #16 + +.endm + + + + +/******************************************************************************/ + +.macro INIT2x1 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s17, s16 + +.endm + + + +.macro KERNEL2x1_SUB + + flds s8 , [ BO ] + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + fmacs s16 , s0, s8 + fmacs s17 , s1, s8 + + add AO , AO, #8 + add BO , BO, #4 + +.endm + +.macro SAVE2x1 + + + flds s0, ALPHA + + fmuls s8 , s0 , s16 + fmuls s9 , s0 , s17 + + fsts s8 , [CO1] + fsts s9 , [CO1, #4 ] + + add CO1, CO1, #8 + +.endm + +/******************************************************************************/ + +.macro INIT1x1 + + vsub.f32 s16 , s16 , s16 + +.endm + + + +.macro KERNEL1x1_SUB + + flds s8 , [ BO ] + + flds s0 , [ AO ] + + fmacs s16 , s0, s8 + + add AO , AO, #4 + add BO , BO, #4 + +.endm + +.macro SAVE1x1 + + + flds s0, ALPHA + + fmuls s8 , s0 , s16 + fsts s8 , [CO1] + + add CO1, CO1, #4 + +.endm + + + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + str OLD_M, M + str OLD_N, N + str OLD_K, K + str OLD_A, A + vstr OLD_ALPHA, ALPHA + + sub r3, fp, #128 + vstm r3, { s8 - s31} // store floating point registers + + ldr r3, OLD_LDC + lsl r3, r3, #2 // ldc = ldc * 4 + str r3, LDC + + ldr r3, OFFSET +#ifndef LEFT + neg r3 , r3 +#endif + str r3 , KK + + ldr BC, B + + ldr J, N + asrs J, J, #2 // J = J / 4 + ble _L2_BEGIN + +_L4_BEGIN: + + ldr CO1, C // CO1 = C + ldr r4 , LDC + lsl r4 , r4 , #2 // LDC * 4 + add r3 , r4, CO1 + str r3 , C // store C + +#if defined(LEFT) + ldr r3 , OFFSET + str r3 , KK +#endif + + ldr AO, A // AO = A + pld [AO , #A_PRE-64] + pld [AO , #A_PRE-32] + + + +_L4_M4_BEGIN: + + ldr I, M + asrs I, I, #2 // I = I / 4 + ble _L4_M2_BEGIN + +_L4_M4_20: + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #4 // 4 float values + add BO , BO , r4 + lsls r4 , r3 , #4 // 4 float values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #4 // number of values in AO +#else + add K1, K1, #4 // number of values in BO +#endif + str K1, KKK +#endif + + asrs L , K1, #3 // L = L / 8 + cmp L , #3 + blt _L4_M4_30 + .align 5 + + + + KERNEL4x4_I + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + sub L, L, #2 + +_L4_M4_22: + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + subs L, L, #1 + bgt _L4_M4_22 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_E + + b _L4_M4_44 + + +_L4_M4_30: + tst L, #3 + ble _L4_M4_40 + + tst L, #2 + ble _L4_M4_32 + + KERNEL4x4_I + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_E + + b _L4_M4_44 + +_L4_M4_32: + + tst L, #1 + ble _L4_M4_40 + + KERNEL4x4_I + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_E + + b _L4_M4_44 + + +_L4_M4_40: + + INIT4x4 + + +_L4_M4_44: + + ands L , K1, #7 // L = L % 8 + ble _L4_M4_100 + +_L4_M4_46: + + KERNEL4x4_SUB + + subs L, L, #1 + bne _L4_M4_46 + +_L4_M4_100: + + SAVE4x4 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #4 // 4 float values + add BO , BO , r4 + lsls r4 , r3 , #4 // 4 float values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #4 // number of values in AO + str r3 , KK +#endif + + + + +_L4_M4_END: + + subs I, I, #1 + bne _L4_M4_20 + + +_L4_M2_BEGIN: + + ldr I, M + tst I , #3 + ble _L4_END + + tst I, #2 // I = I / 2 + ble _L4_M1_BEGIN + +_L4_M2_20: + + INIT2x4 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #4 // 4 float values + add BO , BO , r4 + lsls r4 , r3 , #3 // 2 float values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #2 // number of values in AO +#else + add K1, K1, #4 // number of values in BO +#endif + str K1, KKK +#endif + + + asrs L , K1, #3 // L = L / 8 + ble _L4_M2_40 + +_L4_M2_22: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + subs L, L, #1 + bgt _L4_M2_22 + + +_L4_M2_40: + + ands L , K1, #7 // L = L % 8 + ble _L4_M2_100 + +_L4_M2_42: + + KERNEL2x4_SUB + + subs L, L, #1 + bgt _L4_M2_42 + +_L4_M2_100: + + SAVE2x4 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #4 // 4 float values + add BO , BO , r4 + lsls r4 , r3 , #3 // 2 float values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in AO + str r3 , KK +#endif + + + +_L4_M2_END: + + +_L4_M1_BEGIN: + + tst I, #1 // I = I % 2 + ble _L4_END + +_L4_M1_20: + + INIT1x4 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #4 // 4 float values + add BO , BO , r4 + lsls r4 , r3 , #2 // 1 float value + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #1 // number of values in AO +#else + add K1, K1, #4 // number of values in BO +#endif + str K1, KKK +#endif + + + asrs L , K1, #3 // L = L / 8 + ble _L4_M1_40 + +_L4_M1_22: + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + subs L, L, #1 + bgt _L4_M1_22 + + +_L4_M1_40: + + ands L , K1, #7 // L = L % 8 + ble _L4_M1_100 + +_L4_M1_42: + + KERNEL1x4_SUB + + subs L, L, #1 + bgt _L4_M1_42 + +_L4_M1_100: + + SAVE1x4 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #4 // 4 float values + add BO , BO , r4 + lsls r4 , r3 , #2 // 1 float value + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #1 // number of values in AO + str r3 , KK +#endif + + + +_L4_END: + + mov r3, BC + ldr r4, K + lsl r4, r4, #4 // k * 4 * 4 + add r3, r3, r4 // B = B + K * 4 * 4 + mov BC, r3 + +#if !defined(LEFT) + ldr r3 , KK + add r3 , r3 , #4 // number of values in BO + str r3 , KK +#endif + + subs J , #1 // j-- + bgt _L4_BEGIN + + + +/*********************************************************************************************/ + +_L2_BEGIN: + + ldr J , N + tst J , #3 + ble _L999 + + tst J , #2 + ble _L1_BEGIN + + ldr CO1, C // CO1 = C + ldr r4 , LDC + lsl r4 , r4 , #1 // LDC * 2 + add r3 , r4, CO1 + str r3 , C // store C + +#if defined(LEFT) + ldr r3 , OFFSET + str r3 , KK +#endif + + ldr AO, A // AO = A + //pld [AO , #A_PRE-96] + //pld [AO , #A_PRE-64] + //pld [AO , #A_PRE-32] + + + +_L2_M4_BEGIN: + + ldr I, M + asrs I, I, #2 // I = I / 4 + ble _L2_M2_BEGIN + +_L2_M4_20: + + INIT4x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #3 // 2 float values + add BO , BO , r4 + lsls r4 , r3 , #4 // 4 float values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #4 // number of values in AO +#else + add K1, K1, #2 // number of values in BO +#endif + str K1, KKK +#endif + + + asrs L , K1, #3 // L = L / 8 + ble _L2_M4_40 + .align 5 + +_L2_M4_22: + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + subs L, L, #1 + bgt _L2_M4_22 + + +_L2_M4_40: + + ands L , K1, #7 // L = L % 8 + ble _L2_M4_100 + +_L2_M4_42: + + KERNEL4x2_SUB + + subs L, L, #1 + bgt _L2_M4_42 + +_L2_M4_100: + + SAVE4x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #3 // 2 float values + add BO , BO , r4 + lsls r4 , r3 , #4 // 4 float values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #4 // number of values in AO + str r3 , KK +#endif + + + +_L2_M4_END: + + subs I, I, #1 + bgt _L2_M4_20 + + +_L2_M2_BEGIN: + + ldr I, M + tst I , #3 + ble _L2_END + + tst I, #2 // I = I / 2 + ble _L2_M1_BEGIN + +_L2_M2_20: + + INIT2x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #3 // 2 float values + add BO , BO , r4 + lsls r4 , r3 , #3 // 2 float values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #2 // number of values in AO +#else + add K1, K1, #2 // number of values in BO +#endif + str K1, KKK +#endif + + + asrs L , K1, #3 // L = L / 8 + ble _L2_M2_40 + +_L2_M2_22: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + subs L, L, #1 + bgt _L2_M2_22 + + +_L2_M2_40: + + ands L , K1, #7 // L = L % 8 + ble _L2_M2_100 + +_L2_M2_42: + + KERNEL2x2_SUB + + subs L, L, #1 + bgt _L2_M2_42 + +_L2_M2_100: + + SAVE2x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #3 // 2 float values + add BO , BO , r4 + lsls r4 , r3 , #3 // 2 float values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in AO + str r3 , KK +#endif + + + +_L2_M2_END: + + +_L2_M1_BEGIN: + + tst I, #1 // I = I % 2 + ble _L2_END + +_L2_M1_20: + + INIT1x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #3 // 2 float values + add BO , BO , r4 + lsls r4 , r3 , #2 // 1 float value + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #1 // number of values in AO +#else + add K1, K1, #2 // number of values in BO +#endif + str K1, KKK +#endif + + + asrs L , K1, #3 // L = L / 8 + ble _L2_M1_40 + +_L2_M1_22: + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs L, L, #1 + bgt _L2_M1_22 + + +_L2_M1_40: + + ands L , K1, #7 // L = L % 8 + ble _L2_M1_100 + +_L2_M1_42: + + KERNEL1x2_SUB + + subs L, L, #1 + bgt _L2_M1_42 + +_L2_M1_100: + + SAVE1x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #3 // 2 float values + add BO , BO , r4 + lsls r4 , r3 , #2 // 1 float value + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #1 // number of values in AO + str r3 , KK +#endif + + + +_L2_END: + + mov r3, BC + ldr r4, K + lsl r4, r4, #3 // k * 2 * 4 + add r3, r3, r4 // B = B + K * 2 * 4 + mov BC, r3 + +#if !defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in BO + str r3 , KK +#endif + + +/*********************************************************************************************/ + +_L1_BEGIN: + + ldr J , N + tst J , #1 + ble _L999 + + + ldr CO1, C // CO1 = C + ldr r4 , LDC + add r3 , r4, CO1 + str r3 , C // store C + +#if defined(LEFT) + ldr r3 , OFFSET + str r3 , KK +#endif + + ldr AO, A // AO = A + //pld [AO , #A_PRE-96] + //pld [AO , #A_PRE-64] + //pld [AO , #A_PRE-32] + + + +_L1_M4_BEGIN: + + ldr I, M + asrs I, I, #2 // I = I / 4 + ble _L1_M2_BEGIN + +_L1_M4_20: + + INIT4x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #2 // 1 float value + add BO , BO , r4 + lsls r4 , r3 , #4 // 4 float values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #4 // number of values in AO +#else + add K1, K1, #1 // number of values in BO +#endif + str K1, KKK +#endif + + + asrs L , K1, #3 // L = L / 8 + ble _L1_M4_40 + .align 5 + +_L1_M4_22: + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + subs L, L, #1 + bgt _L1_M4_22 + + +_L1_M4_40: + + ands L , K1, #7 // L = L % 8 + ble _L1_M4_100 + +_L1_M4_42: + + KERNEL4x1_SUB + + subs L, L, #1 + bgt _L1_M4_42 + +_L1_M4_100: + + SAVE4x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #2 // 1 float value + add BO , BO , r4 + lsls r4 , r3 , #4 // 4 float values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #4 // number of values in AO + str r3 , KK +#endif + + + + +_L1_M4_END: + + subs I, I, #1 + bgt _L1_M4_20 + + +_L1_M2_BEGIN: + + ldr I, M + tst I , #3 + ble _L1_END + + tst I, #2 // I = I / 2 + ble _L1_M1_BEGIN + +_L1_M2_20: + + INIT2x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #2 // 1 float value + add BO , BO , r4 + lsls r4 , r3 , #3 // 2 float values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #2 // number of values in AO +#else + add K1, K1, #1 // number of values in BO +#endif + str K1, KKK +#endif + + + asrs L , K1, #3 // L = L / 8 + ble _L1_M2_40 + +_L1_M2_22: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + subs L, L, #1 + bgt _L1_M2_22 + + +_L1_M2_40: + + ands L , K1, #7 // L = L % 8 + ble _L1_M2_100 + +_L1_M2_42: + + KERNEL2x1_SUB + + subs L, L, #1 + bgt _L1_M2_42 + +_L1_M2_100: + + SAVE2x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #2 // 1 float value + add BO , BO , r4 + lsls r4 , r3 , #3 // 2 float values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in AO + str r3 , KK +#endif + + + +_L1_M2_END: + + +_L1_M1_BEGIN: + + tst I, #1 // I = I % 2 + ble _L1_END + +_L1_M1_20: + + INIT1x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #2 // 1 float value + add BO , BO , r4 + lsls r4 , r3 , #2 // 1 float value + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #1 // number of values in AO +#else + add K1, K1, #1 // number of values in BO +#endif + str K1, KKK +#endif + + + asrs L , K1, #3 // L = L / 8 + ble _L1_M1_40 + +_L1_M1_22: + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs L, L, #1 + bgt _L1_M1_22 + + +_L1_M1_40: + + ands L , K1, #7 // L = L % 8 + ble _L1_M1_100 + +_L1_M1_42: + + KERNEL1x1_SUB + + subs L, L, #1 + bgt _L1_M1_42 + +_L1_M1_100: + + SAVE1x1 + + +_L1_END: + + +_L999: + + sub r3, fp, #128 + vldm r3, { s8 - s31} // restore floating point registers + + movs r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/swap.c b/kernel/arm/swap.c new file mode 100644 index 000000000..1ca9e7607 --- /dev/null +++ b/kernel/arm/swap.c @@ -0,0 +1,62 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/08/20 Saar +* BLASTEST float OK +* BLASTEST double OK +* +**************************************************************************************/ + +#include "common.h" +#include + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT temp; + + if ( n < 0 ) return(0); + + while(i < n) + { + + temp = x[ix] ; + x[ix] = y[iy] ; + y[iy] = temp ; + + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(0); + +} + + diff --git a/kernel/arm/swap_vfp.S b/kernel/arm/swap_vfp.S new file mode 100644 index 000000000..352875188 --- /dev/null +++ b/kernel/arm/swap_vfp.S @@ -0,0 +1,354 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/14 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_INC_X [fp, #0 ] +#define OLD_Y [fp, #4 ] +#define OLD_INC_Y [fp, #8 ] + + +#define N r0 +#define Y r1 +#define INC_X r2 +#define X r3 +#define INC_Y r4 + +#define I r12 + +#define X_PRE 512 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +/*****************************************************************************************/ + + + +#if !defined(COMPLEX) + +#if defined(DOUBLE) + +.macro KERNEL_F4 + + pld [ X, #X_PRE ] + pld [ Y, #X_PRE ] + fldmiad X, { d0 - d3 } + fldmiad Y, { d4 - d7 } + fstmiad Y!, { d0 - d3 } + fstmiad X!, { d4 - d7} + +.endm + + +.macro KERNEL_F1 + + fldmiad X, { d0 } + fldmiad Y, { d4 } + fstmiad Y!, { d0 } + fstmiad X!, { d4 } + +.endm + +.macro KERNEL_S1 + + fldmiad X, { d0 } + fldmiad Y, { d4 } + fstmiad Y, { d0 } + fstmiad X, { d4 } + add X, X, INC_X + add Y, Y, INC_Y + +.endm + +#else + +.macro KERNEL_F4 + + fldmias X, { s0 - s3 } + fldmias Y, { s4 - s7 } + fstmias Y!, { s0 - s3 } + fstmias X!, { s4 - s7} + +.endm + + +.macro KERNEL_F1 + + fldmias X, { s0 } + fldmias Y, { s4 } + fstmias Y!, { s0 } + fstmias X!, { s4 } + +.endm + +.macro KERNEL_S1 + + fldmias X, { s0 } + fldmias Y, { s4 } + fstmias Y, { s0 } + fstmias X, { s4 } + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + +#endif + +#else + +#if defined(DOUBLE) + +.macro KERNEL_F4 + + pld [ X, #X_PRE ] + pld [ Y, #X_PRE ] + fldmiad X, { d0 - d3 } + fldmiad Y, { d4 - d7 } + fstmiad Y!, { d0 - d3 } + fstmiad X!, { d4 - d7} + + pld [ X, #X_PRE ] + pld [ Y, #X_PRE ] + fldmiad X, { d0 - d3 } + fldmiad Y, { d4 - d7 } + fstmiad Y!, { d0 - d3 } + fstmiad X!, { d4 - d7} + +.endm + +.macro KERNEL_F1 + + fldmiad X, { d0 - d1 } + fldmiad Y, { d4 - d5 } + fstmiad Y!, { d0 - d1 } + fstmiad X!, { d4 - d5 } + +.endm + +.macro KERNEL_S1 + + fldmiad X, { d0 - d1 } + fldmiad Y, { d4 - d5 } + fstmiad Y, { d0 - d1 } + fstmiad X, { d4 - d5 } + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + +#else + +.macro KERNEL_F4 + + pld [ X, #X_PRE ] + pld [ Y, #X_PRE ] + fldmias X, { s0 - s3 } + fldmias Y, { s4 - s7 } + fstmias Y!, { s0 - s3 } + fstmias X!, { s4 - s7} + + fldmias X, { s0 - s3 } + fldmias Y, { s4 - s7 } + fstmias Y!, { s0 - s3 } + fstmias X!, { s4 - s7} + +.endm + +.macro KERNEL_F1 + + fldmias X, { s0 - s1 } + fldmias Y, { s4 - s5 } + fstmias Y!, { s0 - s1 } + fstmias X!, { s4 - s5 } + +.endm + +.macro KERNEL_S1 + + fldmias X, { s0 - s1 } + fldmias Y, { s4 - s5 } + fstmias Y, { s0 - s1 } + fstmias X, { s4 - s5 } + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + + +#endif + +#endif + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + push {r4 , fp} + add fp, sp, #8 + + ldr INC_X , OLD_INC_X + ldr Y, OLD_Y + ldr INC_Y , OLD_INC_Y + + + cmp N, #0 + ble swap_kernel_L999 + + cmp INC_X, #0 + beq swap_kernel_L999 + + cmp INC_Y, #0 + beq swap_kernel_L999 + + cmp INC_X, #1 + bne swap_kernel_S_BEGIN + + cmp INC_Y, #1 + bne swap_kernel_S_BEGIN + + +swap_kernel_F_BEGIN: + + + asrs I, N, #2 // I = N / 4 + ble swap_kernel_F1 + + .align 5 + +swap_kernel_F4: + +#if !defined(COMPLEX) && !defined(DOUBLE) + pld [ X, #X_PRE ] + pld [ Y, #X_PRE ] +#endif + + KERNEL_F4 + + subs I, I, #1 + ble swap_kernel_F1 + + KERNEL_F4 + + subs I, I, #1 + bne swap_kernel_F4 + +swap_kernel_F1: + + ands I, N, #3 + ble swap_kernel_L999 + +swap_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne swap_kernel_F10 + + b swap_kernel_L999 + +swap_kernel_S_BEGIN: + +#if defined(COMPLEX) + +#if defined(DOUBLE) + lsl INC_X, INC_X, #4 // INC_X * SIZE * 2 + lsl INC_Y, INC_Y, #4 // INC_Y * SIZE * 2 +#else + lsl INC_X, INC_X, #3 // INC_X * SIZE * 2 + lsl INC_Y, INC_Y, #3 // INC_Y * SIZE * 2 +#endif + +#else + +#if defined(DOUBLE) + lsl INC_X, INC_X, #3 // INC_X * SIZE + lsl INC_Y, INC_Y, #3 // INC_Y * SIZE +#else + lsl INC_X, INC_X, #2 // INC_X * SIZE + lsl INC_Y, INC_Y, #2 // INC_Y * SIZE +#endif + +#endif + + + asrs I, N, #2 // I = N / 4 + ble swap_kernel_S1 + + .align 5 + +swap_kernel_S4: + + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + + subs I, I, #1 + bne swap_kernel_S4 + +swap_kernel_S1: + + ands I, N, #3 + ble swap_kernel_L999 + +swap_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne swap_kernel_S10 + + +swap_kernel_L999: + + mov r0, #0 // set return value + + sub sp, fp, #8 + pop {r4,fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/zamax.c b/kernel/arm/zamax.c new file mode 100644 index 000000000..8c2a5c346 --- /dev/null +++ b/kernel/arm/zamax.c @@ -0,0 +1,81 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : OK +* BLASTEST double : OK +* CTEST : NoTest +* TEST : NoTest +* +**************************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + +#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT maxf[2]; + BLASLONG max=0; + BLASLONG inc_x2; + + if (n < 0 || inc_x < 1 ) return(0.0); + + inc_x2 = 2 * inc_x; + + maxf[0] = ABS(x[ix]); + maxf[1] = ABS(x[ix+1]); + + while(i < n) + { + if( CABS1(x,ix) > CABS1(maxf,0) ) + { + max = i; + maxf[0] = ABS(x[ix]); + maxf[1] = ABS(x[ix+1]); + } + ix += inc_x2; + i++; + } + return(CABS1(maxf,0)); +} + + diff --git a/kernel/arm/zamin.c b/kernel/arm/zamin.c new file mode 100644 index 000000000..6956ced0e --- /dev/null +++ b/kernel/arm/zamin.c @@ -0,0 +1,81 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : OK +* BLASTEST double : OK +* CTEST : NoTest +* TEST : NoTest +* +**************************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + +#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT minf[2]; + BLASLONG min=0; + BLASLONG inc_x2; + + if (n < 0 || inc_x < 1 ) return(0.0); + + inc_x2 = 2 * inc_x; + + minf[0] = ABS(x[ix]); + minf[1] = ABS(x[ix+1]); + + while(i < n) + { + if( CABS1(x,ix) < CABS1(minf,0) ) + { + min = i; + minf[0] = ABS(x[ix]); + minf[1] = ABS(x[ix+1]); + } + ix += inc_x2; + i++; + } + return(CABS1(minf,0)); +} + + diff --git a/kernel/arm/zasum.c b/kernel/arm/zasum.c new file mode 100644 index 000000000..13acfc0f0 --- /dev/null +++ b/kernel/arm/zasum.c @@ -0,0 +1,71 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : OK +* BLASTEST double : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + +#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + FLOAT sumf = 0.0; + BLASLONG inc_x2; + if (n < 0 || inc_x < 1 ) return(sumf); + + inc_x2 = 2 * inc_x; + + n *= inc_x2; + while(i < n) + { + sumf += CABS1(x,i); + i += inc_x2; + } + return(sumf); +} + + diff --git a/kernel/arm/zaxpy.c b/kernel/arm/zaxpy.c new file mode 100644 index 000000000..28a4380fb --- /dev/null +++ b/kernel/arm/zaxpy.c @@ -0,0 +1,72 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/15 Saar +* BLASTEST float : OK +* BLASTEST double : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + + +#include "common.h" + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG ix,iy; + + if ( n < 0 ) return(0); + if ( da_r == 0.0 && da_i == 0.0 ) return(0); + + ix = 0; + iy = 0; + + BLASLONG inc_x2 = 2 * inc_x; + BLASLONG inc_y2 = 2 * inc_y; + + while(i < n) + { +#if !defined(CONJ) + y[iy] += ( da_r * x[ix] - da_i * x[ix+1] ) ; + y[iy+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ; +#else + y[iy] += ( da_r * x[ix] + da_i * x[ix+1] ) ; + y[iy+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ; +#endif + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + + } + return(0); + +} + + diff --git a/kernel/arm/zcopy.c b/kernel/arm/zcopy.c new file mode 100644 index 000000000..654711240 --- /dev/null +++ b/kernel/arm/zcopy.c @@ -0,0 +1,63 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : OK +* BLASTEST double : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#include "common.h" + +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + + if ( n < 0 ) return(0); + + BLASLONG inc_x2 = 2 * inc_x; + BLASLONG inc_y2 = 2 * inc_y; + + while(i < n) + { + + y[iy] = x[ix] ; + y[iy+1] = x[ix+1] ; + ix += inc_x2; + iy += inc_y2; + i++ ; + + } + return(0); + +} + + diff --git a/kernel/arm/zcopy_vfp.S b/kernel/arm/zcopy_vfp.S new file mode 100644 index 000000000..06f892446 --- /dev/null +++ b/kernel/arm/zcopy_vfp.S @@ -0,0 +1,223 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/07 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define N r0 +#define X r1 +#define INC_X r2 +#define OLD_Y r3 + + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define OLD_INC_Y [fp, #4 ] + +#define I r5 +#define Y r6 +#define INC_Y r7 + +#define X_PRE 256 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro COPY_F4 + + pld [ X, #X_PRE ] + pld [ X, #X_PRE+32 ] + fldmiad X!, { d0 - d7 } + fstmiad Y!, { d0 - d7 } + +.endm + +.macro COPY_F1 + + fldmiad X!, { d0 - d1 } + fstmiad Y!, { d0 - d1 } + +.endm + + +/*************************************************************************************************************************/ + +.macro COPY_S4 + + nop + fldmiad X, { d0 - d1 } + fstmiad Y, { d0 - d1 } + add X, X, INC_X + add Y, Y, INC_Y + + fldmiad X, { d2 - d3 } + fstmiad Y, { d2 - d3 } + add X, X, INC_X + add Y, Y, INC_Y + + fldmiad X, { d0 - d1 } + fstmiad Y, { d0 - d1 } + add X, X, INC_X + add Y, Y, INC_Y + + fldmiad X, { d2 - d3 } + fstmiad Y, { d2 - d3 } + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + +.macro COPY_S1 + + fldmiad X, { d0 - d1 } + fstmiad Y, { d0 - d1 } + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + sub r4, fp, #128 + vstm r4, { d8 - d15} // store floating point registers + + mov Y, OLD_Y + ldr INC_Y, OLD_INC_Y + + cmp N, #0 + ble zcopy_kernel_L999 + + cmp INC_X, #0 + beq zcopy_kernel_L999 + + cmp INC_Y, #0 + beq zcopy_kernel_L999 + + cmp INC_X, #1 + bne zcopy_kernel_S_BEGIN + + cmp INC_Y, #1 + bne zcopy_kernel_S_BEGIN + +zcopy_kernel_F_BEGIN: + + asrs I, N, #2 // I = N / 4 + ble zcopy_kernel_F1 + +zcopy_kernel_F4: + + COPY_F4 + + subs I, I, #1 + bne zcopy_kernel_F4 + +zcopy_kernel_F1: + + ands I, N, #3 + ble zcopy_kernel_L999 + +zcopy_kernel_F10: + + COPY_F1 + + subs I, I, #1 + bne zcopy_kernel_F10 + + b zcopy_kernel_L999 + +zcopy_kernel_S_BEGIN: + + lsl INC_X, INC_X, #4 // INC_X * SIZE * 2 + lsl INC_Y, INC_Y, #4 // INC_Y * SIZE * 2 + + asrs I, N, #2 // I = N / 4 + ble zcopy_kernel_S1 + +zcopy_kernel_S4: + + COPY_S4 + + subs I, I, #1 + bne zcopy_kernel_S4 + +zcopy_kernel_S1: + + ands I, N, #3 + ble zcopy_kernel_L999 + +zcopy_kernel_S10: + + COPY_S1 + + subs I, I, #1 + bne zcopy_kernel_S10 + + + + + + +zcopy_kernel_L999: + + sub r3, fp, #128 + vldm r3, { d8 - d15} // restore floating point registers + + mov r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/zdot.c b/kernel/arm/zdot.c new file mode 100644 index 000000000..096ced9db --- /dev/null +++ b/kernel/arm/zdot.c @@ -0,0 +1,78 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : FAIL +* BLASTEST double : FAIL +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#include "common.h" +#include + +FLOAT _Complex CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT dot[2]; + FLOAT _Complex result; + + dot[0]=0.0; + dot[1]=0.0; + + __real__ result = 0.0 ; + __imag__ result = 0.0 ; + + if ( n < 1 ) return(result); + + BLASLONG inc_x2 = 2 * inc_x ; + BLASLONG inc_y2 = 2 * inc_y ; + + while(i < n) + { +#if !defined(CONJ) + dot[0] += ( x[ix] * y[iy] - x[ix+1] * y[iy+1] ) ; + dot[1] += ( x[ix+1] * y[iy] + x[ix] * y[iy+1] ) ; +#else + dot[0] += ( x[ix] * y[iy] + x[ix+1] * y[iy+1] ) ; + dot[1] -= ( x[ix+1] * y[iy] - x[ix] * y[iy+1] ) ; +#endif + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + + } + __real__ result = dot[0]; + __imag__ result = dot[1]; + return(result); + +} + + diff --git a/kernel/arm/zdot_vfp.S b/kernel/arm/zdot_vfp.S new file mode 100644 index 000000000..1a78b5aec --- /dev/null +++ b/kernel/arm/zdot_vfp.S @@ -0,0 +1,286 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/11 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define N r0 +#define X r1 +#define INC_X r2 +#define OLD_Y r3 + + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define OLD_INC_Y [fp, #4 ] + +#define I r5 +#define Y r6 +#define INC_Y r7 + +#define X_PRE 512 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro KERNEL_F4 + + pld [ X, #X_PRE ] + pld [ Y, #X_PRE ] + + fldmiad X!, { d4 - d5 } + fldmiad Y!, { d8 - d9 } + fmacd d0 , d4, d8 + fmacd d1 , d4, d9 + fldmiad X!, { d6 - d7 } + fmacd d2 , d5, d9 + fmacd d3 , d5, d8 + + fldmiad Y!, { d10 - d11 } + fmacd d0 , d6, d10 + fmacd d1 , d6, d11 + pld [ X, #X_PRE ] + fmacd d2 , d7, d11 + fmacd d3 , d7, d10 + + pld [ Y, #X_PRE ] + + fldmiad X!, { d4 - d5 } + fldmiad Y!, { d8 - d9 } + fmacd d0 , d4, d8 + fmacd d1 , d4, d9 + fldmiad X!, { d6 - d7 } + fmacd d2 , d5, d9 + fmacd d3 , d5, d8 + + fldmiad Y!, { d10 - d11 } + fmacd d0 , d6, d10 + fmacd d1 , d6, d11 + fmacd d2 , d7, d11 + fmacd d3 , d7, d10 + +.endm + +.macro KERNEL_F1 + + fldmiad X!, { d4 - d5 } + fldmiad Y!, { d8 - d9 } + fmacd d0 , d4, d8 + fmacd d1 , d4, d9 + fmacd d2 , d5, d9 + fmacd d3 , d5, d8 + +.endm + + +/*************************************************************************************************************************/ + +.macro KERNEL_S4 + + nop + + fldmiad X, { d4 - d5 } + fldmiad Y, { d8 - d9 } + fmacd d0 , d4, d8 + fmacd d1 , d4, d9 + fmacd d2 , d5, d9 + fmacd d3 , d5, d8 + add X, X, INC_X + add Y, Y, INC_Y + + fldmiad X, { d4 - d5 } + fldmiad Y, { d8 - d9 } + fmacd d0 , d4, d8 + fmacd d1 , d4, d9 + fmacd d2 , d5, d9 + fmacd d3 , d5, d8 + add X, X, INC_X + add Y, Y, INC_Y + + fldmiad X, { d4 - d5 } + fldmiad Y, { d8 - d9 } + fmacd d0 , d4, d8 + fmacd d1 , d4, d9 + fmacd d2 , d5, d9 + fmacd d3 , d5, d8 + add X, X, INC_X + add Y, Y, INC_Y + + fldmiad X, { d4 - d5 } + fldmiad Y, { d8 - d9 } + fmacd d0 , d4, d8 + fmacd d1 , d4, d9 + fmacd d2 , d5, d9 + fmacd d3 , d5, d8 + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + +.macro KERNEL_S1 + + fldmiad X, { d4 - d5 } + fldmiad Y, { d8 - d9 } + fmacd d0 , d4, d8 + fmacd d1 , d4, d9 + fmacd d2 , d5, d9 + fmacd d3 , d5, d8 + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + sub r4, fp, #128 + vstm r4, { d8 - d15} // store floating point registers + + mov Y, OLD_Y + ldr INC_Y, OLD_INC_Y + + vsub.f64 d0 , d0 , d0 + vsub.f64 d1 , d1 , d1 + vsub.f64 d2 , d2 , d2 + vsub.f64 d3 , d3 , d3 + + cmp N, #0 + ble zdot_kernel_L999 + + cmp INC_X, #0 + beq zdot_kernel_L999 + + cmp INC_Y, #0 + beq zdot_kernel_L999 + + cmp INC_X, #1 + bne zdot_kernel_S_BEGIN + + cmp INC_Y, #1 + bne zdot_kernel_S_BEGIN + +zdot_kernel_F_BEGIN: + + asrs I, N, #2 // I = N / 4 + ble zdot_kernel_F1 + +zdot_kernel_F4: + + KERNEL_F4 + + subs I, I, #1 + bne zdot_kernel_F4 + +zdot_kernel_F1: + + ands I, N, #3 + ble zdot_kernel_L999 + +zdot_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne zdot_kernel_F10 + + b zdot_kernel_L999 + +zdot_kernel_S_BEGIN: + + lsl INC_X, INC_X, #4 // INC_X * SIZE * 2 + lsl INC_Y, INC_Y, #4 // INC_Y * SIZE * 2 + + asrs I, N, #2 // I = N / 4 + ble zdot_kernel_S1 + +zdot_kernel_S4: + + KERNEL_S4 + + subs I, I, #1 + bne zdot_kernel_S4 + +zdot_kernel_S1: + + ands I, N, #3 + ble zdot_kernel_L999 + +zdot_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne zdot_kernel_S10 + + + +zdot_kernel_L999: + + sub r3, fp, #128 + vldm r3, { d8 - d15} // restore floating point registers + +#if !defined(CONJ) + vsub.f64 d0 , d0, d2 + vadd.f64 d1 , d1, d3 +#else + vadd.f64 d0 , d0, d2 + vsub.f64 d1 , d1, d3 +#endif + + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/zgemm_kernel_2x2_vfp.S b/kernel/arm/zgemm_kernel_2x2_vfp.S new file mode 100644 index 000000000..8a5401858 --- /dev/null +++ b/kernel/arm/zgemm_kernel_2x2_vfp.S @@ -0,0 +1,1299 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/28 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +***************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_K r2 +#define OLD_A r3 +#define OLD_ALPHA_R d0 +#define OLD_ALPHA_I d1 + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define A [fp, #-248 ] +#define LDC [fp, #-252 ] +#define M [fp, #-256 ] +#define N [fp, #-260 ] +#define K [fp, #-264 ] + +#define ALPHA_I [fp, #-272] +#define ALPHA_R [fp, #-280] + +#define B [fp, #4 ] +#define C [fp, #8 ] +#define OLD_LDC [fp, #12 ] + +#define I r0 +#define J r1 +#define L r2 + +#define AO r5 +#define BO r6 + +#define CO1 r8 +#define CO2 r9 + +#define K1 r7 +#define BC r12 + +#define A_PRE 96 +#define B_PRE 96 +#define C_PRE 64 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + + #define KMAC_R fnmacd + #define KMAC_I fmacd + + #define FMAC_R1 fmacd + #define FMAC_R2 fnmacd + #define FMAC_I1 fmacd + #define FMAC_I2 fmacd + +#elif defined(CN) || defined(CT) + + #define KMAC_R fmacd + #define KMAC_I fnmacd + + #define FMAC_R1 fmacd + #define FMAC_R2 fnmacd + #define FMAC_I1 fmacd + #define FMAC_I2 fmacd + +#elif defined(NC) || defined(TC) + + #define KMAC_R fmacd + #define KMAC_I fnmacd + + #define FMAC_R1 fmacd + #define FMAC_R2 fmacd + #define FMAC_I1 fnmacd + #define FMAC_I2 fmacd + +#else + + #define KMAC_R fnmacd + #define KMAC_I fmacd + + #define FMAC_R1 fmacd + #define FMAC_R2 fmacd + #define FMAC_I1 fnmacd + #define FMAC_I2 fmacd + +#endif + + + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro INIT2x2 + + vsub.f64 d8 , d8 , d8 + vmov.f64 d9 , d8 + vmov.f64 d10, d8 + vmov.f64 d11, d8 + vmov.f64 d12, d8 + vmov.f64 d13, d8 + vmov.f64 d14, d8 + vmov.f64 d15, d8 + +.endm + +.macro KERNEL2x2_I + pld [ AO, #A_PRE ] + pld [ BO, #B_PRE ] + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fldd d6 , [ BO, #16 ] + fldd d7 , [ BO, #24 ] + + fmuld d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmuld d9 , d0, d5 + KMAC_I d9 , d1, d4 + + fmuld d10 , d2, d4 + KMAC_R d10 , d3, d5 + fmuld d11 , d2, d5 + KMAC_I d11 , d3, d4 + + fmuld d12 , d0, d6 + KMAC_R d12 , d1, d7 + fmuld d13 , d0, d7 + KMAC_I d13 , d1, d6 + + fmuld d14 , d2, d6 + KMAC_R d14 , d3, d7 + fmuld d15 , d2, d7 + KMAC_I d15 , d3, d6 + + add BO , BO, #32 + add AO , AO, #32 + +.endm + + + +.macro KERNEL2x2_M1 + + + fldd d0 , [ AO ] + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fmacd d8 , d0, d4 + fldd d1 , [ AO, #8 ] + fmacd d9 , d0, d5 + fldd d2 , [ AO, #16 ] + KMAC_R d8 , d1, d5 + fldd d3 , [ AO, #24 ] + KMAC_I d9 , d1, d4 + + fldd d6 , [ BO, #16 ] + fmacd d10 , d2, d4 + fldd d7 , [ BO, #24 ] + fmacd d11 , d2, d5 + KMAC_R d10 , d3, d5 + pld [ AO, #A_PRE ] + KMAC_I d11 , d3, d4 + + pld [ BO, #B_PRE ] + fmacd d12 , d0, d6 + fmacd d13 , d0, d7 + KMAC_R d12 , d1, d7 + KMAC_I d13 , d1, d6 + + fmacd d14 , d2, d6 + fmacd d15 , d2, d7 + add BO , BO, #32 + KMAC_R d14 , d3, d7 + add AO , AO, #32 + KMAC_I d15 , d3, d6 + + +.endm + +.macro KERNEL2x2_M2 + + fldd d0 , [ AO ] + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fmacd d8 , d0, d4 + fldd d1 , [ AO, #8 ] + fmacd d9 , d0, d5 + fldd d2 , [ AO, #16 ] + KMAC_R d8 , d1, d5 + fldd d3 , [ AO, #24 ] + KMAC_I d9 , d1, d4 + + fldd d6 , [ BO, #16 ] + fmacd d10 , d2, d4 + fldd d7 , [ BO, #24 ] + fmacd d11 , d2, d5 + KMAC_R d10 , d3, d5 + pld [ AO, #A_PRE ] + KMAC_I d11 , d3, d4 + + pld [ BO, #B_PRE ] + fmacd d12 , d0, d6 + fmacd d13 , d0, d7 + KMAC_R d12 , d1, d7 + KMAC_I d13 , d1, d6 + + fmacd d14 , d2, d6 + fmacd d15 , d2, d7 + add BO , BO, #32 + KMAC_R d14 , d3, d7 + add AO , AO, #32 + KMAC_I d15 , d3, d6 + + +.endm + + +.macro KERNEL2x2_E + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + fldd d6 , [ BO, #16 ] + fldd d7 , [ BO, #24 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + fmacd d10 , d2, d4 + KMAC_R d10 , d3, d5 + fmacd d11 , d2, d5 + KMAC_I d11 , d3, d4 + + fmacd d12 , d0, d6 + KMAC_R d12 , d1, d7 + fmacd d13 , d0, d7 + KMAC_I d13 , d1, d6 + + fmacd d14 , d2, d6 + KMAC_R d14 , d3, d7 + fmacd d15 , d2, d7 + KMAC_I d15 , d3, d6 + + add BO , BO, #32 + add AO , AO, #32 + +.endm + +.macro KERNEL2x2_SUB + + fldd d0 , [ AO ] + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fmacd d8 , d0, d4 + fldd d1 , [ AO, #8 ] + fmacd d9 , d0, d5 + fldd d2 , [ AO, #16 ] + KMAC_R d8 , d1, d5 + fldd d3 , [ AO, #24 ] + KMAC_I d9 , d1, d4 + + fldd d6 , [ BO, #16 ] + fmacd d10 , d2, d4 + fldd d7 , [ BO, #24 ] + fmacd d11 , d2, d5 + KMAC_R d10 , d3, d5 + pld [ AO, #A_PRE ] + KMAC_I d11 , d3, d4 + + pld [ BO, #B_PRE ] + fmacd d12 , d0, d6 + fmacd d13 , d0, d7 + KMAC_R d12 , d1, d7 + KMAC_I d13 , d1, d6 + + fmacd d14 , d2, d6 + fmacd d15 , d2, d7 + add BO , BO, #32 + KMAC_R d14 , d3, d7 + add AO , AO, #32 + KMAC_I d15 , d3, d6 + + +.endm + + +.macro SAVE2x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + fldd d0, ALPHA_R + fldd d1, ALPHA_I + + fldmiad CO1, { d4 - d7 } + + FMAC_R1 d4 , d0 , d8 + FMAC_I1 d5 , d0 , d9 + FMAC_R2 d4 , d1 , d9 + FMAC_I2 d5 , d1 , d8 + + FMAC_R1 d6 , d0 , d10 + FMAC_I1 d7 , d0 , d11 + FMAC_R2 d6 , d1 , d11 + FMAC_I2 d7 , d1 , d10 + + fstmiad CO1, { d4 - d7 } + + fldmiad CO2, { d4 - d7 } + + FMAC_R1 d4 , d0 , d12 + FMAC_I1 d5 , d0 , d13 + FMAC_R2 d4 , d1 , d13 + FMAC_I2 d5 , d1 , d12 + + FMAC_R1 d6 , d0 , d14 + FMAC_I1 d7 , d0 , d15 + FMAC_R2 d6 , d1 , d15 + FMAC_I2 d7 , d1 , d14 + + fstmiad CO2, { d4 - d7 } + + add CO1, CO1, #32 + +.endm + +/******************************************************************************/ + +.macro INIT1x2 + + vsub.f64 d8 , d8 , d8 + vmov.f64 d9 , d8 + vmov.f64 d12, d8 + vmov.f64 d13, d8 + +.endm + +.macro KERNEL1x2_I + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fldd d6 , [ BO, #16 ] + fldd d7 , [ BO, #24 ] + + fmuld d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmuld d9 , d0, d5 + KMAC_I d9 , d1, d4 + + fmuld d12 , d0, d6 + KMAC_R d12 , d1, d7 + fmuld d13 , d0, d7 + KMAC_I d13 , d1, d6 + + add BO , BO, #32 + add AO , AO, #16 + +.endm + + + +.macro KERNEL1x2_M1 + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + fldd d6 , [ BO, #16 ] + fldd d7 , [ BO, #24 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + fmacd d12 , d0, d6 + KMAC_R d12 , d1, d7 + fmacd d13 , d0, d7 + KMAC_I d13 , d1, d6 + + add BO , BO, #32 + add AO , AO, #16 + +.endm + +.macro KERNEL1x2_M2 + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + fldd d6 , [ BO, #16 ] + fldd d7 , [ BO, #24 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + fmacd d12 , d0, d6 + KMAC_R d12 , d1, d7 + fmacd d13 , d0, d7 + KMAC_I d13 , d1, d6 + + add BO , BO, #32 + add AO , AO, #16 + + +.endm + + +.macro KERNEL1x2_E + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + fldd d6 , [ BO, #16 ] + fldd d7 , [ BO, #24 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + fmacd d12 , d0, d6 + KMAC_R d12 , d1, d7 + fmacd d13 , d0, d7 + KMAC_I d13 , d1, d6 + + add BO , BO, #32 + add AO , AO, #16 + +.endm + +.macro KERNEL1x2_SUB + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + fldd d6 , [ BO, #16 ] + fldd d7 , [ BO, #24 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + fmacd d12 , d0, d6 + KMAC_R d12 , d1, d7 + fmacd d13 , d0, d7 + KMAC_I d13 , d1, d6 + + add BO , BO, #32 + add AO , AO, #16 + + +.endm + + +.macro SAVE1x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + fldd d0, ALPHA_R + fldd d1, ALPHA_I + + fldmiad CO1, { d4 - d5 } + + FMAC_R1 d4 , d0 , d8 + FMAC_I1 d5 , d0 , d9 + FMAC_R2 d4 , d1 , d9 + FMAC_I2 d5 , d1 , d8 + + fstmiad CO1, { d4 - d5 } + + fldmiad CO2, { d4 - d5 } + + FMAC_R1 d4 , d0 , d12 + FMAC_I1 d5 , d0 , d13 + FMAC_R2 d4 , d1 , d13 + FMAC_I2 d5 , d1 , d12 + + fstmiad CO2, { d4 - d5 } + + add CO1, CO1, #16 + +.endm + + +/******************************************************************************/ + +.macro INIT2x1 + + vsub.f64 d8 , d8 , d8 + vmov.f64 d9 , d8 + vmov.f64 d10, d8 + vmov.f64 d11, d8 + +.endm + +.macro KERNEL2x1_I + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fmuld d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmuld d9 , d0, d5 + KMAC_I d9 , d1, d4 + + fmuld d10 , d2, d4 + KMAC_R d10 , d3, d5 + fmuld d11 , d2, d5 + KMAC_I d11 , d3, d4 + + add BO , BO, #16 + add AO , AO, #32 + +.endm + + + +.macro KERNEL2x1_M1 + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + fmacd d10 , d2, d4 + KMAC_R d10 , d3, d5 + fmacd d11 , d2, d5 + KMAC_I d11 , d3, d4 + + add BO , BO, #16 + add AO , AO, #32 + +.endm + +.macro KERNEL2x1_M2 + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + fmacd d10 , d2, d4 + KMAC_R d10 , d3, d5 + fmacd d11 , d2, d5 + KMAC_I d11 , d3, d4 + + add BO , BO, #16 + add AO , AO, #32 + + +.endm + + +.macro KERNEL2x1_E + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + fmacd d10 , d2, d4 + KMAC_R d10 , d3, d5 + fmacd d11 , d2, d5 + KMAC_I d11 , d3, d4 + + add BO , BO, #16 + add AO , AO, #32 + +.endm + +.macro KERNEL2x1_SUB + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + fmacd d10 , d2, d4 + KMAC_R d10 , d3, d5 + fmacd d11 , d2, d5 + KMAC_I d11 , d3, d4 + + add BO , BO, #16 + add AO , AO, #32 + + +.endm + + +.macro SAVE2x1 + + fldd d0, ALPHA_R + fldd d1, ALPHA_I + + fldmiad CO1, { d4 - d7 } + + FMAC_R1 d4 , d0 , d8 + FMAC_I1 d5 , d0 , d9 + FMAC_R2 d4 , d1 , d9 + FMAC_I2 d5 , d1 , d8 + + FMAC_R1 d6 , d0 , d10 + FMAC_I1 d7 , d0 , d11 + FMAC_R2 d6 , d1 , d11 + FMAC_I2 d7 , d1 , d10 + + fstmiad CO1, { d4 - d7 } + + add CO1, CO1, #32 + +.endm + + +/******************************************************************************/ + +.macro INIT1x1 + + vsub.f64 d8 , d8 , d8 + vmov.f64 d9 , d8 + +.endm + +.macro KERNEL1x1_I + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fmuld d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmuld d9 , d0, d5 + KMAC_I d9 , d1, d4 + + add BO , BO, #16 + add AO , AO, #16 + +.endm + + + +.macro KERNEL1x1_M1 + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + add BO , BO, #16 + add AO , AO, #16 + +.endm + +.macro KERNEL1x1_M2 + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + add BO , BO, #16 + add AO , AO, #16 + + +.endm + + +.macro KERNEL1x1_E + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + add BO , BO, #16 + add AO , AO, #16 + +.endm + +.macro KERNEL1x1_SUB + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + add BO , BO, #16 + add AO , AO, #16 + + +.endm + + +.macro SAVE1x1 + + fldd d0, ALPHA_R + fldd d1, ALPHA_I + + fldmiad CO1, { d4 - d5 } + + FMAC_R1 d4 , d0 , d8 + FMAC_I1 d5 , d0 , d9 + FMAC_R2 d4 , d1 , d9 + FMAC_I2 d5 , d1 , d8 + + fstmiad CO1, { d4 - d5 } + + add CO1, CO1, #16 + +.endm + + +/******************************************************************************/ + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + str OLD_M, M + str OLD_N, N + str OLD_K, K + str OLD_A, A + vstr OLD_ALPHA_R, ALPHA_R + vstr OLD_ALPHA_I, ALPHA_I + + sub r3, fp, #128 + vstm r3, { d8 - d15} // store floating point registers + + ldr r3, OLD_LDC + lsl r3, r3, #4 // ldc = ldc * 8 * 2 + str r3, LDC + + ldr K1, K + ldr BC, B + + ldr J, N + asrs J, J, #1 // J = J / 2 + ble zgemm_kernel_L1_BEGIN + +zgemm_kernel_L2_BEGIN: + + ldr CO1, C // CO1 = C + ldr r4 , LDC + lsl r4 , r4 , #1 // LDC * 2 + add r3 , r4, CO1 + str r3 , C // store C + + ldr AO, A // AO = A + pld [AO , #A_PRE-64] + pld [AO , #A_PRE-32] + + + +zgemm_kernel_L2_M2_BEGIN: + + ldr I, M + asrs I, I, #1 // I = I / 2 + ble zgemm_kernel_L2_M1_BEGIN + +zgemm_kernel_L2_M2_20: + + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + cmp L , #3 + blt zgemm_kernel_L2_M2_30 + .align 5 + + + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + sub L, L, #2 + +zgemm_kernel_L2_M2_22: + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + subs L, L, #1 + bgt zgemm_kernel_L2_M2_22 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b zgemm_kernel_L2_M2_44 + + +zgemm_kernel_L2_M2_30: + tst L, #3 + ble zgemm_kernel_L2_M2_40 + + tst L, #2 + ble zgemm_kernel_L2_M2_32 + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b zgemm_kernel_L2_M2_44 + +zgemm_kernel_L2_M2_32: + + tst L, #1 + ble zgemm_kernel_L2_M2_40 + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b zgemm_kernel_L2_M2_44 + + +zgemm_kernel_L2_M2_40: + + INIT2x2 + + +zgemm_kernel_L2_M2_44: + + ands L , K1, #7 // L = L % 8 + ble zgemm_kernel_L2_M2_100 + +zgemm_kernel_L2_M2_46: + + KERNEL2x2_SUB + + subs L, L, #1 + bne zgemm_kernel_L2_M2_46 + +zgemm_kernel_L2_M2_100: + + SAVE2x2 + +zgemm_kernel_L2_M2_END: + + subs I, I, #1 + bne zgemm_kernel_L2_M2_20 + + +zgemm_kernel_L2_M1_BEGIN: + + ldr I, M + tst I, #1 // I = I % 2 + ble zgemm_kernel_L2_END + +zgemm_kernel_L2_M1_20: + + INIT1x2 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble zgemm_kernel_L2_M1_40 + +zgemm_kernel_L2_M1_22: + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs L, L, #1 + bgt zgemm_kernel_L2_M1_22 + + +zgemm_kernel_L2_M1_40: + + ands L , K1, #7 // L = L % 8 + ble zgemm_kernel_L2_M1_100 + +zgemm_kernel_L2_M1_42: + + KERNEL1x2_SUB + + subs L, L, #1 + bgt zgemm_kernel_L2_M1_42 + +zgemm_kernel_L2_M1_100: + + SAVE1x2 + + +zgemm_kernel_L2_END: + + mov r3, BC + mov r4, K1 + lsl r4, r4, #5 // k * 2 * 8 * 2 + add r3, r3, r4 // B = B + K * 4 * 8 + mov BC, r3 + + subs J , #1 // j-- + bgt zgemm_kernel_L2_BEGIN + + + +/*********************************************************************************************/ + +zgemm_kernel_L1_BEGIN: + + ldr J , N + tst J , #1 + ble zgemm_kernel_L999 + + + ldr CO1, C // CO1 = C + ldr r4 , LDC + add r3 , r4, CO1 + str r3 , C // store C + + ldr AO, A // AO = A + +zgemm_kernel_L1_M2_BEGIN: + + ldr I, M + asrs I, I, #1 // I = I / 2 + ble zgemm_kernel_L1_M1_BEGIN + +zgemm_kernel_L1_M2_20: + + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + cmp L , #3 + blt zgemm_kernel_L1_M2_30 + .align 5 + + + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + sub L, L, #2 + +zgemm_kernel_L1_M2_22: + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + subs L, L, #1 + bgt zgemm_kernel_L1_M2_22 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b zgemm_kernel_L1_M2_44 + + +zgemm_kernel_L1_M2_30: + tst L, #3 + ble zgemm_kernel_L1_M2_40 + + tst L, #2 + ble zgemm_kernel_L1_M2_32 + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b zgemm_kernel_L1_M2_44 + +zgemm_kernel_L1_M2_32: + + tst L, #1 + ble zgemm_kernel_L1_M2_40 + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b zgemm_kernel_L1_M2_44 + + +zgemm_kernel_L1_M2_40: + + INIT2x1 + + +zgemm_kernel_L1_M2_44: + + ands L , K1, #7 // L = L % 8 + ble zgemm_kernel_L1_M2_100 + +zgemm_kernel_L1_M2_46: + + KERNEL2x1_SUB + + subs L, L, #1 + bne zgemm_kernel_L1_M2_46 + +zgemm_kernel_L1_M2_100: + + SAVE2x1 + +zgemm_kernel_L1_M2_END: + + subs I, I, #1 + bne zgemm_kernel_L1_M2_20 + + +zgemm_kernel_L1_M1_BEGIN: + + ldr I, M + tst I, #1 // I = I % 2 + ble zgemm_kernel_L1_END + +zgemm_kernel_L1_M1_20: + + INIT1x1 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble zgemm_kernel_L1_M1_40 + +zgemm_kernel_L1_M1_22: + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs L, L, #1 + bgt zgemm_kernel_L1_M1_22 + + +zgemm_kernel_L1_M1_40: + + ands L , K1, #7 // L = L % 8 + ble zgemm_kernel_L1_M1_100 + +zgemm_kernel_L1_M1_42: + + KERNEL1x1_SUB + + subs L, L, #1 + bgt zgemm_kernel_L1_M1_42 + +zgemm_kernel_L1_M1_100: + + SAVE1x1 + + +zgemm_kernel_L1_END: + + + +zgemm_kernel_L999: + + sub r3, fp, #128 + vldm r3, { d8 - d15} // restore floating point registers + + movs r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/zgemm_kernel_2x2_vfpv3.S b/kernel/arm/zgemm_kernel_2x2_vfpv3.S new file mode 100644 index 000000000..2d35028a2 --- /dev/null +++ b/kernel/arm/zgemm_kernel_2x2_vfpv3.S @@ -0,0 +1,1345 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/05 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +* 2013/11/02 Saar +* UNROLL_N 2 +* UNROLL_M 2 +* ZGEMM_P 64 +* ZGEMM_Q 120 +* ZGEMM_R 4096 +* A_PRE 96 +* B_PRE 96 +* C_PRE 64 +* +* Performance on Odroid U2: +* +* 1 Core: 1.62 GFLOPS ATLAS: 1.39 GFLOPS +* 2 Cores: 3.20 GFLOPS ATLAS: 2.54 GFLOPS +* 3 Cores: 4.72 GFLOPS ATLAS: 3.76 GFLOPS +* 4 Cores: 5.93 GFLOPS ATLAS: 4.88 GFLOPS +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_K r2 +#define OLD_A r3 +#define OLD_ALPHA_R d0 +#define OLD_ALPHA_I d1 + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define A [fp, #-248 ] +#define LDC [fp, #-252 ] +#define M [fp, #-256 ] +#define N [fp, #-260 ] +#define K [fp, #-264 ] + +#define ALPHA_I [fp, #-272] +#define ALPHA_R [fp, #-280] + +#define B [fp, #4 ] +#define C [fp, #8 ] +#define OLD_LDC [fp, #12 ] + +#define I r0 +#define J r1 +#define L r2 + +#define AO r5 +#define BO r6 + +#define CO1 r8 +#define CO2 r9 + +#define K1 r7 +#define BC r12 + +#define A_PRE 96 +#define B_PRE 96 +#define C_PRE 64 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + + #define FADD_R fsubd + #define FADD_I faddd + + #define FMAC_R1 fnmacd + #define FMAC_R2 fnmacd + #define FMAC_I1 fmacd + #define FMAC_I2 fnmacd + +#elif defined(CN) || defined(CT) + + #define FADD_R faddd + #define FADD_I fsubd + + #define FMAC_R1 fmacd + #define FMAC_R2 fmacd + #define FMAC_I1 fnmacd + #define FMAC_I2 fmacd + +#elif defined(NC) || defined(TC) + + #define FADD_R faddd + #define FADD_I fsubd + + #define FMAC_R1 fmacd + #define FMAC_R2 fnmacd + #define FMAC_I1 fmacd + #define FMAC_I2 fmacd + +#else + + #define FADD_R fsubd + #define FADD_I faddd + + #define FMAC_R1 fnmacd + #define FMAC_R2 fmacd + #define FMAC_I1 fnmacd + #define FMAC_I2 fnmacd + +#endif + + + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro INIT2x2 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d17, d16 + vmov.f64 d18, d16 + vmov.f64 d19, d16 + vmov.f64 d20, d16 + vmov.f64 d21, d16 + vmov.f64 d22, d16 + vmov.f64 d23, d16 + vmov.f64 d24, d16 + vmov.f64 d25, d16 + vmov.f64 d26, d16 + vmov.f64 d27, d16 + vmov.f64 d28, d16 + vmov.f64 d29, d16 + vmov.f64 d30, d16 + vmov.f64 d31, d16 + +.endm + +.macro KERNEL2x2_I + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + fmuld d16 , d0, d8 + fldd d2 , [ AO, #16 ] + fmuld d24 , d1, d9 + fldd d3 , [ AO, #24 ] + fmuld d17 , d0, d9 + fldd d10, [ BO, #16 ] + fmuld d25 , d1, d8 + + fldd d11, [ BO, #24 ] + fmuld d18 , d2, d8 + add BO , BO, #32 + fmuld d26 , d3, d9 + add AO , AO, #32 + fmuld d19 , d2, d9 + pld [ BO , #B_PRE ] + fmuld d27 , d3, d8 + + pld [ AO , #A_PRE ] + fmuld d20 , d0, d10 + fldd d4 , [ AO, #0 ] + fmuld d28 , d1, d11 + fldd d5 , [ AO, #8 ] + fmuld d21 , d0, d11 + fldd d12, [ BO ] + fmuld d29 , d1, d10 + + fldd d13, [ BO, #8 ] + fmuld d22 , d2, d10 + fldd d6 , [ AO, #16 ] + fmuld d30 , d3, d11 + fldd d7 , [ AO, #24 ] + fmuld d23 , d2, d11 + fldd d14, [ BO, #16 ] + fmuld d31 , d3, d10 + fldd d15, [ BO, #24 ] + + add BO , BO, #32 + add AO , AO, #32 +.endm + + + +.macro KERNEL2x2_M1 + pld [ AO , #A_PRE ] + + fmacd d16 , d0, d8 + pld [ BO , #B_PRE ] + fmacd d24 , d1, d9 + fldd d4 , [ AO, #0 ] + fmacd d17 , d0, d9 + fldd d5 , [ AO, #8 ] + fmacd d25 , d1, d8 + + fldd d12, [ BO ] + fmacd d18 , d2, d8 + fldd d13, [ BO, #8 ] + fmacd d26 , d3, d9 + fldd d6 , [ AO, #16 ] + fmacd d19 , d2, d9 + fldd d7 , [ AO, #24 ] + fmacd d27 , d3, d8 + + fmacd d20 , d0, d10 + fldd d14, [ BO, #16 ] + fmacd d28 , d1, d11 + fmacd d21 , d0, d11 + fldd d15, [ BO, #24 ] + fmacd d29 , d1, d10 + + fmacd d22 , d2, d10 + add BO , BO, #32 + fmacd d30 , d3, d11 + fmacd d23 , d2, d11 + add AO , AO, #32 + fmacd d31 , d3, d10 + +.endm + +.macro KERNEL2x2_M2 + pld [ AO , #A_PRE ] + + fmacd d16 , d4, d12 + pld [ BO , #B_PRE ] + fmacd d24 , d5, d13 + fldd d0 , [ AO, #0 ] + fmacd d17 , d4, d13 + fldd d1 , [ AO, #8 ] + fmacd d25 , d5, d12 + + fmacd d18 , d6, d12 + fldd d8 , [ BO ] + fmacd d26 , d7, d13 + fldd d9 , [ BO, #8 ] + fmacd d19 , d6, d13 + fmacd d27 , d7, d12 + + fldd d2 , [ AO, #16 ] + fmacd d20 , d4, d14 + fldd d3 , [ AO, #24 ] + fmacd d28 , d5, d15 + fmacd d21 , d4, d15 + fldd d10, [ BO, #16 ] + fmacd d29 , d5, d14 + + fldd d11, [ BO, #24 ] + fmacd d22 , d6, d14 + fmacd d30 , d7, d15 + add BO , BO, #32 + fmacd d23 , d6, d15 + add AO , AO, #32 + fmacd d31 , d7, d14 + +.endm + + +.macro KERNEL2x2_E + + fmacd d16 , d4, d12 + fmacd d24 , d5, d13 + fmacd d17 , d4, d13 + fmacd d25 , d5, d12 + + fmacd d18 , d6, d12 + fmacd d26 , d7, d13 + fmacd d19 , d6, d13 + fmacd d27 , d7, d12 + + fmacd d20 , d4, d14 + fmacd d28 , d5, d15 + fmacd d21 , d4, d15 + fmacd d29 , d5, d14 + + fmacd d22 , d6, d14 + fmacd d30 , d7, d15 + fmacd d23 , d6, d15 + fmacd d31 , d7, d14 + +.endm + +.macro KERNEL2x2_SUB + + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + fmacd d16 , d0, d8 + fldd d2 , [ AO, #16 ] + fmacd d24 , d1, d9 + fldd d3 , [ AO, #24 ] + fmacd d17 , d0, d9 + fldd d10, [ BO, #16 ] + fmacd d25 , d1, d8 + + fldd d11, [ BO, #24 ] + fmacd d18 , d2, d8 + fmacd d26 , d3, d9 + fmacd d19 , d2, d9 + fmacd d27 , d3, d8 + + fmacd d20 , d0, d10 + fmacd d28 , d1, d11 + fmacd d21 , d0, d11 + fmacd d29 , d1, d10 + + fmacd d22 , d2, d10 + add BO , BO, #32 + fmacd d30 , d3, d11 + fmacd d23 , d2, d11 + add AO , AO, #32 + fmacd d31 , d3, d10 + +.endm + + + + +.macro SAVE2x2 + pld [ CO1 , #C_PRE ] + + ldr r3 , LDC + add CO2 , CO1, r3 + fldd d0, ALPHA_R + fldd d1, ALPHA_I + + fldmiad CO1, { d4 - d7 } + fldmiad CO2, { d8 - d11 } + + FADD_R d16, d24 , d16 + FADD_I d17, d25 , d17 + FADD_R d18, d26 , d18 + FADD_I d19, d27 , d19 + FADD_R d20, d28 , d20 + FADD_I d21, d29 , d21 + FADD_R d22, d30 , d22 + FADD_I d23, d31 , d23 + + FMAC_R1 d4 , d0 , d16 + FMAC_I1 d5 , d0 , d17 + FMAC_R2 d4 , d1 , d17 + FMAC_I2 d5 , d1 , d16 + + FMAC_R1 d6 , d0 , d18 + FMAC_I1 d7 , d0 , d19 + FMAC_R2 d6 , d1 , d19 + FMAC_I2 d7 , d1 , d18 + + FMAC_R1 d8 , d0 , d20 + FMAC_I1 d9 , d0 , d21 + FMAC_R2 d8 , d1 , d21 + FMAC_I2 d9 , d1 , d20 + + FMAC_R1 d10, d0 , d22 + FMAC_I1 d11, d0 , d23 + FMAC_R2 d10, d1 , d23 + FMAC_I2 d11, d1 , d22 + + fstmiad CO1, { d4 - d7 } + fstmiad CO2, { d8 - d11 } + + add CO1, CO1, #32 + +.endm + +/******************************************************************************/ + +.macro INIT1x2 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d17, d16 + vmov.f64 d20, d16 + vmov.f64 d21, d16 + vmov.f64 d24, d16 + vmov.f64 d25, d16 + vmov.f64 d28, d16 + vmov.f64 d29, d16 + +.endm + +.macro KERNEL1x2_I + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + fldd d10, [ BO, #16 ] + fldd d11, [ BO, #24 ] + + fmuld d16 , d0, d8 + fmuld d24 , d1, d9 + fmuld d17 , d0, d9 + fmuld d25 , d1, d8 + + fmuld d20 , d0, d10 + fmuld d28 , d1, d11 + fmuld d21 , d0, d11 + fmuld d29 , d1, d10 + + add BO , BO, #32 + add AO , AO, #16 + + pld [ BO , #B_PRE ] + + fldd d4 , [ AO, #0 ] + fldd d5 , [ AO, #8 ] + + fldd d12, [ BO ] + fldd d13, [ BO, #8 ] + fldd d14, [ BO, #16 ] + fldd d15, [ BO, #24 ] + + add BO , BO, #32 + add AO , AO, #16 +.endm + + + +.macro KERNEL1x2_M1 + pld [ BO , #B_PRE ] + + fmacd d16 , d0, d8 + fmacd d24 , d1, d9 + fmacd d17 , d0, d9 + fmacd d25 , d1, d8 + + fmacd d20 , d0, d10 + fmacd d28 , d1, d11 + fmacd d21 , d0, d11 + fmacd d29 , d1, d10 + + fldd d4 , [ AO, #0 ] + fldd d5 , [ AO, #8 ] + + fldd d12, [ BO ] + fldd d13, [ BO, #8 ] + fldd d14, [ BO, #16 ] + fldd d15, [ BO, #24 ] + + add BO , BO, #32 + add AO , AO, #16 +.endm + +.macro KERNEL1x2_M2 + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + + fmacd d16 , d4, d12 + fmacd d24 , d5, d13 + fmacd d17 , d4, d13 + fmacd d25 , d5, d12 + + fmacd d20 , d4, d14 + fmacd d28 , d5, d15 + fmacd d21 , d4, d15 + fmacd d29 , d5, d14 + + fldd d0 , [ AO, #0 ] + fldd d1 , [ AO, #8 ] + + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + fldd d10, [ BO, #16 ] + fldd d11, [ BO, #24 ] + + add BO , BO, #32 + add AO , AO, #16 +.endm + + +.macro KERNEL1x2_E + + fmacd d16 , d4, d12 + fmacd d24 , d5, d13 + fmacd d17 , d4, d13 + fmacd d25 , d5, d12 + + fmacd d20 , d4, d14 + fmacd d28 , d5, d15 + fmacd d21 , d4, d15 + fmacd d29 , d5, d14 + +.endm + +.macro KERNEL1x2_SUB + + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + fldd d10, [ BO, #16 ] + fldd d11, [ BO, #24 ] + + fmacd d16 , d0, d8 + fmacd d24 , d1, d9 + fmacd d17 , d0, d9 + fmacd d25 , d1, d8 + + fmacd d20 , d0, d10 + fmacd d28 , d1, d11 + fmacd d21 , d0, d11 + fmacd d29 , d1, d10 + + add BO , BO, #32 + add AO , AO, #16 + +.endm + + + + +.macro SAVE1x2 + pld [ CO1 , #C_PRE ] + + ldr r3 , LDC + add CO2 , CO1, r3 + fldd d0, ALPHA_R + fldd d1, ALPHA_I + + fldmiad CO1, { d4 - d5 } + fldmiad CO2, { d8 - d9 } + + FADD_R d16, d24 , d16 + FADD_I d17, d25 , d17 + FADD_R d20, d28 , d20 + FADD_I d21, d29 , d21 + + FMAC_R1 d4 , d0 , d16 + FMAC_I1 d5 , d0 , d17 + FMAC_R2 d4 , d1 , d17 + FMAC_I2 d5 , d1 , d16 + + FMAC_R1 d8 , d0 , d20 + FMAC_I1 d9 , d0 , d21 + FMAC_R2 d8 , d1 , d21 + FMAC_I2 d9 , d1 , d20 + + fstmiad CO1, { d4 - d5 } + fstmiad CO2, { d8 - d9 } + + add CO1, CO1, #16 + +.endm + +/******************************************************************************/ + +.macro INIT2x1 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d17, d16 + vmov.f64 d18, d16 + vmov.f64 d19, d16 + vmov.f64 d24, d16 + vmov.f64 d25, d16 + vmov.f64 d26, d16 + vmov.f64 d27, d16 + +.endm + +.macro KERNEL2x1_I + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + fmuld d16 , d0, d8 + fmuld d24 , d1, d9 + fmuld d17 , d0, d9 + fmuld d25 , d1, d8 + + fmuld d18 , d2, d8 + fmuld d26 , d3, d9 + fmuld d19 , d2, d9 + fmuld d27 , d3, d8 + + add BO , BO, #16 + add AO , AO, #32 + + pld [ BO , #B_PRE ] + pld [ AO , #A_PRE ] + + fldd d4 , [ AO, #0 ] + fldd d5 , [ AO, #8 ] + fldd d6 , [ AO, #16 ] + fldd d7 , [ AO, #24 ] + + fldd d12, [ BO ] + fldd d13, [ BO, #8 ] + + add BO , BO, #16 + add AO , AO, #32 +.endm + + + +.macro KERNEL2x1_M1 + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + + fmacd d16 , d0, d8 + fmacd d24 , d1, d9 + fmacd d17 , d0, d9 + fmacd d25 , d1, d8 + + fmacd d18 , d2, d8 + fmacd d26 , d3, d9 + fmacd d19 , d2, d9 + fmacd d27 , d3, d8 + + fldd d4 , [ AO, #0 ] + fldd d5 , [ AO, #8 ] + fldd d6 , [ AO, #16 ] + fldd d7 , [ AO, #24 ] + + fldd d12, [ BO ] + fldd d13, [ BO, #8 ] + + add BO , BO, #16 + add AO , AO, #32 +.endm + +.macro KERNEL2x1_M2 + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + + fmacd d16 , d4, d12 + fmacd d24 , d5, d13 + fmacd d17 , d4, d13 + fmacd d25 , d5, d12 + + fmacd d18 , d6, d12 + fmacd d26 , d7, d13 + fmacd d19 , d6, d13 + fmacd d27 , d7, d12 + + fldd d0 , [ AO, #0 ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + add BO , BO, #16 + add AO , AO, #32 +.endm + + +.macro KERNEL2x1_E + + fmacd d16 , d4, d12 + fmacd d24 , d5, d13 + fmacd d17 , d4, d13 + fmacd d25 , d5, d12 + + fmacd d18 , d6, d12 + fmacd d26 , d7, d13 + fmacd d19 , d6, d13 + fmacd d27 , d7, d12 + +.endm + +.macro KERNEL2x1_SUB + + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + fmacd d16 , d0, d8 + fmacd d24 , d1, d9 + fmacd d17 , d0, d9 + fmacd d25 , d1, d8 + + fmacd d18 , d2, d8 + fmacd d26 , d3, d9 + fmacd d19 , d2, d9 + fmacd d27 , d3, d8 + + add BO , BO, #16 + add AO , AO, #32 + +.endm + + + + +.macro SAVE2x1 + pld [ CO1 , #C_PRE ] + + fldd d0, ALPHA_R + fldd d1, ALPHA_I + + fldmiad CO1, { d4 - d7 } + + FADD_R d16, d24 , d16 + FADD_I d17, d25 , d17 + FADD_R d18, d26 , d18 + FADD_I d19, d27 , d19 + + FMAC_R1 d4 , d0 , d16 + FMAC_I1 d5 , d0 , d17 + FMAC_R2 d4 , d1 , d17 + FMAC_I2 d5 , d1 , d16 + + FMAC_R1 d6 , d0 , d18 + FMAC_I1 d7 , d0 , d19 + FMAC_R2 d6 , d1 , d19 + FMAC_I2 d7 , d1 , d18 + + fstmiad CO1, { d4 - d7 } + + add CO1, CO1, #32 + +.endm + +/******************************************************************************/ + +.macro INIT1x1 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d17, d16 + vmov.f64 d24, d16 + vmov.f64 d25, d16 + +.endm + +.macro KERNEL1x1_I + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + fmuld d16 , d0, d8 + fmuld d24 , d1, d9 + fmuld d17 , d0, d9 + fmuld d25 , d1, d8 + + add BO , BO, #16 + add AO , AO, #16 + + pld [ BO , #B_PRE ] + pld [ AO , #A_PRE ] + + fldd d4 , [ AO, #0 ] + fldd d5 , [ AO, #8 ] + + fldd d12, [ BO ] + fldd d13, [ BO, #8 ] + + add BO , BO, #16 + add AO , AO, #16 +.endm + + + +.macro KERNEL1x1_M1 + + fmacd d16 , d0, d8 + fmacd d24 , d1, d9 + fmacd d17 , d0, d9 + fmacd d25 , d1, d8 + + fldd d4 , [ AO, #0 ] + fldd d5 , [ AO, #8 ] + + fldd d12, [ BO ] + fldd d13, [ BO, #8 ] + + add BO , BO, #16 + add AO , AO, #16 +.endm + +.macro KERNEL1x1_M2 + + fmacd d16 , d4, d12 + fmacd d24 , d5, d13 + fmacd d17 , d4, d13 + fmacd d25 , d5, d12 + + fldd d0 , [ AO, #0 ] + fldd d1 , [ AO, #8 ] + + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + add BO , BO, #16 + add AO , AO, #16 +.endm + + +.macro KERNEL1x1_E + + fmacd d16 , d4, d12 + fmacd d24 , d5, d13 + fmacd d17 , d4, d13 + fmacd d25 , d5, d12 + +.endm + +.macro KERNEL1x1_SUB + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + fmacd d16 , d0, d8 + fmacd d24 , d1, d9 + fmacd d17 , d0, d9 + fmacd d25 , d1, d8 + + add BO , BO, #16 + add AO , AO, #16 + +.endm + + + + +.macro SAVE1x1 + pld [ CO1 , #C_PRE ] + + fldd d0, ALPHA_R + fldd d1, ALPHA_I + + fldmiad CO1, { d4 - d5 } + + FADD_R d16, d24 , d16 + FADD_I d17, d25 , d17 + + FMAC_R1 d4 , d0 , d16 + FMAC_I1 d5 , d0 , d17 + FMAC_R2 d4 , d1 , d17 + FMAC_I2 d5 , d1 , d16 + + fstmiad CO1, { d4 - d5 } + + add CO1, CO1, #16 + +.endm + +/******************************************************************************/ + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + str OLD_M, M + str OLD_N, N + str OLD_K, K + str OLD_A, A + vstr OLD_ALPHA_R, ALPHA_R + vstr OLD_ALPHA_I, ALPHA_I + + sub r3, fp, #128 + vstm r3, { d8 - d15} // store floating point registers + + ldr r3, OLD_LDC + lsl r3, r3, #4 // ldc = ldc * 8 * 2 + str r3, LDC + + ldr K1, K + ldr BC, B + + ldr J, N + asrs J, J, #1 // J = J / 2 + ble zgemm_kernel_L1_BEGIN + +zgemm_kernel_L2_BEGIN: + + ldr CO1, C // CO1 = C + ldr r4 , LDC + lsl r4 , r4 , #1 // LDC * 2 + add r3 , r4, CO1 + str r3 , C // store C + + ldr AO, A // AO = A + pld [AO , #A_PRE-64] + pld [AO , #A_PRE-32] + + + +zgemm_kernel_L2_M2_BEGIN: + + ldr I, M + asrs I, I, #1 // I = I / 2 + ble zgemm_kernel_L2_M1_BEGIN + +zgemm_kernel_L2_M2_20: + + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + cmp L , #3 + blt zgemm_kernel_L2_M2_30 + .align 5 + + + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + sub L, L, #2 + +zgemm_kernel_L2_M2_22: + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + subs L, L, #1 + bgt zgemm_kernel_L2_M2_22 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b zgemm_kernel_L2_M2_44 + + +zgemm_kernel_L2_M2_30: + tst L, #3 + ble zgemm_kernel_L2_M2_40 + + tst L, #2 + ble zgemm_kernel_L2_M2_32 + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b zgemm_kernel_L2_M2_44 + +zgemm_kernel_L2_M2_32: + + tst L, #1 + ble zgemm_kernel_L2_M2_40 + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b zgemm_kernel_L2_M2_44 + + +zgemm_kernel_L2_M2_40: + + INIT2x2 + + +zgemm_kernel_L2_M2_44: + + ands L , K1, #7 // L = L % 8 + ble zgemm_kernel_L2_M2_100 + +zgemm_kernel_L2_M2_46: + + KERNEL2x2_SUB + + subs L, L, #1 + bne zgemm_kernel_L2_M2_46 + +zgemm_kernel_L2_M2_100: + + SAVE2x2 + +zgemm_kernel_L2_M2_END: + + subs I, I, #1 + bne zgemm_kernel_L2_M2_20 + + +zgemm_kernel_L2_M1_BEGIN: + + ldr I, M + tst I, #1 // I = I % 2 + ble zgemm_kernel_L2_END + +zgemm_kernel_L2_M1_20: + + INIT1x2 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble zgemm_kernel_L2_M1_40 + +zgemm_kernel_L2_M1_22: + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs L, L, #1 + bgt zgemm_kernel_L2_M1_22 + + +zgemm_kernel_L2_M1_40: + + ands L , K1, #7 // L = L % 8 + ble zgemm_kernel_L2_M1_100 + +zgemm_kernel_L2_M1_42: + + KERNEL1x2_SUB + + subs L, L, #1 + bgt zgemm_kernel_L2_M1_42 + +zgemm_kernel_L2_M1_100: + + SAVE1x2 + + +zgemm_kernel_L2_END: + + mov r3, BC + mov r4, K1 + lsl r4, r4, #5 // k * 2 * 8 * 2 + add r3, r3, r4 // B = B + K * 4 * 8 + mov BC, r3 + + subs J , #1 // j-- + bgt zgemm_kernel_L2_BEGIN + + + +/*********************************************************************************************/ + +zgemm_kernel_L1_BEGIN: + + ldr J , N + tst J , #1 + ble zgemm_kernel_L999 + + + ldr CO1, C // CO1 = C + ldr r4 , LDC + add r3 , r4, CO1 + str r3 , C // store C + + ldr AO, A // AO = A + +zgemm_kernel_L1_M2_BEGIN: + + ldr I, M + asrs I, I, #1 // I = I / 2 + ble zgemm_kernel_L1_M1_BEGIN + +zgemm_kernel_L1_M2_20: + + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + cmp L , #3 + blt zgemm_kernel_L1_M2_30 + .align 5 + + + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + sub L, L, #2 + +zgemm_kernel_L1_M2_22: + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + subs L, L, #1 + bgt zgemm_kernel_L1_M2_22 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b zgemm_kernel_L1_M2_44 + + +zgemm_kernel_L1_M2_30: + tst L, #3 + ble zgemm_kernel_L1_M2_40 + + tst L, #2 + ble zgemm_kernel_L1_M2_32 + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b zgemm_kernel_L1_M2_44 + +zgemm_kernel_L1_M2_32: + + tst L, #1 + ble zgemm_kernel_L1_M2_40 + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b zgemm_kernel_L1_M2_44 + + +zgemm_kernel_L1_M2_40: + + INIT2x1 + + +zgemm_kernel_L1_M2_44: + + ands L , K1, #7 // L = L % 8 + ble zgemm_kernel_L1_M2_100 + +zgemm_kernel_L1_M2_46: + + KERNEL2x1_SUB + + subs L, L, #1 + bne zgemm_kernel_L1_M2_46 + +zgemm_kernel_L1_M2_100: + + SAVE2x1 + +zgemm_kernel_L1_M2_END: + + subs I, I, #1 + bne zgemm_kernel_L1_M2_20 + + +zgemm_kernel_L1_M1_BEGIN: + + ldr I, M + tst I, #1 // I = I % 2 + ble zgemm_kernel_L1_END + +zgemm_kernel_L1_M1_20: + + INIT1x1 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble zgemm_kernel_L1_M1_40 + +zgemm_kernel_L1_M1_22: + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs L, L, #1 + bgt zgemm_kernel_L1_M1_22 + + +zgemm_kernel_L1_M1_40: + + ands L , K1, #7 // L = L % 8 + ble zgemm_kernel_L1_M1_100 + +zgemm_kernel_L1_M1_42: + + KERNEL1x1_SUB + + subs L, L, #1 + bgt zgemm_kernel_L1_M1_42 + +zgemm_kernel_L1_M1_100: + + SAVE1x1 + + +zgemm_kernel_L1_END: + + + +zgemm_kernel_L999: + + sub r3, fp, #128 + vldm r3, { d8 - d15} // restore floating point registers + + movs r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/zgemm_ncopy_2_vfp.S b/kernel/arm/zgemm_ncopy_2_vfp.S new file mode 100644 index 000000000..5ff8ee299 --- /dev/null +++ b/kernel/arm/zgemm_ncopy_2_vfp.S @@ -0,0 +1,254 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/05 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_A r2 +#define OLD_LDA r3 + + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define LDA [fp, #-260 ] + +#define B [fp, #4 ] + +#define M r0 +#define N r1 +#define A r2 + +#define BO r5 + +#define AO1 r6 +#define AO2 r7 + +#define I r3 +#define J r12 + +#define A_PRE 256 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro COPY2x2 + + pld [ AO1, #A_PRE ] + pld [ AO2, #A_PRE ] + fldd d0 , [ AO1, #0 ] + fldd d1 , [ AO1, #8 ] + fldd d4 , [ AO1, #16 ] + fldd d5 , [ AO1, #24 ] + + fldd d2 , [ AO2, #0 ] + fldd d3 , [ AO2, #8 ] + add AO1, AO1, #32 + fldd d6 , [ AO2, #16 ] + fldd d7 , [ AO2, #24 ] + + fstmiad BO!, { d0 - d7 } + add AO2, AO2, #32 + +.endm + + +.macro COPY1x2 + + fldd d0 , [ AO1, #0 ] + fldd d1 , [ AO1, #8 ] + fldd d2 , [ AO2, #0 ] + fldd d3 , [ AO2, #8 ] + + add AO1, AO1, #16 + fstmiad BO!, { d0 - d3 } + add AO2, AO2, #16 + +.endm + +.macro COPY2x1 + + fldd d0 , [ AO1, #0 ] + fldd d1 , [ AO1, #8 ] + fldd d2 , [ AO1, #16 ] + fldd d3 , [ AO1, #24 ] + + fstmiad BO!, { d0 - d3 } + add AO1, AO1, #32 + +.endm + + +.macro COPY1x1 + + fldd d0 , [ AO1, #0 ] + fldd d1 , [ AO1, #8 ] + + fstmiad BO!, { d0 - d1 } + add AO1, AO1, #16 + +.endm + + + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + + lsl r3, r3, #4 // lda = lda * 8 * 2 + str r3, LDA + + sub r4, fp, #128 + vstm r4, { d8 - d15} // store floating point registers + + ldr BO, B + +/*********************************************************************************************/ + +zgemm_ncopy_L2_BEGIN: + + asrs J, N, #1 // J = N / 2 + ble zgemm_ncopy_L1_BEGIN + +zgemm_ncopy_L2_M2_BEGIN: + + mov AO1, A // AO1 = A + ldr r4 , LDA + add AO2, AO1, r4 + add A , AO2, r4 // A = A + 2 * LDA + + asrs I, M, #1 // I = M / 2 + ble zgemm_ncopy_L2_M2_40 + +zgemm_ncopy_L2_M2_20: + + COPY2x2 + + subs I , I , #1 + bne zgemm_ncopy_L2_M2_20 + + +zgemm_ncopy_L2_M2_40: + + ands I, M , #1 + ble zgemm_ncopy_L2_M2_END + +zgemm_ncopy_L2_M2_60: + + COPY1x2 + + subs I , I , #1 + bne zgemm_ncopy_L2_M2_60 + + +zgemm_ncopy_L2_M2_END: + + subs J , J, #1 // j-- + bne zgemm_ncopy_L2_M2_BEGIN + + +/*********************************************************************************************/ + +zgemm_ncopy_L1_BEGIN: + + tst N, #1 + ble zgemm_ncopy_L999 + + +zgemm_ncopy_L1_M2_BEGIN: + + mov AO1, A // AO1 = A + ldr r4 , LDA + add A , AO1, r4 // A = A + 1 * LDA + + asrs I, M, #1 // I = M / 2 + ble zgemm_ncopy_L1_M2_40 + +zgemm_ncopy_L1_M2_20: + + COPY2x1 + + subs I , I , #1 + bne zgemm_ncopy_L1_M2_20 + + +zgemm_ncopy_L1_M2_40: + + ands I, M , #1 + ble zgemm_ncopy_L1_M2_END + +zgemm_ncopy_L1_M2_60: + + COPY1x1 + + subs I , I , #1 + bne zgemm_ncopy_L1_M2_60 + + +zgemm_ncopy_L1_M2_END: + + + +zgemm_ncopy_L999: + + sub r3, fp, #128 + vldm r3, { d8 - d15} // restore floating point registers + + movs r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/zgemm_tcopy_2_vfp.S b/kernel/arm/zgemm_tcopy_2_vfp.S new file mode 100644 index 000000000..7e27ca6a6 --- /dev/null +++ b/kernel/arm/zgemm_tcopy_2_vfp.S @@ -0,0 +1,245 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/07 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_A r2 +#define OLD_LDA r3 + + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define B [fp, #4 ] +#define A [fp, #-248 ] + +#define M r0 +#define N r1 +#define M4 r2 + +#define LDA r5 + +#define AO1 r6 +#define BO1 r7 +#define BO2 r8 + +#define I r4 +#define J r12 + +#define A_PRE 256 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ +.macro COPY2x2 + + pld [ AO1, #A_PRE ] + fldmiad AO1, { d0 - d3 } + + add r3, AO1, LDA + pld [ r3, #A_PRE ] + fldmiad r3, { d4 - d7 } + + fstmiad BO1, { d0 - d7 } + add AO1, AO1, #32 + add BO1, BO1, M4 + +.endm + +.macro COPY1x2 + + fldmiad AO1, { d0 -d1 } + + add r3, AO1, LDA + fldmiad r3, { d2 - d3 } + + fstmiad BO2, { d0 - d3 } + add AO1, AO1, #16 + add BO2, BO2, #32 + +.endm + +/*************************************************************************************************************************/ +.macro COPY2x1 + + fldmiad AO1, { d0 - d3 } + + fstmiad BO1, { d0 - d3 } + add AO1, AO1, #32 + add BO1, BO1, M4 + +.endm + +.macro COPY1x1 + + fldmiad AO1, { d0 - d1 } + + fstmiad BO2, { d0 - d1 } + add AO1, AO1, #16 + add BO2, BO2, #16 + +.endm + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + str OLD_A, A // store A + + lsl LDA, OLD_LDA, #4 // lda = lda * SIZE * 2 + + sub r4, fp, #128 + vstm r4, { d8 - d15} // store floating point registers + + lsl r4 , M, #4 // M * SIZE * 2 + + ldr r3, B + + and BO2 , N , #-2 + + mul BO2, BO2, r4 + + add BO2 , BO2, r3 + + lsl M4, M, #5 // M4 = M * 2 * SIZE * 2 + +zgemm_tcopy_L2_BEGIN: + + asrs J, M, #1 // J = N / 2 + ble zgemm_tcopy_L1_BEGIN + +zgemm_tcopy_L2_M2_BEGIN: + + ldr AO1, A // AO1 = A + lsl r3, LDA, #1 // r3 = 2 * LDA + add r3, r3 , AO1 // A = A + 2 * LDA + str r3, A // store A + + ldr BO1, B + add r3, BO1, #64 // B = B + 4 * SIZE *2 + str r3, B + + asrs I, N, #1 // I = M / 2 + ble zgemm_tcopy_L2_M2_60 + +zgemm_tcopy_L2_M2_40: + + COPY2x2 + subs I, I, #1 + bne zgemm_tcopy_L2_M2_40 + +zgemm_tcopy_L2_M2_60: + + tst N , #1 + ble zgemm_tcopy_L2_M2_END + + COPY1x2 + + +zgemm_tcopy_L2_M2_END: + + subs J , J, #1 // j-- + bne zgemm_tcopy_L2_M2_BEGIN + +/*********************************************************************************************/ + +zgemm_tcopy_L1_BEGIN: + + tst M, #1 + ble zgemm_tcopy_L999 + + +zgemm_tcopy_L1_M2_BEGIN: + + ldr AO1, A // AO1 = A + add r3, LDA , AO1 // A = A + 1 * LDA + str r3, A // store A + + ldr BO1, B + add r3, BO1, #32 // B = B + 2 * SIZE *2 + str r3, B + + asrs I, N, #1 // I = M / 2 + ble zgemm_tcopy_L1_M2_60 + + +zgemm_tcopy_L1_M2_40: + + COPY2x1 + subs I, I, #1 + bne zgemm_tcopy_L1_M2_40 + +zgemm_tcopy_L1_M2_60: + + tst N , #1 + ble zgemm_tcopy_L1_M2_END + + COPY1x1 + + +zgemm_tcopy_L1_M2_END: + + + +zgemm_tcopy_L999: + + sub r3, fp, #128 + vldm r3, { d8 - d15} // restore floating point registers + + mov r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/zgemv_n.c b/kernel/arm/zgemv_n.c new file mode 100644 index 000000000..dc2ffa0d2 --- /dev/null +++ b/kernel/arm/zgemv_n.c @@ -0,0 +1,157 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** + * * 2013/11/23 Saar + * * BLASTEST float : OK + * * BLASTEST double : OK + * CTEST : OK + * TEST : OK + * * + * **************************************************************************************/ + + +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +{ + BLASLONG i; + BLASLONG ix,iy; + BLASLONG j; + FLOAT *a_ptr; + FLOAT temp_r,temp_i; + BLASLONG inc_x2,inc_y2; + BLASLONG lda2; + BLASLONG i2; + + lda2 = 2*lda; + + ix = 0; + a_ptr = a; + + if ( inc_x == 1 && inc_y == 1 ) + { + + for (j=0; j + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + + + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + FLOAT scale = 0.0; + FLOAT ssq = 1.0; + BLASLONG inc_x2; + FLOAT temp; + + if (n < 0 || inc_x < 1 ) return(0.0); + + inc_x2 = 2 * inc_x; + + n *= inc_x2; + while(i < n) + { + + if ( x[i] != 0.0 ) + { + temp = ABS( x[i] ); + if ( scale < temp ) + { + ssq = 1 + ssq * ( scale / temp ) * ( scale / temp ); + scale = temp ; + } + else + { + ssq += ( temp / scale ) * ( temp / scale ); + } + + } + + if ( x[i+1] != 0.0 ) + { + temp = ABS( x[i+1] ); + if ( scale < temp ) + { + ssq = 1 + ssq * ( scale / temp ) * ( scale / temp ); + scale = temp ; + } + else + { + ssq += ( temp / scale ) * ( temp / scale ); + } + + } + + + i += inc_x2; + } + scale = scale * sqrt( ssq ); + return(scale); + +} + + diff --git a/kernel/arm/zrot.c b/kernel/arm/zrot.c new file mode 100644 index 000000000..4a2f37f64 --- /dev/null +++ b/kernel/arm/zrot.c @@ -0,0 +1,68 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : OK +* BLASTEST double : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#include "common.h" + +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT temp[2]; + + if ( n <= 0 ) return(0); + + BLASLONG inc_x2 = 2 * inc_x ; + BLASLONG inc_y2 = 2 * inc_y ; + + while(i < n) + { + temp[0] = c*x[ix] + s*y[iy] ; + temp[1] = c*x[ix+1] + s*y[iy+1] ; + y[iy] = c*y[iy] - s*x[ix] ; + y[iy+1] = c*y[iy+1] - s*x[ix+1] ; + x[ix] = temp[0] ; + x[ix+1] = temp[1] ; + + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + + } + return(0); + +} + + diff --git a/kernel/arm/zscal.c b/kernel/arm/zscal.c new file mode 100644 index 000000000..833dc8c03 --- /dev/null +++ b/kernel/arm/zscal.c @@ -0,0 +1,64 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : OK +* BLASTEST double : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#include "common.h" + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG inc_x2; + BLASLONG ip = 0; + FLOAT temp; + + if ( n < 0 || inc_x < 1 ) return(0); + + inc_x2 = 2 * inc_x; + for ( i=0; i + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT temp[2]; + + if ( n < 0 ) return(0); + + BLASLONG inc_x2 = 2 * inc_x; + BLASLONG inc_y2 = 2 * inc_y; + + while(i < n) + { + + temp[0] = x[ix] ; + temp[1] = x[ix+1] ; + x[ix] = y[iy] ; + x[ix+1] = y[iy+1] ; + y[iy] = temp[0] ; + y[iy+1] = temp[1] ; + + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + + } + return(0); + +} + + diff --git a/kernel/arm/ztrmm_kernel_2x2_vfp.S b/kernel/arm/ztrmm_kernel_2x2_vfp.S new file mode 100644 index 000000000..59039c32f --- /dev/null +++ b/kernel/arm/ztrmm_kernel_2x2_vfp.S @@ -0,0 +1,1537 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/28 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_K r2 +#define OLD_A r3 +#define OLD_ALPHA_R d0 +#define OLD_ALPHA_I d1 + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define KKK [fp, #-240] +#define KK [fp, #-244 ] +#define A [fp, #-248 ] +#define LDC [fp, #-252 ] +#define M [fp, #-256 ] +#define N [fp, #-260 ] +#define K [fp, #-264 ] + +#define ALPHA_I [fp, #-272] +#define ALPHA_R [fp, #-280] + +#define B [fp, #4 ] +#define C [fp, #8 ] +#define OLD_LDC [fp, #12 ] +#define OFFSET [fp, #16 ] + +#define I r0 +#define J r1 +#define L r2 + +#define AO r5 +#define BO r6 + +#define CO1 r8 +#define CO2 r9 + +#define K1 r7 +#define BC r12 + +#define A_PRE 96 +#define B_PRE 96 +#define C_PRE 64 + + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + + #define KMAC_R fnmacd + #define KMAC_I fmacd + + #define FMAC_R1 fmacd + #define FMAC_R2 fnmacd + #define FMAC_I1 fmacd + #define FMAC_I2 fmacd + +#elif defined(CN) || defined(CT) + + #define KMAC_R fmacd + #define KMAC_I fnmacd + + #define FMAC_R1 fmacd + #define FMAC_R2 fnmacd + #define FMAC_I1 fmacd + #define FMAC_I2 fmacd + +#elif defined(NC) || defined(TC) + + #define KMAC_R fmacd + #define KMAC_I fnmacd + + #define FMAC_R1 fmacd + #define FMAC_R2 fmacd + #define FMAC_I1 fnmacd + #define FMAC_I2 fmacd + +#else + + #define KMAC_R fnmacd + #define KMAC_I fmacd + + #define FMAC_R1 fmacd + #define FMAC_R2 fmacd + #define FMAC_I1 fnmacd + #define FMAC_I2 fmacd + +#endif + + + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro INIT2x2 + + vsub.f64 d8 , d8 , d8 + vmov.f64 d9 , d8 + vmov.f64 d10, d8 + vmov.f64 d11, d8 + vmov.f64 d12, d8 + vmov.f64 d13, d8 + vmov.f64 d14, d8 + vmov.f64 d15, d8 + +.endm + +.macro KERNEL2x2_I + pld [ AO, #A_PRE ] + pld [ BO, #B_PRE ] + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fldd d6 , [ BO, #16 ] + fldd d7 , [ BO, #24 ] + + fmuld d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmuld d9 , d0, d5 + KMAC_I d9 , d1, d4 + + fmuld d10 , d2, d4 + KMAC_R d10 , d3, d5 + fmuld d11 , d2, d5 + KMAC_I d11 , d3, d4 + + fmuld d12 , d0, d6 + KMAC_R d12 , d1, d7 + fmuld d13 , d0, d7 + KMAC_I d13 , d1, d6 + + fmuld d14 , d2, d6 + KMAC_R d14 , d3, d7 + fmuld d15 , d2, d7 + KMAC_I d15 , d3, d6 + + add BO , BO, #32 + add AO , AO, #32 + +.endm + + + +.macro KERNEL2x2_M1 + + + fldd d0 , [ AO ] + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fmacd d8 , d0, d4 + fldd d1 , [ AO, #8 ] + fmacd d9 , d0, d5 + fldd d2 , [ AO, #16 ] + KMAC_R d8 , d1, d5 + fldd d3 , [ AO, #24 ] + KMAC_I d9 , d1, d4 + + fldd d6 , [ BO, #16 ] + fmacd d10 , d2, d4 + fldd d7 , [ BO, #24 ] + fmacd d11 , d2, d5 + KMAC_R d10 , d3, d5 + pld [ AO, #A_PRE ] + KMAC_I d11 , d3, d4 + + pld [ BO, #B_PRE ] + fmacd d12 , d0, d6 + fmacd d13 , d0, d7 + KMAC_R d12 , d1, d7 + KMAC_I d13 , d1, d6 + + fmacd d14 , d2, d6 + fmacd d15 , d2, d7 + add BO , BO, #32 + KMAC_R d14 , d3, d7 + add AO , AO, #32 + KMAC_I d15 , d3, d6 + + +.endm + +.macro KERNEL2x2_M2 + + fldd d0 , [ AO ] + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fmacd d8 , d0, d4 + fldd d1 , [ AO, #8 ] + fmacd d9 , d0, d5 + fldd d2 , [ AO, #16 ] + KMAC_R d8 , d1, d5 + fldd d3 , [ AO, #24 ] + KMAC_I d9 , d1, d4 + + fldd d6 , [ BO, #16 ] + fmacd d10 , d2, d4 + fldd d7 , [ BO, #24 ] + fmacd d11 , d2, d5 + KMAC_R d10 , d3, d5 + pld [ AO, #A_PRE ] + KMAC_I d11 , d3, d4 + + pld [ BO, #B_PRE ] + fmacd d12 , d0, d6 + fmacd d13 , d0, d7 + KMAC_R d12 , d1, d7 + KMAC_I d13 , d1, d6 + + fmacd d14 , d2, d6 + fmacd d15 , d2, d7 + add BO , BO, #32 + KMAC_R d14 , d3, d7 + add AO , AO, #32 + KMAC_I d15 , d3, d6 + + +.endm + + +.macro KERNEL2x2_E + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + fldd d6 , [ BO, #16 ] + fldd d7 , [ BO, #24 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + fmacd d10 , d2, d4 + KMAC_R d10 , d3, d5 + fmacd d11 , d2, d5 + KMAC_I d11 , d3, d4 + + fmacd d12 , d0, d6 + KMAC_R d12 , d1, d7 + fmacd d13 , d0, d7 + KMAC_I d13 , d1, d6 + + fmacd d14 , d2, d6 + KMAC_R d14 , d3, d7 + fmacd d15 , d2, d7 + KMAC_I d15 , d3, d6 + + add BO , BO, #32 + add AO , AO, #32 + +.endm + +.macro KERNEL2x2_SUB + + fldd d0 , [ AO ] + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fmacd d8 , d0, d4 + fldd d1 , [ AO, #8 ] + fmacd d9 , d0, d5 + fldd d2 , [ AO, #16 ] + KMAC_R d8 , d1, d5 + fldd d3 , [ AO, #24 ] + KMAC_I d9 , d1, d4 + + fldd d6 , [ BO, #16 ] + fmacd d10 , d2, d4 + fldd d7 , [ BO, #24 ] + fmacd d11 , d2, d5 + KMAC_R d10 , d3, d5 + pld [ AO, #A_PRE ] + KMAC_I d11 , d3, d4 + + pld [ BO, #B_PRE ] + fmacd d12 , d0, d6 + fmacd d13 , d0, d7 + KMAC_R d12 , d1, d7 + KMAC_I d13 , d1, d6 + + fmacd d14 , d2, d6 + fmacd d15 , d2, d7 + add BO , BO, #32 + KMAC_R d14 , d3, d7 + add AO , AO, #32 + KMAC_I d15 , d3, d6 + + +.endm + + +.macro SAVE2x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + fldd d0, ALPHA_R + fldd d1, ALPHA_I + + vsub.f64 d4, d4 , d4 + vsub.f64 d5, d5 , d5 + vsub.f64 d6, d6 , d6 + vsub.f64 d7, d7 , d7 + + FMAC_R1 d4 , d0 , d8 + FMAC_I1 d5 , d0 , d9 + FMAC_R2 d4 , d1 , d9 + FMAC_I2 d5 , d1 , d8 + + FMAC_R1 d6 , d0 , d10 + FMAC_I1 d7 , d0 , d11 + FMAC_R2 d6 , d1 , d11 + FMAC_I2 d7 , d1 , d10 + + fstmiad CO1, { d4 - d7 } + + vsub.f64 d4, d4 , d4 + vsub.f64 d5, d5 , d5 + vsub.f64 d6, d6 , d6 + vsub.f64 d7, d7 , d7 + + FMAC_R1 d4 , d0 , d12 + FMAC_I1 d5 , d0 , d13 + FMAC_R2 d4 , d1 , d13 + FMAC_I2 d5 , d1 , d12 + + FMAC_R1 d6 , d0 , d14 + FMAC_I1 d7 , d0 , d15 + FMAC_R2 d6 , d1 , d15 + FMAC_I2 d7 , d1 , d14 + + fstmiad CO2, { d4 - d7 } + + add CO1, CO1, #32 + +.endm + +/******************************************************************************/ + +.macro INIT1x2 + + vsub.f64 d8 , d8 , d8 + vmov.f64 d9 , d8 + vmov.f64 d12, d8 + vmov.f64 d13, d8 + +.endm + +.macro KERNEL1x2_I + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fldd d6 , [ BO, #16 ] + fldd d7 , [ BO, #24 ] + + fmuld d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmuld d9 , d0, d5 + KMAC_I d9 , d1, d4 + + fmuld d12 , d0, d6 + KMAC_R d12 , d1, d7 + fmuld d13 , d0, d7 + KMAC_I d13 , d1, d6 + + add BO , BO, #32 + add AO , AO, #16 + +.endm + + + +.macro KERNEL1x2_M1 + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + fldd d6 , [ BO, #16 ] + fldd d7 , [ BO, #24 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + fmacd d12 , d0, d6 + KMAC_R d12 , d1, d7 + fmacd d13 , d0, d7 + KMAC_I d13 , d1, d6 + + add BO , BO, #32 + add AO , AO, #16 + +.endm + +.macro KERNEL1x2_M2 + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + fldd d6 , [ BO, #16 ] + fldd d7 , [ BO, #24 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + fmacd d12 , d0, d6 + KMAC_R d12 , d1, d7 + fmacd d13 , d0, d7 + KMAC_I d13 , d1, d6 + + add BO , BO, #32 + add AO , AO, #16 + + +.endm + + +.macro KERNEL1x2_E + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + fldd d6 , [ BO, #16 ] + fldd d7 , [ BO, #24 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + fmacd d12 , d0, d6 + KMAC_R d12 , d1, d7 + fmacd d13 , d0, d7 + KMAC_I d13 , d1, d6 + + add BO , BO, #32 + add AO , AO, #16 + +.endm + +.macro KERNEL1x2_SUB + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + fldd d6 , [ BO, #16 ] + fldd d7 , [ BO, #24 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + fmacd d12 , d0, d6 + KMAC_R d12 , d1, d7 + fmacd d13 , d0, d7 + KMAC_I d13 , d1, d6 + + add BO , BO, #32 + add AO , AO, #16 + + +.endm + + +.macro SAVE1x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + fldd d0, ALPHA_R + fldd d1, ALPHA_I + + vsub.f64 d4, d4 , d4 + vsub.f64 d5, d5 , d5 + + FMAC_R1 d4 , d0 , d8 + FMAC_I1 d5 , d0 , d9 + FMAC_R2 d4 , d1 , d9 + FMAC_I2 d5 , d1 , d8 + + fstmiad CO1, { d4 - d5 } + + vsub.f64 d4, d4 , d4 + vsub.f64 d5, d5 , d5 + + FMAC_R1 d4 , d0 , d12 + FMAC_I1 d5 , d0 , d13 + FMAC_R2 d4 , d1 , d13 + FMAC_I2 d5 , d1 , d12 + + fstmiad CO2, { d4 - d5 } + + add CO1, CO1, #16 + +.endm + + +/******************************************************************************/ + +.macro INIT2x1 + + vsub.f64 d8 , d8 , d8 + vmov.f64 d9 , d8 + vmov.f64 d10, d8 + vmov.f64 d11, d8 + +.endm + +.macro KERNEL2x1_I + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fmuld d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmuld d9 , d0, d5 + KMAC_I d9 , d1, d4 + + fmuld d10 , d2, d4 + KMAC_R d10 , d3, d5 + fmuld d11 , d2, d5 + KMAC_I d11 , d3, d4 + + add BO , BO, #16 + add AO , AO, #32 + +.endm + + + +.macro KERNEL2x1_M1 + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + fmacd d10 , d2, d4 + KMAC_R d10 , d3, d5 + fmacd d11 , d2, d5 + KMAC_I d11 , d3, d4 + + add BO , BO, #16 + add AO , AO, #32 + +.endm + +.macro KERNEL2x1_M2 + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + fmacd d10 , d2, d4 + KMAC_R d10 , d3, d5 + fmacd d11 , d2, d5 + KMAC_I d11 , d3, d4 + + add BO , BO, #16 + add AO , AO, #32 + + +.endm + + +.macro KERNEL2x1_E + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + fmacd d10 , d2, d4 + KMAC_R d10 , d3, d5 + fmacd d11 , d2, d5 + KMAC_I d11 , d3, d4 + + add BO , BO, #16 + add AO , AO, #32 + +.endm + +.macro KERNEL2x1_SUB + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + fmacd d10 , d2, d4 + KMAC_R d10 , d3, d5 + fmacd d11 , d2, d5 + KMAC_I d11 , d3, d4 + + add BO , BO, #16 + add AO , AO, #32 + + +.endm + + +.macro SAVE2x1 + + fldd d0, ALPHA_R + fldd d1, ALPHA_I + + vsub.f64 d4, d4 , d4 + vsub.f64 d5, d5 , d5 + vsub.f64 d6, d6 , d6 + vsub.f64 d7, d7 , d7 + + FMAC_R1 d4 , d0 , d8 + FMAC_I1 d5 , d0 , d9 + FMAC_R2 d4 , d1 , d9 + FMAC_I2 d5 , d1 , d8 + + FMAC_R1 d6 , d0 , d10 + FMAC_I1 d7 , d0 , d11 + FMAC_R2 d6 , d1 , d11 + FMAC_I2 d7 , d1 , d10 + + fstmiad CO1, { d4 - d7 } + + add CO1, CO1, #32 + +.endm + + +/******************************************************************************/ + +.macro INIT1x1 + + vsub.f64 d8 , d8 , d8 + vmov.f64 d9 , d8 + +.endm + +.macro KERNEL1x1_I + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fmuld d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmuld d9 , d0, d5 + KMAC_I d9 , d1, d4 + + add BO , BO, #16 + add AO , AO, #16 + +.endm + + + +.macro KERNEL1x1_M1 + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + add BO , BO, #16 + add AO , AO, #16 + +.endm + +.macro KERNEL1x1_M2 + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + add BO , BO, #16 + add AO , AO, #16 + + +.endm + + +.macro KERNEL1x1_E + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + add BO , BO, #16 + add AO , AO, #16 + +.endm + +.macro KERNEL1x1_SUB + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + add BO , BO, #16 + add AO , AO, #16 + + +.endm + + +.macro SAVE1x1 + + fldd d0, ALPHA_R + fldd d1, ALPHA_I + + vsub.f64 d4, d4 , d4 + vsub.f64 d5, d5 , d5 + + FMAC_R1 d4 , d0 , d8 + FMAC_I1 d5 , d0 , d9 + FMAC_R2 d4 , d1 , d9 + FMAC_I2 d5 , d1 , d8 + + fstmiad CO1, { d4 - d5 } + + add CO1, CO1, #16 + +.endm + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + str OLD_M, M + str OLD_N, N + str OLD_K, K + str OLD_A, A + vstr OLD_ALPHA_R, ALPHA_R + vstr OLD_ALPHA_I, ALPHA_I + + sub r3, fp, #128 + vstm r3, { d8 - d15} // store floating point registers + + ldr r3, OLD_LDC + lsl r3, r3, #4 // ldc = ldc * 8 * 2 + str r3, LDC + + ldr r3, OFFSET +#ifndef LEFT + neg r3 , r3 +#endif + str r3 , KK + + ldr BC, B + + ldr J, N + asrs J, J, #1 // J = J / 2 + ble _L1_BEGIN + +_L2_BEGIN: + + ldr CO1, C // CO1 = C + ldr r4 , LDC + lsl r4 , r4 , #1 // LDC * 2 + add r3 , r4, CO1 + str r3 , C // store C + +#if defined(LEFT) + ldr r3 , OFFSET + str r3 , KK +#endif + + + ldr AO, A // AO = A + pld [AO , #A_PRE-64] + pld [AO , #A_PRE-32] + + + +_L2_M2_BEGIN: + + ldr I, M + asrs I, I, #1 // I = I / 2 + ble _L2_M1_BEGIN + +_L2_M2_20: + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #5 // 2 * 8 * 2 double values + add BO , BO , r4 + lsls r4 , r3 , #5 // 2 * 8 * 2 double values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #2 // number of values in AO +#else + add K1, K1, #2 // number of values in BO +#endif + str K1, KKK +#endif + + + asrs L , K1, #3 // L = L / 8 + cmp L , #3 + blt _L2_M2_30 + .align 5 + + + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + sub L, L, #2 + +_L2_M2_22: + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + subs L, L, #1 + bgt _L2_M2_22 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b _L2_M2_44 + + +_L2_M2_30: + tst L, #3 + ble _L2_M2_40 + + tst L, #2 + ble _L2_M2_32 + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b _L2_M2_44 + +_L2_M2_32: + + tst L, #1 + ble _L2_M2_40 + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b _L2_M2_44 + + +_L2_M2_40: + + INIT2x2 + + +_L2_M2_44: + + ands L , K1, #7 // L = L % 8 + ble _L2_M2_100 + +_L2_M2_46: + + KERNEL2x2_SUB + + subs L, L, #1 + bne _L2_M2_46 + +_L2_M2_100: + + SAVE2x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #5 // 2 * 8 * 2 double values + add BO , BO , r4 + lsls r4 , r3 , #5 // 2 * 8 * 2 double values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in AO + str r3 , KK +#endif + + + + +_L2_M2_END: + + subs I, I, #1 + bne _L2_M2_20 + + +_L2_M1_BEGIN: + + ldr I, M + tst I, #1 // I = I % 2 + ble _L2_END + +_L2_M1_20: + + INIT1x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #5 // 2 * 8 * 2 double values + add BO , BO , r4 + lsls r4 , r3 , #4 // 1 * 8 * 2 double values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #1 // number of values in AO +#else + add K1, K1, #2 // number of values in BO +#endif + str K1, KKK +#endif + + asrs L , K1, #3 // L = L / 8 + ble _L2_M1_40 + +_L2_M1_22: + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs L, L, #1 + bgt _L2_M1_22 + + +_L2_M1_40: + + ands L , K1, #7 // L = L % 8 + ble _L2_M1_100 + +_L2_M1_42: + + KERNEL1x2_SUB + + subs L, L, #1 + bgt _L2_M1_42 + +_L2_M1_100: + + SAVE1x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #5 // 2 * 8 * 2 double values + add BO , BO , r4 + lsls r4 , r3 , #4 // 1 * 8 * 2 double values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #1 // number of values in AO + str r3 , KK +#endif + + + +_L2_END: + + mov r3, BC + ldr r4, K + lsl r4, r4, #5 // k * 2 * 8 * 2 + add r3, r3, r4 // B = B + K * 4 * 8 + mov BC, r3 + +#if !defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in BO + str r3 , KK +#endif + + + subs J , #1 // j-- + bgt _L2_BEGIN + + + +/*********************************************************************************************/ + +_L1_BEGIN: + + ldr J , N + tst J , #1 + ble _L999 + + + ldr CO1, C // CO1 = C + ldr r4 , LDC + add r3 , r4, CO1 + str r3 , C // store C + +#if defined(LEFT) + ldr r3 , OFFSET + str r3 , KK +#endif + + + ldr AO, A // AO = A + +_L1_M2_BEGIN: + + ldr I, M + asrs I, I, #1 // I = I / 2 + ble _L1_M1_BEGIN + +_L1_M2_20: + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #4 // 1 * 8 * 2 double values + add BO , BO , r4 + lsls r4 , r3 , #5 // 2 * 8 * 2 double values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #2 // number of values in AO +#else + add K1, K1, #1 // number of values in BO +#endif + str K1, KKK +#endif + + asrs L , K1, #3 // L = L / 8 + cmp L , #3 + blt _L1_M2_30 + .align 5 + + + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + sub L, L, #2 + +_L1_M2_22: + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + subs L, L, #1 + bgt _L1_M2_22 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b _L1_M2_44 + + +_L1_M2_30: + tst L, #3 + ble _L1_M2_40 + + tst L, #2 + ble _L1_M2_32 + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b _L1_M2_44 + +_L1_M2_32: + + tst L, #1 + ble _L1_M2_40 + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b _L1_M2_44 + + +_L1_M2_40: + + INIT2x1 + + +_L1_M2_44: + + ands L , K1, #7 // L = L % 8 + ble _L1_M2_100 + +_L1_M2_46: + + KERNEL2x1_SUB + + subs L, L, #1 + bne _L1_M2_46 + +_L1_M2_100: + + SAVE2x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #4 // 1 * 8 * 2 double values + add BO , BO , r4 + lsls r4 , r3 , #5 // 2 * 8 * 2 double values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in AO + str r3 , KK +#endif + + + +_L1_M2_END: + + subs I, I, #1 + bne _L1_M2_20 + + +_L1_M1_BEGIN: + + ldr I, M + tst I, #1 // I = I % 2 + ble _L1_END + +_L1_M1_20: + + INIT1x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #4 // 1 * 8 * 2 double values + add BO , BO , r4 + lsls r4 , r3 , #4 // 1 * 8 * 2 double values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #1 // number of values in AO +#else + add K1, K1, #1 // number of values in BO +#endif + str K1, KKK +#endif + + asrs L , K1, #3 // L = L / 8 + ble _L1_M1_40 + +_L1_M1_22: + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs L, L, #1 + bgt _L1_M1_22 + + +_L1_M1_40: + + ands L , K1, #7 // L = L % 8 + ble _L1_M1_100 + +_L1_M1_42: + + KERNEL1x1_SUB + + subs L, L, #1 + bgt _L1_M1_42 + +_L1_M1_100: + + SAVE1x1 + + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #4 // 1 * 8 * 2 double values + add BO , BO , r4 + lsls r4 , r3 , #4 // 1 * 8 * 2 double values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #1 // number of values in AO + str r3 , KK +#endif + + + +_L1_END: + + + +_L999: + + sub r3, fp, #128 + vldm r3, { d8 - d15} // restore floating point registers + + movs r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/ztrmm_kernel_2x2_vfpv3.S b/kernel/arm/ztrmm_kernel_2x2_vfpv3.S new file mode 100644 index 000000000..917ce610f --- /dev/null +++ b/kernel/arm/ztrmm_kernel_2x2_vfpv3.S @@ -0,0 +1,1538 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/10/16 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_K r2 +#define OLD_A r3 +#define OLD_ALPHA_R d0 +#define OLD_ALPHA_I d1 + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define KKK [fp, #-240] +#define KK [fp, #-244 ] +#define A [fp, #-248 ] +#define LDC [fp, #-252 ] +#define M [fp, #-256 ] +#define N [fp, #-260 ] +#define K [fp, #-264 ] + +#define ALPHA_I [fp, #-272] +#define ALPHA_R [fp, #-280] + +#define B [fp, #4 ] +#define C [fp, #8 ] +#define OLD_LDC [fp, #12 ] +#define OFFSET [fp, #16 ] + +#define I r0 +#define J r1 +#define L r2 + +#define AO r5 +#define BO r6 + +#define CO1 r8 +#define CO2 r9 + +#define K1 r7 +#define BC r12 + +#define A_PRE 96 +#define B_PRE 96 +#define C_PRE 64 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + + #define FADD_R fsubd + #define FADD_I faddd + + #define FMAC_R1 fnmuld + #define FMAC_R2 fnmacd + #define FMAC_I1 fmuld + #define FMAC_I2 fnmacd + +#elif defined(CN) || defined(CT) + + #define FADD_R faddd + #define FADD_I fsubd + + #define FMAC_R1 fmuld + #define FMAC_R2 fmacd + #define FMAC_I1 fnmuld + #define FMAC_I2 fmacd + +#elif defined(NC) || defined(TC) + + #define FADD_R faddd + #define FADD_I fsubd + + #define FMAC_R1 fmuld + #define FMAC_R2 fnmacd + #define FMAC_I1 fmuld + #define FMAC_I2 fmacd + +#else + + #define FADD_R fsubd + #define FADD_I faddd + + #define FMAC_R1 fnmuld + #define FMAC_R2 fmacd + #define FMAC_I1 fnmuld + #define FMAC_I2 fnmacd + +#endif + + + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro INIT2x2 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d17, d16 + vmov.f64 d18, d16 + vmov.f64 d19, d16 + vmov.f64 d20, d16 + vmov.f64 d21, d16 + vmov.f64 d22, d16 + vmov.f64 d23, d16 + vmov.f64 d24, d16 + vmov.f64 d25, d16 + vmov.f64 d26, d16 + vmov.f64 d27, d16 + vmov.f64 d28, d16 + vmov.f64 d29, d16 + vmov.f64 d30, d16 + vmov.f64 d31, d16 + +.endm + +.macro KERNEL2x2_I + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + fmuld d16 , d0, d8 + fldd d2 , [ AO, #16 ] + fmuld d24 , d1, d9 + fldd d3 , [ AO, #24 ] + fmuld d17 , d0, d9 + fldd d10, [ BO, #16 ] + fmuld d25 , d1, d8 + + fldd d11, [ BO, #24 ] + fmuld d18 , d2, d8 + add BO , BO, #32 + fmuld d26 , d3, d9 + add AO , AO, #32 + fmuld d19 , d2, d9 + pld [ BO , #B_PRE ] + fmuld d27 , d3, d8 + + pld [ AO , #A_PRE ] + fmuld d20 , d0, d10 + fldd d4 , [ AO, #0 ] + fmuld d28 , d1, d11 + fldd d5 , [ AO, #8 ] + fmuld d21 , d0, d11 + fldd d12, [ BO ] + fmuld d29 , d1, d10 + + fldd d13, [ BO, #8 ] + fmuld d22 , d2, d10 + fldd d6 , [ AO, #16 ] + fmuld d30 , d3, d11 + fldd d7 , [ AO, #24 ] + fmuld d23 , d2, d11 + fldd d14, [ BO, #16 ] + fmuld d31 , d3, d10 + fldd d15, [ BO, #24 ] + + add BO , BO, #32 + add AO , AO, #32 +.endm + + + +.macro KERNEL2x2_M1 + pld [ AO , #A_PRE ] + + fmacd d16 , d0, d8 + pld [ BO , #B_PRE ] + fmacd d24 , d1, d9 + fldd d4 , [ AO, #0 ] + fmacd d17 , d0, d9 + fldd d5 , [ AO, #8 ] + fmacd d25 , d1, d8 + + fldd d12, [ BO ] + fmacd d18 , d2, d8 + fldd d13, [ BO, #8 ] + fmacd d26 , d3, d9 + fldd d6 , [ AO, #16 ] + fmacd d19 , d2, d9 + fldd d7 , [ AO, #24 ] + fmacd d27 , d3, d8 + + fmacd d20 , d0, d10 + fldd d14, [ BO, #16 ] + fmacd d28 , d1, d11 + fmacd d21 , d0, d11 + fldd d15, [ BO, #24 ] + fmacd d29 , d1, d10 + + fmacd d22 , d2, d10 + add BO , BO, #32 + fmacd d30 , d3, d11 + fmacd d23 , d2, d11 + add AO , AO, #32 + fmacd d31 , d3, d10 + +.endm + +.macro KERNEL2x2_M2 + pld [ AO , #A_PRE ] + + fmacd d16 , d4, d12 + pld [ BO , #B_PRE ] + fmacd d24 , d5, d13 + fldd d0 , [ AO, #0 ] + fmacd d17 , d4, d13 + fldd d1 , [ AO, #8 ] + fmacd d25 , d5, d12 + + fmacd d18 , d6, d12 + fldd d8 , [ BO ] + fmacd d26 , d7, d13 + fldd d9 , [ BO, #8 ] + fmacd d19 , d6, d13 + fmacd d27 , d7, d12 + + fldd d2 , [ AO, #16 ] + fmacd d20 , d4, d14 + fldd d3 , [ AO, #24 ] + fmacd d28 , d5, d15 + fmacd d21 , d4, d15 + fldd d10, [ BO, #16 ] + fmacd d29 , d5, d14 + + fldd d11, [ BO, #24 ] + fmacd d22 , d6, d14 + fmacd d30 , d7, d15 + add BO , BO, #32 + fmacd d23 , d6, d15 + add AO , AO, #32 + fmacd d31 , d7, d14 + +.endm + + +.macro KERNEL2x2_E + + fmacd d16 , d4, d12 + fmacd d24 , d5, d13 + fmacd d17 , d4, d13 + fmacd d25 , d5, d12 + + fmacd d18 , d6, d12 + fmacd d26 , d7, d13 + fmacd d19 , d6, d13 + fmacd d27 , d7, d12 + + fmacd d20 , d4, d14 + fmacd d28 , d5, d15 + fmacd d21 , d4, d15 + fmacd d29 , d5, d14 + + fmacd d22 , d6, d14 + fmacd d30 , d7, d15 + fmacd d23 , d6, d15 + fmacd d31 , d7, d14 + +.endm + +.macro KERNEL2x2_SUB + + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + fmacd d16 , d0, d8 + fldd d2 , [ AO, #16 ] + fmacd d24 , d1, d9 + fldd d3 , [ AO, #24 ] + fmacd d17 , d0, d9 + fldd d10, [ BO, #16 ] + fmacd d25 , d1, d8 + + fldd d11, [ BO, #24 ] + fmacd d18 , d2, d8 + fmacd d26 , d3, d9 + fmacd d19 , d2, d9 + fmacd d27 , d3, d8 + + fmacd d20 , d0, d10 + fmacd d28 , d1, d11 + fmacd d21 , d0, d11 + fmacd d29 , d1, d10 + + fmacd d22 , d2, d10 + add BO , BO, #32 + fmacd d30 , d3, d11 + fmacd d23 , d2, d11 + add AO , AO, #32 + fmacd d31 , d3, d10 + +.endm + + + + +.macro SAVE2x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + fldd d0, ALPHA_R + fldd d1, ALPHA_I + + + FADD_R d16, d24 , d16 + FADD_I d17, d25 , d17 + FADD_R d18, d26 , d18 + FADD_I d19, d27 , d19 + FADD_R d20, d28 , d20 + FADD_I d21, d29 , d21 + FADD_R d22, d30 , d22 + FADD_I d23, d31 , d23 + + FMAC_R1 d4 , d0 , d16 + FMAC_I1 d5 , d0 , d17 + FMAC_R2 d4 , d1 , d17 + FMAC_I2 d5 , d1 , d16 + + FMAC_R1 d6 , d0 , d18 + FMAC_I1 d7 , d0 , d19 + FMAC_R2 d6 , d1 , d19 + FMAC_I2 d7 , d1 , d18 + + FMAC_R1 d8 , d0 , d20 + FMAC_I1 d9 , d0 , d21 + FMAC_R2 d8 , d1 , d21 + FMAC_I2 d9 , d1 , d20 + + FMAC_R1 d10, d0 , d22 + FMAC_I1 d11, d0 , d23 + FMAC_R2 d10, d1 , d23 + FMAC_I2 d11, d1 , d22 + + fstmiad CO1, { d4 - d7 } + fstmiad CO2, { d8 - d11 } + + add CO1, CO1, #32 + +.endm + +/******************************************************************************/ + +.macro INIT1x2 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d17, d16 + vmov.f64 d20, d16 + vmov.f64 d21, d16 + vmov.f64 d24, d16 + vmov.f64 d25, d16 + vmov.f64 d28, d16 + vmov.f64 d29, d16 + +.endm + +.macro KERNEL1x2_I + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + fldd d10, [ BO, #16 ] + fldd d11, [ BO, #24 ] + + fmuld d16 , d0, d8 + fmuld d24 , d1, d9 + fmuld d17 , d0, d9 + fmuld d25 , d1, d8 + + fmuld d20 , d0, d10 + fmuld d28 , d1, d11 + fmuld d21 , d0, d11 + fmuld d29 , d1, d10 + + add BO , BO, #32 + add AO , AO, #16 + + pld [ BO , #B_PRE ] + + fldd d4 , [ AO, #0 ] + fldd d5 , [ AO, #8 ] + + fldd d12, [ BO ] + fldd d13, [ BO, #8 ] + fldd d14, [ BO, #16 ] + fldd d15, [ BO, #24 ] + + add BO , BO, #32 + add AO , AO, #16 +.endm + + + +.macro KERNEL1x2_M1 + pld [ BO , #B_PRE ] + + fmacd d16 , d0, d8 + fmacd d24 , d1, d9 + fmacd d17 , d0, d9 + fmacd d25 , d1, d8 + + fmacd d20 , d0, d10 + fmacd d28 , d1, d11 + fmacd d21 , d0, d11 + fmacd d29 , d1, d10 + + fldd d4 , [ AO, #0 ] + fldd d5 , [ AO, #8 ] + + fldd d12, [ BO ] + fldd d13, [ BO, #8 ] + fldd d14, [ BO, #16 ] + fldd d15, [ BO, #24 ] + + add BO , BO, #32 + add AO , AO, #16 +.endm + +.macro KERNEL1x2_M2 + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + + fmacd d16 , d4, d12 + fmacd d24 , d5, d13 + fmacd d17 , d4, d13 + fmacd d25 , d5, d12 + + fmacd d20 , d4, d14 + fmacd d28 , d5, d15 + fmacd d21 , d4, d15 + fmacd d29 , d5, d14 + + fldd d0 , [ AO, #0 ] + fldd d1 , [ AO, #8 ] + + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + fldd d10, [ BO, #16 ] + fldd d11, [ BO, #24 ] + + add BO , BO, #32 + add AO , AO, #16 +.endm + + +.macro KERNEL1x2_E + + fmacd d16 , d4, d12 + fmacd d24 , d5, d13 + fmacd d17 , d4, d13 + fmacd d25 , d5, d12 + + fmacd d20 , d4, d14 + fmacd d28 , d5, d15 + fmacd d21 , d4, d15 + fmacd d29 , d5, d14 + +.endm + +.macro KERNEL1x2_SUB + + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + fldd d10, [ BO, #16 ] + fldd d11, [ BO, #24 ] + + fmacd d16 , d0, d8 + fmacd d24 , d1, d9 + fmacd d17 , d0, d9 + fmacd d25 , d1, d8 + + fmacd d20 , d0, d10 + fmacd d28 , d1, d11 + fmacd d21 , d0, d11 + fmacd d29 , d1, d10 + + add BO , BO, #32 + add AO , AO, #16 + +.endm + + + + +.macro SAVE1x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + fldd d0, ALPHA_R + fldd d1, ALPHA_I + + FADD_R d16, d24 , d16 + FADD_I d17, d25 , d17 + FADD_R d20, d28 , d20 + FADD_I d21, d29 , d21 + + FMAC_R1 d4 , d0 , d16 + FMAC_I1 d5 , d0 , d17 + FMAC_R2 d4 , d1 , d17 + FMAC_I2 d5 , d1 , d16 + + FMAC_R1 d8 , d0 , d20 + FMAC_I1 d9 , d0 , d21 + FMAC_R2 d8 , d1 , d21 + FMAC_I2 d9 , d1 , d20 + + fstmiad CO1, { d4 - d5 } + fstmiad CO2, { d8 - d9 } + + add CO1, CO1, #16 + +.endm + +/******************************************************************************/ + +.macro INIT2x1 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d17, d16 + vmov.f64 d18, d16 + vmov.f64 d19, d16 + vmov.f64 d24, d16 + vmov.f64 d25, d16 + vmov.f64 d26, d16 + vmov.f64 d27, d16 + +.endm + +.macro KERNEL2x1_I + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + fmuld d16 , d0, d8 + fmuld d24 , d1, d9 + fmuld d17 , d0, d9 + fmuld d25 , d1, d8 + + fmuld d18 , d2, d8 + fmuld d26 , d3, d9 + fmuld d19 , d2, d9 + fmuld d27 , d3, d8 + + add BO , BO, #16 + add AO , AO, #32 + + pld [ BO , #B_PRE ] + pld [ AO , #A_PRE ] + + fldd d4 , [ AO, #0 ] + fldd d5 , [ AO, #8 ] + fldd d6 , [ AO, #16 ] + fldd d7 , [ AO, #24 ] + + fldd d12, [ BO ] + fldd d13, [ BO, #8 ] + + add BO , BO, #16 + add AO , AO, #32 +.endm + + + +.macro KERNEL2x1_M1 + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + + fmacd d16 , d0, d8 + fmacd d24 , d1, d9 + fmacd d17 , d0, d9 + fmacd d25 , d1, d8 + + fmacd d18 , d2, d8 + fmacd d26 , d3, d9 + fmacd d19 , d2, d9 + fmacd d27 , d3, d8 + + fldd d4 , [ AO, #0 ] + fldd d5 , [ AO, #8 ] + fldd d6 , [ AO, #16 ] + fldd d7 , [ AO, #24 ] + + fldd d12, [ BO ] + fldd d13, [ BO, #8 ] + + add BO , BO, #16 + add AO , AO, #32 +.endm + +.macro KERNEL2x1_M2 + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + + fmacd d16 , d4, d12 + fmacd d24 , d5, d13 + fmacd d17 , d4, d13 + fmacd d25 , d5, d12 + + fmacd d18 , d6, d12 + fmacd d26 , d7, d13 + fmacd d19 , d6, d13 + fmacd d27 , d7, d12 + + fldd d0 , [ AO, #0 ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + add BO , BO, #16 + add AO , AO, #32 +.endm + + +.macro KERNEL2x1_E + + fmacd d16 , d4, d12 + fmacd d24 , d5, d13 + fmacd d17 , d4, d13 + fmacd d25 , d5, d12 + + fmacd d18 , d6, d12 + fmacd d26 , d7, d13 + fmacd d19 , d6, d13 + fmacd d27 , d7, d12 + +.endm + +.macro KERNEL2x1_SUB + + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + fmacd d16 , d0, d8 + fmacd d24 , d1, d9 + fmacd d17 , d0, d9 + fmacd d25 , d1, d8 + + fmacd d18 , d2, d8 + fmacd d26 , d3, d9 + fmacd d19 , d2, d9 + fmacd d27 , d3, d8 + + add BO , BO, #16 + add AO , AO, #32 + +.endm + + + + +.macro SAVE2x1 + + fldd d0, ALPHA_R + fldd d1, ALPHA_I + + FADD_R d16, d24 , d16 + FADD_I d17, d25 , d17 + FADD_R d18, d26 , d18 + FADD_I d19, d27 , d19 + + FMAC_R1 d4 , d0 , d16 + FMAC_I1 d5 , d0 , d17 + FMAC_R2 d4 , d1 , d17 + FMAC_I2 d5 , d1 , d16 + + FMAC_R1 d6 , d0 , d18 + FMAC_I1 d7 , d0 , d19 + FMAC_R2 d6 , d1 , d19 + FMAC_I2 d7 , d1 , d18 + + fstmiad CO1, { d4 - d7 } + + add CO1, CO1, #32 + +.endm + +/******************************************************************************/ + +.macro INIT1x1 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d17, d16 + vmov.f64 d24, d16 + vmov.f64 d25, d16 + +.endm + +.macro KERNEL1x1_I + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + fmuld d16 , d0, d8 + fmuld d24 , d1, d9 + fmuld d17 , d0, d9 + fmuld d25 , d1, d8 + + add BO , BO, #16 + add AO , AO, #16 + + pld [ BO , #B_PRE ] + pld [ AO , #A_PRE ] + + fldd d4 , [ AO, #0 ] + fldd d5 , [ AO, #8 ] + + fldd d12, [ BO ] + fldd d13, [ BO, #8 ] + + add BO , BO, #16 + add AO , AO, #16 +.endm + + + +.macro KERNEL1x1_M1 + + fmacd d16 , d0, d8 + fmacd d24 , d1, d9 + fmacd d17 , d0, d9 + fmacd d25 , d1, d8 + + fldd d4 , [ AO, #0 ] + fldd d5 , [ AO, #8 ] + + fldd d12, [ BO ] + fldd d13, [ BO, #8 ] + + add BO , BO, #16 + add AO , AO, #16 +.endm + +.macro KERNEL1x1_M2 + + fmacd d16 , d4, d12 + fmacd d24 , d5, d13 + fmacd d17 , d4, d13 + fmacd d25 , d5, d12 + + fldd d0 , [ AO, #0 ] + fldd d1 , [ AO, #8 ] + + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + add BO , BO, #16 + add AO , AO, #16 +.endm + + +.macro KERNEL1x1_E + + fmacd d16 , d4, d12 + fmacd d24 , d5, d13 + fmacd d17 , d4, d13 + fmacd d25 , d5, d12 + +.endm + +.macro KERNEL1x1_SUB + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + fmacd d16 , d0, d8 + fmacd d24 , d1, d9 + fmacd d17 , d0, d9 + fmacd d25 , d1, d8 + + add BO , BO, #16 + add AO , AO, #16 + +.endm + + + + +.macro SAVE1x1 + + fldd d0, ALPHA_R + fldd d1, ALPHA_I + + FADD_R d16, d24 , d16 + FADD_I d17, d25 , d17 + + FMAC_R1 d4 , d0 , d16 + FMAC_I1 d5 , d0 , d17 + FMAC_R2 d4 , d1 , d17 + FMAC_I2 d5 , d1 , d16 + + fstmiad CO1, { d4 - d5 } + + add CO1, CO1, #16 + +.endm + +/******************************************************************************/ + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + str OLD_M, M + str OLD_N, N + str OLD_K, K + str OLD_A, A + vstr OLD_ALPHA_R, ALPHA_R + vstr OLD_ALPHA_I, ALPHA_I + + sub r3, fp, #128 + vstm r3, { d8 - d15} // store floating point registers + + ldr r3, OLD_LDC + lsl r3, r3, #4 // ldc = ldc * 8 * 2 + str r3, LDC + + ldr r3, OFFSET +#ifndef LEFT + neg r3 , r3 +#endif + str r3 , KK + + ldr BC, B + + ldr J, N + asrs J, J, #1 // J = J / 2 + ble _L1_BEGIN + +_L2_BEGIN: + + ldr CO1, C // CO1 = C + ldr r4 , LDC + lsl r4 , r4 , #1 // LDC * 2 + add r3 , r4, CO1 + str r3 , C // store C + +#if defined(LEFT) + ldr r3 , OFFSET + str r3 , KK +#endif + + + ldr AO, A // AO = A + pld [AO , #A_PRE-64] + pld [AO , #A_PRE-32] + + + +_L2_M2_BEGIN: + + ldr I, M + asrs I, I, #1 // I = I / 2 + ble _L2_M1_BEGIN + +_L2_M2_20: + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #5 // 2 * 8 * 2 double values + add BO , BO , r4 + lsls r4 , r3 , #5 // 2 * 8 * 2 double values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #2 // number of values in AO +#else + add K1, K1, #2 // number of values in BO +#endif + str K1, KKK +#endif + + + asrs L , K1, #3 // L = L / 8 + cmp L , #3 + blt _L2_M2_30 + .align 5 + + + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + sub L, L, #2 + +_L2_M2_22: + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + subs L, L, #1 + bgt _L2_M2_22 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b _L2_M2_44 + + +_L2_M2_30: + tst L, #3 + ble _L2_M2_40 + + tst L, #2 + ble _L2_M2_32 + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b _L2_M2_44 + +_L2_M2_32: + + tst L, #1 + ble _L2_M2_40 + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b _L2_M2_44 + + +_L2_M2_40: + + INIT2x2 + + +_L2_M2_44: + + ands L , K1, #7 // L = L % 8 + ble _L2_M2_100 + +_L2_M2_46: + + KERNEL2x2_SUB + + subs L, L, #1 + bne _L2_M2_46 + +_L2_M2_100: + + SAVE2x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #5 // 2 * 8 * 2 double values + add BO , BO , r4 + lsls r4 , r3 , #5 // 2 * 8 * 2 double values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in AO + str r3 , KK +#endif + + + + +_L2_M2_END: + + subs I, I, #1 + bne _L2_M2_20 + + +_L2_M1_BEGIN: + + ldr I, M + tst I, #1 // I = I % 2 + ble _L2_END + +_L2_M1_20: + + INIT1x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #5 // 2 * 8 * 2 double values + add BO , BO , r4 + lsls r4 , r3 , #4 // 1 * 8 * 2 double values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #1 // number of values in AO +#else + add K1, K1, #2 // number of values in BO +#endif + str K1, KKK +#endif + + asrs L , K1, #3 // L = L / 8 + ble _L2_M1_40 + +_L2_M1_22: + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs L, L, #1 + bgt _L2_M1_22 + + +_L2_M1_40: + + ands L , K1, #7 // L = L % 8 + ble _L2_M1_100 + +_L2_M1_42: + + KERNEL1x2_SUB + + subs L, L, #1 + bgt _L2_M1_42 + +_L2_M1_100: + + SAVE1x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #5 // 2 * 8 * 2 double values + add BO , BO , r4 + lsls r4 , r3 , #4 // 1 * 8 * 2 double values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #1 // number of values in AO + str r3 , KK +#endif + + + +_L2_END: + + mov r3, BC + ldr r4, K + lsl r4, r4, #5 // k * 2 * 8 * 2 + add r3, r3, r4 // B = B + K * 4 * 8 + mov BC, r3 + +#if !defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in BO + str r3 , KK +#endif + + + subs J , #1 // j-- + bgt _L2_BEGIN + + + +/*********************************************************************************************/ + +_L1_BEGIN: + + ldr J , N + tst J , #1 + ble _L999 + + + ldr CO1, C // CO1 = C + ldr r4 , LDC + add r3 , r4, CO1 + str r3 , C // store C + +#if defined(LEFT) + ldr r3 , OFFSET + str r3 , KK +#endif + + + ldr AO, A // AO = A + +_L1_M2_BEGIN: + + ldr I, M + asrs I, I, #1 // I = I / 2 + ble _L1_M1_BEGIN + +_L1_M2_20: + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #4 // 1 * 8 * 2 double values + add BO , BO , r4 + lsls r4 , r3 , #5 // 2 * 8 * 2 double values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #2 // number of values in AO +#else + add K1, K1, #1 // number of values in BO +#endif + str K1, KKK +#endif + + asrs L , K1, #3 // L = L / 8 + cmp L , #3 + blt _L1_M2_30 + .align 5 + + + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + sub L, L, #2 + +_L1_M2_22: + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + subs L, L, #1 + bgt _L1_M2_22 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b _L1_M2_44 + + +_L1_M2_30: + tst L, #3 + ble _L1_M2_40 + + tst L, #2 + ble _L1_M2_32 + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b _L1_M2_44 + +_L1_M2_32: + + tst L, #1 + ble _L1_M2_40 + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b _L1_M2_44 + + +_L1_M2_40: + + INIT2x1 + + +_L1_M2_44: + + ands L , K1, #7 // L = L % 8 + ble _L1_M2_100 + +_L1_M2_46: + + KERNEL2x1_SUB + + subs L, L, #1 + bne _L1_M2_46 + +_L1_M2_100: + + SAVE2x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #4 // 1 * 8 * 2 double values + add BO , BO , r4 + lsls r4 , r3 , #5 // 2 * 8 * 2 double values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in AO + str r3 , KK +#endif + + + +_L1_M2_END: + + subs I, I, #1 + bne _L1_M2_20 + + +_L1_M1_BEGIN: + + ldr I, M + tst I, #1 // I = I % 2 + ble _L1_END + +_L1_M1_20: + + INIT1x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #4 // 1 * 8 * 2 double values + add BO , BO , r4 + lsls r4 , r3 , #4 // 1 * 8 * 2 double values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #1 // number of values in AO +#else + add K1, K1, #1 // number of values in BO +#endif + str K1, KKK +#endif + + asrs L , K1, #3 // L = L / 8 + ble _L1_M1_40 + +_L1_M1_22: + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs L, L, #1 + bgt _L1_M1_22 + + +_L1_M1_40: + + ands L , K1, #7 // L = L % 8 + ble _L1_M1_100 + +_L1_M1_42: + + KERNEL1x1_SUB + + subs L, L, #1 + bgt _L1_M1_42 + +_L1_M1_100: + + SAVE1x1 + + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #4 // 1 * 8 * 2 double values + add BO , BO , r4 + lsls r4 , r3 , #4 // 1 * 8 * 2 double values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #1 // number of values in AO + str r3 , KK +#endif + + + +_L1_END: + + + +_L999: + + sub r3, fp, #128 + vldm r3, { d8 - d15} // restore floating point registers + + movs r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm64/KERNEL b/kernel/arm64/KERNEL new file mode 100644 index 000000000..aeccfbf4c --- /dev/null +++ b/kernel/arm64/KERNEL @@ -0,0 +1,46 @@ +ifndef SNRM2KERNEL +SNRM2KERNEL = nrm2.c +endif + +ifndef DNRM2KERNEL +DNRM2KERNEL = nrm2.c +endif + +ifndef CNRM2KERNEL +CNRM2KERNEL = znrm2.c +endif + +ifndef ZNRM2KERNEL +ZNRM2KERNEL = znrm2.c +endif + +ifndef SCABS_KERNEL +SCABS_KERNEL = ../generic/cabs.c +endif + +ifndef DCABS_KERNEL +DCABS_KERNEL = ../generic/cabs.c +endif + +ifndef QCABS_KERNEL +QCABS_KERNEL = ../generic/cabs.c +endif + +ifndef LSAME_KERNEL +LSAME_KERNEL = ../generic/lsame.c +endif + +ifndef SGEMM_BETA +SGEMM_BETA = ../generic/gemm_beta.c +endif +ifndef DGEMM_BETA +DGEMM_BETA = ../generic/gemm_beta.c +endif +ifndef CGEMM_BETA +CGEMM_BETA = ../generic/zgemm_beta.c +endif +ifndef ZGEMM_BETA +ZGEMM_BETA = ../generic/zgemm_beta.c +endif + + diff --git a/kernel/arm64/KERNEL.ARMV8 b/kernel/arm64/KERNEL.ARMV8 new file mode 100644 index 000000000..ecf278cf9 --- /dev/null +++ b/kernel/arm64/KERNEL.ARMV8 @@ -0,0 +1,134 @@ +SAMAXKERNEL = ../arm/amax.c +DAMAXKERNEL = ../arm/amax.c +CAMAXKERNEL = ../arm/zamax.c +ZAMAXKERNEL = ../arm/zamax.c + +SAMINKERNEL = ../arm/amin.c +DAMINKERNEL = ../arm/amin.c +CAMINKERNEL = ../arm/zamin.c +ZAMINKERNEL = ../arm/zamin.c + +SMAXKERNEL = ../arm/max.c +DMAXKERNEL = ../arm/max.c + +SMINKERNEL = ../arm/min.c +DMINKERNEL = ../arm/min.c + +ISAMAXKERNEL = ../arm/iamax.c +IDAMAXKERNEL = ../arm/iamax.c +ICAMAXKERNEL = ../arm/izamax.c +IZAMAXKERNEL = ../arm/izamax.c + +ISAMINKERNEL = ../arm/iamin.c +IDAMINKERNEL = ../arm/iamin.c +ICAMINKERNEL = ../arm/izamin.c +IZAMINKERNEL = ../arm/izamin.c + +ISMAXKERNEL = ../arm/imax.c +IDMAXKERNEL = ../arm/imax.c + +ISMINKERNEL = ../arm/imin.c +IDMINKERNEL = ../arm/imin.c + +SASUMKERNEL = ../arm/asum.c +DASUMKERNEL = ../arm/asum.c +CASUMKERNEL = ../arm/zasum.c +ZASUMKERNEL = ../arm/zasum.c + +SAXPYKERNEL = ../arm/axpy.c +DAXPYKERNEL = ../arm/axpy.c +CAXPYKERNEL = ../arm/zaxpy.c +ZAXPYKERNEL = ../arm/zaxpy.c + +SCOPYKERNEL = ../arm/copy.c +DCOPYKERNEL = ../arm/copy.c +CCOPYKERNEL = ../arm/zcopy.c +ZCOPYKERNEL = ../arm/zcopy.c + +SDOTKERNEL = ../arm/dot.c +DDOTKERNEL = ../arm/dot.c +CDOTKERNEL = ../arm/zdot.c +ZDOTKERNEL = ../arm/zdot.c + +SNRM2KERNEL = ../arm/nrm2.c +DNRM2KERNEL = ../arm/nrm2.c +CNRM2KERNEL = ../arm/znrm2.c +ZNRM2KERNEL = ../arm/znrm2.c + +SROTKERNEL = ../arm/rot.c +DROTKERNEL = ../arm/rot.c +CROTKERNEL = ../arm/zrot.c +ZROTKERNEL = ../arm/zrot.c + +SSCALKERNEL = ../arm/scal.c +DSCALKERNEL = ../arm/scal.c +CSCALKERNEL = ../arm/zscal.c +ZSCALKERNEL = ../arm/zscal.c + +SSWAPKERNEL = ../arm/swap.c +DSWAPKERNEL = ../arm/swap.c +CSWAPKERNEL = ../arm/zswap.c +ZSWAPKERNEL = ../arm/zswap.c + +SGEMVNKERNEL = ../arm/gemv_n.c +DGEMVNKERNEL = ../arm/gemv_n.c +CGEMVNKERNEL = ../arm/zgemv_n.c +ZGEMVNKERNEL = ../arm/zgemv_n.c + +SGEMVTKERNEL = ../arm/gemv_t.c +DGEMVTKERNEL = ../arm/gemv_t.c +CGEMVTKERNEL = ../arm/zgemv_t.c +ZGEMVTKERNEL = ../arm/zgemv_t.c + +STRMMKERNEL = ../generic/trmmkernel_2x2.c +DTRMMKERNEL = ../generic/trmmkernel_2x2.c +CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c +ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c + +SGEMMKERNEL = ../generic/gemmkernel_2x2.c +SGEMMONCOPY = ../generic/gemm_ncopy_2.c +SGEMMOTCOPY = ../generic/gemm_tcopy_2.c +SGEMMONCOPYOBJ = sgemm_oncopy.o +SGEMMOTCOPYOBJ = sgemm_otcopy.o + +DGEMMKERNEL = ../generic/gemmkernel_2x2.c +DGEMMONCOPY = ../generic/gemm_ncopy_2.c +DGEMMOTCOPY = ../generic/gemm_tcopy_2.c +DGEMMONCOPYOBJ = dgemm_oncopy.o +DGEMMOTCOPYOBJ = dgemm_otcopy.o + +CGEMMKERNEL = ../generic/zgemmkernel_2x2.c +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMONCOPYOBJ = cgemm_oncopy.o +CGEMMOTCOPYOBJ = cgemm_otcopy.o + +ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMONCOPYOBJ = zgemm_oncopy.o +ZGEMMOTCOPYOBJ = zgemm_otcopy.o + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + + + + diff --git a/kernel/arm64/Makefile b/kernel/arm64/Makefile new file mode 100644 index 000000000..efae70d7b --- /dev/null +++ b/kernel/arm64/Makefile @@ -0,0 +1,2 @@ +clean :: + diff --git a/kernel/generic/gemm_ncopy_6.c b/kernel/generic/gemm_ncopy_6.c new file mode 100644 index 000000000..1ecb93c65 --- /dev/null +++ b/kernel/generic/gemm_ncopy_6.c @@ -0,0 +1,230 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ + BLASLONG i, j; + + FLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4; + FLOAT *b_offset; + FLOAT ctemp1, ctemp2, ctemp3, ctemp4; + FLOAT ctemp5, ctemp6, ctemp7, ctemp8; + FLOAT ctemp9, ctemp10, ctemp11, ctemp12; + FLOAT ctemp13, ctemp14, ctemp15, ctemp16; + + a_offset = a; + b_offset = b; + + j = (n >> 2); + if (j > 0){ + do{ + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset3 = a_offset2 + lda; + a_offset4 = a_offset3 + lda; + a_offset += 4 * lda; + + i = (m >> 2); + if (i > 0){ + do{ + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + ctemp3 = *(a_offset1 + 2); + ctemp4 = *(a_offset1 + 3); + + ctemp5 = *(a_offset2 + 0); + ctemp6 = *(a_offset2 + 1); + ctemp7 = *(a_offset2 + 2); + ctemp8 = *(a_offset2 + 3); + + ctemp9 = *(a_offset3 + 0); + ctemp10 = *(a_offset3 + 1); + ctemp11 = *(a_offset3 + 2); + ctemp12 = *(a_offset3 + 3); + + ctemp13 = *(a_offset4 + 0); + ctemp14 = *(a_offset4 + 1); + ctemp15 = *(a_offset4 + 2); + ctemp16 = *(a_offset4 + 3); + + *(b_offset + 0) = ctemp1; + *(b_offset + 1) = ctemp5; + *(b_offset + 2) = ctemp9; + *(b_offset + 3) = ctemp13; + + *(b_offset + 4) = ctemp2; + *(b_offset + 5) = ctemp6; + *(b_offset + 6) = ctemp10; + *(b_offset + 7) = ctemp14; + + *(b_offset + 8) = ctemp3; + *(b_offset + 9) = ctemp7; + *(b_offset + 10) = ctemp11; + *(b_offset + 11) = ctemp15; + + *(b_offset + 12) = ctemp4; + *(b_offset + 13) = ctemp8; + *(b_offset + 14) = ctemp12; + *(b_offset + 15) = ctemp16; + + a_offset1 += 4; + a_offset2 += 4; + a_offset3 += 4; + a_offset4 += 4; + + b_offset += 16; + i --; + }while(i > 0); + } + + i = (m & 3); + if (i > 0){ + do{ + ctemp1 = *(a_offset1 + 0); + ctemp5 = *(a_offset2 + 0); + ctemp9 = *(a_offset3 + 0); + ctemp13 = *(a_offset4 + 0); + + *(b_offset + 0) = ctemp1; + *(b_offset + 1) = ctemp5; + *(b_offset + 2) = ctemp9; + *(b_offset + 3) = ctemp13; + + a_offset1 ++; + a_offset2 ++; + a_offset3 ++; + a_offset4 ++; + + b_offset += 4; + i --; + }while(i > 0); + } + j--; + }while(j > 0); + } /* end of if(j > 0) */ + + if (n & 2){ + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset += 2 * lda; + + i = (m >> 2); + if (i > 0){ + do{ + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + ctemp3 = *(a_offset1 + 2); + ctemp4 = *(a_offset1 + 3); + + ctemp5 = *(a_offset2 + 0); + ctemp6 = *(a_offset2 + 1); + ctemp7 = *(a_offset2 + 2); + ctemp8 = *(a_offset2 + 3); + + *(b_offset + 0) = ctemp1; + *(b_offset + 1) = ctemp5; + *(b_offset + 2) = ctemp2; + *(b_offset + 3) = ctemp6; + + *(b_offset + 4) = ctemp3; + *(b_offset + 5) = ctemp7; + *(b_offset + 6) = ctemp4; + *(b_offset + 7) = ctemp8; + + a_offset1 += 4; + a_offset2 += 4; + b_offset += 8; + i --; + }while(i > 0); + } + + i = (m & 3); + if (i > 0){ + do{ + ctemp1 = *(a_offset1 + 0); + ctemp5 = *(a_offset2 + 0); + + *(b_offset + 0) = ctemp1; + *(b_offset + 1) = ctemp5; + + a_offset1 ++; + a_offset2 ++; + b_offset += 2; + i --; + }while(i > 0); + } + } /* end of if(j > 0) */ + + if (n & 1){ + a_offset1 = a_offset; + + i = (m >> 2); + if (i > 0){ + do{ + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + ctemp3 = *(a_offset1 + 2); + ctemp4 = *(a_offset1 + 3); + + *(b_offset + 0) = ctemp1; + *(b_offset + 1) = ctemp2; + *(b_offset + 2) = ctemp3; + *(b_offset + 3) = ctemp4; + + a_offset1 += 4; + b_offset += 4; + i --; + }while(i > 0); + } + + i = (m & 3); + if (i > 0){ + do{ + ctemp1 = *(a_offset1 + 0); + *(b_offset + 0) = ctemp1; + a_offset1 ++; + b_offset += 1; + i --; + }while(i > 0); + } + } /* end of if(j > 0) */ + + return 0; +} diff --git a/kernel/generic/gemm_tcopy_6.c b/kernel/generic/gemm_tcopy_6.c new file mode 100644 index 000000000..bd32090e7 --- /dev/null +++ b/kernel/generic/gemm_tcopy_6.c @@ -0,0 +1,281 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ + + BLASLONG i, j; + + FLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4; + FLOAT *b_offset, *b_offset1, *b_offset2, *b_offset3; + FLOAT ctemp1, ctemp2, ctemp3, ctemp4; + FLOAT ctemp5, ctemp6, ctemp7, ctemp8; + FLOAT ctemp9, ctemp10, ctemp11, ctemp12; + FLOAT ctemp13, ctemp14, ctemp15, ctemp16; + + a_offset = a; + b_offset = b; + + b_offset2 = b + m * (n & ~3); + b_offset3 = b + m * (n & ~1); + + j = (m >> 2); + if (j > 0){ + do{ + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset3 = a_offset2 + lda; + a_offset4 = a_offset3 + lda; + a_offset += 4 * lda; + + b_offset1 = b_offset; + b_offset += 16; + + i = (n >> 2); + if (i > 0){ + do{ + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + ctemp3 = *(a_offset1 + 2); + ctemp4 = *(a_offset1 + 3); + + ctemp5 = *(a_offset2 + 0); + ctemp6 = *(a_offset2 + 1); + ctemp7 = *(a_offset2 + 2); + ctemp8 = *(a_offset2 + 3); + + ctemp9 = *(a_offset3 + 0); + ctemp10 = *(a_offset3 + 1); + ctemp11 = *(a_offset3 + 2); + ctemp12 = *(a_offset3 + 3); + + ctemp13 = *(a_offset4 + 0); + ctemp14 = *(a_offset4 + 1); + ctemp15 = *(a_offset4 + 2); + ctemp16 = *(a_offset4 + 3); + + a_offset1 += 4; + a_offset2 += 4; + a_offset3 += 4; + a_offset4 += 4; + + *(b_offset1 + 0) = ctemp1; + *(b_offset1 + 1) = ctemp2; + *(b_offset1 + 2) = ctemp3; + *(b_offset1 + 3) = ctemp4; + + *(b_offset1 + 4) = ctemp5; + *(b_offset1 + 5) = ctemp6; + *(b_offset1 + 6) = ctemp7; + *(b_offset1 + 7) = ctemp8; + + *(b_offset1 + 8) = ctemp9; + *(b_offset1 + 9) = ctemp10; + *(b_offset1 + 10) = ctemp11; + *(b_offset1 + 11) = ctemp12; + + *(b_offset1 + 12) = ctemp13; + *(b_offset1 + 13) = ctemp14; + *(b_offset1 + 14) = ctemp15; + *(b_offset1 + 15) = ctemp16; + + b_offset1 += m * 4; + i --; + }while(i > 0); + } + + if (n & 2) { + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + + ctemp3 = *(a_offset2 + 0); + ctemp4 = *(a_offset2 + 1); + + ctemp5 = *(a_offset3 + 0); + ctemp6 = *(a_offset3 + 1); + + ctemp7 = *(a_offset4 + 0); + ctemp8 = *(a_offset4 + 1); + + a_offset1 += 2; + a_offset2 += 2; + a_offset3 += 2; + a_offset4 += 2; + + *(b_offset2 + 0) = ctemp1; + *(b_offset2 + 1) = ctemp2; + *(b_offset2 + 2) = ctemp3; + *(b_offset2 + 3) = ctemp4; + + *(b_offset2 + 4) = ctemp5; + *(b_offset2 + 5) = ctemp6; + *(b_offset2 + 6) = ctemp7; + *(b_offset2 + 7) = ctemp8; + + b_offset2 += 8; + } + + if (n & 1) { + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset2 + 0); + ctemp3 = *(a_offset3 + 0); + ctemp4 = *(a_offset4 + 0); + + *(b_offset3 + 0) = ctemp1; + *(b_offset3 + 1) = ctemp2; + *(b_offset3 + 2) = ctemp3; + *(b_offset3 + 3) = ctemp4; + + b_offset3 += 4; + } + + j--; + }while(j > 0); + } + + if (m & 2){ + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset += 2 * lda; + + b_offset1 = b_offset; + b_offset += 8; + + i = (n >> 2); + if (i > 0){ + do{ + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + ctemp3 = *(a_offset1 + 2); + ctemp4 = *(a_offset1 + 3); + + ctemp5 = *(a_offset2 + 0); + ctemp6 = *(a_offset2 + 1); + ctemp7 = *(a_offset2 + 2); + ctemp8 = *(a_offset2 + 3); + + a_offset1 += 4; + a_offset2 += 4; + + *(b_offset1 + 0) = ctemp1; + *(b_offset1 + 1) = ctemp2; + *(b_offset1 + 2) = ctemp3; + *(b_offset1 + 3) = ctemp4; + + *(b_offset1 + 4) = ctemp5; + *(b_offset1 + 5) = ctemp6; + *(b_offset1 + 6) = ctemp7; + *(b_offset1 + 7) = ctemp8; + + b_offset1 += m * 4; + i --; + }while(i > 0); + } + + if (n & 2) { + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + + ctemp3 = *(a_offset2 + 0); + ctemp4 = *(a_offset2 + 1); + + a_offset1 += 2; + a_offset2 += 2; + + *(b_offset2 + 0) = ctemp1; + *(b_offset2 + 1) = ctemp2; + *(b_offset2 + 2) = ctemp3; + *(b_offset2 + 3) = ctemp4; + + b_offset2 += 4; + } + + if (n & 1) { + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset2 + 0); + + *(b_offset3 + 0) = ctemp1; + *(b_offset3 + 1) = ctemp2; + b_offset3 += 2; + } + } + + if (m & 1){ + a_offset1 = a_offset; + b_offset1 = b_offset; + + i = (n >> 2); + if (i > 0){ + do{ + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + ctemp3 = *(a_offset1 + 2); + ctemp4 = *(a_offset1 + 3); + + a_offset1 += 4; + + *(b_offset1 + 0) = ctemp1; + *(b_offset1 + 1) = ctemp2; + *(b_offset1 + 2) = ctemp3; + *(b_offset1 + 3) = ctemp4; + + b_offset1 += 4 * m; + + i --; + }while(i > 0); + } + + if (n & 2) { + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + a_offset1 += 2; + + *(b_offset2 + 0) = ctemp1; + *(b_offset2 + 1) = ctemp2; + } + + if (n & 1) { + ctemp1 = *(a_offset1 + 0); + *(b_offset3 + 0) = ctemp1; + } + } + + return 0; +} diff --git a/kernel/generic/symm_lcopy_6.c b/kernel/generic/symm_lcopy_6.c new file mode 100644 index 000000000..ac04943e2 --- /dev/null +++ b/kernel/generic/symm_lcopy_6.c @@ -0,0 +1,138 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01, data02, data03, data04; + FLOAT *ao1, *ao2, *ao3, *ao4; + + js = (n >> 2); + while (js > 0){ + + offset = posX - posY; + + if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda; + if (offset > -1) ao2 = a + posX + 1 + posY * lda; else ao2 = a + posY + (posX + 1) * lda; + if (offset > -2) ao3 = a + posX + 2 + posY * lda; else ao3 = a + posY + (posX + 2) * lda; + if (offset > -3) ao4 = a + posX + 3 + posY * lda; else ao4 = a + posY + (posX + 3) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao2 + 0); + data03 = *(ao3 + 0); + data04 = *(ao4 + 0); + + if (offset > 0) ao1 += lda; else ao1 ++; + if (offset > -1) ao2 += lda; else ao2 ++; + if (offset > -2) ao3 += lda; else ao3 ++; + if (offset > -3) ao4 += lda; else ao4 ++; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + b += 4; + + offset --; + i --; + } + + posX += 4; + js --; + } + + if (n & 2) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda; + if (offset > -1) ao2 = a + posX + 1 + posY * lda; else ao2 = a + posY + (posX + 1) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao2 + 0); + + if (offset > 0) ao1 += lda; else ao1 ++; + if (offset > -1) ao2 += lda; else ao2 ++; + + b[ 0] = data01; + b[ 1] = data02; + + b += 2; + + offset --; + i --; + } + + posX += 2; + } + + if (n & 1) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + + if (offset > 0) ao1 += lda; else ao1 ++; + + b[ 0] = data01; + + b ++; + + offset --; + i --; + } + } + + return 0; +} diff --git a/kernel/generic/symm_ucopy_6.c b/kernel/generic/symm_ucopy_6.c new file mode 100644 index 000000000..9b9cff820 --- /dev/null +++ b/kernel/generic/symm_ucopy_6.c @@ -0,0 +1,136 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01, data02, data03, data04; + FLOAT *ao1, *ao2, *ao3, *ao4; + + js = (n >> 2); + while (js > 0){ + + offset = posX - posY; + + if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda; + if (offset > -1) ao2 = a + posY + (posX + 1) * lda; else ao2 = a + posX + 1 + posY * lda; + if (offset > -2) ao3 = a + posY + (posX + 2) * lda; else ao3 = a + posX + 2 + posY * lda; + if (offset > -3) ao4 = a + posY + (posX + 3) * lda; else ao4 = a + posX + 3 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao2 + 0); + data03 = *(ao3 + 0); + data04 = *(ao4 + 0); + + if (offset > 0) ao1 ++; else ao1 += lda; + if (offset > -1) ao2 ++; else ao2 += lda; + if (offset > -2) ao3 ++; else ao3 += lda; + if (offset > -3) ao4 ++; else ao4 += lda; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + b += 4; + + offset --; + i --; + } + + posX += 4; + js --; + } + + if (n & 2) { + offset = posX - posY; + + if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda; + if (offset > -1) ao2 = a + posY + (posX + 1) * lda; else ao2 = a + posX + 1 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao2 + 0); + + if (offset > 0) ao1 ++; else ao1 += lda; + if (offset > -1) ao2 ++; else ao2 += lda; + + b[ 0] = data01; + b[ 1] = data02; + + b += 2; + + offset --; + i --; + } + + posX += 2; + } + + if (n & 1) { + offset = posX - posY; + + if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + + if (offset > 0) ao1 ++; else ao1 += lda; + + b[ 0] = data01; + + b ++; + + offset --; + i --; + } + } + + return 0; +} diff --git a/kernel/generic/trmm_lncopy_6.c b/kernel/generic/trmm_lncopy_6.c new file mode 100644 index 000000000..6cd16673a --- /dev/null +++ b/kernel/generic/trmm_lncopy_6.c @@ -0,0 +1,484 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT data09, data10, data11, data12, data13, data14, data15, data16; + FLOAT *ao1, *ao2, *ao3, *ao4; + + js = (n >> 2); + + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + ao3 = a + posY + (posX + 2) * lda; + ao4 = a + posY + (posX + 3) * lda; + } else { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + ao3 = a + posX + (posY + 2) * lda; + ao4 = a + posX + (posY + 3) * lda; + } + + i = (m >> 2); + if (i > 0) { + do { + if (X > posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + data07 = *(ao2 + 2); + data08 = *(ao2 + 3); + + data09 = *(ao3 + 0); + data10 = *(ao3 + 1); + data11 = *(ao3 + 2); + data12 = *(ao3 + 3); + + data13 = *(ao4 + 0); + data14 = *(ao4 + 1); + data15 = *(ao4 + 2); + data16 = *(ao4 + 3); + + b[ 0] = data01; + b[ 1] = data05; + b[ 2] = data09; + b[ 3] = data13; + b[ 4] = data02; + b[ 5] = data06; + b[ 6] = data10; + b[ 7] = data14; + + b[ 8] = data03; + b[ 9] = data07; + b[10] = data11; + b[11] = data15; + b[12] = data04; + b[13] = data08; + b[14] = data12; + b[15] = data16; + + ao1 += 4; + ao2 += 4; + ao3 += 4; + ao4 += 4; + b += 16; + + } else + if (X < posY) { + ao1 += 4 * lda; + ao2 += 4 * lda; + ao3 += 4 * lda; + ao4 += 4 * lda; + b += 16; + + } else { +#ifdef UNIT + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + data07 = *(ao2 + 2); + data08 = *(ao2 + 3); + + data12 = *(ao3 + 3); + + b[ 0] = ONE; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = data02; + b[ 5] = ONE; + b[ 6] = ZERO; + b[ 7] = ZERO; + + b[ 8] = data03; + b[ 9] = data07; + b[10] = ONE; + b[11] = ZERO; + b[12] = data04; + b[13] = data08; + b[14] = data12; + b[15] = ONE; +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + data06 = *(ao2 + 1); + data07 = *(ao2 + 2); + data08 = *(ao2 + 3); + + data11 = *(ao3 + 2); + data12 = *(ao3 + 3); + + data16 = *(ao4 + 3); + + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = data02; + b[ 5] = data06; + b[ 6] = ZERO; + b[ 7] = ZERO; + + b[ 8] = data03; + b[ 9] = data07; + b[10] = data11; + b[11] = ZERO; + b[12] = data04; + b[13] = data08; + b[14] = data12; + b[15] = data16; +#endif + ao1 += 4; + ao2 += 4; + ao3 += 4; + ao4 += 4; + b += 16; + } + + X += 4; + i --; + } while (i > 0); + } + + i = (m & 3); + if (i) { + + if (X > posY) { + + if (m & 2) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + data05 = *(ao3 + 0); + data06 = *(ao3 + 1); + data07 = *(ao4 + 0); + data08 = *(ao4 + 1); + + b[ 0] = data01; + b[ 1] = data03; + b[ 2] = data05; + b[ 3] = data07; + b[ 4] = data02; + b[ 5] = data04; + b[ 6] = data06; + b[ 7] = data08; + + ao1 += 2; + ao2 += 2; + ao3 += 2; + ao4 += 2; + b += 8; + } + + if (m & 1) { + data01 = *(ao1 + 0); + data02 = *(ao2 + 0); + data03 = *(ao3 + 0); + data04 = *(ao4 + 0); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + ao1 += 1; + ao2 += 1; + ao3 += 1; + ao4 += 1; + b += 4; + } + + } else + if (X < posY) { + if (m & 2) { + ao1 += 2 * lda; + ao2 += 2 * lda; + + b += 8; + } + + if (m & 1) { + ao1 += lda; + b += 4; + } + + } else { +#ifdef UNIT + data05 = *(ao2 + 0); + data09 = *(ao3 + 0); + data13 = *(ao4 + 0); + + if (i >= 2) { + data10 = *(ao3 + 1); + data14 = *(ao4 + 1); + } + + if (i >= 3) { + data15 = *(ao4 + 2); + } + + b[ 0] = ONE; + b[ 1] = data05; + b[ 2] = data09; + b[ 3] = data13; + b += 4; + + if(i >= 2) { + b[ 0] = ZERO; + b[ 1] = ONE; + b[ 2] = data10; + b[ 3] = data14; + b += 4; + } + + if (i >= 3) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ONE; + b[ 3] = data15; + b += 4; + } +#else + data01 = *(ao1 + 0); + data05 = *(ao2 + 0); + data09 = *(ao3 + 0); + data13 = *(ao4 + 0); + + if (i >= 2) { + data06 = *(ao2 + 1); + data10 = *(ao3 + 1); + data14 = *(ao4 + 1); + } + + if (i >= 3) { + data11 = *(ao3 + 2); + data15 = *(ao4 + 2); + } + + b[ 0] = data01; + b[ 1] = data05; + b[ 2] = data09; + b[ 3] = data13; + b += 4; + + if(i >= 2) { + b[ 0] = ZERO; + b[ 1] = data06; + b[ 2] = data10; + b[ 3] = data14; + b += 4; + } + + if (i >= 3) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = data11; + b[ 3] = data15; + b += 4; + } +#endif + } + } + + posY += 4; + js --; + } while (js > 0); + } /* End of main loop */ + + + if (n & 2){ + X = posX; + + if (posX <= posY) { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + } else { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X > posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + + b[ 0] = data01; + b[ 1] = data05; + b[ 2] = data02; + b[ 3] = data06; + + ao1 += 2; + ao2 += 2; + b += 4; + + } else + if (X < posY) { + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 4; + } else { +#ifdef UNIT + data02 = *(ao1 + 1); + + b[ 0] = ONE; + b[ 1] = ZERO; + b[ 2] = data02; + b[ 3] = ONE; +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data06 = *(ao2 + 1); + + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = data02; + b[ 3] = data06; +#endif + ao1 += 2; + ao2 += 2; + + b += 4; + } + + X += 2; + i --; + } while (i > 0); + } + + i = (m & 1); + if (i) { + + if (X > posY) { + data01 = *(ao1 + 0); + data02 = *(ao2 + 0); + b[ 0] = data01; + b[ 1] = data02; + + ao1 += 1; + ao2 += 1; + b += 2; + } else + if (X < posY) { + ao1 += lda; + b += 2; + } else { +#ifdef UNIT + data05 = *(ao2 + 0); + + b[ 0] = ONE; + b[ 1] = data05; +#else + data01 = *(ao1 + 0); + data05 = *(ao2 + 0); + + b[ 0] = data01; + b[ 1] = data05; +#endif + b += 2; + } + } + posY += 2; + } + + if (n & 1){ + X = posX; + + if (posX <= posY) { + ao1 = a + posY + (posX + 0) * lda; + } else { + ao1 = a + posX + (posY + 0) * lda; + } + + i = m; + if (i > 0) { + do { + if (X > posY) { + data01 = *(ao1 + 0); + b[ 0] = data01; + b += 1; + ao1 += 1; + } else + if (X < posY) { + b += 1; + ao1 += lda; + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + data01 = *(ao1 + 0); + b[ 0] = data01; +#endif + b += 1; + ao1 += 1; + } + + X ++; + i --; + } while (i > 0); + } + + posY += 1; + } + + return 0; +} diff --git a/kernel/generic/trmm_ltcopy_6.c b/kernel/generic/trmm_ltcopy_6.c new file mode 100644 index 000000000..69a233be6 --- /dev/null +++ b/kernel/generic/trmm_ltcopy_6.c @@ -0,0 +1,488 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT data09, data10, data11, data12, data13, data14, data15, data16; + FLOAT *ao1, *ao2, *ao3, *ao4; + + js = (n >> 2); + + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + ao3 = a + posY + (posX + 2) * lda; + ao4 = a + posY + (posX + 3) * lda; + } else { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + ao3 = a + posX + (posY + 2) * lda; + ao4 = a + posX + (posY + 3) * lda; + } + + i = (m >> 2); + if (i > 0) { + do { + if (X > posY) { + ao1 += 4; + ao2 += 4; + ao3 += 4; + ao4 += 4; + b += 16; + + } else + if (X < posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + data07 = *(ao2 + 2); + data08 = *(ao2 + 3); + + data09 = *(ao3 + 0); + data10 = *(ao3 + 1); + data11 = *(ao3 + 2); + data12 = *(ao3 + 3); + + data13 = *(ao4 + 0); + data14 = *(ao4 + 1); + data15 = *(ao4 + 2); + data16 = *(ao4 + 3); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + + ao1 += 4 * lda; + ao2 += 4 * lda; + ao3 += 4 * lda; + ao4 += 4 * lda; + b += 16; + + } else { + +#ifdef UNIT + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + data07 = *(ao2 + 2); + data08 = *(ao2 + 3); + + data12 = *(ao3 + 3); + + b[ 0] = ONE; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + b[ 4] = ZERO; + b[ 5] = ONE; + b[ 6] = data07; + b[ 7] = data08; + + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ONE; + b[11] = data12; + + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ONE; +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + data06 = *(ao2 + 1); + data07 = *(ao2 + 2); + data08 = *(ao2 + 3); + + data11 = *(ao3 + 2); + data12 = *(ao3 + 3); + + data16 = *(ao4 + 3); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = ZERO; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = data11; + b[11] = data12; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = data16; +#endif + ao1 += 4; + ao2 += 4; + ao3 += 4; + ao4 += 4; + b += 16; + } + + X += 4; + i --; + } while (i > 0); + } + + i = (m & 3); + if (i) { + + if (X > posY) { + + if (m & 2) { + ao1 += 2; + ao2 += 2; + ao3 += 2; + ao4 += 2; + b += 8; + } + + if (m & 1) { + ao1 += 1; + ao2 += 1; + ao3 += 1; + ao4 += 1; + b += 4; + } + + } else + if (X < posY) { + if (m & 2) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + data07 = *(ao2 + 2); + data08 = *(ao2 + 3); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + + ao1 += 2 * lda; + ao2 += 2 * lda; + + b += 8; + } + + if (m & 1) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + ao1 += lda; + b += 4; + } + + } else { + +#ifdef UNIT + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + if (i >= 2) { + data07 = *(ao2 + 2); + data08 = *(ao2 + 3); + } + + if (i >= 3) { + data12 = *(ao3 + 3); + } + + b[ 0] = ONE; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b += 4; + + if(i >= 2) { + b[ 0] = ZERO; + b[ 1] = ONE; + b[ 2] = data07; + b[ 3] = data08; + b += 4; + } + + if (i >= 3) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ONE; + b[ 3] = data12; + b += 4; + } +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + if (i >= 2) { + data06 = *(ao2 + 1); + data07 = *(ao2 + 2); + data08 = *(ao2 + 3); + } + + if (i >= 3) { + data11 = *(ao3 + 2); + data12 = *(ao3 + 3); + } + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b += 4; + + if(i >= 2) { + b[ 0] = ZERO; + b[ 1] = data06; + b[ 2] = data07; + b[ 3] = data08; + b += 4; + } + + if (i >= 3) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = data11; + b[ 3] = data12; + b += 4; + } +#endif + } + } + + posY += 4; + js --; + } while (js > 0); + } /* End of main loop */ + + + if (n & 2){ + X = posX; + + if (posX <= posY) { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + } else { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X > posY) { + ao1 += 2; + ao2 += 2; + b += 4; + + } else + if (X < posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data05; + b[ 3] = data06; + + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 4; + } else { +#ifdef UNIT + data02 = *(ao1 + 1); + + b[ 0] = ONE; + b[ 1] = data02; + b[ 2] = ZERO; + b[ 3] = ONE; +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data06 = *(ao2 + 1); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = ZERO; + b[ 3] = data06; +#endif + ao1 += 2; + ao2 += 2; + b += 4; + } + + X += 2; + i --; + } while (i > 0); + } + + i = (m & 1); + if (i) { + + if (X > posY) { + ao1 += 1; + ao2 += 1; + + b += 2; + } else + if (X < posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + b[ 0] = data01; + b[ 1] = data02; + ao1 += lda; + b += 2; + } else { +#ifdef UNIT + data02 = *(ao1 + 1); + + b[ 0] = ONE; + b[ 1] = data02; +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + b[ 0] = data01; + b[ 1] = data02; +#endif + b += 2; + } + } + posY += 2; + } + + if (n & 1){ + X = posX; + + if (posX <= posY) { + ao1 = a + posY + (posX + 0) * lda; + } else { + ao1 = a + posX + (posY + 0) * lda; + } + + i = m; + if (i > 0) { + do { + if (X > posY) { + b += 1; + ao1 += 1; + } else + if (X < posY) { + data01 = *(ao1 + 0); + b[ 0] = data01; + ao1 += lda; + b += 1; + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + data01 = *(ao1 + 0); + b[ 0] = data01; +#endif + ao1 += 1; + b += 1; + } + + X ++; + i --; + } while (i > 0); + } + + posY += 1; + } + + return 0; +} diff --git a/kernel/generic/trmm_uncopy_6.c b/kernel/generic/trmm_uncopy_6.c new file mode 100644 index 000000000..70945a246 --- /dev/null +++ b/kernel/generic/trmm_uncopy_6.c @@ -0,0 +1,785 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X, mm; + + FLOAT data01, data02, data03, data04, data05, data06; + FLOAT data07, data08, data09, data10, data11, data12; + FLOAT data13, data14, data15, data16, data17, data18; + FLOAT data19, data20, data21, data22, data23, data24; + FLOAT data25, data26, data27, data28, data29, data30; + FLOAT data31, data32, data33, data34, data35, data36; + + FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6; + + //js = (n >> 2); + js = n/6; + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + ao3 = a + posX + (posY + 2) * lda; + ao4 = a + posX + (posY + 3) * lda; + ao5 = a + posX + (posY + 4) * lda; + ao6 = a + posX + (posY + 5) * lda; + } else { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + ao3 = a + posY + (posX + 2) * lda; + ao4 = a + posY + (posX + 3) * lda; + ao5 = a + posY + (posX + 4) * lda; + ao6 = a + posY + (posX + 5) * lda; + } + + i = m/6; + if (i > 0) { + do { + if (X < posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + data05 = *(ao1 + 4); + data06 = *(ao1 + 5); + + data07 = *(ao2 + 0); + data08 = *(ao2 + 1); + data09 = *(ao2 + 2); + data10 = *(ao2 + 3); + data11 = *(ao2 + 4); + data12 = *(ao2 + 5); + + data13 = *(ao3 + 0); + data14 = *(ao3 + 1); + data15 = *(ao3 + 2); + data16 = *(ao3 + 3); + data17 = *(ao3 + 4); + data18 = *(ao3 + 5); + + data19 = *(ao4 + 0); + data20 = *(ao4 + 1); + data21 = *(ao4 + 2); + data22 = *(ao4 + 3); + data23 = *(ao4 + 4); + data24 = *(ao4 + 5); + + data25 = *(ao5 + 0); + data26 = *(ao5 + 1); + data27 = *(ao5 + 2); + data28 = *(ao5 + 3); + data29 = *(ao5 + 4); + data30 = *(ao5 + 5); + + data31 = *(ao6 + 0); + data32 = *(ao6 + 1); + data33 = *(ao6 + 2); + data34 = *(ao6 + 3); + data35 = *(ao6 + 4); + data36 = *(ao6 + 5); + + b[ 0] = data01; + b[ 1] = data07; + b[ 2] = data13; + b[ 3] = data19; + b[ 4] = data25; + b[ 5] = data31; + + b[ 6] = data02; + b[ 7] = data08; + b[ 8] = data14; + b[ 9] = data20; + b[10] = data26; + b[11] = data32; + + b[12] = data03; + b[13] = data09; + b[14] = data15; + b[15] = data21; + b[16] = data27; + b[17] = data33; + + b[18] = data04; + b[19] = data10; + b[20] = data16; + b[21] = data22; + b[22] = data28; + b[23] = data34; + + b[24] = data05; + b[25] = data11; + b[26] = data17; + b[27] = data23; + b[28] = data29; + b[29] = data35; + + b[30] = data06; + b[31] = data12; + b[32] = data18; + b[33] = data24; + b[34] = data30; + b[35] = data36; + + ao1 += 6; + ao2 += 6; + ao3 += 6; + ao4 += 6; + ao5 += 6; + ao6 += 6; + b += 36; + } else + if (X > posY) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + b[16] = ZERO; + b[17] = ZERO; + b[18] = ZERO; + b[19] = ZERO; + b[20] = ZERO; + b[21] = ZERO; + b[22] = ZERO; + b[23] = ZERO; + b[24] = ZERO; + b[25] = ZERO; + b[26] = ZERO; + b[27] = ZERO; + b[28] = ZERO; + b[29] = ZERO; + b[30] = ZERO; + b[31] = ZERO; + b[32] = ZERO; + b[33] = ZERO; + b[34] = ZERO; + b[35] = ZERO; + + ao1 += 6 * lda; + ao2 += 6 * lda; + ao3 += 6 * lda; + ao4 += 6 * lda; + ao5 += 6 * lda; + ao6 += 6 * lda; + + b += 36; + } else { + data01 = *(ao1 + 0); + data07 = *(ao2 + 0); + data13 = *(ao3 + 0); + data19 = *(ao4 + 0); + data25 = *(ao5 + 0); + data31 = *(ao6 + 0); + + data08 = *(ao2 + 1); + data14 = *(ao3 + 1); + data20 = *(ao4 + 1); + data26 = *(ao5 + 1); + data32 = *(ao6 + 1); + + data15 = *(ao3 + 2); + data21 = *(ao4 + 2); + data27 = *(ao5 + 2); + data33 = *(ao6 + 2); + + data22 = *(ao4 + 3); + data28 = *(ao5 + 3); + data34 = *(ao6 + 3); + + data29 = *(ao5 + 4); + data35 = *(ao6 + 4); + + data36 = *(ao6 + 5); + +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = data07; + b[ 2] = data13; + b[ 3] = data19; + b[ 4] = data25; + b[ 5] = data31; + + b[ 6] = ZERO; + b[ 7] = ONE; + b[ 8] = data14; + b[ 9] = data20; + b[10] = data26; + b[11] = data32; + + b[12] = ZERO; + b[13] = ZERO; + b[14] = ONE; + b[15] = data21; + b[16] = data27; + b[17] = data33; + + b[18] = ZERO; + b[19] = ZERO; + b[20] = ZERO; + b[21] = ONE; + b[22] = data28; + b[23] = data34; + + b[24] = ZERO; + b[25] = ZERO; + b[26] = ZERO; + b[27] = ZERO; + b[28] = ONE; + b[29] = data35; + + b[30] = ZERO; + b[31] = ZERO; + b[32] = ZERO; + b[33] = ZERO; + b[34] = ZERO; + b[35] = ONE; +#else + b[ 0] = data01; + b[ 1] = data07; + b[ 2] = data13; + b[ 3] = data19; + b[ 4] = data25; + b[ 5] = data31; + + b[ 6] = ZERO; + b[ 7] = data08; + b[ 8] = data14; + b[ 9] = data20; + b[10] = data26; + b[11] = data32; + + b[12] = ZERO; + b[13] = ZERO; + b[14] = data15; + b[15] = data21; + b[16] = data27; + b[17] = data33; + + b[18] = ZERO; + b[19] = ZERO; + b[20] = ZERO; + b[21] = data22; + b[22] = data28; + b[23] = data34; + + b[24] = ZERO; + b[25] = ZERO; + b[26] = ZERO; + b[27] = ZERO; + b[28] = data29; + b[29] = data35; + + b[30] = ZERO; + b[31] = ZERO; + b[32] = ZERO; + b[33] = ZERO; + b[34] = ZERO; + b[35] = data36; +#endif + + ao1 += 6; + ao2 += 6; + ao3 += 6; + ao4 += 6; + ao5 += 6; + ao6 += 7; + + b += 36; + } + X += 6; + i --; + } while (i > 0); + } + mm = m - m/6; + if (mm & 4) { + if (X < posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + data07 = *(ao2 + 2); + data08 = *(ao2 + 3); + + data09 = *(ao3 + 0); + data10 = *(ao3 + 1); + data11 = *(ao3 + 2); + data12 = *(ao3 + 3); + + data13 = *(ao4 + 0); + data14 = *(ao4 + 1); + data15 = *(ao4 + 2); + data16 = *(ao4 + 3); + + b[ 0] = data01; + b[ 1] = data05; + b[ 2] = data09; + b[ 3] = data13; + b[ 4] = data02; + b[ 5] = data06; + b[ 6] = data10; + b[ 7] = data14; + + b[ 8] = data03; + b[ 9] = data07; + b[10] = data11; + b[11] = data15; + b[12] = data04; + b[13] = data08; + b[14] = data12; + b[15] = data16; + + ao1 += 4; + ao2 += 4; + ao3 += 4; + ao4 += 4; + b += 16; + } else + if (X > posY) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + b[16] = ZERO; + b[17] = ZERO; + b[18] = ZERO; + b[19] = ZERO; + b[20] = ZERO; + b[21] = ZERO; + b[22] = ZERO; + b[23] = ZERO; + + ao1 += 4 * lda; + ao2 += 4 * lda; + ao3 += 4 * lda; + ao4 += 4 * lda; + + b += 16; + } else { +#ifdef UNIT + data05 = *(ao2 + 0); + + data09 = *(ao3 + 0); + data10 = *(ao3 + 1); + + data13 = *(ao4 + 0); + data14 = *(ao4 + 1); + data15 = *(ao4 + 2); + + b[ 0] = ONE; + b[ 1] = data05; + b[ 2] = data09; + b[ 3] = data13; + + b[ 4] = ZERO; + b[ 5] = ONE; + b[ 6] = data10; + b[ 7] = data14; + + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ONE; + b[11] = data15; + + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ONE; +#else + data01 = *(ao1 + 0); + + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + + data09 = *(ao3 + 0); + data10 = *(ao3 + 1); + data11 = *(ao3 + 2); + + data13 = *(ao4 + 0); + data14 = *(ao4 + 1); + data15 = *(ao4 + 2); + data16 = *(ao4 + 3); + + b[ 0] = data01; + b[ 1] = data05; + b[ 2] = data09; + b[ 3] = data13; + + b[ 4] = ZERO; + b[ 5] = data06; + b[ 6] = data10; + b[ 7] = data14; + + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = data11; + b[11] = data15; + + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = data16; +#endif + ao1 += 4; + ao2 += 4; + ao3 += 4; + ao4 += 4; + + b += 16; + } + X += 4; + } + + if (mm & 3) { + if (X < posY) { + if (mm & 2) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + data05 = *(ao3 + 0); + data06 = *(ao3 + 1); + data07 = *(ao4 + 0); + data08 = *(ao4 + 1); + + b[ 0] = data01; + b[ 1] = data03; + b[ 2] = data05; + b[ 3] = data07; + b[ 4] = data02; + b[ 5] = data04; + b[ 6] = data06; + b[ 7] = data08; + + ao1 += 2; + ao2 += 2; + ao3 += 2; + ao4 += 2; + b += 8; + } + + if (mm & 1) { + data01 = *(ao1 + 0); + data03 = *(ao2 + 0); + data05 = *(ao3 + 0); + data07 = *(ao4 + 0); + + b[ 0] = data01; + b[ 1] = data03; + b[ 2] = data05; + b[ 3] = data07; + + ao1 += 1; + ao2 += 1; + ao3 += 1; + ao4 += 1; + b += 4; + } + + } else + if (X > posY) { + if (m & 2) { + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 8; + } + + if (m & 1) { + ao1 += lda; + b += 4; + } + + } else { +#ifdef UNIT + data05 = *(ao2 + 0); + data09 = *(ao3 + 0); + data13 = *(ao4 + 0); + + if (i >= 2) { + data10 = *(ao3 + 1); + data14 = *(ao4 + 1); + } + + if (i >= 3) { + data15 = *(ao4 + 2); + } + + b[ 0] = ONE; + b[ 1] = data05; + b[ 2] = data09; + b[ 3] = data13; + b += 4; + + if(i >= 2) { + b[ 0] = ZERO; + b[ 1] = ONE; + b[ 2] = data10; + b[ 3] = data14; + b += 4; + } + + if (i >= 3) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ONE; + b[ 3] = data15; + b += 4; + } +#else + data01 = *(ao1 + 0); + data05 = *(ao2 + 0); + data09 = *(ao3 + 0); + data13 = *(ao4 + 0); + + if (i >= 2) { + data06 = *(ao2 + 1); + data10 = *(ao3 + 1); + data14 = *(ao4 + 1); + } + + if (i >= 3) { + data11 = *(ao3 + 2); + data15 = *(ao4 + 2); + } + + b[ 0] = data01; + b[ 1] = data05; + b[ 2] = data09; + b[ 3] = data13; + b += 4; + + if(i >= 2) { + b[ 0] = ZERO; + b[ 1] = data06; + b[ 2] = data10; + b[ 3] = data14; + b += 4; + } + + if (i >= 3) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = data11; + b[ 3] = data15; + b += 4; + } +#endif + } + } + + posY += 4; + js --; + } while (js > 0); + } /* End of main loop */ + + if (n & 2){ + X = posX; + + if (posX <= posY) { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + } else { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X < posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + + b[ 0] = data01; + b[ 1] = data05; + b[ 2] = data02; + b[ 3] = data06; + + ao1 += 2; + ao2 += 2; + b += 4; + + } else + if (X > posY) { + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 4; + + } else { +#ifdef UNIT + data05 = *(ao2 + 0); + + b[ 0] = ONE; + b[ 1] = data05; + b[ 2] = ZERO; + b[ 3] = ONE; +#else + data01 = *(ao1 + 0); + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + + b[ 0] = data01; + b[ 1] = data05; + b[ 2] = ZERO; + b[ 3] = data06; +#endif + + ao1 += 2 * lda; + ao2 += 2 * lda; + + b += 4; + } + + X += 2; + i --; + } while (i > 0); + } + + i = (m & 1); + if (i) { + + if (X < posY) { + data01 = *(ao1 + 0); + data05 = *(ao2 + 0); + + b[ 0] = data01; + b[ 1] = data05; + ao1 += 1; + ao2 += 1; + b += 2; + } else + if (X > posY) { + ao1 += lda; + ao2 += lda; + b += 2; + } else { +#ifdef UNIT + data05 = *(ao2 + 0); + b[ 0] = ONE; + b[ 1] = data05; +#else + data01 = *(ao1 + 0); + data05 = *(ao2 + 0); + + b[ 0] = data01; + b[ 1] = data05; +#endif + ao1 += lda; + ao2 += lda; + b += 2; + } + } + + posY += 2; + } + + if (n & 1){ + X = posX; + + if (posX <= posY) { + ao1 = a + posX + (posY + 0) * lda; + } else { + ao1 = a + posY + (posX + 0) * lda; + } + + i = m; + if (m > 0) { + do { + if (X < posY) { + data01 = *(ao1 + 0); + b[ 0] = data01; + ao1 += 1; + b += 1; + } else + if (X > posY) { + ao1 += lda; + b += 1; + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + data01 = *(ao1 + 0); + b[ 0] = data01; +#endif + ao1 += lda; + b += 1; + } + + X += 1; + i --; + } while (i > 0); + } + } + + return 0; +} diff --git a/kernel/generic/trmm_utcopy_6.c b/kernel/generic/trmm_utcopy_6.c new file mode 100644 index 000000000..7d4dba34b --- /dev/null +++ b/kernel/generic/trmm_utcopy_6.c @@ -0,0 +1,472 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT data09, data10, data11, data12, data13, data14, data15, data16; + FLOAT *ao1, *ao2, *ao3, *ao4; + + js = (n >> 2); + + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + ao3 = a + posX + (posY + 2) * lda; + ao4 = a + posX + (posY + 3) * lda; + } else { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + ao3 = a + posY + (posX + 2) * lda; + ao4 = a + posY + (posX + 3) * lda; + } + + i = (m >> 2); + if (i > 0) { + do { + if (X < posY) { + ao1 += 4; + ao2 += 4; + ao3 += 4; + ao4 += 4; + b += 16; + } else + if (X > posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + data07 = *(ao2 + 2); + data08 = *(ao2 + 3); + + data09 = *(ao3 + 0); + data10 = *(ao3 + 1); + data11 = *(ao3 + 2); + data12 = *(ao3 + 3); + + data13 = *(ao4 + 0); + data14 = *(ao4 + 1); + data15 = *(ao4 + 2); + data16 = *(ao4 + 3); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + + ao1 += 4 * lda; + ao2 += 4 * lda; + ao3 += 4 * lda; + ao4 += 4 * lda; + b += 16; + + } else { +#ifdef UNIT + data05 = *(ao2 + 0); + data09 = *(ao3 + 0); + data10 = *(ao3 + 1); + data13 = *(ao4 + 0); + data14 = *(ao4 + 1); + data15 = *(ao4 + 2); + + b[ 0] = ONE; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + + b[ 4] = data05; + b[ 5] = ONE; + b[ 6] = ZERO; + b[ 7] = ZERO; + + b[ 8] = data09; + b[ 9] = data10; + b[10] = ONE; + b[11] = ZERO; + + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = ONE; +#else + data01 = *(ao1 + 0); + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + data09 = *(ao3 + 0); + data10 = *(ao3 + 1); + data11 = *(ao3 + 2); + data13 = *(ao4 + 0); + data14 = *(ao4 + 1); + data15 = *(ao4 + 2); + data16 = *(ao4 + 3); + + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = ZERO; + b[ 7] = ZERO; + + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = ZERO; + + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; +#endif + + ao1 += 4 * lda; + ao2 += 4 * lda; + ao3 += 4 * lda; + ao4 += 4 * lda; + + b += 16; + } + + X += 4; + i --; + } while (i > 0); + } + + i = (m & 3); + if (i) { + + if (X < posY) { + + if (m & 2) { + ao1 += 2; + ao2 += 2; + ao3 += 2; + ao4 += 2; + b += 8; + } + + if (m & 1) { + ao1 += 1; + ao2 += 1; + ao3 += 1; + ao4 += 1; + b += 4; + } + + } else + if (X > posY) { + if (m & 2) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + data07 = *(ao2 + 2); + data08 = *(ao2 + 3); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 8; + } + + if (m & 1) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + ao1 += lda; + b += 4; + } + + } else { + +#ifdef UNIT + if (i >= 2) { + data05 = *(ao2 + 0); + } + + if (i >= 3) { + data09 = *(ao3 + 0); + data10 = *(ao3 + 1); + } + + b[ 0] = ONE; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b += 4; + + if(i >= 2) { + b[ 0] = data05; + b[ 1] = ONE; + b[ 2] = ZERO; + b[ 3] = ZERO; + b += 4; + } + + if (i >= 3) { + b[ 0] = data09; + b[ 1] = data10; + b[ 2] = ONE; + b[ 3] = ZERO; + b += 4; + } +#else + data01 = *(ao1 + 0); + + if (i >= 2) { + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + } + + if (i >= 3) { + data09 = *(ao3 + 0); + data10 = *(ao3 + 1); + data11 = *(ao3 + 2); + } + + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b += 4; + + if(i >= 2) { + b[ 0] = data05; + b[ 1] = data06; + b[ 2] = ZERO; + b[ 3] = ZERO; + b += 4; + } + + if (i >= 3) { + b[ 0] = data09; + b[ 1] = data10; + b[ 2] = data11; + b[ 3] = ZERO; + b += 4; + } +#endif + } + } + + posY += 4; + js --; + } while (js > 0); + } /* End of main loop */ + + if (n & 2){ + X = posX; + + if (posX <= posY) { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + } else { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X < posY) { + ao1 += 2; + ao2 += 2; + b += 4; + + } else + if (X > posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data05; + b[ 3] = data06; + + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 4; + } else { +#ifdef UNIT + data05 = *(ao2 + 0); + + b[ 0] = ONE; + b[ 1] = ZERO; + b[ 2] = data05; + b[ 3] = ONE; +#else + data01 = *(ao1 + 0); + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = data05; + b[ 3] = data06; + +#endif + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 4; + } + + X += 2; + i --; + } while (i > 0); + } + + i = (m & 1); + if (i) { + + if (X < posY) { + ao1 += 2; + b += 2; + } else + if (X > posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + b[ 0] = data01; + b[ 1] = data02; + + ao1 += lda; + b += 2; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + data01 = *(ao1 + 0); + + b[ 0] = data01; + b[ 1] = ZERO; +#endif + b += 2; + } + } + posY += 2; + } + + if (n & 1){ + X = posX; + + if (posX <= posY) { + ao1 = a + posX + (posY + 0) * lda; + } else { + ao1 = a + posY + (posX + 0) * lda; + } + + i = m; + if (m > 0) { + do { + + if (X < posY) { + b += 1; + ao1 += 1; + } else + if (X > posY) { + data01 = *(ao1 + 0); + b[ 0] = data01; + ao1 += lda; + b += 1; + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + data01 = *(ao1 + 0); + b[ 0] = data01; +#endif + ao1 += lda; + b += 1; + } + + X += 1; + i --; + } while (i > 0); + } + } + + return 0; +} diff --git a/kernel/generic/trsm_kernel_LN.c b/kernel/generic/trsm_kernel_LN.c index 068a202b8..931cba377 100644 --- a/kernel/generic/trsm_kernel_LN.c +++ b/kernel/generic/trsm_kernel_LN.c @@ -58,6 +58,10 @@ static FLOAT dm1 = -1.; #define GEMM_UNROLL_M_SHIFT 2 #endif +#if GEMM_DEFAULT_UNROLL_M == 6 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + #if GEMM_DEFAULT_UNROLL_M == 8 #define GEMM_UNROLL_M_SHIFT 3 #endif diff --git a/kernel/generic/trsm_kernel_LT.c b/kernel/generic/trsm_kernel_LT.c index 300fdd483..099624252 100644 --- a/kernel/generic/trsm_kernel_LT.c +++ b/kernel/generic/trsm_kernel_LT.c @@ -58,6 +58,10 @@ static FLOAT dm1 = -1.; #define GEMM_UNROLL_M_SHIFT 2 #endif +#if GEMM_DEFAULT_UNROLL_M == 6 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + #if GEMM_DEFAULT_UNROLL_M == 8 #define GEMM_UNROLL_M_SHIFT 3 #endif diff --git a/kernel/generic/trsm_kernel_RN.c b/kernel/generic/trsm_kernel_RN.c index b85c3c1e9..d7e650e0c 100644 --- a/kernel/generic/trsm_kernel_RN.c +++ b/kernel/generic/trsm_kernel_RN.c @@ -58,6 +58,10 @@ static FLOAT dm1 = -1.; #define GEMM_UNROLL_M_SHIFT 2 #endif +#if GEMM_DEFAULT_UNROLL_M == 6 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + #if GEMM_DEFAULT_UNROLL_M == 8 #define GEMM_UNROLL_M_SHIFT 3 #endif diff --git a/kernel/generic/trsm_kernel_RT.c b/kernel/generic/trsm_kernel_RT.c index 2adb3a4f7..a46945330 100644 --- a/kernel/generic/trsm_kernel_RT.c +++ b/kernel/generic/trsm_kernel_RT.c @@ -58,6 +58,11 @@ static FLOAT dm1 = -1.; #define GEMM_UNROLL_M_SHIFT 2 #endif +#if GEMM_DEFAULT_UNROLL_M == 6 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + + #if GEMM_DEFAULT_UNROLL_M == 8 #define GEMM_UNROLL_M_SHIFT 3 #endif diff --git a/kernel/generic/trsm_lncopy_6.c b/kernel/generic/trsm_lncopy_6.c new file mode 100644 index 000000000..9f7bcc2dd --- /dev/null +++ b/kernel/generic/trsm_lncopy_6.c @@ -0,0 +1,326 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef UNIT +#define INV(a) (ONE / (a)) +#else +#define INV(a) (ONE) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT data09, data10, data11, data12, data13, data14, data15, data16; + FLOAT *a1, *a2, *a3, *a4; + + jj = offset; + + j = (n >> 2); + while (j > 0){ + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a3 = a + 2 * lda; + a4 = a + 3 * lda; + + i = (m >> 2); + ii = 0; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + +#ifndef UNIT + data06 = *(a2 + 1); +#endif + data07 = *(a2 + 2); + data08 = *(a2 + 3); + +#ifndef UNIT + data11 = *(a3 + 2); +#endif + data12 = *(a3 + 3); + +#ifndef UNIT + data16 = *(a4 + 3); +#endif + + *(b + 0) = INV(data01); + + *(b + 4) = data02; + *(b + 5) = INV(data06); + + *(b + 8) = data03; + *(b + 9) = data07; + *(b + 10) = INV(data11); + + *(b + 12) = data04; + *(b + 13) = data08; + *(b + 14) = data12; + *(b + 15) = INV(data16); + } + + if (ii > jj) { + + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + + data05 = *(a2 + 0); + data06 = *(a2 + 1); + data07 = *(a2 + 2); + data08 = *(a2 + 3); + + data09 = *(a3 + 0); + data10 = *(a3 + 1); + data11 = *(a3 + 2); + data12 = *(a3 + 3); + + data13 = *(a4 + 0); + data14 = *(a4 + 1); + data15 = *(a4 + 2); + data16 = *(a4 + 3); + + *(b + 0) = data01; + *(b + 1) = data05; + *(b + 2) = data09; + *(b + 3) = data13; + *(b + 4) = data02; + *(b + 5) = data06; + *(b + 6) = data10; + *(b + 7) = data14; + + *(b + 8) = data03; + *(b + 9) = data07; + *(b + 10) = data11; + *(b + 11) = data15; + *(b + 12) = data04; + *(b + 13) = data08; + *(b + 14) = data12; + *(b + 15) = data16; + } + + a1 += 4; + a2 += 4; + a3 += 4; + a4 += 4; + b += 16; + + i --; + ii += 4; + } + + if ((m & 2) != 0) { + + if (ii== jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data02 = *(a1 + 1); + +#ifndef UNIT + data06 = *(a2 + 1); +#endif + + *(b + 0) = INV(data01); + + *(b + 4) = data02; + *(b + 5) = INV(data06); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a2 + 0); + data04 = *(a2 + 1); + data05 = *(a3 + 0); + data06 = *(a3 + 1); + data07 = *(a4 + 0); + data08 = *(a4 + 1); + + *(b + 0) = data01; + *(b + 1) = data03; + *(b + 2) = data05; + *(b + 3) = data07; + *(b + 4) = data02; + *(b + 5) = data04; + *(b + 6) = data06; + *(b + 7) = data08; + } + + a1 += 2; + a2 += 2; + a3 += 2; + a4 += 2; + b += 8; + + ii += 2; + } + + if ((m & 1) != 0) { + + if (ii== jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + *(b + 0) = INV(data01); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a2 + 0); + data03 = *(a3 + 0); + data04 = *(a4 + 0); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + } + b += 4; + } + + a += 4 * lda; + jj += 4; + j --; + } + + if (n & 2) { + a1 = a + 0 * lda; + a2 = a + 1 * lda; + + i = (m >> 1); + ii = 0; + while (i > 0) { + + if (ii == jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data02 = *(a1 + 1); + +#ifndef UNIT + data04 = *(a2 + 1); +#endif + + *(b + 0) = INV(data01); + *(b + 2) = data02; + *(b + 3) = INV(data04); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a2 + 0); + data04 = *(a2 + 1); + + *(b + 0) = data01; + *(b + 1) = data03; + *(b + 2) = data02; + *(b + 3) = data04; + } + + a1 += 2; + a2 += 2; + b += 4; + + i --; + ii += 2; + } + + if ((m & 1) != 0) { + + if (ii== jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + *(b + 0) = INV(data01); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a2 + 0); + *(b + 0) = data01; + *(b + 1) = data02; + } + b += 2; + } + a += 2 * lda; + jj += 2; + } + + if (n & 1) { + a1 = a + 0 * lda; + + i = m; + ii = 0; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + *(b + 0) = INV(data01); + } + + if (ii > jj) { + data01 = *(a1 + 0); + *(b + 0) = data01; + } + + a1+= 1; + b += 1; + i --; + ii += 1; + } + } + + return 0; +} diff --git a/kernel/generic/trsm_ltcopy_6.c b/kernel/generic/trsm_ltcopy_6.c new file mode 100644 index 000000000..d891468a4 --- /dev/null +++ b/kernel/generic/trsm_ltcopy_6.c @@ -0,0 +1,346 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef UNIT +#define INV(a) (ONE / (a)) +#else +#define INV(a) (ONE) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT data09, data10, data11, data12, data13, data14, data15, data16; + FLOAT *a1, *a2, *a3, *a4; + + jj = offset; + + j = (n >> 2); + while (j > 0){ + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a3 = a + 2 * lda; + a4 = a + 3 * lda; + + i = (m >> 2); + ii = 0; + while (i > 0) { + + if (ii == jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + +#ifndef UNIT + data06 = *(a2 + 1); +#endif + data07 = *(a2 + 2); + data08 = *(a2 + 3); + +#ifndef UNIT + data11 = *(a3 + 2); +#endif + data12 = *(a3 + 3); + +#ifndef UNIT + data16 = *(a4 + 3); +#endif + + *(b + 0) = INV(data01); + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + + *(b + 5) = INV(data06); + *(b + 6) = data07; + *(b + 7) = data08; + + *(b + 10) = INV(data11); + *(b + 11) = data12; + + *(b + 15) = INV(data16); + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + + data05 = *(a2 + 0); + data06 = *(a2 + 1); + data07 = *(a2 + 2); + data08 = *(a2 + 3); + + data09 = *(a3 + 0); + data10 = *(a3 + 1); + data11 = *(a3 + 2); + data12 = *(a3 + 3); + + data13 = *(a4 + 0); + data14 = *(a4 + 1); + data15 = *(a4 + 2); + data16 = *(a4 + 3); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + *(b + 4) = data05; + *(b + 5) = data06; + *(b + 6) = data07; + *(b + 7) = data08; + + *(b + 8) = data09; + *(b + 9) = data10; + *(b + 10) = data11; + *(b + 11) = data12; + *(b + 12) = data13; + *(b + 13) = data14; + *(b + 14) = data15; + *(b + 15) = data16; + } + + a1 += 4 * lda; + a2 += 4 * lda; + a3 += 4 * lda; + a4 += 4 * lda; + b += 16; + + i --; + ii += 4; + } + + if ((m & 2) != 0) { + + if (ii== jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + +#ifndef UNIT + data06 = *(a2 + 1); +#endif + data07 = *(a2 + 2); + data08 = *(a2 + 3); + + *(b + 0) = INV(data01); + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + + *(b + 5) = INV(data06); + *(b + 6) = data07; + *(b + 7) = data08; + + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + + data05 = *(a2 + 0); + data06 = *(a2 + 1); + data07 = *(a2 + 2); + data08 = *(a2 + 3); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + *(b + 4) = data05; + *(b + 5) = data06; + *(b + 6) = data07; + *(b + 7) = data08; + } + + a1 += 2 * lda; + a2 += 2 * lda; + b += 8; + + ii += 2; + } + + if ((m & 1) != 0) { + + if (ii== jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + + *(b + 0) = INV(data01); + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + } + b += 4; + } + + a += 4; + jj += 4; + j --; + } + + if (n & 2) { + a1 = a + 0 * lda; + a2 = a + 1 * lda; + + i = (m >> 1); + ii = 0; + while (i > 0) { + + if (ii == jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data02 = *(a1 + 1); + +#ifndef UNIT + data04 = *(a2 + 1); +#endif + + *(b + 0) = INV(data01); + *(b + 1) = data02; + + *(b + 3) = INV(data04); + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a2 + 0); + data04 = *(a2 + 1); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + } + + a1 += 2 * lda; + a2 += 2 * lda; + b += 4; + + i --; + ii += 2; + } + + if ((m & 1) != 0) { + + if (ii== jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + *(b + 0) = INV(data01); + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + *(b + 0) = data01; + *(b + 1) = data02; + } + b += 2; + } + a += 2; + jj += 2; + } + + if (n & 1) { + a1 = a + 0 * lda; + + i = m; + ii = 0; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + *(b + 0) = INV(data01); + } + + if (ii < jj) { + data01 = *(a1 + 0); + *(b + 0) = data01; + } + + a1 += 1 * lda; + b += 1; + + i --; + ii += 1; + } + } + + return 0; +} diff --git a/kernel/generic/trsm_uncopy_6.c b/kernel/generic/trsm_uncopy_6.c new file mode 100644 index 000000000..837a25019 --- /dev/null +++ b/kernel/generic/trsm_uncopy_6.c @@ -0,0 +1,350 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef UNIT +#define INV(a) (ONE / (a)) +#else +#define INV(a) (ONE) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT data09, data10, data11, data12, data13, data14, data15, data16; + FLOAT *a1, *a2, *a3, *a4; + + jj = offset; + + j = (n >> 2); + while (j > 0){ + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a3 = a + 2 * lda; + a4 = a + 3 * lda; + + i = (m >> 2); + ii = 0; + while (i > 0) { + + if (ii == jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + + data05 = *(a2 + 0); +#ifndef UNIT + data06 = *(a2 + 1); +#endif + + data09 = *(a3 + 0); + data10 = *(a3 + 1); +#ifndef UNIT + data11 = *(a3 + 2); +#endif + + data13 = *(a4 + 0); + data14 = *(a4 + 1); + data15 = *(a4 + 2); +#ifndef UNIT + data16 = *(a4 + 3); +#endif + + *(b + 0) = INV(data01); + *(b + 1) = data05; + *(b + 2) = data09; + *(b + 3) = data13; + + *(b + 5) = INV(data06); + *(b + 6) = data10; + *(b + 7) = data14; + + *(b + 10) = INV(data11); + *(b + 11) = data15; + + *(b + 15) = INV(data16); + } + + if (ii < jj) { + + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + + data05 = *(a2 + 0); + data06 = *(a2 + 1); + data07 = *(a2 + 2); + data08 = *(a2 + 3); + + data09 = *(a3 + 0); + data10 = *(a3 + 1); + data11 = *(a3 + 2); + data12 = *(a3 + 3); + + data13 = *(a4 + 0); + data14 = *(a4 + 1); + data15 = *(a4 + 2); + data16 = *(a4 + 3); + + *(b + 0) = data01; + *(b + 1) = data05; + *(b + 2) = data09; + *(b + 3) = data13; + *(b + 4) = data02; + *(b + 5) = data06; + *(b + 6) = data10; + *(b + 7) = data14; + + *(b + 8) = data03; + *(b + 9) = data07; + *(b + 10) = data11; + *(b + 11) = data15; + *(b + 12) = data04; + *(b + 13) = data08; + *(b + 14) = data12; + *(b + 15) = data16; + } + + a1 += 4; + a2 += 4; + a3 += 4; + a4 += 4; + b += 16; + + i --; + ii += 4; + } + + if ((m & 2) != 0) { + + if (ii== jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + + data05 = *(a2 + 0); +#ifndef UNIT + data06 = *(a2 + 1); +#endif + + data09 = *(a3 + 0); + data10 = *(a3 + 1); + + data13 = *(a4 + 0); + data14 = *(a4 + 1); + + *(b + 0) = INV(data01); + *(b + 1) = data05; + *(b + 2) = data09; + *(b + 3) = data13; + + *(b + 5) = INV(data06); + *(b + 6) = data10; + *(b + 7) = data14; + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a2 + 0); + data04 = *(a2 + 1); + data05 = *(a3 + 0); + data06 = *(a3 + 1); + data07 = *(a4 + 0); + data08 = *(a4 + 1); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + *(b + 4) = data05; + *(b + 5) = data06; + *(b + 6) = data07; + *(b + 7) = data08; + } + + a1 += 2; + a2 += 2; + b += 8; + + ii += 2; + } + + if ((m & 1) != 0) { + + if (ii== jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + + data05 = *(a2 + 0); + data09 = *(a3 + 0); + data13 = *(a4 + 0); + + *(b + 0) = INV(data01); + *(b + 1) = data05; + *(b + 2) = data09; + *(b + 3) = data13; + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a2 + 0); + data03 = *(a3 + 0); + data04 = *(a4 + 0); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + } + b += 4; + } + + a += 4 * lda; + jj += 4; + j --; + } + + if (n & 2) { + a1 = a + 0 * lda; + a2 = a + 1 * lda; + + i = (m >> 1); + ii = 0; + while (i > 0) { + + if (ii == jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + + data03 = *(a2 + 0); +#ifndef UNIT + data04 = *(a2 + 1); +#endif + + *(b + 0) = INV(data01); + *(b + 1) = data03; + *(b + 3) = INV(data04); + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a2 + 0); + data04 = *(a2 + 1); + + *(b + 0) = data01; + *(b + 1) = data03; + *(b + 2) = data02; + *(b + 3) = data04; + } + + a1 += 2; + a2 += 2; + b += 4; + + i --; + ii += 2; + } + + if ((m & 1) != 0) { + + if (ii== jj) { + + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + + data03 = *(a2 + 0); + + *(b + 0) = INV(data01); + *(b + 1) = data03; + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a2 + 0); + *(b + 0) = data01; + *(b + 1) = data02; + } + b += 2; + } + a += 2 * lda; + jj += 2; + } + + if (n & 1) { + a1 = a + 0 * lda; + + i = m; + ii = 0; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + *(b + 0) = INV(data01); + } + + if (ii < jj) { + data01 = *(a1 + 0); + *(b + 0) = data01; + } + + a1+= 1; + b += 1; + i --; + ii += 1; + } + } + + return 0; +} diff --git a/kernel/generic/trsm_utcopy_6.c b/kernel/generic/trsm_utcopy_6.c new file mode 100644 index 000000000..bbba78d53 --- /dev/null +++ b/kernel/generic/trsm_utcopy_6.c @@ -0,0 +1,322 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef UNIT +#define INV(a) (ONE / (a)) +#else +#define INV(a) (ONE) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT data09, data10, data11, data12, data13, data14, data15, data16; + FLOAT *a1, *a2, *a3, *a4; + + jj = offset; + + j = (n >> 2); + while (j > 0){ + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a3 = a + 2 * lda; + a4 = a + 3 * lda; + + i = (m >> 2); + ii = 0; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + + data05 = *(a2 + 0); +#ifndef UNIT + data06 = *(a2 + 1); +#endif + + data09 = *(a3 + 0); + data10 = *(a3 + 1); +#ifndef UNIT + data11 = *(a3 + 2); +#endif + + data13 = *(a4 + 0); + data14 = *(a4 + 1); + data15 = *(a4 + 2); +#ifndef UNIT + data16 = *(a4 + 3); +#endif + + *(b + 0) = INV(data01); + + *(b + 4) = data05; + *(b + 5) = INV(data06); + + *(b + 8) = data09; + *(b + 9) = data10; + *(b + 10) = INV(data11); + + *(b + 12) = data13; + *(b + 13) = data14; + *(b + 14) = data15; + *(b + 15) = INV(data16); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + + data05 = *(a2 + 0); + data06 = *(a2 + 1); + data07 = *(a2 + 2); + data08 = *(a2 + 3); + + data09 = *(a3 + 0); + data10 = *(a3 + 1); + data11 = *(a3 + 2); + data12 = *(a3 + 3); + + data13 = *(a4 + 0); + data14 = *(a4 + 1); + data15 = *(a4 + 2); + data16 = *(a4 + 3); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + *(b + 4) = data05; + *(b + 5) = data06; + *(b + 6) = data07; + *(b + 7) = data08; + + *(b + 8) = data09; + *(b + 9) = data10; + *(b + 10) = data11; + *(b + 11) = data12; + *(b + 12) = data13; + *(b + 13) = data14; + *(b + 14) = data15; + *(b + 15) = data16; + } + + a1 += 4 * lda; + a2 += 4 * lda; + a3 += 4 * lda; + a4 += 4 * lda; + b += 16; + + i --; + ii += 4; + } + + if ((m & 2) != 0) { + + if (ii== jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data05 = *(a2 + 0); +#ifndef UNIT + data06 = *(a2 + 1); +#endif + + *(b + 0) = INV(data01); + + *(b + 4) = data05; + *(b + 5) = INV(data06); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + + data05 = *(a2 + 0); + data06 = *(a2 + 1); + data07 = *(a2 + 2); + data08 = *(a2 + 3); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + *(b + 4) = data05; + *(b + 5) = data06; + *(b + 6) = data07; + *(b + 7) = data08; + } + + a1 += 2 * lda; + a2 += 2 * lda; + b += 8; + + ii += 2; + } + + if ((m & 1) != 0) { + + if (ii== jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + *(b + 0) = INV(data01); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + } + b += 4; + } + + a += 4; + jj += 4; + j --; + } + + if (n & 2) { + a1 = a + 0 * lda; + a2 = a + 1 * lda; + + i = (m >> 1); + ii = 0; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data03 = *(a2 + 0); +#ifndef UNIT + data04 = *(a2 + 1); +#endif + + *(b + 0) = INV(data01); + *(b + 2) = data03; + *(b + 3) = INV(data04); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a2 + 0); + data04 = *(a2 + 1); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + } + + a1 += 2 * lda; + a2 += 2 * lda; + b += 4; + + i --; + ii += 2; + } + + if ((m & 1) != 0) { + + if (ii== jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + *(b + 0) = INV(data01); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + *(b + 0) = data01; + *(b + 1) = data02; + } + b += 2; + } + a += 2; + jj += 2; + } + + if (n & 1) { + a1 = a + 0 * lda; + + i = m; + ii = 0; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + *(b + 0) = INV(data01); + } + + if (ii > jj) { + data01 = *(a1 + 0); + *(b + 0) = data01; + } + + a1 += 1 * lda; + b += 1; + + i --; + ii += 1; + } + } + + return 0; +} diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index 4a7526196..cf868f103 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -762,6 +762,23 @@ static void init_parameter(void) { #endif #endif +#ifdef HASWELL + +#ifdef DEBUG + fprintf(stderr, "Haswell\n"); +#endif + + TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; + TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; + TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; + TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; +#ifdef EXPRECISION + TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P; + TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P; +#endif +#endif + + #ifdef OPTERON #ifdef DEBUG diff --git a/kernel/x86/KERNEL.HASWELL b/kernel/x86/KERNEL.HASWELL new file mode 100644 index 000000000..65b03ae50 --- /dev/null +++ b/kernel/x86/KERNEL.HASWELL @@ -0,0 +1 @@ +include $(KERNELDIR)/KERNEL.PENRYN diff --git a/kernel/x86/trsm_kernel_LN_2x4_penryn.S b/kernel/x86/trsm_kernel_LN_2x4_penryn.S index ebd1377f1..16ba9a0e3 100644 --- a/kernel/x86/trsm_kernel_LN_2x4_penryn.S +++ b/kernel/x86/trsm_kernel_LN_2x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/trsm_kernel_LN_4x4_penryn.S b/kernel/x86/trsm_kernel_LN_4x4_penryn.S index 6fa7d410e..03f8e3d79 100644 --- a/kernel/x86/trsm_kernel_LN_4x4_penryn.S +++ b/kernel/x86/trsm_kernel_LN_4x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/trsm_kernel_LT_2x4_penryn.S b/kernel/x86/trsm_kernel_LT_2x4_penryn.S index 9ce4cd8d4..65a6cf091 100644 --- a/kernel/x86/trsm_kernel_LT_2x4_penryn.S +++ b/kernel/x86/trsm_kernel_LT_2x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/trsm_kernel_LT_4x4_penryn.S b/kernel/x86/trsm_kernel_LT_4x4_penryn.S index a1a35a7a5..d27880b8d 100644 --- a/kernel/x86/trsm_kernel_LT_4x4_penryn.S +++ b/kernel/x86/trsm_kernel_LT_4x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/trsm_kernel_RT_2x4_penryn.S b/kernel/x86/trsm_kernel_RT_2x4_penryn.S index a5333640d..ff8231e16 100644 --- a/kernel/x86/trsm_kernel_RT_2x4_penryn.S +++ b/kernel/x86/trsm_kernel_RT_2x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/trsm_kernel_RT_4x4_penryn.S b/kernel/x86/trsm_kernel_RT_4x4_penryn.S index c3619ec3d..857866552 100644 --- a/kernel/x86/trsm_kernel_RT_4x4_penryn.S +++ b/kernel/x86/trsm_kernel_RT_4x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/ztrsm_kernel_LN_2x2_penryn.S b/kernel/x86/ztrsm_kernel_LN_2x2_penryn.S index 53e53c3ce..6d9880556 100644 --- a/kernel/x86/ztrsm_kernel_LN_2x2_penryn.S +++ b/kernel/x86/ztrsm_kernel_LN_2x2_penryn.S @@ -61,7 +61,7 @@ #define PREFETCHSIZE 84 #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif diff --git a/kernel/x86/ztrsm_kernel_LT_1x2_penryn.S b/kernel/x86/ztrsm_kernel_LT_1x2_penryn.S index 3c056cdff..452e3bf87 100644 --- a/kernel/x86/ztrsm_kernel_LT_1x2_penryn.S +++ b/kernel/x86/ztrsm_kernel_LT_1x2_penryn.S @@ -63,7 +63,7 @@ #define PREFETCHSIZE 84 #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif diff --git a/kernel/x86/ztrsm_kernel_LT_2x2_penryn.S b/kernel/x86/ztrsm_kernel_LT_2x2_penryn.S index 1efa1fd25..64232fdfb 100644 --- a/kernel/x86/ztrsm_kernel_LT_2x2_penryn.S +++ b/kernel/x86/ztrsm_kernel_LT_2x2_penryn.S @@ -61,7 +61,7 @@ #define PREFETCHSIZE 84 #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif diff --git a/kernel/x86/ztrsm_kernel_RT_1x2_penryn.S b/kernel/x86/ztrsm_kernel_RT_1x2_penryn.S index 849afed73..71246d7a6 100644 --- a/kernel/x86/ztrsm_kernel_RT_1x2_penryn.S +++ b/kernel/x86/ztrsm_kernel_RT_1x2_penryn.S @@ -63,7 +63,7 @@ #define PREFETCHSIZE 84 #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif diff --git a/kernel/x86/ztrsm_kernel_RT_2x2_penryn.S b/kernel/x86/ztrsm_kernel_RT_2x2_penryn.S index c1833abe2..dfd555c88 100644 --- a/kernel/x86/ztrsm_kernel_RT_2x2_penryn.S +++ b/kernel/x86/ztrsm_kernel_RT_2x2_penryn.S @@ -61,7 +61,7 @@ #define PREFETCHSIZE 84 #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL new file mode 100644 index 000000000..447481019 --- /dev/null +++ b/kernel/x86_64/KERNEL.HASWELL @@ -0,0 +1,63 @@ +SGEMMKERNEL = sgemm_kernel_16x4_haswell.S +SGEMMINCOPY = ../generic/gemm_ncopy_16.c +SGEMMITCOPY = ../generic/gemm_tcopy_16.c +SGEMMONCOPY = ../generic/gemm_ncopy_4.c +SGEMMOTCOPY = ../generic/gemm_tcopy_4.c +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) + +DGEMMKERNEL = dgemm_kernel_4x4_haswell.S +DGEMMINCOPY = +DGEMMITCOPY = +DGEMMONCOPY = ../generic/gemm_ncopy_4.c +DGEMMOTCOPY = ../generic/gemm_tcopy_4.c +DGEMMINCOPYOBJ = +DGEMMITCOPYOBJ = +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) + +CGEMMKERNEL = cgemm_kernel_8x2_haswell.S +CGEMMINCOPY = ../generic/zgemm_ncopy_8.c +CGEMMITCOPY = ../generic/zgemm_tcopy_8.c +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ZGEMMKERNEL = zgemm_kernel_4x2_haswell.S +ZGEMMINCOPY = ../generic/zgemm_ncopy_4.c +ZGEMMITCOPY = ../generic/zgemm_tcopy_4.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CGEMM3MKERNEL = zgemm3m_kernel_4x8_nehalem.S +ZGEMM3MKERNEL = zgemm3m_kernel_2x8_nehalem.S + diff --git a/kernel/x86_64/KERNEL.PILEDRIVER b/kernel/x86_64/KERNEL.PILEDRIVER index 8ebd42244..abed953c3 100644 --- a/kernel/x86_64/KERNEL.PILEDRIVER +++ b/kernel/x86_64/KERNEL.PILEDRIVER @@ -7,7 +7,7 @@ DAXPYKERNEL = daxpy_bulldozer.S DDOTKERNEL = ddot_bulldozer.S DCOPYKERNEL = dcopy_bulldozer.S -SGEMMKERNEL = sgemm_kernel_16x2_bulldozer.S +SGEMMKERNEL = sgemm_kernel_16x2_piledriver.S SGEMMINCOPY = ../generic/gemm_ncopy_16.c SGEMMITCOPY = ../generic/gemm_tcopy_16.c SGEMMONCOPY = gemm_ncopy_2_bulldozer.S @@ -16,7 +16,8 @@ SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) -DGEMMKERNEL = dgemm_kernel_8x2_bulldozer.S + +DGEMMKERNEL = dgemm_kernel_8x2_piledriver.S DGEMMINCOPY = dgemm_ncopy_8_bulldozer.S DGEMMITCOPY = dgemm_tcopy_8_bulldozer.S DGEMMONCOPY = gemm_ncopy_2_bulldozer.S @@ -25,7 +26,8 @@ DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) -CGEMMKERNEL = cgemm_kernel_4x2_bulldozer.S + +CGEMMKERNEL = cgemm_kernel_4x2_piledriver.S CGEMMINCOPY = ../generic/zgemm_ncopy_4.c CGEMMITCOPY = ../generic/zgemm_tcopy_4.c CGEMMONCOPY = ../generic/zgemm_ncopy_2.c @@ -34,7 +36,7 @@ CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) -ZGEMMKERNEL = zgemm_kernel_2x2_bulldozer.S +ZGEMMKERNEL = zgemm_kernel_2x2_piledriver.S ZGEMMINCOPY = ZGEMMITCOPY = ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c @@ -52,9 +54,10 @@ STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_LT = dtrsm_kernel_LT_8x2_bulldozer.S +DTRSMKERNEL_RN = dtrsm_kernel_RN_8x2_bulldozer.S DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c diff --git a/kernel/x86_64/cgemm_kernel_4x2_piledriver.S b/kernel/x86_64/cgemm_kernel_4x2_piledriver.S new file mode 100644 index 000000000..931316285 --- /dev/null +++ b/kernel/x86_64/cgemm_kernel_4x2_piledriver.S @@ -0,0 +1,1920 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +/********************************************************************* +* +* 2013/10/31 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +* +* 2013/10/31 Saar +* +* Parameter: +* UNROLL_M 4 +* UNROLL_N 2 +* CGEMM_P 768 +* CGEMM_Q 168 +* A_PR1 512 +* B_PR1 256 +* +* Performance at m x n on AMD 8320 (ACML-Version: 5.3.1): +* +* 4608x4608 154.0 GFLOPS with 8 threads on 4 modules (ACML: 111.7 ) (BULLDOZER: 153.9 ) +* 4608x4608 148.3 GFLOPS with 4 threads on 4 modules (ACML: 96.0 ) (BULLDOZER: 143.2 ) +* 3456x3456 74.3 GFLOPS with 2 threads on 2 modules (ACML: 47.3 ) (BULLDOZER: 72.3 ) +* 3456x3456 37.3 GFLOPS with 1 threads on 1 modules (ACML: 24.2 ) (BULLDOZER: 36.5 ) +* +* Performance at m x n on AMD 6380 (ACML-Version: 5.3.1): +* +* 6912x6912 421.5 GFLOPS with 32 threads on 16 modules (ACML: 266.6 ) (BULLDOZER: 422.5 ) +* 6912x6912 407.0 GFLOPS with 16 threads on 16 modules (ACML: 271.5 ) (BULLDOZER: 404.7 ) +* 6912x6912 234.2 GFLOPS with 8 threads on 8 modules (ACML: 164.0 ) (BULLDOZER: 230.5 ) +* 4608x4608 123.1 GFLOPS with 4 threads on 4 modules (ACML: 87.9 ) (BULLDOZER: 120.9 ) +* 3456x3456 62.6 GFLOPS with 2 threads on 2 modules (ACML: 44.5 ) (BULLDOZER: 62.1 ) +* 3456x3456 31.8 GFLOPS with 1 threads on 1 modules (ACML: 22.6 ) (BULLDOZER: 31.4 ) +* +*********************************************************************/ + + + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define J %r14 +#define OLD_K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define K %r12 +#define BI %rbp +#define SP %rbx + +#define BO1 %rdi +#define BO2 %r15 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 96 + +#else + +#define STACKSIZE 320 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#endif + +#define L_BUFFER_SIZE 512*8*4 +#define LB2_OFFSET 512*8*2 + +#define Ndiv6 24(%rsp) +#define Nmod6 32(%rsp) +#define N 40(%rsp) +#define ALPHA_R 48(%rsp) +#define ALPHA_I 56(%rsp) +#define OFFSET 64(%rsp) +#define KK 72(%rsp) +#define KKK 80(%rsp) +#define BUFFER1 128(%rsp) +#define BUFFER2 LB2_OFFSET+128(%rsp) + +#if defined(OS_WINDOWS) +#if L_BUFFER_SIZE > 16384 +#define STACK_TOUCH \ + movl $0, 4096 * 4(%rsp);\ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 12288 +#define STACK_TOUCH \ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 8192 +#define STACK_TOUCH \ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 4096 +#define STACK_TOUCH \ + movl $0, 4096 * 1(%rsp); +#else +#define STACK_TOUCH +#endif +#else +#define STACK_TOUCH +#endif + + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define VFMADD_R vfmaddps +#define VFMADD_I vfmaddps +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define VFMADD_R vfnmaddps +#define VFMADD_I vfmaddps +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define VFMADD_R vfmaddps +#define VFMADD_I vfnmaddps +#else +#define VFMADD_R vfnmaddps +#define VFMADD_I vfnmaddps +#endif + + + +#define A_PR1 512 +#define B_PR1 256 + +#define KERNEL4x2_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + +#define KERNEL4x2_2(xx) \ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + +#define KERNEL4x2_3(xx) \ + prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ + vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups 4 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + +#define KERNEL4x2_4(xx) \ + vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups 12 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vbroadcastss 6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vbroadcastss 7 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + addq $16, BI ;\ + addq $32, %rax ;\ + + +#define KERNEL4x2_SUB(xx) \ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + addq $4, BI ;\ + addq $8, %rax ;\ + +/************************************************************************************************/ + +#define KERNEL2x2_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + +#define KERNEL2x2_2(xx) \ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + +#define KERNEL2x2_3(xx) \ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + +#define KERNEL2x2_4(xx) \ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vbroadcastss 6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vbroadcastss 7 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + addq $16, BI ;\ + addq $16, %rax ;\ + + +#define KERNEL2x2_SUB(xx) \ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + addq $4, BI ;\ + addq $4, %rax ;\ + +/************************************************************************************************/ + +#define KERNEL1x2_1(xx) \ + vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + +#define KERNEL1x2_2(xx) \ + vmovsd -14 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + +#define KERNEL1x2_3(xx) \ + vmovsd -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + +#define KERNEL1x2_4(xx) \ + vmovsd -10 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vbroadcastss 6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vbroadcastss 7 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + addq $16, BI ;\ + addq $8, %rax ;\ + + +#define KERNEL1x2_SUB(xx) \ + vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + addq $4, BI ;\ + addq $2, %rax ;\ + + + +/************************************************************************************************/ + +#define KERNEL4x1_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + +#define KERNEL4x1_2(xx) \ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + +#define KERNEL4x1_3(xx) \ + prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ + vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups 4 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + +#define KERNEL4x1_4(xx) \ + vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups 12 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + addq $8, BI ;\ + addq $32, %rax ;\ + + +#define KERNEL4x1_SUB(xx) \ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + addq $2, BI ;\ + addq $8, %rax ;\ + + +/************************************************************************************************/ + +#define KERNEL2x1_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + +#define KERNEL2x1_2(xx) \ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + +#define KERNEL2x1_3(xx) \ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + +#define KERNEL2x1_4(xx) \ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + addq $8, BI ;\ + addq $16, %rax ;\ + + +#define KERNEL2x1_SUB(xx) \ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + addq $2, BI ;\ + addq $4, %rax ;\ + + +/************************************************************************************************/ + +#define KERNEL1x1_1(xx) \ + vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + +#define KERNEL1x1_2(xx) \ + vmovsd -14 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + +#define KERNEL1x1_3(xx) \ + vmovsd -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + +#define KERNEL1x1_4(xx) \ + vmovsd -10 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + addq $8, BI ;\ + addq $8, %rax ;\ + + +#define KERNEL1x1_SUB(xx) \ + vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + addq $2, BI ;\ + addq $2, %rax ;\ + + +/************************************************************************************************/ + + + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif + vmovaps %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + movsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovss %xmm0, ALPHA_R + vmovss %xmm1, ALPHA_I + + salq $ZBASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $2, %rdi + divq %rdi // N / 2 + movq %rax, Ndiv6 // N / 2 + movq %rdx, Nmod6 // N % 2 + + + +#ifdef TRMMKERNEL + vmovsd %xmm12, OFFSET + vmovsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + +.L2_0: + + movq Ndiv6, J + cmpq $0, J + je .L1_0 + ALIGN_4 + + + +.L2_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L2_02b: + + vmovups (BO1), %xmm0 + vmovups %xmm0, (BO) + addq $4*SIZE,BO1 + addq $4*SIZE,BO + decq %rax + jnz .L2_02b + +.L2_02c: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = (m >> 2) + je .L2_20 + + ALIGN_4 + +.L2_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_16 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_12: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + je .L2_16 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + je .L2_16 + + jmp .L2_12 + ALIGN_4 + +.L2_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_17: + + KERNEL4x2_SUB(xxx) + jl .L2_17 + ALIGN_4 + + +.L2_19: + + vbroadcastss ALPHA_R, %xmm0 + vbroadcastss ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $0xb1, %xmm11, %xmm11, %xmm11 + vshufps $0xb1, %xmm13, %xmm13, %xmm13 + vshufps $0xb1, %xmm15, %xmm15, %xmm15 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + vaddsubps %xmm13,%xmm12, %xmm12 + vaddsubps %xmm15,%xmm14, %xmm14 + + vshufps $0xb1, %xmm8 , %xmm8, %xmm9 + vshufps $0xb1, %xmm10, %xmm10, %xmm11 + vshufps $0xb1, %xmm12, %xmm12, %xmm13 + vshufps $0xb1, %xmm14, %xmm14, %xmm15 + +#else + vaddsubps %xmm8, %xmm9 ,%xmm9 + vaddsubps %xmm10, %xmm11,%xmm11 + vaddsubps %xmm12, %xmm13,%xmm13 + vaddsubps %xmm14, %xmm15,%xmm15 + + vmovaps %xmm9, %xmm8 + vmovaps %xmm11, %xmm10 + vmovaps %xmm13, %xmm12 + vmovaps %xmm15, %xmm14 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $0xb1, %xmm11, %xmm11, %xmm11 + vshufps $0xb1, %xmm13, %xmm13, %xmm13 + vshufps $0xb1, %xmm15, %xmm15, %xmm15 + +#endif + + // multiply with ALPHA_R + vmulps %xmm8 , %xmm0, %xmm8 + vmulps %xmm10, %xmm0, %xmm10 + vmulps %xmm12, %xmm0, %xmm12 + vmulps %xmm14, %xmm0, %xmm14 + + // multiply with ALPHA_I + vmulps %xmm9 , %xmm1, %xmm9 + vmulps %xmm11, %xmm1, %xmm11 + vmulps %xmm13, %xmm1, %xmm13 + vmulps %xmm15, %xmm1, %xmm15 + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + vaddsubps %xmm13,%xmm12, %xmm12 + vaddsubps %xmm15,%xmm14, %xmm14 + + + +#ifndef TRMMKERNEL + + vaddps (CO1), %xmm8 , %xmm8 + vaddps 4 * SIZE(CO1), %xmm12, %xmm12 + + vaddps (CO1, LDC), %xmm10, %xmm10 + vaddps 4 * SIZE(CO1, LDC), %xmm14, %xmm14 + +#endif + + vmovups %xmm8 , (CO1) + vmovups %xmm12 , 4 * SIZE(CO1) + + vmovups %xmm10 , (CO1, LDC) + vmovups %xmm14 , 4 * SIZE(CO1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + decq I # i -- + jg .L2_11 + ALIGN_4 + + + +/************************************************************************** +* Rest of M +***************************************************************************/ + +.L2_20: + testq $3, M + jz .L2_60 // to next 2 lines of N + + testq $2, M + jz .L2_40 + ALIGN_4 + +.L2_21: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_26 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_22: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_26 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_26 + + jmp .L2_22 + ALIGN_4 + +.L2_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_29 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_27: + + KERNEL2x2_SUB(xxx) + jl .L2_27 + ALIGN_4 + + +.L2_29: + + vbroadcastss ALPHA_R, %xmm0 + vbroadcastss ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $0xb1, %xmm11, %xmm11, %xmm11 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + + vshufps $0xb1, %xmm8 , %xmm8, %xmm9 + vshufps $0xb1, %xmm10, %xmm10, %xmm11 + +#else + vaddsubps %xmm8, %xmm9 ,%xmm9 + vaddsubps %xmm10, %xmm11,%xmm11 + + vmovaps %xmm9, %xmm8 + vmovaps %xmm11, %xmm10 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $0xb1, %xmm11, %xmm11, %xmm11 + +#endif + + // multiply with ALPHA_R + vmulps %xmm8 , %xmm0, %xmm8 + vmulps %xmm10, %xmm0, %xmm10 + + // multiply with ALPHA_I + vmulps %xmm9 , %xmm1, %xmm9 + vmulps %xmm11, %xmm1, %xmm11 + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + + + +#ifndef TRMMKERNEL + + vaddps (CO1), %xmm8 , %xmm8 + + vaddps (CO1, LDC), %xmm10, %xmm10 + +#endif + + vmovups %xmm8 , (CO1) + + vmovups %xmm10 , (CO1, LDC) + + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + + +/**************************************************************************/ +.L2_40: + testq $1, M + jz .L2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_46 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_42: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + jmp .L2_42 + ALIGN_4 + +.L2_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_49 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB(xxx) + jl .L2_47 + ALIGN_4 + + +.L2_49: + + vbroadcastss ALPHA_R, %xmm0 + vbroadcastss ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $0xb1, %xmm11, %xmm11, %xmm11 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + + vshufps $0xb1, %xmm8 , %xmm8, %xmm9 + vshufps $0xb1, %xmm10, %xmm10, %xmm11 + +#else + vaddsubps %xmm8, %xmm9 ,%xmm9 + vaddsubps %xmm10, %xmm11,%xmm11 + + vmovaps %xmm9, %xmm8 + vmovaps %xmm11, %xmm10 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $0xb1, %xmm11, %xmm11, %xmm11 + +#endif + + // multiply with ALPHA_R + vmulps %xmm8 , %xmm0, %xmm8 + vmulps %xmm10, %xmm0, %xmm10 + + // multiply with ALPHA_I + vmulps %xmm9 , %xmm1, %xmm9 + vmulps %xmm11, %xmm1, %xmm11 + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + + + +#ifndef TRMMKERNEL + + vmovsd (CO1), %xmm14 + vaddps %xmm14, %xmm8 , %xmm8 + + vmovsd (CO1, LDC), %xmm15 + vaddps %xmm15, %xmm10, %xmm10 + +#endif + + vmovsd %xmm8 , (CO1) + + vmovsd %xmm10 , (CO1, LDC) + + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + + + + +.L2_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + decq J // j -- + jg .L2_01 // next 2 lines of N + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovsd (BO1), %xmm0 + vmovsd %xmm0, (BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = (m >> 2) + je .L1_20 + + ALIGN_4 + +.L1_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_16 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_12: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + je .L1_16 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + je .L1_16 + + jmp .L1_12 + ALIGN_4 + +.L1_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 4 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_17: + + KERNEL4x1_SUB(xxx) + jl .L1_17 + ALIGN_4 + + +.L1_19: + + vbroadcastss ALPHA_R, %xmm0 + vbroadcastss ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $0xb1, %xmm13, %xmm13, %xmm13 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm13,%xmm12, %xmm12 + + vshufps $0xb1, %xmm8 , %xmm8, %xmm9 + vshufps $0xb1, %xmm12, %xmm12, %xmm13 + +#else + vaddsubps %xmm8, %xmm9 ,%xmm9 + vaddsubps %xmm12, %xmm13,%xmm13 + + vmovaps %xmm9, %xmm8 + vmovaps %xmm13, %xmm12 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $0xb1, %xmm13, %xmm13, %xmm13 + +#endif + + // multiply with ALPHA_R + vmulps %xmm8 , %xmm0, %xmm8 + vmulps %xmm12, %xmm0, %xmm12 + + // multiply with ALPHA_I + vmulps %xmm9 , %xmm1, %xmm9 + vmulps %xmm13, %xmm1, %xmm13 + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm13,%xmm12, %xmm12 + + + +#ifndef TRMMKERNEL + + vaddps (CO1), %xmm8 , %xmm8 + vaddps 4 * SIZE(CO1), %xmm12, %xmm12 + +#endif + + vmovups %xmm8 , (CO1) + vmovups %xmm12 , 4 * SIZE(CO1) + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + decq I # i -- + jg .L1_11 + ALIGN_4 + + + +/************************************************************************** +* Rest of M +***************************************************************************/ + +.L1_20: + testq $3, M + jz .L999 + + testq $2, M + jz .L1_40 + ALIGN_4 + +.L1_21: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_26 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_22: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_26 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_26 + + jmp .L1_22 + ALIGN_4 + +.L1_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_29 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_27: + + KERNEL2x1_SUB(xxx) + jl .L1_27 + ALIGN_4 + + +.L1_29: + + vbroadcastss ALPHA_R, %xmm0 + vbroadcastss ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %xmm9, %xmm8 , %xmm8 + + vshufps $0xb1, %xmm8 , %xmm8, %xmm9 + +#else + vaddsubps %xmm8, %xmm9 ,%xmm9 + + vmovaps %xmm9, %xmm8 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + +#endif + + // multiply with ALPHA_R + vmulps %xmm8 , %xmm0, %xmm8 + + // multiply with ALPHA_I + vmulps %xmm9 , %xmm1, %xmm9 + + vaddsubps %xmm9, %xmm8 , %xmm8 + + + +#ifndef TRMMKERNEL + + vaddps (CO1), %xmm8 , %xmm8 + +#endif + + vmovups %xmm8 , (CO1) + + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + + +/**************************************************************************/ +.L1_40: + testq $1, M + jz .L999 // to next 2 lines of N + + ALIGN_4 + +.L1_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_46 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_42: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + jmp .L1_42 + ALIGN_4 + +.L1_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_49 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB(xxx) + jl .L1_47 + ALIGN_4 + + +.L1_49: + + vbroadcastss ALPHA_R, %xmm0 + vbroadcastss ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %xmm9, %xmm8 , %xmm8 + + vshufps $0xb1, %xmm8 , %xmm8, %xmm9 + +#else + vaddsubps %xmm8, %xmm9 ,%xmm9 + + vmovaps %xmm9, %xmm8 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + +#endif + + // multiply with ALPHA_R + vmulps %xmm8 , %xmm0, %xmm8 + + // multiply with ALPHA_I + vmulps %xmm9 , %xmm1, %xmm9 + + vaddsubps %xmm9, %xmm8 , %xmm8 + + + +#ifndef TRMMKERNEL + + vmovsd (CO1), %xmm14 + vaddps %xmm14, %xmm8 , %xmm8 + +#endif + + vmovsd %xmm8 , (CO1) + + + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + + + + + + + +.L999: + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/cgemm_kernel_8x2_haswell.S b/kernel/x86_64/cgemm_kernel_8x2_haswell.S new file mode 100644 index 000000000..e4aba23e4 --- /dev/null +++ b/kernel/x86_64/cgemm_kernel_8x2_haswell.S @@ -0,0 +1,2284 @@ +/********************************************************************************* +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + +/********************************************************************* +* 2013/11/13 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +* 2013/10/28 Saar +* Parameter: +* CGEMM_DEFAULT_UNROLL_N 2 +* CGEMM_DEFAULT_UNROLL_M 8 +* CGEMM_DEFAULT_P 384 +* CGEMM_DEFAULT_Q 192 +* A_PR1 512 +* B_PR1 512 +* +* Performance at 6912x6912x6912: +* 1 thread: 84 GFLOPS (SANDYBRIDGE: 60) (MKL: 86) +* 2 threads: 153 GFLOPS (SANDYBRIDGE: 114) (MKL: 155) +* 3 threads: 224 GFLOPS (SANDYBRIDGE: 162) (MKL: 222) +* 4 threads: 278 GFLOPS (SANDYBRIDGE: 223) (MKL: 279) +* +* +*********************************************************************/ + + + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define J %r14 +#define OLD_K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define K %r12 +#define BI %rbp +#define SP %rbx + +#define BO1 %rdi +#define BO2 %r15 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 96 + +#else + +#define STACKSIZE 320 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#endif + +#define L_BUFFER_SIZE 512*8*4 +#define LB2_OFFSET 512*8*2 + +#define Ndiv6 24(%rsp) +#define Nmod6 32(%rsp) +#define N 40(%rsp) +#define ALPHA_R 48(%rsp) +#define ALPHA_I 56(%rsp) +#define OFFSET 64(%rsp) +#define KK 72(%rsp) +#define KKK 80(%rsp) +#define BUFFER1 128(%rsp) +#define BUFFER2 LB2_OFFSET+128(%rsp) + +#if defined(OS_WINDOWS) +#if L_BUFFER_SIZE > 16384 +#define STACK_TOUCH \ + movl $0, 4096 * 4(%rsp);\ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 12288 +#define STACK_TOUCH \ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 8192 +#define STACK_TOUCH \ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 4096 +#define STACK_TOUCH \ + movl $0, 4096 * 1(%rsp); +#else +#define STACK_TOUCH +#endif +#else +#define STACK_TOUCH +#endif + + +#if defined(BULLDOZER) + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + +#define VFMADDPS_R( y0,y1,y2 ) vfmaddps y0,y1,y2,y0 + +#define VFMADDPS_I( y0,y1,y2 ) vfmaddps y0,y1,y2,y0 + +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) + +#define VFMADDPS_R( y0,y1,y2 ) vfnmaddps y0,y1,y2,y0 + +#define VFMADDPS_I( y0,y1,y2 ) vfmaddps y0,y1,y2,y0 + +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) + +#define VFMADDPS_R( y0,y1,y2 ) vfmaddps y0,y1,y2,y0 + +#define VFMADDPS_I( y0,y1,y2 ) vfnmaddps y0,y1,y2,y0 + +#else + +#define VFMADDPS_R( y0,y1,y2 ) vfnmaddps y0,y1,y2,y0 + +#define VFMADDPS_I( y0,y1,y2 ) vfnmaddps y0,y1,y2,y0 + +#endif + +#else + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + +#define VFMADDPS_R( y0,y1,y2 ) vfmadd231ps y1,y2,y0 + +#define VFMADDPS_I( y0,y1,y2 ) vfmadd231ps y1,y2,y0 + +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) + +#define VFMADDPS_R( y0,y1,y2 ) vfnmadd231ps y1,y2,y0 + +#define VFMADDPS_I( y0,y1,y2 ) vfmadd231ps y1,y2,y0 + +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) + +#define VFMADDPS_R( y0,y1,y2 ) vfmadd231ps y1,y2,y0 + +#define VFMADDPS_I( y0,y1,y2 ) vfnmadd231ps y1,y2,y0 + +#else + +#define VFMADDPS_R( y0,y1,y2 ) vfnmadd231ps y1,y2,y0 + +#define VFMADDPS_I( y0,y1,y2 ) vfnmadd231ps y1,y2,y0 + +#endif + +#endif + + +#define A_PR1 512 +#define B_PR1 512 + +/***************************************************************************************************************************/ + +.macro KERNEL8x2_SUB + + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + vbroadcastss -8 * SIZE(BO, BI, SIZE), %ymm4 + VFMADDPS_R( %ymm8,%ymm4,%ymm0 ) + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 + VFMADDPS_R( %ymm12,%ymm4,%ymm1 ) + vbroadcastss -7 * SIZE(BO, BI, SIZE), %ymm5 + VFMADDPS_I( %ymm9,%ymm5,%ymm0 ) + VFMADDPS_I( %ymm13,%ymm5,%ymm1 ) + vbroadcastss -6 * SIZE(BO, BI, SIZE), %ymm6 + VFMADDPS_R( %ymm10,%ymm6,%ymm0 ) + VFMADDPS_R( %ymm14,%ymm6,%ymm1 ) + vbroadcastss -5 * SIZE(BO, BI, SIZE), %ymm7 + VFMADDPS_I( %ymm11,%ymm7,%ymm0 ) + VFMADDPS_I( %ymm15,%ymm7,%ymm1 ) + addq $4 , BI + addq $16, %rax +.endm + +.macro SAVE8x2 + + vbroadcastss ALPHA_R, %ymm0 + vbroadcastss ALPHA_I, %ymm1 + + // swap high and low 64 bytes + vshufps $0xb1, %ymm9 , %ymm9, %ymm9 + vshufps $0xb1, %ymm11, %ymm11, %ymm11 + vshufps $0xb1, %ymm13, %ymm13, %ymm13 + vshufps $0xb1, %ymm15, %ymm15, %ymm15 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %ymm9, %ymm8 , %ymm8 + vaddsubps %ymm11,%ymm10, %ymm10 + vaddsubps %ymm13,%ymm12, %ymm12 + vaddsubps %ymm15,%ymm14, %ymm14 + + vshufps $0xb1, %ymm8 , %ymm8, %ymm9 + vshufps $0xb1, %ymm10, %ymm10, %ymm11 + vshufps $0xb1, %ymm12, %ymm12, %ymm13 + vshufps $0xb1, %ymm14, %ymm14, %ymm15 + +#else + vaddsubps %ymm8, %ymm9 ,%ymm9 + vaddsubps %ymm10, %ymm11,%ymm11 + vaddsubps %ymm12, %ymm13,%ymm13 + vaddsubps %ymm14, %ymm15,%ymm15 + + vmovaps %ymm9, %ymm8 + vmovaps %ymm11, %ymm10 + vmovaps %ymm13, %ymm12 + vmovaps %ymm15, %ymm14 + + // swap high and low 64 bytes + vshufps $0xb1, %ymm9 , %ymm9, %ymm9 + vshufps $0xb1, %ymm11, %ymm11, %ymm11 + vshufps $0xb1, %ymm13, %ymm13, %ymm13 + vshufps $0xb1, %ymm15, %ymm15, %ymm15 + +#endif + + // multiply with ALPHA_R + vmulps %ymm8 , %ymm0, %ymm8 + vmulps %ymm10, %ymm0, %ymm10 + vmulps %ymm12, %ymm0, %ymm12 + vmulps %ymm14, %ymm0, %ymm14 + + // multiply with ALPHA_I + vmulps %ymm9 , %ymm1, %ymm9 + vmulps %ymm11, %ymm1, %ymm11 + vmulps %ymm13, %ymm1, %ymm13 + vmulps %ymm15, %ymm1, %ymm15 + + vaddsubps %ymm9, %ymm8 , %ymm8 + vaddsubps %ymm11,%ymm10, %ymm10 + vaddsubps %ymm13,%ymm12, %ymm12 + vaddsubps %ymm15,%ymm14, %ymm14 + + + +#ifndef TRMMKERNEL + + vaddps (CO1), %ymm8 , %ymm8 + vaddps 8 * SIZE(CO1), %ymm12, %ymm12 + + vaddps (CO1, LDC), %ymm10, %ymm10 + vaddps 8 * SIZE(CO1, LDC), %ymm14, %ymm14 + +#endif + + vmovups %ymm8 , (CO1) + vmovups %ymm12 , 8 * SIZE(CO1) + + vmovups %ymm10 , (CO1, LDC) + vmovups %ymm14 , 8 * SIZE(CO1, LDC) + + prefetcht0 64(CO1) + prefetcht0 64(CO1, LDC) + +.endm + +/***************************************************************************************************************************/ + +.macro KERNEL4x2_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 + vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 + VFMADDPS_R( %xmm8,%xmm4,%xmm0 ) + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 + VFMADDPS_R( %xmm12,%xmm4,%xmm1 ) + vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 + VFMADDPS_I( %xmm9,%xmm5,%xmm0 ) + VFMADDPS_I( %xmm13,%xmm5,%xmm1 ) + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 + VFMADDPS_R( %xmm10,%xmm6,%xmm0 ) + VFMADDPS_R( %xmm14,%xmm6,%xmm1 ) + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 + VFMADDPS_I( %xmm11,%xmm7,%xmm0 ) + VFMADDPS_I( %xmm15,%xmm7,%xmm1 ) + addq $4, BI + addq $8, %rax +.endm + +.macro SAVE4x2 + + vbroadcastss ALPHA_R, %xmm0 + vbroadcastss ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $0xb1, %xmm11, %xmm11, %xmm11 + vshufps $0xb1, %xmm13, %xmm13, %xmm13 + vshufps $0xb1, %xmm15, %xmm15, %xmm15 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + vaddsubps %xmm13,%xmm12, %xmm12 + vaddsubps %xmm15,%xmm14, %xmm14 + + vshufps $0xb1, %xmm8 , %xmm8, %xmm9 + vshufps $0xb1, %xmm10, %xmm10, %xmm11 + vshufps $0xb1, %xmm12, %xmm12, %xmm13 + vshufps $0xb1, %xmm14, %xmm14, %xmm15 + +#else + vaddsubps %xmm8, %xmm9 ,%xmm9 + vaddsubps %xmm10, %xmm11,%xmm11 + vaddsubps %xmm12, %xmm13,%xmm13 + vaddsubps %xmm14, %xmm15,%xmm15 + + vmovaps %xmm9, %xmm8 + vmovaps %xmm11, %xmm10 + vmovaps %xmm13, %xmm12 + vmovaps %xmm15, %xmm14 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $0xb1, %xmm11, %xmm11, %xmm11 + vshufps $0xb1, %xmm13, %xmm13, %xmm13 + vshufps $0xb1, %xmm15, %xmm15, %xmm15 + +#endif + + // multiply with ALPHA_R + vmulps %xmm8 , %xmm0, %xmm8 + vmulps %xmm10, %xmm0, %xmm10 + vmulps %xmm12, %xmm0, %xmm12 + vmulps %xmm14, %xmm0, %xmm14 + + // multiply with ALPHA_I + vmulps %xmm9 , %xmm1, %xmm9 + vmulps %xmm11, %xmm1, %xmm11 + vmulps %xmm13, %xmm1, %xmm13 + vmulps %xmm15, %xmm1, %xmm15 + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + vaddsubps %xmm13,%xmm12, %xmm12 + vaddsubps %xmm15,%xmm14, %xmm14 + +#ifndef TRMMKERNEL + + vaddps (CO1), %xmm8 , %xmm8 + vaddps 4 * SIZE(CO1), %xmm12, %xmm12 + + vaddps (CO1, LDC), %xmm10, %xmm10 + vaddps 4 * SIZE(CO1, LDC), %xmm14, %xmm14 + +#endif + + vmovups %xmm8 , (CO1) + vmovups %xmm12 , 4 * SIZE(CO1) + + vmovups %xmm10 , (CO1, LDC) + vmovups %xmm14 , 4 * SIZE(CO1, LDC) + +.endm + +/************************************************************************************************/ + +.macro KERNEL2x2_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 + vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 + VFMADDPS_R( %xmm8,%xmm4,%xmm0 ) + vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 + VFMADDPS_I( %xmm9,%xmm5,%xmm0 ) + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 + VFMADDPS_R( %xmm10,%xmm6,%xmm0 ) + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 + VFMADDPS_I( %xmm11,%xmm7,%xmm0 ) + addq $4, BI + addq $4, %rax +.endm + +.macro SAVE2x2 + + vbroadcastss ALPHA_R, %xmm0 + vbroadcastss ALPHA_I, %xmm1 + + // swap high and low 4 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $0xb1, %xmm11, %xmm11, %xmm11 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + + vshufps $0xb1, %xmm8 , %xmm8, %xmm9 + vshufps $0xb1, %xmm10, %xmm10, %xmm11 + +#else + vaddsubps %xmm8, %xmm9 ,%xmm9 + vaddsubps %xmm10, %xmm11,%xmm11 + + vmovaps %xmm9, %xmm8 + vmovaps %xmm11, %xmm10 + + // swap high and low 4 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $0xb1, %xmm11, %xmm11, %xmm11 + +#endif + + // multiply with ALPHA_R + vmulps %xmm8 , %xmm0, %xmm8 + vmulps %xmm10, %xmm0, %xmm10 + + // multiply with ALPHA_I + vmulps %xmm9 , %xmm1, %xmm9 + vmulps %xmm11, %xmm1, %xmm11 + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + +#ifndef TRMMKERNEL + + vaddps (CO1), %xmm8 , %xmm8 + + vaddps (CO1, LDC), %xmm10, %xmm10 + +#endif + + vmovups %xmm8 , (CO1) + + vmovups %xmm10 , (CO1, LDC) + +.endm + +/************************************************************************************************/ + +.macro KERNEL1x2_SUB + vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 + vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 + VFMADDPS_R( %xmm8,%xmm4,%xmm0 ) + vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 + VFMADDPS_I( %xmm9,%xmm5,%xmm0 ) + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 + VFMADDPS_R( %xmm10,%xmm6,%xmm0 ) + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 + VFMADDPS_I( %xmm11,%xmm7,%xmm0 ) + addq $4, BI + addq $2, %rax +.endm + +.macro SAVE1x2 + + vbroadcastss ALPHA_R, %xmm0 + vbroadcastss ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $0xb1, %xmm11, %xmm11, %xmm11 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + + vshufps $0xb1, %xmm8 , %xmm8, %xmm9 + vshufps $0xb1, %xmm10, %xmm10, %xmm11 + +#else + vaddsubps %xmm8, %xmm9 ,%xmm9 + vaddsubps %xmm10, %xmm11,%xmm11 + + vmovaps %xmm9, %xmm8 + vmovaps %xmm11, %xmm10 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $0xb1, %xmm11, %xmm11, %xmm11 + +#endif + + // multiply with ALPHA_R + vmulps %xmm8 , %xmm0, %xmm8 + vmulps %xmm10, %xmm0, %xmm10 + + // multiply with ALPHA_I + vmulps %xmm9 , %xmm1, %xmm9 + vmulps %xmm11, %xmm1, %xmm11 + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + +#ifndef TRMMKERNEL + + vmovsd (CO1), %xmm14 + vaddps %xmm14, %xmm8 , %xmm8 + + vmovsd (CO1, LDC), %xmm15 + vaddps %xmm15, %xmm10, %xmm10 + +#endif + + vmovsd %xmm8 , (CO1) + vmovsd %xmm10 , (CO1, LDC) + +.endm + +/************************************************************************************************/ + +.macro KERNEL8x1_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm4 + VFMADDPS_R( %ymm8,%ymm4,%ymm0 ) + VFMADDPS_R( %ymm12,%ymm4,%ymm1 ) + vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm5 + VFMADDPS_I( %ymm9,%ymm5,%ymm0 ) + VFMADDPS_I( %ymm13,%ymm5,%ymm1 ) + addq $2 , BI + addq $16, %rax +.endm + +.macro SAVE8x1 + + vbroadcastss ALPHA_R, %ymm0 + vbroadcastss ALPHA_I, %ymm1 + + // swap high and low 64 bytes + vshufps $0xb1, %ymm9 , %ymm9, %ymm9 + vshufps $0xb1, %ymm13, %ymm13, %ymm13 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %ymm9, %ymm8 , %ymm8 + vaddsubps %ymm13,%ymm12, %ymm12 + + vshufps $0xb1, %ymm8 , %ymm8, %ymm9 + vshufps $0xb1, %ymm12, %ymm12, %ymm13 + +#else + vaddsubps %ymm8, %ymm9 ,%ymm9 + vaddsubps %ymm12, %ymm13,%ymm13 + + vmovaps %ymm9, %ymm8 + vmovaps %ymm13, %ymm12 + + // swap high and low 64 bytes + vshufps $0xb1, %ymm9 , %ymm9, %ymm9 + vshufps $0xb1, %ymm13, %ymm13, %ymm13 + +#endif + + // multiply with ALPHA_R + vmulps %ymm8 , %ymm0, %ymm8 + vmulps %ymm12, %ymm0, %ymm12 + + // multiply with ALPHA_I + vmulps %ymm9 , %ymm1, %ymm9 + vmulps %ymm13, %ymm1, %ymm13 + + vaddsubps %ymm9, %ymm8 , %ymm8 + vaddsubps %ymm13,%ymm12, %ymm12 + + + +#ifndef TRMMKERNEL + + vaddps (CO1), %ymm8 , %ymm8 + vaddps 8 * SIZE(CO1), %ymm12, %ymm12 + +#endif + + vmovups %ymm8 , (CO1) + vmovups %ymm12 , 8 * SIZE(CO1) + +.endm + + +/************************************************************************************************/ + +.macro KERNEL4x1_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 + VFMADDPS_R( %xmm8,%xmm4,%xmm0 ) + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 + VFMADDPS_R( %xmm12,%xmm4,%xmm1 ) + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 + VFMADDPS_I( %xmm9,%xmm5,%xmm0 ) + VFMADDPS_I( %xmm13,%xmm5,%xmm1 ) + addq $2, BI + addq $8, %rax +.endm + +.macro SAVE4x1 + + vbroadcastss ALPHA_R, %xmm0 + vbroadcastss ALPHA_I, %xmm1 + + // swap high and low 4 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $0xb1, %xmm13, %xmm13, %xmm13 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm13,%xmm12, %xmm12 + + vshufps $0xb1, %xmm8 , %xmm8, %xmm9 + vshufps $0xb1, %xmm12, %xmm12, %xmm13 + +#else + vaddsubps %xmm8, %xmm9 ,%xmm9 + vaddsubps %xmm12, %xmm13,%xmm13 + + vmovaps %xmm9, %xmm8 + vmovaps %xmm13, %xmm12 + + // swap high and low 4 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $0xb1, %xmm13, %xmm13, %xmm13 + +#endif + + // multiply with ALPHA_R + vmulps %xmm8 , %xmm0, %xmm8 + vmulps %xmm12, %xmm0, %xmm12 + + // multiply with ALPHA_I + vmulps %xmm9 , %xmm1, %xmm9 + vmulps %xmm13, %xmm1, %xmm13 + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm13,%xmm12, %xmm12 + +#ifndef TRMMKERNEL + + vaddps (CO1), %xmm8 , %xmm8 + vaddps 4 * SIZE(CO1), %xmm12, %xmm12 + +#endif + + vmovups %xmm8 , (CO1) + vmovups %xmm12 , 4 * SIZE(CO1) + +.endm + +/************************************************************************************************/ + +.macro KERNEL2x1_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 + VFMADDPS_R( %xmm8,%xmm4,%xmm0 ) + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 + VFMADDPS_I( %xmm9,%xmm5,%xmm0 ) + addq $2, BI + addq $4, %rax +.endm + +.macro SAVE2x1 + + vbroadcastss ALPHA_R, %xmm0 + vbroadcastss ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %xmm9, %xmm8 , %xmm8 + + vshufps $0xb1, %xmm8 , %xmm8, %xmm9 + +#else + vaddsubps %xmm8, %xmm9 ,%xmm9 + + vmovaps %xmm9, %xmm8 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + +#endif + + // multiply with ALPHA_R + vmulps %xmm8 , %xmm0, %xmm8 + + // multiply with ALPHA_I + vmulps %xmm9 , %xmm1, %xmm9 + + vaddsubps %xmm9, %xmm8 , %xmm8 + +#ifndef TRMMKERNEL + + vaddps (CO1), %xmm8 , %xmm8 + +#endif + + vmovups %xmm8 , (CO1) + +.endm + +/************************************************************************************************/ + +.macro KERNEL1x1_SUB + vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 + VFMADDPS_R( %xmm8,%xmm4,%xmm0 ) + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 + VFMADDPS_I( %xmm9,%xmm5,%xmm0 ) + addq $2, BI + addq $2, %rax +.endm + +.macro SAVE1x1 + + vbroadcastss ALPHA_R, %xmm0 + vbroadcastss ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %xmm9, %xmm8 , %xmm8 + + vshufps $0xb1, %xmm8 , %xmm8, %xmm9 + +#else + vaddsubps %xmm8, %xmm9 ,%xmm9 + + vmovaps %xmm9, %xmm8 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + +#endif + + // multiply with ALPHA_R + vmulps %xmm8 , %xmm0, %xmm8 + + // multiply with ALPHA_I + vmulps %xmm9 , %xmm1, %xmm9 + + vaddsubps %xmm9, %xmm8 , %xmm8 + +#ifndef TRMMKERNEL + + vmovsd (CO1), %xmm14 + vaddps %xmm14, %xmm8 , %xmm8 + +#endif + + vmovsd %xmm8 , (CO1) + +.endm + +/************************************************************************************************/ + + + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif + vmovaps %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + movsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovss %xmm0, ALPHA_R + vmovss %xmm1, ALPHA_I + + salq $ZBASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $2, %rdi + divq %rdi // N / 2 + movq %rax, Ndiv6 // N / 2 + movq %rdx, Nmod6 // N % 2 + + + +#ifdef TRMMKERNEL + vmovsd %xmm12, OFFSET + vmovsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + +.L2_0: + + movq Ndiv6, J + cmpq $0, J + je .L1_0 + ALIGN_4 + + + +.L2_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L2_02b: + + vmovups (BO1), %xmm0 + vmovups %xmm0, (BO) + addq $4*SIZE,BO1 + addq $4*SIZE,BO + decq %rax + jnz .L2_02b + +.L2_02c: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $3, I // i = (m >> 3) + je .L2_4_10 + + ALIGN_4 +/**********************************************************************************************************/ + +.L2_8_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax *16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_8_16 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $4, %rax // rax = rax *16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_8_12: + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL8x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x2_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL8x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x2_SUB + + je .L2_8_16 + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL8x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x2_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL8x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x2_SUB + + je .L2_8_16 + + jmp .L2_8_12 + ALIGN_4 + +.L2_8_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_8_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $4, %rax // rax = rax *16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_8_17: + + KERNEL8x2_SUB + + jl .L2_8_17 + ALIGN_4 + + +.L2_8_19: + + SAVE8x2 + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax *16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L2_8_11 + ALIGN_4 + + +/**********************************************************************************************************/ + + + + +.L2_4_10: + testq $7, M + jz .L2_4_60 // to next 2 lines of N + + testq $4, M + jz .L2_4_20 + ALIGN_4 + + +.L2_4_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_4_16 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_4_12: + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL4x2_SUB + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + KERNEL4x2_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL4x2_SUB + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + KERNEL4x2_SUB + + je .L2_4_16 + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL4x2_SUB + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + KERNEL4x2_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL4x2_SUB + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + KERNEL4x2_SUB + + je .L2_4_16 + + jmp .L2_4_12 + ALIGN_4 + +.L2_4_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_4_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_4_17: + + KERNEL4x2_SUB + + jl .L2_4_17 + ALIGN_4 + + +.L2_4_19: + + SAVE4x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/************************************************************************** +* Rest of M +***************************************************************************/ + +.L2_4_20: + + testq $2, M + jz .L2_4_40 + ALIGN_4 + +.L2_4_21: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_4_26 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_4_22: + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + je .L2_4_26 + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + je .L2_4_26 + + jmp .L2_4_22 + ALIGN_4 + +.L2_4_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_4_29 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_4_27: + + KERNEL2x2_SUB + + jl .L2_4_27 + ALIGN_4 + + +.L2_4_29: + + vbroadcastss ALPHA_R, %xmm0 + vbroadcastss ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $0xb1, %xmm11, %xmm11, %xmm11 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + + vshufps $0xb1, %xmm8 , %xmm8, %xmm9 + vshufps $0xb1, %xmm10, %xmm10, %xmm11 + +#else + vaddsubps %xmm8, %xmm9 ,%xmm9 + vaddsubps %xmm10, %xmm11,%xmm11 + + vmovaps %xmm9, %xmm8 + vmovaps %xmm11, %xmm10 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $0xb1, %xmm11, %xmm11, %xmm11 + +#endif + + // multiply with ALPHA_R + vmulps %xmm8 , %xmm0, %xmm8 + vmulps %xmm10, %xmm0, %xmm10 + + // multiply with ALPHA_I + vmulps %xmm9 , %xmm1, %xmm9 + vmulps %xmm11, %xmm1, %xmm11 + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + + + +#ifndef TRMMKERNEL + + vaddps (CO1), %xmm8 , %xmm8 + + vaddps (CO1, LDC), %xmm10, %xmm10 + +#endif + + vmovups %xmm8 , (CO1) + + vmovups %xmm10 , (CO1, LDC) + + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + decq I # i -- + jg .L2_4_21 + ALIGN_4 + + + +/**************************************************************************/ +.L2_4_40: + testq $1, M + jz .L2_4_60 // to next 2 lines of N + + ALIGN_4 + +.L2_4_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_4_46 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_4_42: + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + je .L2_4_46 + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + je .L2_4_46 + + jmp .L2_4_42 + ALIGN_4 + +.L2_4_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_4_49 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_4_47: + + KERNEL1x2_SUB + + jl .L2_4_47 + ALIGN_4 + + +.L2_4_49: + + SAVE1x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + decq I # i -- + jg .L2_4_41 + ALIGN_4 + + + + +.L2_4_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + decq J // j -- + jg .L2_01 // next 2 lines of N + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovsd (BO1), %xmm0 + vmovsd %xmm0, (BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $3, I // i = (m >> 3) + je .L1_4_10 + + ALIGN_4 + +/**************************************************************************************************/ + +.L1_8_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax *16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_8_16 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax *16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_8_12: + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL8x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x1_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x1_SUB + + je .L1_8_16 + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL8x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x1_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x1_SUB + + je .L1_8_16 + + jmp .L1_8_12 + ALIGN_4 + +.L1_8_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_8_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 4 ; number of values + + salq $4, %rax // rax = rax *16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_8_17: + + KERNEL8x1_SUB + + jl .L1_8_17 + ALIGN_4 + + +.L1_8_19: + + SAVE8x1 + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax *16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L1_8_11 + ALIGN_4 + + + +/**************************************************************************************************/ +.L1_4_10: + + testq $7, M + jz .L999 + + testq $4, M + jz .L1_4_20 + + +.L1_4_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_4_16 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_4_12: + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL4x1_SUB + KERNEL4x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x1_SUB + KERNEL4x1_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x1_SUB + KERNEL4x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x1_SUB + KERNEL4x1_SUB + + je .L1_4_16 + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL4x1_SUB + KERNEL4x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x1_SUB + KERNEL4x1_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x1_SUB + KERNEL4x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x1_SUB + KERNEL4x1_SUB + + je .L1_4_16 + + jmp .L1_4_12 + ALIGN_4 + +.L1_4_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_4_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 4 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_4_17: + + KERNEL4x1_SUB + + jl .L1_4_17 + ALIGN_4 + + +.L1_4_19: + + SAVE4x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/************************************************************************** +* Rest of M +***************************************************************************/ + +.L1_4_20: + + testq $2, M + jz .L1_4_40 + ALIGN_4 + +.L1_4_21: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_4_26 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_4_22: + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + je .L1_4_26 + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + je .L1_4_26 + + jmp .L1_4_22 + ALIGN_4 + +.L1_4_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_4_29 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_4_27: + + KERNEL2x1_SUB + + jl .L1_4_27 + ALIGN_4 + + +.L1_4_29: + + SAVE2x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + + +/**************************************************************************/ +.L1_4_40: + testq $1, M + jz .L999 // to next 2 lines of N + + ALIGN_4 + +.L1_4_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_4_46 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_4_42: + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + je .L1_4_46 + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + je .L1_4_46 + + jmp .L1_4_42 + ALIGN_4 + +.L1_4_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_4_49 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_4_47: + + KERNEL1x1_SUB + + jl .L1_4_47 + ALIGN_4 + + +.L1_4_49: + + SAVE1x1 + + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + + +.L999: + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/dgemm_kernel_16x2_haswell.S b/kernel/x86_64/dgemm_kernel_16x2_haswell.S new file mode 100644 index 000000000..2907a6871 --- /dev/null +++ b/kernel/x86_64/dgemm_kernel_16x2_haswell.S @@ -0,0 +1,5215 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +/********************************************************************* +* 2013/10/20 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK + +* +* +* 2013/10/20 Saar +* Parameter: +* DGEMM_DEFAULT_UNROLL_N 2 +* DGEMM_DEFAULT_UNROLL_M 16 +* DGEMM_DEFAULT_P 192 +* DGEMM_DEFAULT_Q 128 +* A_PR1 512 +* +* +* Performance without prefetch of B: +* 1 thread: 45.8 GFLOPS (MKL: 45) +* 2 threads: 80.0 GFLOPS (MKL: 91) +* 4 threads: 135.0 GFLOPS (MKL: 135) +*********************************************************************/ + + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define J %r14 +#define OLD_K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define K %r12 +#define BI %rbp +#define SP %rbx + +#define BO1 %rdi +#define BO2 %r15 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 96 + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#endif + +#define L_BUFFER_SIZE 512*8*4 +#define LB2_OFFSET 512*8*2 + +#define Ndiv6 24(%rsp) +#define Nmod6 32(%rsp) +#define N 40(%rsp) +#define ALPHA 48(%rsp) +#define OFFSET 56(%rsp) +#define KK 64(%rsp) +#define KKK 72(%rsp) +#define BUFFER1 128(%rsp) +#define BUFFER2 LB2_OFFSET+128(%rsp) + +#if defined(OS_WINDOWS) +#if L_BUFFER_SIZE > 16384 +#define STACK_TOUCH \ + movl $0, 4096 * 4(%rsp);\ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 12288 +#define STACK_TOUCH \ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 8192 +#define STACK_TOUCH \ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 4096 +#define STACK_TOUCH \ + movl $0, 4096 * 1(%rsp); +#else +#define STACK_TOUCH +#endif +#else +#define STACK_TOUCH +#endif + +#if defined(BULLDOZER) + +.macro VFMADD231PD_ y0,y1,y2 + vfmaddpd \y0,\y1,\y2,\y0 +.endm + +.macro VFMADD231SD_ x0,x1,x2 + vfmaddsd \x0,\x1,\x2,\x0 +.endm + +#else + +.macro VFMADD231PD_ y0,y1,y2 + vfmadd231pd \y2,\y1,\y0 +.endm + +.macro VFMADD231SD_ x0,x1,x2 + vfmadd231sd \x2,\x1,\x0 +.endm + +#endif + + +#define A_PR1 512 +#define B_PR1 256 + +/******************************************************************************************* +* 3 lines of N +*******************************************************************************************/ + +.macro KERNEL16x3_SUBN + prefetcht0 A_PR1(AO) + vbroadcastsd -12 * SIZE(BO), %ymm1 + vmovaps -16 * SIZE(AO), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -11 * SIZE(BO), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd -10 * SIZE(BO), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + vmovaps -12 * SIZE(AO), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + prefetcht0 A_PR1+64(AO) + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + VFMADD231PD_ %ymm9,%ymm3,%ymm0 + vmovaps -8 * SIZE(AO), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + VFMADD231PD_ %ymm11,%ymm2,%ymm0 + VFMADD231PD_ %ymm12,%ymm3,%ymm0 + vmovaps -4 * SIZE(AO), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + VFMADD231PD_ %ymm14,%ymm2,%ymm0 + VFMADD231PD_ %ymm15,%ymm3,%ymm0 + addq $3*SIZE , BO + addq $16*SIZE, AO +.endm + + +.macro KERNEL8x3_SUBN + //prefetcht0 A_PR1(AO) + vbroadcastsd -12 * SIZE(BO), %ymm1 + vmovaps -16 * SIZE(AO), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -11 * SIZE(BO), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd -10 * SIZE(BO), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + vmovaps -12 * SIZE(AO), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + //prefetcht0 A_PR1+64(AO) + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + VFMADD231PD_ %ymm9,%ymm3,%ymm0 + prefetcht0 B_PR1(BO) + addq $3*SIZE , BO + addq $8*SIZE, AO +.endm + +.macro KERNEL4x3_SUBN + vbroadcastsd -12 * SIZE(BO), %ymm1 + vmovaps -16 * SIZE(AO), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -11 * SIZE(BO), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd -10 * SIZE(BO), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + addq $3*SIZE , BO + addq $4*SIZE, AO +.endm + +.macro KERNEL2x3_SUBN + vmovsd -12 * SIZE(BO), %xmm1 + vmovsd -16 * SIZE(AO), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -11 * SIZE(BO), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -10 * SIZE(BO), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 + vmovsd -15 * SIZE(AO), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 + VFMADD231SD_ %xmm12,%xmm3,%xmm0 + addq $3*SIZE , BO + addq $2*SIZE, AO +.endm + +.macro KERNEL1x3_SUBN + vmovsd -12 * SIZE(BO), %xmm1 + vmovsd -16 * SIZE(AO), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -11 * SIZE(BO), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -10 * SIZE(BO), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 + addq $3*SIZE , BO + addq $1*SIZE, AO +.endm + + + + + + +/******************************************************************************************/ + +.macro KERNEL16x3_1 + prefetcht0 A_PR1(AO, %rax, SIZE) + vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + prefetcht0 64+A_PR1(AO, %rax, SIZE) + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + VFMADD231PD_ %ymm9,%ymm3,%ymm0 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + VFMADD231PD_ %ymm11,%ymm2,%ymm0 + VFMADD231PD_ %ymm12,%ymm3,%ymm0 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + VFMADD231PD_ %ymm14,%ymm2,%ymm0 + vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm1 + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm15,%ymm3,%ymm0 +.endm + + + + +.macro KERNEL16x3_2 + prefetcht0 128+A_PR1(AO, %rax, SIZE) + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + vmovups -12 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + prefetcht0 A_PR1+64(AO,%rax,SIZE) + VFMADD231PD_ %ymm9,%ymm3,%ymm0 + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + prefetcht0 192+A_PR1(AO, %rax, SIZE) + VFMADD231PD_ %ymm11,%ymm2,%ymm0 + VFMADD231PD_ %ymm12,%ymm3,%ymm0 + vmovups -4 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + VFMADD231PD_ %ymm14,%ymm2,%ymm0 + vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 + vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm15,%ymm3,%ymm0 +.endm + +.macro KERNEL16x3_3 + prefetcht0 256+A_PR1(AO, %rax, SIZE) + vmovups 0 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd 2 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + vmovups 4 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + prefetcht0 320+A_PR1(AO, %rax, SIZE) + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + VFMADD231PD_ %ymm9,%ymm3,%ymm0 + vmovups 8 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + VFMADD231PD_ %ymm11,%ymm2,%ymm0 + VFMADD231PD_ %ymm12,%ymm3,%ymm0 + vmovups 12 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + VFMADD231PD_ %ymm14,%ymm2,%ymm0 + vbroadcastsd 3 * SIZE(BO, BI, SIZE), %ymm1 + vbroadcastsd 4 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm15,%ymm3,%ymm0 +.endm + +.macro KERNEL16x3_4 + prefetcht0 384+A_PR1(AO, %rax, SIZE) + vmovups 16 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd 5 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + vmovups 20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + prefetcht0 448+A_PR1(AO, %rax, SIZE) + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + VFMADD231PD_ %ymm9,%ymm3,%ymm0 + vmovups 24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + VFMADD231PD_ %ymm11,%ymm2,%ymm0 + addq $12, BI + VFMADD231PD_ %ymm12,%ymm3,%ymm0 + vmovups 28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + VFMADD231PD_ %ymm14,%ymm2,%ymm0 + addq $64, %rax + VFMADD231PD_ %ymm15,%ymm3,%ymm0 +.endm + +.macro KERNEL16x3_SUB + vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + VFMADD231PD_ %ymm9,%ymm3,%ymm0 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + VFMADD231PD_ %ymm11,%ymm2,%ymm0 + VFMADD231PD_ %ymm12,%ymm3,%ymm0 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + VFMADD231PD_ %ymm14,%ymm2,%ymm0 + VFMADD231PD_ %ymm15,%ymm3,%ymm0 + addq $3 , BI + addq $16, %rax +.endm + +.macro SAVE16x3 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm7 , %ymm7 + vmulpd %ymm0 , %ymm10, %ymm10 + vmulpd %ymm0 , %ymm13, %ymm13 + + vmulpd %ymm0 , %ymm5 , %ymm5 + vmulpd %ymm0 , %ymm8 , %ymm8 + vmulpd %ymm0 , %ymm11, %ymm11 + vmulpd %ymm0 , %ymm14, %ymm14 + + vmulpd %ymm0 , %ymm6 , %ymm6 + vmulpd %ymm0 , %ymm9 , %ymm9 + vmulpd %ymm0 , %ymm12, %ymm12 + vmulpd %ymm0 , %ymm15, %ymm15 + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4,%ymm4 + vaddpd 4 * SIZE(CO1), %ymm7,%ymm7 + vaddpd 8 * SIZE(CO1), %ymm10,%ymm10 + vaddpd 12 * SIZE(CO1), %ymm13,%ymm13 + + vaddpd (CO1, LDC), %ymm5,%ymm5 + vaddpd 4 * SIZE(CO1, LDC), %ymm8,%ymm8 + vaddpd 8 * SIZE(CO1, LDC), %ymm11,%ymm11 + vaddpd 12 * SIZE(CO1, LDC), %ymm14,%ymm14 + + vaddpd (CO1, LDC, 2), %ymm6,%ymm6 + vaddpd 4 * SIZE(CO1, LDC, 2), %ymm9,%ymm9 + vaddpd 8 * SIZE(CO1, LDC, 2), %ymm12,%ymm12 + vaddpd 12 * SIZE(CO1, LDC, 2), %ymm15,%ymm15 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm7 , 4 * SIZE(CO1) + vmovups %ymm10, 8 * SIZE(CO1) + vmovups %ymm13,12 * SIZE(CO1) + + vmovups %ymm5 , (CO1, LDC) + vmovups %ymm8 , 4 * SIZE(CO1, LDC) + vmovups %ymm11, 8 * SIZE(CO1, LDC) + vmovups %ymm14,12 * SIZE(CO1, LDC) + + vmovups %ymm6 , (CO1, LDC, 2) + vmovups %ymm9 , 4 * SIZE(CO1, LDC, 2) + vmovups %ymm12, 8 * SIZE(CO1, LDC, 2) + vmovups %ymm15,12 * SIZE(CO1, LDC, 2) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL8x3_1 + prefetcht0 A_PR1(AO, %rax, SIZE) + vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + VFMADD231PD_ %ymm9,%ymm3,%ymm0 +.endm + +.macro KERNEL8x3_2 + prefetcht0 64+A_PR1(AO, %rax, SIZE) + vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + VFMADD231PD_ %ymm9,%ymm3,%ymm0 +.endm + +.macro KERNEL8x3_3 + prefetcht0 128+A_PR1(AO, %rax, SIZE) + vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd 2 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + vmovups -12 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + VFMADD231PD_ %ymm9,%ymm3,%ymm0 +.endm + +.macro KERNEL8x3_4 + prefetcht0 192+A_PR1(AO, %rax, SIZE) + vbroadcastsd 3 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd 4 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd 5 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + vmovups -4 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + VFMADD231PD_ %ymm9,%ymm3,%ymm0 + addq $12, BI + addq $32, %rax +.endm + +.macro KERNEL8x3_SUB + vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + VFMADD231PD_ %ymm9,%ymm3,%ymm0 + addq $3 , BI + addq $8 , %rax +.endm + +.macro SAVE8x3 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm7 , %ymm7 + + vmulpd %ymm0 , %ymm5 , %ymm5 + vmulpd %ymm0 , %ymm8 , %ymm8 + + vmulpd %ymm0 , %ymm6 , %ymm6 + vmulpd %ymm0 , %ymm9 , %ymm9 + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4,%ymm4 + vaddpd 4 * SIZE(CO1), %ymm7,%ymm7 + + vaddpd (CO1, LDC), %ymm5,%ymm5 + vaddpd 4 * SIZE(CO1, LDC), %ymm8,%ymm8 + + vaddpd (CO1, LDC, 2), %ymm6,%ymm6 + vaddpd 4 * SIZE(CO1, LDC, 2), %ymm9,%ymm9 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm7 , 4 * SIZE(CO1) + + vmovups %ymm5 , (CO1, LDC) + vmovups %ymm8 , 4 * SIZE(CO1, LDC) + + vmovups %ymm6 , (CO1, LDC, 2) + vmovups %ymm9 , 4 * SIZE(CO1, LDC, 2) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL4x3_1 + prefetcht0 A_PR1(AO, %rax, SIZE) + vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 +.endm + +.macro KERNEL4x3_2 + vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 +.endm + +.macro KERNEL4x3_3 + prefetcht0 A_PR1(AO, %rax, SIZE) + vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd 2 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 +.endm + +.macro KERNEL4x3_4 + vbroadcastsd 3 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd 4 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd 5 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + addq $12, BI + addq $16, %rax +.endm + +.macro KERNEL4x3_SUB + vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + addq $3 , BI + addq $4 , %rax +.endm + +.macro SAVE4x3 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm5 , %ymm5 + vmulpd %ymm0 , %ymm6 , %ymm6 + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4,%ymm4 + vaddpd (CO1, LDC), %ymm5,%ymm5 + vaddpd (CO1, LDC, 2), %ymm6,%ymm6 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm5 , (CO1, LDC) + vmovups %ymm6 , (CO1, LDC, 2) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL2x3_1 + prefetcht0 A_PR1(AO, %rax, SIZE) + vmovsd -6 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -5 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -4 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 + vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 + VFMADD231SD_ %xmm12,%xmm3,%xmm0 +.endm + +.macro KERNEL2x3_2 + vmovsd -3 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -30 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -2 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -1 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 + vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 + VFMADD231SD_ %xmm12,%xmm3,%xmm0 +.endm + +.macro KERNEL2x3_3 + vmovsd 0 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -28 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd 1 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd 2 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 + vmovsd -27 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 + VFMADD231SD_ %xmm12,%xmm3,%xmm0 +.endm + +.macro KERNEL2x3_4 + vmovsd 3 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -26 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd 4 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd 5 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 + vmovsd -25 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 + VFMADD231SD_ %xmm12,%xmm3,%xmm0 + addq $12, BI + addq $8, %rax +.endm + +.macro KERNEL2x3_SUB + vmovsd -6 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -5 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -4 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 + vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 + VFMADD231SD_ %xmm12,%xmm3,%xmm0 + addq $3 , BI + addq $2 , %rax +.endm + +.macro SAVE2x3 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + vmulsd %xmm0 , %xmm8 , %xmm8 + vmulsd %xmm0 , %xmm5 , %xmm5 + vmulsd %xmm0 , %xmm10, %xmm10 + vmulsd %xmm0 , %xmm6 , %xmm6 + vmulsd %xmm0 , %xmm12, %xmm12 + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4,%xmm4 + vaddsd 1 * SIZE(CO1), %xmm8,%xmm8 + vaddsd (CO1, LDC), %xmm5,%xmm5 + vaddsd 1 * SIZE(CO1, LDC), %xmm10,%xmm10 + vaddsd (CO1, LDC, 2), %xmm6,%xmm6 + vaddsd 1 * SIZE(CO1, LDC, 2), %xmm12,%xmm12 + +#endif + + vmovsd %xmm4 , (CO1) + vmovsd %xmm8 , 1 * SIZE(CO1) + vmovsd %xmm5 , (CO1, LDC) + vmovsd %xmm10, 1 * SIZE(CO1, LDC) + vmovsd %xmm6 , (CO1, LDC, 2) + vmovsd %xmm12, 1 * SIZE(CO1, LDC, 2) + +.endm + +/*******************************************************************************************/ + +.macro KERNEL1x3_1 + vmovsd -6 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -5 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -4 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 +.endm + +.macro KERNEL1x3_2 + vmovsd -3 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -2 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -1 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 +.endm + +.macro KERNEL1x3_3 + vmovsd 0 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -30 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd 1 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd 2 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 +.endm + +.macro KERNEL1x3_4 + vmovsd 3 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd 4 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd 5 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 + addq $12, BI + addq $4, %rax +.endm + +.macro KERNEL1x3_SUB + vmovsd -6 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -5 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -4 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 + addq $3 , BI + addq $1 , %rax +.endm + +.macro SAVE1x3 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + vmulsd %xmm0 , %xmm5 , %xmm5 + vmulsd %xmm0 , %xmm6 , %xmm6 + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4,%xmm4 + vaddsd (CO1, LDC), %xmm5,%xmm5 + vaddsd (CO1, LDC, 2), %xmm6,%xmm6 + +#endif + + vmovsd %xmm4 , (CO1) + vmovsd %xmm5 , (CO1, LDC) + vmovsd %xmm6 , (CO1, LDC, 2) + +.endm + + +/*******************************************************************************************/ + +/******************************************************************************************* +* 2 lines of N +*******************************************************************************************/ + +.macro KERNEL16x2_1 + prefetcht0 A_PR1(AO, %rax, SIZE) + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + prefetcht0 64+A_PR1(AO, %rax, SIZE) + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + VFMADD231PD_ %ymm11,%ymm2,%ymm0 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + VFMADD231PD_ %ymm14,%ymm2,%ymm0 +.endm + +.macro KERNEL16x2_2 + prefetcht0 128+A_PR1(AO, %rax, SIZE) + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vmovups -12 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + prefetcht0 192+A_PR1(AO, %rax, SIZE) + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + VFMADD231PD_ %ymm11,%ymm2,%ymm0 + vmovups -4 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + VFMADD231PD_ %ymm14,%ymm2,%ymm0 +.endm + +.macro KERNEL16x2_3 + prefetcht0 256+A_PR1(AO, %rax, SIZE) + vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 + vmovups 0 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vmovups 4 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + prefetcht0 320+A_PR1(AO, %rax, SIZE) + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + vmovups 8 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + VFMADD231PD_ %ymm11,%ymm2,%ymm0 + vmovups 12 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + VFMADD231PD_ %ymm14,%ymm2,%ymm0 +.endm + +.macro KERNEL16x2_4 + prefetcht0 384+A_PR1(AO, %rax, SIZE) + vbroadcastsd 2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups 16 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd 3 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vmovups 20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + prefetcht0 448+A_PR1(AO, %rax, SIZE) + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + vmovups 24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + VFMADD231PD_ %ymm11,%ymm2,%ymm0 + vmovups 28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + VFMADD231PD_ %ymm14,%ymm2,%ymm0 + addq $8, BI + addq $64, %rax +.endm + +.macro KERNEL16x2_SUB + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + VFMADD231PD_ %ymm11,%ymm2,%ymm0 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + VFMADD231PD_ %ymm14,%ymm2,%ymm0 + addq $2, BI + addq $16, %rax +.endm + +.macro SAVE16x2 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm7 , %ymm7 + vmulpd %ymm0 , %ymm10, %ymm10 + vmulpd %ymm0 , %ymm13, %ymm13 + + vmulpd %ymm0 , %ymm5 , %ymm5 + vmulpd %ymm0 , %ymm8 , %ymm8 + vmulpd %ymm0 , %ymm11, %ymm11 + vmulpd %ymm0 , %ymm14, %ymm14 + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4,%ymm4 + vaddpd 4 * SIZE(CO1), %ymm7,%ymm7 + vaddpd 8 * SIZE(CO1), %ymm10,%ymm10 + vaddpd 12 * SIZE(CO1), %ymm13,%ymm13 + + vaddpd (CO1, LDC), %ymm5,%ymm5 + vaddpd 4 * SIZE(CO1, LDC), %ymm8,%ymm8 + vaddpd 8 * SIZE(CO1, LDC), %ymm11,%ymm11 + vaddpd 12 * SIZE(CO1, LDC), %ymm14,%ymm14 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm7 , 4 * SIZE(CO1) + vmovups %ymm10, 8 * SIZE(CO1) + vmovups %ymm13,12 * SIZE(CO1) + + vmovups %ymm5 , (CO1, LDC) + vmovups %ymm8 , 4 * SIZE(CO1, LDC) + vmovups %ymm11, 8 * SIZE(CO1, LDC) + vmovups %ymm14,12 * SIZE(CO1, LDC) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL8x2_1 + prefetcht0 A_PR1(AO, %rax, SIZE) + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 +.endm + +.macro KERNEL8x2_2 + prefetcht0 64+A_PR1(AO, %rax, SIZE) + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 +.endm + +.macro KERNEL8x2_3 + prefetcht0 128+A_PR1(AO, %rax, SIZE) + vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vmovups -12 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 +.endm + +.macro KERNEL8x2_4 + prefetcht0 192+A_PR1(AO, %rax, SIZE) + vbroadcastsd 2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd 3 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vmovups -4 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + addq $8, BI + addq $32, %rax +.endm + +.macro KERNEL8x2_SUB + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + addq $2, BI + addq $8 , %rax +.endm + +.macro SAVE8x2 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm7 , %ymm7 + + vmulpd %ymm0 , %ymm5 , %ymm5 + vmulpd %ymm0 , %ymm8 , %ymm8 + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4,%ymm4 + vaddpd 4 * SIZE(CO1), %ymm7,%ymm7 + + vaddpd (CO1, LDC), %ymm5,%ymm5 + vaddpd 4 * SIZE(CO1, LDC), %ymm8,%ymm8 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm7 , 4 * SIZE(CO1) + + vmovups %ymm5 , (CO1, LDC) + vmovups %ymm8 , 4 * SIZE(CO1, LDC) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL4x2_1 + prefetcht0 A_PR1(AO, %rax, SIZE) + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 +.endm + +.macro KERNEL4x2_2 + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 +.endm + +.macro KERNEL4x2_3 + prefetcht0 64+A_PR1(AO, %rax, SIZE) + vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 +.endm + +.macro KERNEL4x2_4 + vbroadcastsd 2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd 3 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + addq $8, BI + addq $16, %rax +.endm + +.macro KERNEL4x2_SUB + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + addq $2, BI + addq $4 , %rax +.endm + +.macro SAVE4x2 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm5 , %ymm5 + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4,%ymm4 + vaddpd (CO1, LDC), %ymm5,%ymm5 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm5 , (CO1, LDC) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL2x2_1 + prefetcht0 A_PR1(AO, %rax, SIZE) + vmovsd -4 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -3 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 +.endm + +.macro KERNEL2x2_2 + vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -30 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -1 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 +.endm + +.macro KERNEL2x2_3 + vmovsd 0 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -28 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd 1 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -27 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 +.endm + +.macro KERNEL2x2_4 + vmovsd 2 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -26 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd 3 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -25 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 + addq $8, BI + addq $8, %rax +.endm + +.macro KERNEL2x2_SUB + vmovsd -4 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -3 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 + addq $2, BI + addq $2, %rax +.endm + +.macro SAVE2x2 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + vmulsd %xmm0 , %xmm8 , %xmm8 + vmulsd %xmm0 , %xmm5 , %xmm5 + vmulsd %xmm0 , %xmm10, %xmm10 + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4,%xmm4 + vaddsd 1 * SIZE(CO1), %xmm8,%xmm8 + vaddsd (CO1, LDC), %xmm5,%xmm5 + vaddsd 1 * SIZE(CO1, LDC), %xmm10,%xmm10 + +#endif + + vmovsd %xmm4 , (CO1) + vmovsd %xmm8 , 1 * SIZE(CO1) + vmovsd %xmm5 , (CO1, LDC) + vmovsd %xmm10, 1 * SIZE(CO1, LDC) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL1x2_1 + vmovsd -4 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -3 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 +.endm + +.macro KERNEL1x2_2 + vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -1 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 +.endm + +.macro KERNEL1x2_3 + vmovsd 0 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -30 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd 1 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 +.endm + +.macro KERNEL1x2_4 + vmovsd 2 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd 3 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + addq $8, BI + addq $4, %rax +.endm + +.macro KERNEL1x2_SUB + vmovsd -4 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -3 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + addq $2, BI + addq $1, %rax +.endm + +.macro SAVE1x2 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + vmulsd %xmm0 , %xmm5 , %xmm5 + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4,%xmm4 + vaddsd (CO1, LDC), %xmm5,%xmm5 + +#endif + + vmovsd %xmm4 , (CO1) + vmovsd %xmm5 , (CO1, LDC) + +.endm + + +/*******************************************************************************************/ + +/******************************************************************************************* +* 1 line of N +*******************************************************************************************/ + +.macro KERNEL16x1_1 + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 +.endm + +.macro KERNEL16x1_2 + vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vmovups -12 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + vmovups -4 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 +.endm + +.macro KERNEL16x1_3 + vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 + vmovups 0 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vmovups 4 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + vmovups 8 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + vmovups 12 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 +.endm + +.macro KERNEL16x1_4 + vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm1 + vmovups 16 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vmovups 20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + vmovups 24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + vmovups 28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + addq $4, BI + addq $64, %rax +.endm + +.macro KERNEL16x1_SUB + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + addq $1, BI + addq $16, %rax +.endm + +.macro SAVE16x1 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm7 , %ymm7 + vmulpd %ymm0 , %ymm10, %ymm10 + vmulpd %ymm0 , %ymm13, %ymm13 + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4,%ymm4 + vaddpd 4 * SIZE(CO1), %ymm7,%ymm7 + vaddpd 8 * SIZE(CO1), %ymm10,%ymm10 + vaddpd 12 * SIZE(CO1), %ymm13,%ymm13 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm7 , 4 * SIZE(CO1) + vmovups %ymm10, 8 * SIZE(CO1) + vmovups %ymm13,12 * SIZE(CO1) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL8x1_1 + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 +.endm + +.macro KERNEL8x1_2 + vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 +.endm + +.macro KERNEL8x1_3 + vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vmovups -12 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 +.endm + +.macro KERNEL8x1_4 + vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vmovups -4 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + addq $4, BI + addq $32, %rax +.endm + +.macro KERNEL8x1_SUB + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + addq $1, BI + addq $8 , %rax +.endm + +.macro SAVE8x1 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm7 , %ymm7 + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4,%ymm4 + vaddpd 4 * SIZE(CO1), %ymm7,%ymm7 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm7 , 4 * SIZE(CO1) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL4x1_1 + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 +.endm + +.macro KERNEL4x1_2 + vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 +.endm + +.macro KERNEL4x1_3 + vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 +.endm + +.macro KERNEL4x1_4 + vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + addq $4, BI + addq $16, %rax +.endm + +.macro KERNEL4x1_SUB + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + addq $1, BI + addq $4 , %rax +.endm + +.macro SAVE4x1 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4,%ymm4 + +#endif + + vmovups %ymm4 , (CO1) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL2x1_1 + vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 +.endm + +.macro KERNEL2x1_2 + vmovsd -1 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -30 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 +.endm + +.macro KERNEL2x1_3 + vmovsd 0 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -28 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -27 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 +.endm + +.macro KERNEL2x1_4 + vmovsd 1 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -26 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -25 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + addq $4, BI + addq $8, %rax +.endm + +.macro KERNEL2x1_SUB + vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + addq $1, BI + addq $2 , %rax +.endm + +.macro SAVE2x1 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + vmulsd %xmm0 , %xmm8 , %xmm8 + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4,%xmm4 + vaddsd 1 * SIZE(CO1), %xmm8,%xmm8 + +#endif + + vmovsd %xmm4 , (CO1) + vmovsd %xmm8 , 1 * SIZE(CO1) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL1x1_1 + vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 +.endm + +.macro KERNEL1x1_2 + vmovsd -1 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 +.endm + +.macro KERNEL1x1_3 + vmovsd 0 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -30 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 +.endm + +.macro KERNEL1x1_4 + vmovsd 1 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + addq $4, BI + addq $4, %rax +.endm + +.macro KERNEL1x1_SUB + vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + addq $1, BI + addq $1 , %rax +.endm + +.macro SAVE1x1 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4,%xmm4 + +#endif + + vmovsd %xmm4 , (CO1) + +.endm + + +/*******************************************************************************************/ + +#if !defined(TRMMKERNEL) + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC + + vmovaps %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovsd %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $6, %rdi + divq %rdi // N / 6 + movq %rax, Ndiv6 // N / 6 + movq %rdx, Nmod6 // N % 6 + + + movq Ndiv6, J + cmpq $0, J + je .L2_0 + ALIGN_4 + +.L6_01: + // copy to sub buffer + movq K, %rax + salq $1,%rax // K * 2 ; read 2 values + movq B, BO1 + leaq (B,%rax, SIZE), BO2 // next offset to BO2 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + sarq $3 , %rax // K / 8 + jz .L6_01a_2 + ALIGN_4 + +.L6_01a_1: + + prefetcht0 512(BO1) + prefetcht0 512(BO2) + prefetchw 512(BO) + + + vmovups 0 * SIZE(BO1), %xmm0 + vmovups 2 * SIZE(BO1), %xmm2 + vmovups 4 * SIZE(BO1), %xmm4 + vmovups 6 * SIZE(BO1), %xmm6 + vmovsd 0 * SIZE(BO2), %xmm1 + vmovsd 2 * SIZE(BO2), %xmm3 + vmovsd 4 * SIZE(BO2), %xmm5 + vmovsd 6 * SIZE(BO2), %xmm7 + vmovups %xmm0, 0*SIZE(BO) + vmovsd %xmm1, 2*SIZE(BO) + vmovups %xmm2, 3*SIZE(BO) + vmovsd %xmm3, 5*SIZE(BO) + vmovups %xmm4, 6*SIZE(BO) + vmovsd %xmm5, 8*SIZE(BO) + vmovups %xmm6, 9*SIZE(BO) + vmovsd %xmm7,11*SIZE(BO) + addq $8*SIZE,BO1 + addq $8*SIZE,BO2 + addq $12*SIZE,BO + + vmovups 0 * SIZE(BO1), %xmm0 + vmovups 2 * SIZE(BO1), %xmm2 + vmovups 4 * SIZE(BO1), %xmm4 + vmovups 6 * SIZE(BO1), %xmm6 + vmovsd 0 * SIZE(BO2), %xmm1 + vmovsd 2 * SIZE(BO2), %xmm3 + vmovsd 4 * SIZE(BO2), %xmm5 + vmovsd 6 * SIZE(BO2), %xmm7 + vmovups %xmm0, 0*SIZE(BO) + vmovsd %xmm1, 2*SIZE(BO) + vmovups %xmm2, 3*SIZE(BO) + vmovsd %xmm3, 5*SIZE(BO) + vmovups %xmm4, 6*SIZE(BO) + vmovsd %xmm5, 8*SIZE(BO) + vmovups %xmm6, 9*SIZE(BO) + vmovsd %xmm7,11*SIZE(BO) + addq $8*SIZE,BO1 + addq $8*SIZE,BO2 + addq $12*SIZE,BO + + decq %rax + jnz .L6_01a_1 + + + +.L6_01a_2: + + movq K, %rax + andq $7, %rax // K % 8 + jz .L6_02c + ALIGN_4 + + +.L6_02b: + + vmovups 0 * SIZE(BO1), %xmm0 + vmovsd 0 * SIZE(BO2), %xmm2 + vmovups %xmm0, 0*SIZE(BO) + vmovsd %xmm2, 2*SIZE(BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO2 + addq $3*SIZE,BO + decq %rax + jnz .L6_02b + +.L6_02c: + + movq K, %rax + salq $1,%rax // K * 2 + leaq (B,%rax, SIZE), BO1 // next offset to BO1 + leaq (BO1,%rax, SIZE), BO2 // next offset to BO2 + leaq BUFFER2, BO // second buffer to BO + movq K, %rax + sarq $3 , %rax // K / 8 + jz .L6_02c_2 + ALIGN_4 + +.L6_02c_1: + + prefetcht0 512(BO2) + prefetchw 512(BO) + + vmovups 0 * SIZE(BO2), %xmm0 + vmovups 2 * SIZE(BO2), %xmm2 + vmovups 4 * SIZE(BO2), %xmm4 + vmovups 6 * SIZE(BO2), %xmm6 + vmovsd 1 * SIZE(BO1), %xmm1 + vmovsd 3 * SIZE(BO1), %xmm3 + vmovsd 5 * SIZE(BO1), %xmm5 + vmovsd 7 * SIZE(BO1), %xmm7 + vmovsd %xmm1, 0*SIZE(BO) + vmovups %xmm0, 1*SIZE(BO) + vmovsd %xmm3, 3*SIZE(BO) + vmovups %xmm2, 4*SIZE(BO) + vmovsd %xmm5, 6*SIZE(BO) + vmovups %xmm4, 7*SIZE(BO) + vmovsd %xmm7, 9*SIZE(BO) + vmovups %xmm6,10*SIZE(BO) + addq $8*SIZE,BO1 + addq $8*SIZE,BO2 + addq $12*SIZE,BO + + + vmovups 0 * SIZE(BO2), %xmm0 + vmovups 2 * SIZE(BO2), %xmm2 + vmovups 4 * SIZE(BO2), %xmm4 + vmovups 6 * SIZE(BO2), %xmm6 + vmovsd 1 * SIZE(BO1), %xmm1 + vmovsd 3 * SIZE(BO1), %xmm3 + vmovsd 5 * SIZE(BO1), %xmm5 + vmovsd 7 * SIZE(BO1), %xmm7 + vmovsd %xmm1, 0*SIZE(BO) + vmovups %xmm0, 1*SIZE(BO) + vmovsd %xmm3, 3*SIZE(BO) + vmovups %xmm2, 4*SIZE(BO) + vmovsd %xmm5, 6*SIZE(BO) + vmovups %xmm4, 7*SIZE(BO) + vmovsd %xmm7, 9*SIZE(BO) + vmovups %xmm6,10*SIZE(BO) + addq $8*SIZE,BO1 + addq $8*SIZE,BO2 + addq $12*SIZE,BO + + decq %rax + jnz .L6_02c_1 + + +.L6_02c_2: + + movq K, %rax + andq $7, %rax // K % 8 + jz .L6_03c + ALIGN_4 + +.L6_03b: + + vmovsd 1*SIZE(BO1), %xmm0 + vmovups 0*SIZE(BO2), %xmm1 + vmovsd %xmm0, 0*SIZE(BO) + vmovups %xmm1, 1*SIZE(BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO2 + addq $3*SIZE,BO + decq %rax + jnz .L6_03b + + +.L6_03c: + + movq BO2, B // next offset of B + +.L6_10: + movq C, CO1 + leaq (C, LDC, 2), C + leaq (C, LDC, 1), C // c += 3 * ldc + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L6_20 + + ALIGN_4 + +.L6_11: + leaq BUFFER1, BO // first buffer to BO + addq $12 * SIZE, BO + + prefetcht0 (CO1) + prefetcht0 (CO1,LDC,1) + prefetcht0 (CO1,LDC,2) + prefetcht0 64(CO1) + prefetcht0 64(CO1,LDC,1) + prefetcht0 64(CO1,LDC,2) + + vzeroall + + movq K, %rax + + sarq $1, %rax // K / 8 + je .L6_16 + + ALIGN_5 + +.L6_12: +/* + prefetcht0 B_PR1(BO) + prefetcht0 B_PR1+64(BO) + prefetcht0 B_PR1+128(BO) +*/ + KERNEL16x3_SUBN + KERNEL16x3_SUBN +/* + KERNEL16x3_SUBN + KERNEL16x3_SUBN + + KERNEL16x3_SUBN + KERNEL16x3_SUBN + KERNEL16x3_SUBN + KERNEL16x3_SUBN +*/ + dec %rax + jne .L6_12 + +.L6_16: + movq K, %rax + + andq $1, %rax # if (k & 1) + je .L6_19 + + ALIGN_4 + +.L6_17: + + KERNEL16x3_SUBN + + dec %rax + jne .L6_17 + ALIGN_4 + + +.L6_19: + + SAVE16x3 + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L6_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L6_20: + // Test rest of M + + testq $15, M + jz .L7_10 // to next 3 lines of N + + testq $8, M + jz .L6_21pre + ALIGN_4 + +/**************************************************************************/ + +.L6_20_1: + leaq BUFFER1, BO // first buffer to BO + addq $12 * SIZE, BO + + vzeroall + + movq K, %rax + + sarq $3, %rax + je .L6_20_6 + + ALIGN_4 + +.L6_20_2: + + KERNEL8x3_SUBN + KERNEL8x3_SUBN + KERNEL8x3_SUBN + KERNEL8x3_SUBN + + KERNEL8x3_SUBN + KERNEL8x3_SUBN + KERNEL8x3_SUBN + KERNEL8x3_SUBN + dec %rax + jne .L6_20_2 + ALIGN_4 + +.L6_20_6: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_20_9 + + + ALIGN_4 + +.L6_20_7: + + KERNEL8x3_SUBN + + dec %rax + jne .L6_20_7 + ALIGN_4 + + +.L6_20_9: + + SAVE8x3 + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L6_21pre: + + testq $4, M + jz .L6_30 + ALIGN_4 + +.L6_21: + leaq BUFFER1, BO // first buffer to BO + addq $12 * SIZE, BO + + vzeroall + + movq K, %rax + + sarq $3, %rax + je .L6_26 + + ALIGN_4 + +.L6_22: + + KERNEL4x3_SUBN + KERNEL4x3_SUBN + KERNEL4x3_SUBN + KERNEL4x3_SUBN + + KERNEL4x3_SUBN + KERNEL4x3_SUBN + KERNEL4x3_SUBN + KERNEL4x3_SUBN + dec %rax + jne .L6_22 + ALIGN_4 + +.L6_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_29 + + ALIGN_4 + +.L6_27: + + KERNEL4x3_SUBN + + dec %rax + jne .L6_27 + ALIGN_4 + + +.L6_29: + + SAVE4x3 + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L6_30: + testq $2, M + jz .L6_40 + + ALIGN_4 + +.L6_31: + leaq BUFFER1, BO // first buffer to BO + addq $12 * SIZE, BO + + vzeroall + + movq K, %rax + + sarq $3, %rax + je .L6_36 + ALIGN_4 + +.L6_32: + + KERNEL2x3_SUBN + KERNEL2x3_SUBN + KERNEL2x3_SUBN + KERNEL2x3_SUBN + + KERNEL2x3_SUBN + KERNEL2x3_SUBN + KERNEL2x3_SUBN + KERNEL2x3_SUBN + dec %rax + jne .L6_32 + ALIGN_4 + +.L6_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_39 + + ALIGN_4 + +.L6_37: + + KERNEL2x3_SUBN + + dec %rax + jne .L6_37 + ALIGN_4 + + +.L6_39: + + SAVE2x3 + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L6_40: + testq $1, M + jz .L7_10 // to next 3 lines of N + + ALIGN_4 + +.L6_41: + leaq BUFFER1, BO // first buffer to BO + addq $12 * SIZE, BO + + vzeroall + + movq K, %rax + + sarq $3,%rax + je .L6_46 + + ALIGN_4 + +.L6_42: + + KERNEL1x3_SUBN + KERNEL1x3_SUBN + KERNEL1x3_SUBN + KERNEL1x3_SUBN + + KERNEL1x3_SUBN + KERNEL1x3_SUBN + KERNEL1x3_SUBN + KERNEL1x3_SUBN + + dec %rax + jne .L6_42 + ALIGN_4 + +.L6_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_49 + + ALIGN_4 + +.L6_47: + + KERNEL1x3_SUBN + + dec %rax + jne .L6_47 + ALIGN_4 + + +.L6_49: + + SAVE1x3 + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + + +/***************************************************************************************************************/ + +.L7_10: + movq C, CO1 + leaq (C, LDC, 2), C + leaq (C, LDC, 1), C // c += 3 * ldc + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L7_20 + + ALIGN_4 + +.L7_11: + leaq BUFFER2, BO // second buffer to BO + addq $12 * SIZE, BO + + prefetcht0 (CO1) + prefetcht0 (CO1,LDC,1) + prefetcht0 (CO1,LDC,2) + prefetcht0 64(CO1) + prefetcht0 64(CO1,LDC,1) + prefetcht0 64(CO1,LDC,2) + + vzeroall + + movq K, %rax + + sarq $3, %rax // K / 8 + je .L7_16 + ALIGN_5 + +.L7_12: +/* + prefetcht0 B_PR1(BO) + prefetcht0 B_PR1+64(BO) + prefetcht0 B_PR1+128(BO) +*/ + KERNEL16x3_SUBN + KERNEL16x3_SUBN + KERNEL16x3_SUBN + KERNEL16x3_SUBN + + KERNEL16x3_SUBN + KERNEL16x3_SUBN + KERNEL16x3_SUBN + KERNEL16x3_SUBN + dec %rax + jne .L7_12 + ALIGN_4 + +.L7_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_19 + + ALIGN_5 + +.L7_17: + + KERNEL16x3_SUBN + + dec %rax + jne .L7_17 + + +.L7_19: + + SAVE16x3 + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L7_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L7_20: + // Test rest of M + + testq $15, M + jz .L7_60 // to next 3 lines of N + + testq $8, M + jz .L7_21pre + ALIGN_4 + +/**************************************************************************/ + +.L7_20_1: + leaq BUFFER2, BO // first buffer to BO + addq $12 * SIZE, BO + + vzeroall + + movq K, %rax + + sarq $3, %rax + je .L7_20_6 + + ALIGN_4 + +.L7_20_2: + + KERNEL8x3_SUBN + KERNEL8x3_SUBN + KERNEL8x3_SUBN + KERNEL8x3_SUBN + + KERNEL8x3_SUBN + KERNEL8x3_SUBN + KERNEL8x3_SUBN + KERNEL8x3_SUBN + + dec %rax + jne .L7_20_2 + ALIGN_4 + +.L7_20_6: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_20_9 + + ALIGN_4 + +.L7_20_7: + + KERNEL8x3_SUBN + + dec %rax + jne .L7_20_7 + ALIGN_4 + +.L7_20_9: + + SAVE8x3 + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L7_21pre: + + testq $4, M + jz .L7_30 + ALIGN_4 + +.L7_21: + leaq BUFFER2, BO // second buffer to BO + addq $12 * SIZE, BO + + vzeroall + + movq K, %rax + + sarq $3, %rax + je .L7_26 + + ALIGN_4 + +.L7_22: + + KERNEL4x3_SUBN + KERNEL4x3_SUBN + KERNEL4x3_SUBN + KERNEL4x3_SUBN + + KERNEL4x3_SUBN + KERNEL4x3_SUBN + KERNEL4x3_SUBN + KERNEL4x3_SUBN + + dec %rax + jne .L7_22 + ALIGN_4 + +.L7_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_29 + + ALIGN_4 + +.L7_27: + + KERNEL4x3_SUBN + + dec %rax + jne .L7_27 + ALIGN_4 + + +.L7_29: + + SAVE4x3 + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L7_30: + testq $2, M + jz .L7_40 + + ALIGN_4 + +.L7_31: + leaq BUFFER2, BO // second buffer to BO + addq $12 * SIZE, BO + + vzeroall + + movq K, %rax + + sarq $3, %rax + je .L7_36 + + ALIGN_4 + +.L7_32: + + KERNEL2x3_SUBN + KERNEL2x3_SUBN + KERNEL2x3_SUBN + KERNEL2x3_SUBN + + KERNEL2x3_SUBN + KERNEL2x3_SUBN + KERNEL2x3_SUBN + KERNEL2x3_SUBN + + dec %rax + jne .L7_32 + ALIGN_4 + +.L7_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_39 + + ALIGN_4 + +.L7_37: + + KERNEL2x3_SUBN + + dec %rax + jne .L7_37 + ALIGN_4 + + +.L7_39: + + SAVE2x3 + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L7_40: + testq $1, M + jz .L7_60 // to next 3 lines of N + + ALIGN_4 + +.L7_41: + leaq BUFFER2, BO // second buffer to BO + addq $12 * SIZE, BO + + vzeroall + + movq K, %rax + + sarq $3, %rax + je .L7_46 + + ALIGN_4 + +.L7_42: + KERNEL1x3_SUBN + KERNEL1x3_SUBN + KERNEL1x3_SUBN + KERNEL1x3_SUBN + + KERNEL1x3_SUBN + KERNEL1x3_SUBN + KERNEL1x3_SUBN + KERNEL1x3_SUBN + + dec %rax + jne .L7_42 + ALIGN_4 + +.L7_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_49 + + ALIGN_4 + +.L7_47: + + KERNEL1x3_SUBN + + dec %rax + jne .L7_47 + ALIGN_4 + + +.L7_49: + + SAVE1x3 + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + +.L7_60: + + decq J // j -- + jg .L6_01 + + +.L2_0: + cmpq $0, Nmod6 // N % 6 == 0 + je .L999 + +/************************************************************************************************ +* Loop for Nmod6 / 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + sarq $1, J // j = j / 2 + je .L1_0 + ALIGN_4 + +.L2_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + sarq $2, %rax // K / 4 + jz .L2_01b + ALIGN_4 + +.L2_01a: + prefetcht0 512(BO1) + prefetchw 512(BO) + + vmovups (BO1), %xmm0 + vmovups 2*SIZE(BO1), %xmm1 + vmovups 4*SIZE(BO1), %xmm2 + vmovups 6*SIZE(BO1), %xmm3 + + vmovups %xmm0, (BO) + vmovups %xmm1, 2*SIZE(BO) + vmovups %xmm2, 4*SIZE(BO) + vmovups %xmm3, 6*SIZE(BO) + + addq $8*SIZE,BO1 + addq $8*SIZE,BO + decq %rax + jnz .L2_01a + + +.L2_01b: + + movq K, %rax + andq $3, %rax // K % 4 + jz .L2_02d + ALIGN_4 + +.L2_02c: + + vmovups (BO1), %xmm0 + vmovups %xmm0, (BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO + decq %rax + jnz .L2_02c + +.L2_02d: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + + + movq A, AO // aoffset = a + addq $32 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L2_20 + + ALIGN_4 + +.L2_11: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_16 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_12: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x2_1 + KERNEL16x2_2 + KERNEL16x2_3 + KERNEL16x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x2_1 + KERNEL16x2_2 + KERNEL16x2_3 + KERNEL16x2_4 + + je .L2_16 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x2_1 + KERNEL16x2_2 + KERNEL16x2_3 + KERNEL16x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x2_1 + KERNEL16x2_2 + KERNEL16x2_3 + KERNEL16x2_4 + + je .L2_16 + + jmp .L2_12 + ALIGN_4 + +.L2_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_17: + + KERNEL16x2_SUB + + jl .L2_17 + ALIGN_4 + + +.L2_19: + + SAVE16x2 + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L2_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_20: + // Test rest of M + + testq $15, M + jz .L2_60 // to next 3 lines of N + + testq $8, M + jz .L2_21pre + ALIGN_4 + +/**************************************************************************/ + +.L2_20_1: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L2_20_6 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_2: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_3 + KERNEL8x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_3 + KERNEL8x2_4 + + je .L2_20_6 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_3 + KERNEL8x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_3 + KERNEL8x2_4 + + je .L2_20_6 + + jmp .L2_20_2 + ALIGN_4 + +.L2_20_6: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_20_9 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_7: + + KERNEL8x2_SUB + + jl .L2_20_7 + ALIGN_4 + + +.L2_20_9: + + SAVE8x2 + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L2_21pre: + + testq $4, M + jz .L2_30 + ALIGN_4 + +.L2_21: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L2_26 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 1 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_22: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_3 + KERNEL4x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_3 + KERNEL4x2_4 + + je .L2_26 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_3 + KERNEL4x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_3 + KERNEL4x2_4 + + je .L2_26 + + jmp .L2_22 + ALIGN_4 + +.L2_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_27: + + KERNEL4x2_SUB + + jl .L2_27 + ALIGN_4 + + +.L2_29: + + SAVE4x2 + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L2_30: + testq $2, M + jz .L2_40 + + ALIGN_4 + +.L2_31: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L2_36 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_32: + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_3 + KERNEL2x2_4 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_3 + KERNEL2x2_4 + + je .L2_36 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_3 + KERNEL2x2_4 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_3 + KERNEL2x2_4 + + je .L2_36 + + jmp .L2_32 + ALIGN_4 + +.L2_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_37: + + KERNEL2x2_SUB + + jl .L2_37 + ALIGN_4 + + +.L2_39: + + SAVE2x2 + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L2_40: + testq $1, M + jz .L2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_41: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L2_46 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_42: + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_3 + KERNEL1x2_4 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_3 + KERNEL1x2_4 + + je .L2_46 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_3 + KERNEL1x2_4 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_3 + KERNEL1x2_4 + + je .L2_46 + + jmp .L2_42 + ALIGN_4 + +.L2_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB + + jl .L2_47 + ALIGN_4 + + +.L2_49: + + SAVE1x2 + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + +.L2_60: + + decq J // j -- + jg .L2_01 // next 2 lines of N + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovsd (BO1), %xmm0 + vmovsd %xmm0, (BO) + addq $1*SIZE,BO1 + addq $1*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + + + movq A, AO // aoffset = a + addq $32 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L1_20 + + ALIGN_4 + +.L1_11: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_16 + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_12: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x1_1 + KERNEL16x1_2 + KERNEL16x1_3 + KERNEL16x1_4 + + KERNEL16x1_1 + KERNEL16x1_2 + KERNEL16x1_3 + KERNEL16x1_4 + + je .L1_16 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x1_1 + KERNEL16x1_2 + KERNEL16x1_3 + KERNEL16x1_4 + + KERNEL16x1_1 + KERNEL16x1_2 + KERNEL16x1_3 + KERNEL16x1_4 + + je .L1_16 + + jmp .L1_12 + ALIGN_4 + +.L1_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_19 + + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_17: + + KERNEL16x1_SUB + + jl .L1_17 + ALIGN_4 + + +.L1_19: + + SAVE16x1 + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L1_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_20: + // Test rest of M + + testq $15, M + jz .L999 + + testq $8, M + jz .L1_21pre + ALIGN_4 + +/**************************************************************************/ + +.L1_20_1: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L1_20_6 + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_2: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_3 + KERNEL8x1_4 + + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_3 + KERNEL8x1_4 + + je .L1_20_6 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_3 + KERNEL8x1_4 + + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_3 + KERNEL8x1_4 + + je .L1_20_6 + + jmp .L1_20_2 + ALIGN_4 + +.L1_20_6: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_20_9 + + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_7: + + KERNEL8x1_SUB + + jl .L1_20_7 + ALIGN_4 + + +.L1_20_9: + + SAVE8x1 + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L1_21pre: + + testq $4, M + jz .L1_30 + ALIGN_4 + +.L1_21: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L1_26 + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_22: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_3 + KERNEL4x1_4 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_3 + KERNEL4x1_4 + + je .L1_26 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_3 + KERNEL4x1_4 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_3 + KERNEL4x1_4 + + je .L1_26 + + jmp .L1_22 + ALIGN_4 + +.L1_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_29 + + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_27: + + KERNEL4x1_SUB + + jl .L1_27 + ALIGN_4 + + +.L1_29: + + SAVE4x1 + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L1_30: + testq $2, M + jz .L1_40 + + ALIGN_4 + +.L1_31: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L1_36 + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_32: + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_3 + KERNEL2x1_4 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_3 + KERNEL2x1_4 + + je .L1_36 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_3 + KERNEL2x1_4 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_3 + KERNEL2x1_4 + + je .L1_36 + + jmp .L1_32 + ALIGN_4 + +.L1_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_39 + + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_37: + + KERNEL2x1_SUB + + jl .L1_37 + ALIGN_4 + + +.L1_39: + + SAVE2x1 + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L1_40: + testq $1, M + jz .L999 + + ALIGN_4 + +.L1_41: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L1_46 + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_42: + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_3 + KERNEL1x1_4 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_3 + KERNEL1x1_4 + + je .L1_46 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_3 + KERNEL1x1_4 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_3 + KERNEL1x1_4 + + je .L1_46 + + jmp .L1_42 + ALIGN_4 + +.L1_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_49 + + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB + + jl .L1_47 + ALIGN_4 + + +.L1_49: + + SAVE1x1 + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + +.L999: + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE + + +#else +/************************************************************************************* +* TRMM Kernel +*************************************************************************************/ + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif + vmovaps %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + movsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovsd %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $2, %rdi + divq %rdi // N / 6 + movq %rax, Ndiv6 // N / 6 + movq %rdx, Nmod6 // N % 6 + + + +#ifdef TRMMKERNEL + vmovsd %xmm12, OFFSET + vmovsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + + movq Ndiv6, J + cmpq $0, J + je .L1_0 + ALIGN_4 + +.L2_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + sarq $2, %rax // K / 4 + jz .L2_01b + ALIGN_4 + +.L2_01a: + prefetcht0 512(BO1) + prefetchw 512(BO) + + vmovups (BO1), %xmm0 + vmovups 2*SIZE(BO1), %xmm1 + vmovups 4*SIZE(BO1), %xmm2 + vmovups 6*SIZE(BO1), %xmm3 + + vmovups %xmm0, (BO) + vmovups %xmm1, 2*SIZE(BO) + vmovups %xmm2, 4*SIZE(BO) + vmovups %xmm3, 6*SIZE(BO) + + addq $8*SIZE,BO1 + addq $8*SIZE,BO + decq %rax + jnz .L2_01a + + +.L2_01b: + + movq K, %rax + andq $3, %rax // K % 4 + jz .L2_02d + ALIGN_4 + +.L2_02c: + + vmovups (BO1), %xmm0 + vmovups %xmm0, (BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO + decq %rax + jnz .L2_02c + +.L2_02d: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $32 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L2_20 + + ALIGN_4 + +.L2_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_16 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_12: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x2_1 + KERNEL16x2_2 + KERNEL16x2_3 + KERNEL16x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x2_1 + KERNEL16x2_2 + KERNEL16x2_3 + KERNEL16x2_4 + + je .L2_16 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x2_1 + KERNEL16x2_2 + KERNEL16x2_3 + KERNEL16x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x2_1 + KERNEL16x2_2 + KERNEL16x2_3 + KERNEL16x2_4 + + je .L2_16 + + jmp .L2_12 + ALIGN_4 + +.L2_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_17: + + KERNEL16x2_SUB + + jl .L2_17 + ALIGN_4 + + +.L2_19: + + SAVE16x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L2_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_20: + // Test rest of M + + testq $15, M + jz .L2_60 // to next 3 lines of N + + testq $8, M + jz .L2_21pre + ALIGN_4 + +/**************************************************************************/ + +.L2_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_20_6 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_2: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_3 + KERNEL8x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_3 + KERNEL8x2_4 + + je .L2_20_6 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_3 + KERNEL8x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_3 + KERNEL8x2_4 + + je .L2_20_6 + + jmp .L2_20_2 + ALIGN_4 + +.L2_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_20_9 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_7: + + KERNEL8x2_SUB + + jl .L2_20_7 + ALIGN_4 + + +.L2_20_9: + + SAVE8x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L2_21pre: + + testq $4, M + jz .L2_30 + ALIGN_4 + +.L2_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_26 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 1 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_22: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_3 + KERNEL4x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_3 + KERNEL4x2_4 + + je .L2_26 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_3 + KERNEL4x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_3 + KERNEL4x2_4 + + je .L2_26 + + jmp .L2_22 + ALIGN_4 + +.L2_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_27: + + KERNEL4x2_SUB + + jl .L2_27 + ALIGN_4 + + +.L2_29: + + SAVE4x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L2_30: + testq $2, M + jz .L2_40 + + ALIGN_4 + +.L2_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_36 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_32: + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_3 + KERNEL2x2_4 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_3 + KERNEL2x2_4 + + je .L2_36 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_3 + KERNEL2x2_4 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_3 + KERNEL2x2_4 + + je .L2_36 + + jmp .L2_32 + ALIGN_4 + +.L2_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_37: + + KERNEL2x2_SUB + + jl .L2_37 + ALIGN_4 + + +.L2_39: + + SAVE2x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L2_40: + testq $1, M + jz .L2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L2_46 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_42: + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_3 + KERNEL1x2_4 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_3 + KERNEL1x2_4 + + je .L2_46 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_3 + KERNEL1x2_4 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_3 + KERNEL1x2_4 + + je .L2_46 + + jmp .L2_42 + ALIGN_4 + +.L2_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB + + jl .L2_47 + ALIGN_4 + + +.L2_49: + + SAVE1x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + + + +.L2_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + decq J // j -- + jg .L2_01 // next 2 lines of N + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovsd (BO1), %xmm0 + vmovsd %xmm0, (BO) + addq $1*SIZE,BO1 + addq $1*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $32 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L1_20 + + ALIGN_4 + +.L1_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_16 + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_12: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x1_1 + KERNEL16x1_2 + KERNEL16x1_3 + KERNEL16x1_4 + + KERNEL16x1_1 + KERNEL16x1_2 + KERNEL16x1_3 + KERNEL16x1_4 + + je .L1_16 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x1_1 + KERNEL16x1_2 + KERNEL16x1_3 + KERNEL16x1_4 + + KERNEL16x1_1 + KERNEL16x1_2 + KERNEL16x1_3 + KERNEL16x1_4 + + je .L1_16 + + jmp .L1_12 + ALIGN_4 + +.L1_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_19 + + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_17: + + KERNEL16x1_SUB + + jl .L1_17 + ALIGN_4 + + +.L1_19: + + SAVE16x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L1_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_20: + // Test rest of M + + testq $15, M + jz .L999 + + testq $8, M + jz .L1_21pre + ALIGN_4 + +/**************************************************************************/ + +.L1_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_20_6 + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_2: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_3 + KERNEL8x1_4 + + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_3 + KERNEL8x1_4 + + je .L1_20_6 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_3 + KERNEL8x1_4 + + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_3 + KERNEL8x1_4 + + je .L1_20_6 + + jmp .L1_20_2 + ALIGN_4 + +.L1_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_20_9 + + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_7: + + KERNEL8x1_SUB + + jl .L1_20_7 + ALIGN_4 + + +.L1_20_9: + + SAVE8x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L1_21pre: + + testq $4, M + jz .L1_30 + ALIGN_4 + +.L1_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_26 + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_22: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_3 + KERNEL4x1_4 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_3 + KERNEL4x1_4 + + je .L1_26 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_3 + KERNEL4x1_4 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_3 + KERNEL4x1_4 + + je .L1_26 + + jmp .L1_22 + ALIGN_4 + +.L1_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_29 + + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_27: + + KERNEL4x1_SUB + + jl .L1_27 + ALIGN_4 + + +.L1_29: + + SAVE4x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L1_30: + testq $2, M + jz .L1_40 + + ALIGN_4 + +.L1_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_36 + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_32: + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_3 + KERNEL2x1_4 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_3 + KERNEL2x1_4 + + je .L1_36 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_3 + KERNEL2x1_4 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_3 + KERNEL2x1_4 + + je .L1_36 + + jmp .L1_32 + ALIGN_4 + +.L1_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_39 + + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_37: + + KERNEL2x1_SUB + + jl .L1_37 + ALIGN_4 + + +.L1_39: + + SAVE2x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L1_40: + testq $1, M + jz .L999 + + ALIGN_4 + +.L1_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L1_46 + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_42: + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_3 + KERNEL1x1_4 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_3 + KERNEL1x1_4 + + je .L1_46 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_3 + KERNEL1x1_4 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_3 + KERNEL1x1_4 + + je .L1_46 + + jmp .L1_42 + ALIGN_4 + +.L1_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_49 + + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB + + jl .L1_47 + ALIGN_4 + + +.L1_49: + + SAVE1x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + +.L999: + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE + + + + + +#endif diff --git a/kernel/x86_64/dgemm_kernel_4x4_haswell.S b/kernel/x86_64/dgemm_kernel_4x4_haswell.S new file mode 100644 index 000000000..1bfb71572 --- /dev/null +++ b/kernel/x86_64/dgemm_kernel_4x4_haswell.S @@ -0,0 +1,3479 @@ +/********************************************************************************* +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + + +/********************************************************************* +* 2013/10/28 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK + +* +* +* 2013/10/27 Saar +* Parameter: +* DGEMM_DEFAULT_UNROLL_N 4 +* DGEMM_DEFAULT_UNROLL_M 4 +* DGEMM_DEFAULT_P 512 +* DGEMM_DEFAULT_Q 256 +* A_PR1 512 +* B_PR1 512 +* +* +* Performance at 9216x9216x9216: +* 1 thread: 53.3 GFLOPS (MKL: 54) +* 2 threads: 100.0 GFLOPS (MKL: 97) +* 3 threads: 147.0 GFLOPS (MKL: 133) +* 4 threads: 184.0 GFLOPS (MKL: 170) +*********************************************************************/ + + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define J %r14 +#define OLD_K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define K %r12 +#define SP %rbx + +#define BO1 %rdi +#define BO2 %r15 +#define BO3 %rbp + +#ifndef WINDOWS_ABI + +#define STACKSIZE 96 + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#endif + +#define L_BUFFER_SIZE 512*8*12 + +#define Ndiv12 24(%rsp) +#define Nmod12 32(%rsp) +#define N 40(%rsp) +#define ALPHA 48(%rsp) +#define OFFSET 56(%rsp) +#define KK 64(%rsp) +#define KKK 72(%rsp) +#define BUFFER1 128(%rsp) + +#if defined(OS_WINDOWS) +#if L_BUFFER_SIZE > 16384 +#define STACK_TOUCH \ + movl $0, 4096 * 4(%rsp);\ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 12288 +#define STACK_TOUCH \ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 8192 +#define STACK_TOUCH \ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 4096 +#define STACK_TOUCH \ + movl $0, 4096 * 1(%rsp); +#else +#define STACK_TOUCH +#endif +#else +#define STACK_TOUCH +#endif + +#define A_PR1 512 +#define B_PR1 512 + +/******************************************************************************************* +* Macro definitions +*******************************************************************************************/ + +.macro INIT4x12 + + vxorpd %ymm4 , %ymm4 , %ymm4 + vxorpd %ymm5 , %ymm5 , %ymm5 + vxorpd %ymm6 , %ymm6 , %ymm6 + vxorpd %ymm7 , %ymm7 , %ymm7 + vxorpd %ymm8 , %ymm8 , %ymm8 + vxorpd %ymm9 , %ymm9 , %ymm9 + vxorpd %ymm10, %ymm10, %ymm10 + vxorpd %ymm11, %ymm11, %ymm11 + vxorpd %ymm12, %ymm12, %ymm12 + vxorpd %ymm13, %ymm13, %ymm13 + vxorpd %ymm14, %ymm14, %ymm14 + vxorpd %ymm15, %ymm15, %ymm15 + +.endm + +.macro KERNEL4x12_I + prefetcht0 A_PR1(AO) + vmovups -12 * SIZE(BO), %ymm1 + prefetcht0 B_PR1(BO) + vmovups -16 * SIZE(AO), %ymm0 + prefetcht0 B_PR1+64(BO) + vmovups -8 * SIZE(BO), %ymm2 + prefetcht0 B_PR1+128(BO) + vmovups -4 * SIZE(BO), %ymm3 + vmulpd %ymm0 ,%ymm1 , %ymm4 + prefetcht0 B_PR1+192(BO) + vmulpd %ymm0 ,%ymm2 , %ymm8 + vmulpd %ymm0 ,%ymm3 , %ymm12 + prefetcht0 B_PR1+256(BO) + vpermpd $0xb1, %ymm0 , %ymm0 + vmulpd %ymm0 ,%ymm1 , %ymm5 + vmulpd %ymm0 ,%ymm2 , %ymm9 + vmulpd %ymm0 ,%ymm3 , %ymm13 + vpermpd $0x1b, %ymm0 , %ymm0 + vmulpd %ymm0 ,%ymm1 , %ymm6 + vmulpd %ymm0 ,%ymm2 , %ymm10 + + addq $12*SIZE, BO + vmulpd %ymm0 ,%ymm3 , %ymm14 + vpermpd $0xb1, %ymm0 , %ymm0 + vmulpd %ymm0 ,%ymm1 , %ymm7 + vmovups -12 * SIZE(BO), %ymm1 + vmulpd %ymm0 ,%ymm2 , %ymm11 + vmovups -8 * SIZE(BO), %ymm2 + vmulpd %ymm0 ,%ymm3 , %ymm15 + vmovups -4 * SIZE(BO), %ymm3 + +.endm + +.macro KERNEL4x12_M1 + prefetcht0 A_PR1(AO) + vmovups -16 * SIZE(AO), %ymm0 + prefetcht0 B_PR1(BO) + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 + prefetcht0 B_PR1+64(BO) + vfmadd231pd %ymm0 ,%ymm2 , %ymm8 + prefetcht0 B_PR1+128(BO) + vfmadd231pd %ymm0 ,%ymm3 , %ymm12 + vpermpd $0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 + vfmadd231pd %ymm0 ,%ymm2 , %ymm9 + vfmadd231pd %ymm0 ,%ymm3 , %ymm13 + vpermpd $0x1b, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + vfmadd231pd %ymm0 ,%ymm2 , %ymm10 + + vfmadd231pd %ymm0 ,%ymm3 , %ymm14 + vpermpd $0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + vmovups -12 * SIZE(BO), %ymm1 + vfmadd231pd %ymm0 ,%ymm2 , %ymm11 + vmovups -8 * SIZE(BO), %ymm2 + vfmadd231pd %ymm0 ,%ymm3 , %ymm15 + vmovups -4 * SIZE(BO), %ymm3 + +.endm + +.macro KERNEL4x12_M2 + vmovups -12 * SIZE(AO), %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 + vfmadd231pd %ymm0 ,%ymm2 , %ymm8 + vfmadd231pd %ymm0 ,%ymm3 , %ymm12 + vpermpd $0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 + vfmadd231pd %ymm0 ,%ymm2 , %ymm9 + vfmadd231pd %ymm0 ,%ymm3 , %ymm13 + vpermpd $0x1b, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + vfmadd231pd %ymm0 ,%ymm2 , %ymm10 + + addq $8*SIZE, AO + vfmadd231pd %ymm0 ,%ymm3 , %ymm14 + vpermpd $0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + vmovups 0 * SIZE(BO), %ymm1 + vfmadd231pd %ymm0 ,%ymm2 , %ymm11 + vmovups 4 * SIZE(BO), %ymm2 + vfmadd231pd %ymm0 ,%ymm3 , %ymm15 + vmovups 8 * SIZE(BO), %ymm3 + addq $24*SIZE, BO +.endm + + +.macro KERNEL4x12_E + vmovups -12 * SIZE(AO), %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 + vfmadd231pd %ymm0 ,%ymm2 , %ymm8 + vfmadd231pd %ymm0 ,%ymm3 , %ymm12 + vpermpd $0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 + vfmadd231pd %ymm0 ,%ymm2 , %ymm9 + vfmadd231pd %ymm0 ,%ymm3 , %ymm13 + vpermpd $0x1b, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + vfmadd231pd %ymm0 ,%ymm2 , %ymm10 + + addq $8*SIZE, AO + vfmadd231pd %ymm0 ,%ymm3 , %ymm14 + vpermpd $0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + vfmadd231pd %ymm0 ,%ymm2 , %ymm11 + vfmadd231pd %ymm0 ,%ymm3 , %ymm15 + addq $12*SIZE, BO +.endm + +.macro KERNEL4x12_SUB + vmovups -12 * SIZE(BO), %ymm1 + vmovups -16 * SIZE(AO), %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 + vmovups -8 * SIZE(BO), %ymm2 + vfmadd231pd %ymm0 ,%ymm2 , %ymm8 + vmovups -4 * SIZE(BO), %ymm3 + vfmadd231pd %ymm0 ,%ymm3 , %ymm12 + vpermpd $0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 + vfmadd231pd %ymm0 ,%ymm2 , %ymm9 + addq $12*SIZE, BO + vfmadd231pd %ymm0 ,%ymm3 , %ymm13 + vpermpd $0x1b, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + vfmadd231pd %ymm0 ,%ymm2 , %ymm10 + addq $4*SIZE, AO + vfmadd231pd %ymm0 ,%ymm3 , %ymm14 + vpermpd $0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + vfmadd231pd %ymm0 ,%ymm2 , %ymm11 + vfmadd231pd %ymm0 ,%ymm3 , %ymm15 + +.endm + + +.macro SAVE4x12 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm5 , %ymm5 + vmulpd %ymm0 , %ymm6 , %ymm6 + vmulpd %ymm0 , %ymm7 , %ymm7 + + vmulpd %ymm0 , %ymm8 , %ymm8 + vmulpd %ymm0 , %ymm9 , %ymm9 + vmulpd %ymm0 , %ymm10, %ymm10 + vmulpd %ymm0 , %ymm11, %ymm11 + + vmulpd %ymm0 , %ymm12, %ymm12 + vmulpd %ymm0 , %ymm13, %ymm13 + vmulpd %ymm0 , %ymm14, %ymm14 + vmulpd %ymm0 , %ymm15, %ymm15 + + vpermpd $0xb1 , %ymm5, %ymm5 + vpermpd $0xb1 , %ymm7, %ymm7 + + vblendpd $0x0a, %ymm5, %ymm4, %ymm0 + vblendpd $0x05, %ymm5, %ymm4, %ymm1 + vblendpd $0x0a, %ymm7, %ymm6, %ymm2 + vblendpd $0x05, %ymm7, %ymm6, %ymm3 + + vpermpd $0x1b , %ymm2, %ymm2 + vpermpd $0x1b , %ymm3, %ymm3 + vpermpd $0xb1 , %ymm2, %ymm2 + vpermpd $0xb1 , %ymm3, %ymm3 + + vblendpd $0x03, %ymm0, %ymm2 , %ymm4 + vblendpd $0x03, %ymm1, %ymm3 , %ymm5 + vblendpd $0x03, %ymm2, %ymm0 , %ymm6 + vblendpd $0x03, %ymm3, %ymm1 , %ymm7 + + leaq (CO1, LDC, 2), %rax + + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4, %ymm4 + vaddpd (CO1, LDC), %ymm5, %ymm5 + vaddpd (%rax), %ymm6, %ymm6 + vaddpd (%rax, LDC), %ymm7, %ymm7 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm5 , (CO1, LDC) + vmovups %ymm6 , (%rax) + vmovups %ymm7 , (%rax, LDC) + + prefetcht0 32(CO1) + prefetcht0 32(CO1,LDC) + prefetcht0 32(%rax) + prefetcht0 32(%rax,LDC) + + vpermpd $0xb1 , %ymm9 , %ymm9 + vpermpd $0xb1 , %ymm11, %ymm11 + + vblendpd $0x0a, %ymm9 , %ymm8 , %ymm0 + vblendpd $0x05, %ymm9 , %ymm8 , %ymm1 + vblendpd $0x0a, %ymm11, %ymm10, %ymm2 + vblendpd $0x05, %ymm11, %ymm10, %ymm3 + + vpermpd $0x1b , %ymm2, %ymm2 + vpermpd $0x1b , %ymm3, %ymm3 + vpermpd $0xb1 , %ymm2, %ymm2 + vpermpd $0xb1 , %ymm3, %ymm3 + + vblendpd $0x03, %ymm0, %ymm2 , %ymm4 + vblendpd $0x03, %ymm1, %ymm3 , %ymm5 + vblendpd $0x03, %ymm2, %ymm0 , %ymm6 + vblendpd $0x03, %ymm3, %ymm1 , %ymm7 + + + leaq (%rax, LDC, 2), %rax + leaq (%rax, LDC, 2), %rbp + +#if !defined(TRMMKERNEL) + + vaddpd (%rax), %ymm4, %ymm4 + vaddpd (%rax, LDC), %ymm5, %ymm5 + vaddpd (%rbp), %ymm6, %ymm6 + vaddpd (%rbp, LDC), %ymm7, %ymm7 + +#endif + + vmovups %ymm4 , (%rax) + vmovups %ymm5 , (%rax, LDC) + vmovups %ymm6 , (%rbp) + vmovups %ymm7 , (%rbp, LDC) + + prefetcht0 32(%rax) + prefetcht0 32(%rax,LDC) + prefetcht0 32(%rbp) + prefetcht0 32(%rbp,LDC) + + vpermpd $0xb1 , %ymm13, %ymm13 + vpermpd $0xb1 , %ymm15, %ymm15 + + vblendpd $0x0a, %ymm13, %ymm12, %ymm0 + vblendpd $0x05, %ymm13, %ymm12, %ymm1 + vblendpd $0x0a, %ymm15, %ymm14, %ymm2 + vblendpd $0x05, %ymm15, %ymm14, %ymm3 + + vpermpd $0x1b , %ymm2, %ymm2 + vpermpd $0x1b , %ymm3, %ymm3 + vpermpd $0xb1 , %ymm2, %ymm2 + vpermpd $0xb1 , %ymm3, %ymm3 + + vblendpd $0x03, %ymm0, %ymm2 , %ymm4 + vblendpd $0x03, %ymm1, %ymm3 , %ymm5 + vblendpd $0x03, %ymm2, %ymm0 , %ymm6 + vblendpd $0x03, %ymm3, %ymm1 , %ymm7 + + + leaq (%rax, LDC, 4), %rax + leaq (%rbp, LDC, 4), %rbp + +#if !defined(TRMMKERNEL) + + vaddpd (%rax), %ymm4, %ymm4 + vaddpd (%rax, LDC), %ymm5, %ymm5 + vaddpd (%rbp), %ymm6, %ymm6 + vaddpd (%rbp, LDC), %ymm7, %ymm7 + +#endif + + vmovups %ymm4 , (%rax) + vmovups %ymm5 , (%rax, LDC) + vmovups %ymm6 , (%rbp) + vmovups %ymm7 , (%rbp, LDC) + + prefetcht0 32(%rax) + prefetcht0 32(%rax,LDC) + prefetcht0 32(%rbp) + prefetcht0 32(%rbp,LDC) + + addq $4*SIZE, CO1 +.endm + +/******************************************************************************************/ + +.macro INIT2x12 + + vxorpd %xmm4 , %xmm4 , %xmm4 + vxorpd %xmm5 , %xmm5 , %xmm5 + vxorpd %xmm6 , %xmm6 , %xmm6 + vxorpd %xmm7 , %xmm7 , %xmm7 + vxorpd %xmm8 , %xmm8 , %xmm8 + vxorpd %xmm9 , %xmm9 , %xmm9 + vxorpd %xmm10, %xmm10, %xmm10 + vxorpd %xmm11, %xmm11, %xmm11 + vxorpd %xmm12, %xmm12, %xmm12 + vxorpd %xmm13, %xmm13, %xmm13 + vxorpd %xmm14, %xmm14, %xmm14 + vxorpd %xmm15, %xmm15, %xmm15 + +.endm + +.macro KERNEL2x12_SUB + vmovups -16 * SIZE(AO), %xmm0 + vmovddup -12 * SIZE(BO), %xmm1 + vmovddup -11 * SIZE(BO), %xmm2 + vmovddup -10 * SIZE(BO), %xmm3 + vfmadd231pd %xmm0 ,%xmm1 , %xmm4 + vmovddup -9 * SIZE(BO), %xmm1 + vfmadd231pd %xmm0 ,%xmm2 , %xmm5 + vmovddup -8 * SIZE(BO), %xmm2 + vfmadd231pd %xmm0 ,%xmm3 , %xmm6 + vmovddup -7 * SIZE(BO), %xmm3 + vfmadd231pd %xmm0 ,%xmm1 , %xmm7 + vmovddup -6 * SIZE(BO), %xmm1 + vfmadd231pd %xmm0 ,%xmm2 , %xmm8 + vmovddup -5 * SIZE(BO), %xmm2 + vfmadd231pd %xmm0 ,%xmm3 , %xmm9 + vmovddup -4 * SIZE(BO), %xmm3 + vfmadd231pd %xmm0 ,%xmm1 , %xmm10 + vmovddup -3 * SIZE(BO), %xmm1 + vfmadd231pd %xmm0 ,%xmm2 , %xmm11 + vmovddup -2 * SIZE(BO), %xmm2 + vfmadd231pd %xmm0 ,%xmm3 , %xmm12 + vmovddup -1 * SIZE(BO), %xmm3 + vfmadd231pd %xmm0 ,%xmm1 , %xmm13 + addq $12*SIZE, BO + vfmadd231pd %xmm0 ,%xmm2 , %xmm14 + addq $2*SIZE, AO + vfmadd231pd %xmm0 ,%xmm3 , %xmm15 + +.endm + +.macro SAVE2x12 + + vmovddup ALPHA, %xmm0 + + vmulpd %xmm0 , %xmm4 , %xmm4 + vmulpd %xmm0 , %xmm5 , %xmm5 + vmulpd %xmm0 , %xmm6 , %xmm6 + vmulpd %xmm0 , %xmm7 , %xmm7 + + vmulpd %xmm0 , %xmm8 , %xmm8 + vmulpd %xmm0 , %xmm9 , %xmm9 + vmulpd %xmm0 , %xmm10, %xmm10 + vmulpd %xmm0 , %xmm11, %xmm11 + + vmulpd %xmm0 , %xmm12, %xmm12 + vmulpd %xmm0 , %xmm13, %xmm13 + vmulpd %xmm0 , %xmm14, %xmm14 + vmulpd %xmm0 , %xmm15, %xmm15 + + + leaq (CO1, LDC, 2), %rax + + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %xmm4, %xmm4 + vaddpd (CO1, LDC), %xmm5, %xmm5 + vaddpd (%rax), %xmm6, %xmm6 + vaddpd (%rax, LDC), %xmm7, %xmm7 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm6 , (%rax) + vmovups %xmm7 , (%rax, LDC) + + + leaq (%rax, LDC, 2), %rax + leaq (%rax, LDC, 2), %rbp + +#if !defined(TRMMKERNEL) + + vaddpd (%rax), %xmm8 , %xmm4 + vaddpd (%rax, LDC), %xmm9 , %xmm5 + vaddpd (%rbp), %xmm10, %xmm6 + vaddpd (%rbp, LDC), %xmm11, %xmm7 + +#endif + + vmovups %xmm4 , (%rax) + vmovups %xmm5 , (%rax, LDC) + vmovups %xmm6 , (%rbp) + vmovups %xmm7 , (%rbp, LDC) + + + leaq (%rax, LDC, 4), %rax + leaq (%rbp, LDC, 4), %rbp + +#if !defined(TRMMKERNEL) + + vaddpd (%rax), %xmm12, %xmm4 + vaddpd (%rax, LDC), %xmm13, %xmm5 + vaddpd (%rbp), %xmm14, %xmm6 + vaddpd (%rbp, LDC), %xmm15, %xmm7 + +#endif + + vmovups %xmm4 , (%rax) + vmovups %xmm5 , (%rax, LDC) + vmovups %xmm6 , (%rbp) + vmovups %xmm7 , (%rbp, LDC) + + addq $2*SIZE, CO1 +.endm + + +/******************************************************************************************/ + +.macro INIT1x12 + + vxorpd %xmm4 , %xmm4 , %xmm4 + vxorpd %xmm5 , %xmm5 , %xmm5 + vxorpd %xmm6 , %xmm6 , %xmm6 + vxorpd %xmm7 , %xmm7 , %xmm7 + vxorpd %xmm8 , %xmm8 , %xmm8 + vxorpd %xmm9 , %xmm9 , %xmm9 + vxorpd %xmm10, %xmm10, %xmm10 + vxorpd %xmm11, %xmm11, %xmm11 + vxorpd %xmm12, %xmm12, %xmm12 + vxorpd %xmm13, %xmm13, %xmm13 + vxorpd %xmm14, %xmm14, %xmm14 + vxorpd %xmm15, %xmm15, %xmm15 + +.endm + +.macro KERNEL1x12_SUB + vmovsd -16 * SIZE(AO), %xmm0 + vmovsd -12 * SIZE(BO), %xmm1 + vmovsd -11 * SIZE(BO), %xmm2 + vmovsd -10 * SIZE(BO), %xmm3 + vfmadd231sd %xmm0 ,%xmm1 , %xmm4 + vmovsd -9 * SIZE(BO), %xmm1 + vfmadd231sd %xmm0 ,%xmm2 , %xmm5 + vmovsd -8 * SIZE(BO), %xmm2 + vfmadd231sd %xmm0 ,%xmm3 , %xmm6 + vmovsd -7 * SIZE(BO), %xmm3 + vfmadd231sd %xmm0 ,%xmm1 , %xmm7 + vmovsd -6 * SIZE(BO), %xmm1 + vfmadd231sd %xmm0 ,%xmm2 , %xmm8 + vmovsd -5 * SIZE(BO), %xmm2 + vfmadd231sd %xmm0 ,%xmm3 , %xmm9 + vmovsd -4 * SIZE(BO), %xmm3 + vfmadd231sd %xmm0 ,%xmm1 , %xmm10 + vmovsd -3 * SIZE(BO), %xmm1 + vfmadd231sd %xmm0 ,%xmm2 , %xmm11 + vmovsd -2 * SIZE(BO), %xmm2 + vfmadd231sd %xmm0 ,%xmm3 , %xmm12 + vmovsd -1 * SIZE(BO), %xmm3 + vfmadd231sd %xmm0 ,%xmm1 , %xmm13 + addq $12*SIZE, BO + vfmadd231sd %xmm0 ,%xmm2 , %xmm14 + addq $1*SIZE, AO + vfmadd231sd %xmm0 ,%xmm3 , %xmm15 + +.endm + +.macro SAVE1x12 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + vmulsd %xmm0 , %xmm5 , %xmm5 + vmulsd %xmm0 , %xmm6 , %xmm6 + vmulsd %xmm0 , %xmm7 , %xmm7 + + vmulsd %xmm0 , %xmm8 , %xmm8 + vmulsd %xmm0 , %xmm9 , %xmm9 + vmulsd %xmm0 , %xmm10, %xmm10 + vmulsd %xmm0 , %xmm11, %xmm11 + + vmulsd %xmm0 , %xmm12, %xmm12 + vmulsd %xmm0 , %xmm13, %xmm13 + vmulsd %xmm0 , %xmm14, %xmm14 + vmulsd %xmm0 , %xmm15, %xmm15 + + + leaq (CO1, LDC, 2), %rax + + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4, %xmm4 + vaddsd (CO1, LDC), %xmm5, %xmm5 + vaddsd (%rax), %xmm6, %xmm6 + vaddsd (%rax, LDC), %xmm7, %xmm7 + +#endif + + vmovsd %xmm4 , (CO1) + vmovsd %xmm5 , (CO1, LDC) + vmovsd %xmm6 , (%rax) + vmovsd %xmm7 , (%rax, LDC) + + + leaq (%rax, LDC, 2), %rax + leaq (%rax, LDC, 2), %rbp + +#if !defined(TRMMKERNEL) + + vaddsd (%rax), %xmm8 , %xmm4 + vaddsd (%rax, LDC), %xmm9 , %xmm5 + vaddsd (%rbp), %xmm10, %xmm6 + vaddsd (%rbp, LDC), %xmm11, %xmm7 + +#endif + + vmovsd %xmm4 , (%rax) + vmovsd %xmm5 , (%rax, LDC) + vmovsd %xmm6 , (%rbp) + vmovsd %xmm7 , (%rbp, LDC) + + + leaq (%rax, LDC, 4), %rax + leaq (%rbp, LDC, 4), %rbp + +#if !defined(TRMMKERNEL) + + vaddsd (%rax), %xmm12, %xmm4 + vaddsd (%rax, LDC), %xmm13, %xmm5 + vaddsd (%rbp), %xmm14, %xmm6 + vaddsd (%rbp, LDC), %xmm15, %xmm7 + +#endif + + vmovsd %xmm4 , (%rax) + vmovsd %xmm5 , (%rax, LDC) + vmovsd %xmm6 , (%rbp) + vmovsd %xmm7 , (%rbp, LDC) + + addq $1*SIZE, CO1 +.endm + + + + +/******************************************************************************************/ +/******************************************************************************************/ + +.macro INIT4x4 + + vxorpd %ymm4 , %ymm4 , %ymm4 + vxorpd %ymm5 , %ymm5 , %ymm5 + vxorpd %ymm6 , %ymm6 , %ymm6 + vxorpd %ymm7 , %ymm7 , %ymm7 + +.endm + +.macro KERNEL4x4_I + prefetcht0 A_PR1(AO) + vmovups -12 * SIZE(BO), %ymm1 + vmovups -16 * SIZE(AO), %ymm0 + vmulpd %ymm0 ,%ymm1 , %ymm4 + vpermpd $0xb1, %ymm0 , %ymm0 + vmulpd %ymm0 ,%ymm1 , %ymm5 + vpermpd $0x1b, %ymm0 , %ymm0 + vmulpd %ymm0 ,%ymm1 , %ymm6 + + addq $4*SIZE, BO + vpermpd $0xb1, %ymm0 , %ymm0 + vmulpd %ymm0 ,%ymm1 , %ymm7 + vmovups -12 * SIZE(BO), %ymm1 + +.endm + +.macro KERNEL4x4_M1 + prefetcht0 A_PR1(AO) + vmovups -16 * SIZE(AO), %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 + vpermpd $0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 + vpermpd $0x1b, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + + vpermpd $0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + vmovups -12 * SIZE(BO), %ymm1 + +.endm + +.macro KERNEL4x4_M2 + vmovups -12 * SIZE(AO), %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 + vpermpd $0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 + vpermpd $0x1b, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + + addq $8*SIZE, AO + vpermpd $0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + vmovups -8 * SIZE(BO), %ymm1 + addq $8*SIZE, BO +.endm + + +.macro KERNEL4x4_E + vmovups -12 * SIZE(AO), %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 + vpermpd $0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 + vpermpd $0x1b, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + + addq $8*SIZE, AO + vpermpd $0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + addq $4*SIZE, BO +.endm + +.macro KERNEL4x4_SUB + vmovups -12 * SIZE(BO), %ymm1 + vmovups -16 * SIZE(AO), %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 + vpermpd $0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 + addq $4*SIZE, BO + vpermpd $0x1b, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + addq $4*SIZE, AO + vpermpd $0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + +.endm + +.macro SAVE4x4 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm7 , %ymm7 + vmulpd %ymm0 , %ymm5 , %ymm5 + vmulpd %ymm0 , %ymm6 , %ymm6 + + vpermpd $0xb1 , %ymm5, %ymm5 + vpermpd $0xb1 , %ymm7, %ymm7 + + vblendpd $0x0a, %ymm5, %ymm4, %ymm0 + vblendpd $0x05, %ymm5, %ymm4, %ymm1 + vblendpd $0x0a, %ymm7, %ymm6, %ymm2 + vblendpd $0x05, %ymm7, %ymm6, %ymm3 + + vpermpd $0x1b , %ymm2, %ymm2 + vpermpd $0x1b , %ymm3, %ymm3 + vpermpd $0xb1 , %ymm2, %ymm2 + vpermpd $0xb1 , %ymm3, %ymm3 + + vblendpd $0x03, %ymm0, %ymm2 , %ymm4 + vblendpd $0x03, %ymm1, %ymm3 , %ymm5 + vblendpd $0x03, %ymm2, %ymm0 , %ymm6 + vblendpd $0x03, %ymm3, %ymm1 , %ymm7 + + leaq (CO1, LDC, 2), %rax + + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4, %ymm4 + vaddpd (CO1, LDC), %ymm5, %ymm5 + vaddpd (%rax), %ymm6, %ymm6 + vaddpd (%rax, LDC), %ymm7, %ymm7 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm5 , (CO1, LDC) + vmovups %ymm6 , (%rax) + vmovups %ymm7 , (%rax, LDC) + + addq $4*SIZE, CO1 +.endm + +/******************************************************************************************/ +/******************************************************************************************/ + +.macro INIT2x4 + + vxorpd %xmm4 , %xmm4 , %xmm4 + vxorpd %xmm5 , %xmm5 , %xmm5 + vxorpd %xmm6 , %xmm6 , %xmm6 + vxorpd %xmm7 , %xmm7 , %xmm7 + +.endm + + +.macro KERNEL2x4_SUB + vmovddup -12 * SIZE(BO), %xmm1 + vmovups -16 * SIZE(AO), %xmm0 + vmovddup -11 * SIZE(BO), %xmm2 + vfmadd231pd %xmm0 ,%xmm1 , %xmm4 + vmovddup -10 * SIZE(BO), %xmm3 + vfmadd231pd %xmm0 ,%xmm2 , %xmm5 + vmovddup -9 * SIZE(BO), %xmm8 + vfmadd231pd %xmm0 ,%xmm3 , %xmm6 + addq $4*SIZE, BO + vfmadd231pd %xmm0 ,%xmm8 , %xmm7 + addq $2*SIZE, AO + +.endm + + +.macro SAVE2x4 + + vmovddup ALPHA, %xmm0 + + vmulpd %xmm0 , %xmm4 , %xmm4 + vmulpd %xmm0 , %xmm5 , %xmm5 + vmulpd %xmm0 , %xmm6 , %xmm6 + vmulpd %xmm0 , %xmm7 , %xmm7 + + leaq (CO1, LDC, 2), %rax + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %xmm4, %xmm4 + vaddpd (CO1, LDC), %xmm5, %xmm5 + vaddpd (%rax), %xmm6, %xmm6 + vaddpd (%rax, LDC), %xmm7, %xmm7 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm6 , (%rax) + vmovups %xmm7 , (%rax, LDC) + + addq $2*SIZE, CO1 +.endm + +/******************************************************************************************/ +/******************************************************************************************/ + +.macro INIT1x4 + + vxorpd %xmm4 , %xmm4 , %xmm4 + vxorpd %xmm5 , %xmm5 , %xmm5 + vxorpd %xmm6 , %xmm6 , %xmm6 + vxorpd %xmm7 , %xmm7 , %xmm7 + +.endm + + +.macro KERNEL1x4_SUB + vmovsd -12 * SIZE(BO), %xmm1 + vmovsd -16 * SIZE(AO), %xmm0 + vmovsd -11 * SIZE(BO), %xmm2 + vfmadd231sd %xmm0 ,%xmm1 , %xmm4 + vmovsd -10 * SIZE(BO), %xmm3 + vfmadd231sd %xmm0 ,%xmm2 , %xmm5 + vmovsd -9 * SIZE(BO), %xmm8 + vfmadd231sd %xmm0 ,%xmm3 , %xmm6 + addq $4*SIZE, BO + vfmadd231sd %xmm0 ,%xmm8 , %xmm7 + addq $1*SIZE, AO + +.endm + + +.macro SAVE1x4 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + vmulsd %xmm0 , %xmm5 , %xmm5 + vmulsd %xmm0 , %xmm6 , %xmm6 + vmulsd %xmm0 , %xmm7 , %xmm7 + + leaq (CO1, LDC, 2), %rax + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4, %xmm4 + vaddsd (CO1, LDC), %xmm5, %xmm5 + vaddsd (%rax), %xmm6, %xmm6 + vaddsd (%rax, LDC), %xmm7, %xmm7 + +#endif + + vmovsd %xmm4 , (CO1) + vmovsd %xmm5 , (CO1, LDC) + vmovsd %xmm6 , (%rax) + vmovsd %xmm7 , (%rax, LDC) + + addq $1*SIZE, CO1 +.endm + + +/******************************************************************************************/ +/******************************************************************************************/ + +.macro INIT4x2 + + vxorpd %xmm4 , %xmm4 , %xmm4 + vxorpd %xmm5 , %xmm5 , %xmm5 + vxorpd %xmm6 , %xmm6 , %xmm6 + vxorpd %xmm7 , %xmm7 , %xmm7 + +.endm + + +.macro KERNEL4x2_SUB + vmovddup -12 * SIZE(BO), %xmm2 + vmovups -16 * SIZE(AO), %xmm0 + vmovups -14 * SIZE(AO), %xmm1 + vmovddup -11 * SIZE(BO), %xmm3 + vfmadd231pd %xmm0 ,%xmm2 , %xmm4 + vfmadd231pd %xmm1 ,%xmm2 , %xmm5 + vfmadd231pd %xmm0 ,%xmm3 , %xmm6 + vfmadd231pd %xmm1 ,%xmm3 , %xmm7 + addq $2*SIZE, BO + addq $4*SIZE, AO + +.endm + + +.macro SAVE4x2 + + vmovddup ALPHA, %xmm0 + + vmulpd %xmm0 , %xmm4 , %xmm4 + vmulpd %xmm0 , %xmm5 , %xmm5 + vmulpd %xmm0 , %xmm6 , %xmm6 + vmulpd %xmm0 , %xmm7 , %xmm7 + + +#if !defined(TRMMKERNEL) + + vaddpd (CO1) , %xmm4, %xmm4 + vaddpd 2 * SIZE(CO1) , %xmm5, %xmm5 + vaddpd (CO1, LDC), %xmm6, %xmm6 + vaddpd 2 * SIZE(CO1, LDC), %xmm7, %xmm7 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , 2 * SIZE(CO1) + vmovups %xmm6 , (CO1, LDC) + vmovups %xmm7 , 2 * SIZE(CO1, LDC) + + addq $4*SIZE, CO1 +.endm + + +/******************************************************************************************/ +/******************************************************************************************/ + +.macro INIT2x2 + + vxorpd %xmm4 , %xmm4 , %xmm4 + vxorpd %xmm6 , %xmm6 , %xmm6 + +.endm + + +.macro KERNEL2x2_SUB + vmovddup -12 * SIZE(BO), %xmm2 + vmovups -16 * SIZE(AO), %xmm0 + vmovddup -11 * SIZE(BO), %xmm3 + vfmadd231pd %xmm0 ,%xmm2 , %xmm4 + vfmadd231pd %xmm0 ,%xmm3 , %xmm6 + addq $2*SIZE, BO + addq $2*SIZE, AO + +.endm + + +.macro SAVE2x2 + + vmovddup ALPHA, %xmm0 + + vmulpd %xmm0 , %xmm4 , %xmm4 + vmulpd %xmm0 , %xmm6 , %xmm6 + + +#if !defined(TRMMKERNEL) + + vaddpd (CO1) , %xmm4, %xmm4 + vaddpd (CO1, LDC), %xmm6, %xmm6 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm6 , (CO1, LDC) + + addq $2*SIZE, CO1 +.endm + +/******************************************************************************************/ +/******************************************************************************************/ + +.macro INIT1x2 + + vxorpd %xmm4 , %xmm4 , %xmm4 + vxorpd %xmm5 , %xmm5 , %xmm5 + +.endm + + +.macro KERNEL1x2_SUB + vmovsd -12 * SIZE(BO), %xmm1 + vmovsd -16 * SIZE(AO), %xmm0 + vmovsd -11 * SIZE(BO), %xmm2 + vfmadd231sd %xmm0 ,%xmm1 , %xmm4 + vfmadd231sd %xmm0 ,%xmm2 , %xmm5 + addq $2*SIZE, BO + addq $1*SIZE, AO + +.endm + + +.macro SAVE1x2 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + vmulsd %xmm0 , %xmm5 , %xmm5 + + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4, %xmm4 + vaddsd (CO1, LDC), %xmm5, %xmm5 + +#endif + + vmovsd %xmm4 , (CO1) + vmovsd %xmm5 , (CO1, LDC) + + addq $1*SIZE, CO1 +.endm + + +/******************************************************************************************/ +/******************************************************************************************/ + +.macro INIT4x1 + + vxorpd %xmm4 , %xmm4 , %xmm4 + vxorpd %xmm5 , %xmm5 , %xmm5 + +.endm + + +.macro KERNEL4x1_SUB + vmovddup -12 * SIZE(BO), %xmm2 + vmovups -16 * SIZE(AO), %xmm0 + vmovups -14 * SIZE(AO), %xmm1 + vfmadd231pd %xmm0 ,%xmm2 , %xmm4 + vfmadd231pd %xmm1 ,%xmm2 , %xmm5 + addq $1*SIZE, BO + addq $4*SIZE, AO + +.endm + + +.macro SAVE4x1 + + vmovddup ALPHA, %xmm0 + + vmulpd %xmm0 , %xmm4 , %xmm4 + vmulpd %xmm0 , %xmm5 , %xmm5 + + +#if !defined(TRMMKERNEL) + + vaddpd (CO1) , %xmm4, %xmm4 + vaddpd 2 * SIZE(CO1) , %xmm5, %xmm5 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , 2 * SIZE(CO1) + + addq $4*SIZE, CO1 +.endm + + +/******************************************************************************************/ +/******************************************************************************************/ + +.macro INIT2x1 + + vxorpd %xmm4 , %xmm4 , %xmm4 + +.endm + + +.macro KERNEL2x1_SUB + vmovddup -12 * SIZE(BO), %xmm2 + vmovups -16 * SIZE(AO), %xmm0 + vfmadd231pd %xmm0 ,%xmm2 , %xmm4 + addq $1*SIZE, BO + addq $2*SIZE, AO + +.endm + + +.macro SAVE2x1 + + vmovddup ALPHA, %xmm0 + + vmulpd %xmm0 , %xmm4 , %xmm4 + + +#if !defined(TRMMKERNEL) + + vaddpd (CO1) , %xmm4, %xmm4 + +#endif + + vmovups %xmm4 , (CO1) + + addq $2*SIZE, CO1 +.endm + + +/******************************************************************************************/ +/******************************************************************************************/ + +.macro INIT1x1 + + vxorpd %xmm4 , %xmm4 , %xmm4 + +.endm + + +.macro KERNEL1x1_SUB + vmovsd -12 * SIZE(BO), %xmm1 + vmovsd -16 * SIZE(AO), %xmm0 + vfmadd231sd %xmm0 ,%xmm1 , %xmm4 + addq $1*SIZE, BO + addq $1*SIZE, AO + +.endm + + +.macro SAVE1x1 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4, %xmm4 + +#endif + + vmovsd %xmm4 , (CO1) + + addq $1*SIZE, CO1 +.endm + + +/*******************************************************************************************/ + +#if !defined(TRMMKERNEL) + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + vmovups %xmm6, 64(%rsp) + vmovups %xmm7, 80(%rsp) + vmovups %xmm8, 96(%rsp) + vmovups %xmm9, 112(%rsp) + vmovups %xmm10, 128(%rsp) + vmovups %xmm11, 144(%rsp) + vmovups %xmm12, 160(%rsp) + vmovups %xmm13, 176(%rsp) + vmovups %xmm14, 192(%rsp) + vmovups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC + + vmovups %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovsd %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $12, %rdi + divq %rdi // N / 12 + movq %rax, Ndiv12 // N / 12 + movq %rdx, Nmod12 // N % 12 + + + movq Ndiv12, J + cmpq $0, J + je .L4_0 + ALIGN_4 + +.L12_01: + // copy to sub buffer + movq K, %rax + salq $2,%rax // K * 4 ; read 2 values + movq B, BO1 + leaq (B,%rax, SIZE), BO2 // next offset to BO2 + leaq (BO2,%rax, SIZE), BO3 // next offset to BO2 + + + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + sarq $1 , %rax // K / 2 + jz .L12_01a_2 + ALIGN_4 + +.L12_01a_1: + + prefetcht0 512(BO1) + prefetcht0 512(BO2) + prefetcht0 512(BO3) + prefetchw 512(BO) + + + vmovups 0 * SIZE(BO1), %ymm1 + vmovups 4 * SIZE(BO1), %ymm5 + vmovups 0 * SIZE(BO2), %ymm2 + vmovups 4 * SIZE(BO2), %ymm6 + vmovups 0 * SIZE(BO3), %ymm3 + vmovups 4 * SIZE(BO3), %ymm7 + + vmovups %ymm1, 0 * SIZE(BO) + vmovups %ymm2, 4 * SIZE(BO) + vmovups %ymm3, 8 * SIZE(BO) + + vmovups %ymm5, 12 * SIZE(BO) + vmovups %ymm6, 16 * SIZE(BO) + vmovups %ymm7, 20 * SIZE(BO) + + addq $8 * SIZE ,BO1 + addq $8 * SIZE ,BO2 + addq $8 * SIZE ,BO3 + addq $24 *SIZE ,BO + + decq %rax + jnz .L12_01a_1 + + + +.L12_01a_2: + + movq K, %rax + andq $1, %rax // K % 2 + jz .L12_03c + ALIGN_4 + + +.L12_02b: + + vmovups 0 * SIZE(BO1), %ymm1 + vmovups 0 * SIZE(BO2), %ymm2 + vmovups 0 * SIZE(BO3), %ymm3 + vmovups %ymm1, 0 * SIZE(BO) + vmovups %ymm2, 4 * SIZE(BO) + vmovups %ymm3, 8 * SIZE(BO) + addq $4*SIZE,BO1 + addq $4*SIZE,BO2 + addq $4*SIZE,BO3 + addq $12*SIZE,BO + decq %rax + jnz .L12_02b + +.L12_03c: + + movq BO3, B // next offset of B + +.L12_10: + movq C, CO1 + leaq (C, LDC, 8), C + leaq (C, LDC, 4), C // c += 12 * ldc + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = m / 4 + je .L12_20 + + ALIGN_4 + +.L12_11: + leaq BUFFER1, BO // first buffer to BO + addq $12 * SIZE, BO + + movq K, %rax + + sarq $3, %rax // K / 8 + cmpq $2, %rax + + jl .L12_13 + + + KERNEL4x12_I + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_M2 + + KERNEL4x12_M1 + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_M2 + + subq $2, %rax + je .L12_12a + + .align 32 + +.L12_12: + + KERNEL4x12_M1 + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_M2 + + KERNEL4x12_M1 + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_M2 + + dec %rax + jne .L12_12 + +.L12_12a: + + KERNEL4x12_M1 + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_M2 + + KERNEL4x12_M1 + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_E + + jmp .L12_16 + + +.L12_13: + + test $1, %rax + jz .L12_14 + + KERNEL4x12_I + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_M2 + + KERNEL4x12_M1 + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_E + + jmp .L12_16 + + +.L12_14: + + INIT4x12 + + +.L12_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L12_19 + + ALIGN_4 + +.L12_17: + + KERNEL4x12_SUB + + dec %rax + jne .L12_17 + ALIGN_4 + + +.L12_19: + + SAVE4x12 + + decq I # i -- + jne .L12_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L12_20: + // Test rest of M + + testq $3, M + jz .L12_100 // to next 16 lines of N + + +.L12_30: + testq $2, M + jz .L12_40 + + ALIGN_4 + +.L12_31: + leaq BUFFER1, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT2x12 + + movq K, %rax + + sarq $3, %rax + je .L12_36 + ALIGN_4 + +.L12_32: + + KERNEL2x12_SUB + KERNEL2x12_SUB + KERNEL2x12_SUB + KERNEL2x12_SUB + + KERNEL2x12_SUB + KERNEL2x12_SUB + KERNEL2x12_SUB + KERNEL2x12_SUB + + dec %rax + jne .L12_32 + ALIGN_4 + +.L12_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L12_39 + + ALIGN_4 + +.L12_37: + + KERNEL2x12_SUB + + dec %rax + jne .L12_37 + ALIGN_4 + + +.L12_39: + + SAVE2x12 + + ALIGN_4 + +.L12_40: + testq $1, M + jz .L12_100 // to next 3 lines of N + + ALIGN_4 + +.L12_41: + leaq BUFFER1, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT1x12 + + movq K, %rax + + sarq $3,%rax + je .L12_46 + + ALIGN_4 + +.L12_42: + + KERNEL1x12_SUB + KERNEL1x12_SUB + KERNEL1x12_SUB + KERNEL1x12_SUB + + KERNEL1x12_SUB + KERNEL1x12_SUB + KERNEL1x12_SUB + KERNEL1x12_SUB + + + dec %rax + jne .L12_42 + ALIGN_4 + +.L12_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L12_49 + + ALIGN_4 + +.L12_47: + + KERNEL1x12_SUB + + dec %rax + jne .L12_47 + ALIGN_4 + + +.L12_49: + + SAVE1x12 + + ALIGN_4 + +.L12_100: + + decq J // j -- + jg .L12_01 + + +.L4_0: + + cmpq $0, Nmod12 // N % 12 == 0 + je .L999 + + movq Nmod12, J + sarq $2, J // j = j / 4 + je .L2_0 + +.L4_10: + movq C, CO1 + leaq (C, LDC, 4), C // c += 4 * ldc + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = m / 4 + je .L4_20 + + ALIGN_4 + +.L4_11: + movq B, BO + addq $12 * SIZE, BO + + movq K, %rax + + sarq $3, %rax // K / 8 + cmpq $2, %rax + jl .L4_13 + + + KERNEL4x4_I + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + subq $2, %rax + je .L4_12a + + .align 32 + +.L4_12: + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + dec %rax + jne .L4_12 + +.L4_12a: + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_E + + jmp .L4_16 + + +.L4_13: + + test $1, %rax + jz .L4_14 + + KERNEL4x4_I + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_E + + jmp .L4_16 + + +.L4_14: + + INIT4x4 + + +.L4_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L4_19 + + ALIGN_4 + +.L4_17: + + KERNEL4x4_SUB + + dec %rax + jne .L4_17 + ALIGN_4 + + +.L4_19: + + SAVE4x4 + + decq I # i -- + jg .L4_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L4_20: + // Test rest of M + + testq $3, M + jz .L4_100 // to next 16 lines of N + + +.L4_30: + testq $2, M + jz .L4_40 + + ALIGN_4 + +.L4_31: + movq B, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT2x4 + + movq K, %rax + + sarq $3, %rax + je .L4_36 + ALIGN_4 + +.L4_32: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + dec %rax + jne .L4_32 + ALIGN_4 + +.L4_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L4_39 + + ALIGN_4 + +.L4_37: + + KERNEL2x4_SUB + + dec %rax + jne .L4_37 + + +.L4_39: + + SAVE2x4 + +.L4_40: + testq $1, M + jz .L4_100 // to next 3 lines of N + + ALIGN_4 + +.L4_41: + movq B, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT1x4 + + movq K, %rax + + sarq $3,%rax + je .L4_46 + + ALIGN_4 + +.L4_42: + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + dec %rax + jne .L4_42 + ALIGN_4 + +.L4_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L4_49 + + ALIGN_4 + +.L4_47: + + KERNEL1x4_SUB + + dec %rax + jne .L4_47 + ALIGN_4 + + +.L4_49: + + SAVE1x4 + + ALIGN_4 + +.L4_100: + + movq K, %rax + salq $2, %rax // * 4 + leaq (B , %rax, SIZE), B + decq J // j -- + jg .L4_10 + + + + +/***************************************************************************************************************/ + +.L2_0: + + movq Nmod12, J + testq $2, J + je .L1_0 + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = m / 4 + je .L2_20 + + ALIGN_4 + +.L2_11: + movq B, BO + addq $12 * SIZE, BO + + INIT4x2 + + movq K, %rax + sarq $3, %rax // K / 8 + + je .L2_16 + + .align 32 + +.L2_12: + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + dec %rax + jne .L2_12 + + +.L2_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_19 + + ALIGN_4 + +.L2_17: + + KERNEL4x2_SUB + + dec %rax + jne .L2_17 + ALIGN_4 + + +.L2_19: + + SAVE4x2 + + decq I # i -- + jg .L2_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_20: + // Test rest of M + + testq $3, M + jz .L2_100 // to next 16 lines of N + + +.L2_30: + testq $2, M + jz .L2_40 + + ALIGN_4 + +.L2_31: + movq B, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT2x2 + + movq K, %rax + + sarq $3, %rax + je .L2_36 + ALIGN_4 + +.L2_32: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + dec %rax + jne .L2_32 + +.L2_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_39 + + ALIGN_4 + +.L2_37: + + KERNEL2x2_SUB + + dec %rax + jne .L2_37 + + +.L2_39: + + SAVE2x2 + +.L2_40: + testq $1, M + jz .L2_100 // to next 3 lines of N + +.L2_41: + movq B, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT1x2 + + movq K, %rax + + sarq $3,%rax + je .L2_46 + + ALIGN_4 + +.L2_42: + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + dec %rax + jne .L2_42 + +.L2_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_49 + + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB + + dec %rax + jne .L2_47 + +.L2_49: + + SAVE1x2 + +.L2_100: + + movq K, %rax + salq $1, %rax // * 2 + leaq (B , %rax, SIZE), B + +/***************************************************************************************************************/ + +.L1_0: + + movq Nmod12, J + testq $1, J + je .L999 + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = m / 4 + je .L1_20 + + ALIGN_4 + +.L1_11: + movq B, BO + addq $12 * SIZE, BO + + INIT4x1 + + movq K, %rax + + sarq $3, %rax // K / 8 + je .L1_16 + + .align 32 + +.L1_12: + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + dec %rax + jne .L1_12 + + +.L1_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_19 + + ALIGN_4 + +.L1_17: + + KERNEL4x1_SUB + + dec %rax + jne .L1_17 + ALIGN_4 + + +.L1_19: + + SAVE4x1 + + decq I # i -- + jg .L1_11 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_20: + // Test rest of M + + testq $3, M + jz .L1_100 + + +.L1_30: + testq $2, M + jz .L1_40 + + ALIGN_4 + +.L1_31: + movq B, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT2x1 + + movq K, %rax + + sarq $3, %rax + je .L1_36 + ALIGN_4 + +.L1_32: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + + dec %rax + jne .L1_32 + +.L1_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_39 + + ALIGN_4 + +.L1_37: + + KERNEL2x1_SUB + + dec %rax + jne .L1_37 + +.L1_39: + + SAVE2x1 + +.L1_40: + testq $1, M + jz .L1_100 // to next 3 lines of N + + +.L1_41: + movq B, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT1x1 + + movq K, %rax + + sarq $3,%rax + je .L1_46 + + ALIGN_4 + +.L1_42: + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + dec %rax + jne .L1_42 + +.L1_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_49 + + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB + + dec %rax + jne .L1_47 + + +.L1_49: + + SAVE1x1 + +.L1_100: + + + + +.L999: + vzeroupper + + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + vmovups 64(%rsp), %xmm6 + vmovups 80(%rsp), %xmm7 + vmovups 96(%rsp), %xmm8 + vmovups 112(%rsp), %xmm9 + vmovups 128(%rsp), %xmm10 + vmovups 144(%rsp), %xmm11 + vmovups 160(%rsp), %xmm12 + vmovups 176(%rsp), %xmm13 + vmovups 192(%rsp), %xmm14 + vmovups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE + + +#else +/************************************************************************************* +* TRMM Kernel +*************************************************************************************/ + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + vmovups %xmm6, 64(%rsp) + vmovups %xmm7, 80(%rsp) + vmovups %xmm8, 96(%rsp) + vmovups %xmm9, 112(%rsp) + vmovups %xmm10, 128(%rsp) + vmovups %xmm11, 144(%rsp) + vmovups %xmm12, 160(%rsp) + vmovups %xmm13, 176(%rsp) + vmovups %xmm14, 192(%rsp) + vmovups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + vmovsd OLD_OFFSET, %xmm12 +#endif + vmovups %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + vmovsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovsd %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $4, %rdi + divq %rdi // N / 4 + movq %rax, Ndiv12 // N / 4 + movq %rdx, Nmod12 // N % 4 + +#ifdef TRMMKERNEL + vmovsd %xmm12, OFFSET + vmovsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + + + + movq Ndiv12, J + cmpq $0, J + je .L2_0 + ALIGN_4 + +.L4_10: + movq C, CO1 + leaq (C, LDC, 4), C // c += 4 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = m / 4 + je .L4_20 + + ALIGN_4 + +.L4_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,4), BO // add number of values in B + leaq (AO,%rax,4), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in AO +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + sarq $3, %rax // K / 8 + cmpq $2, %rax + jl .L4_13 + + + KERNEL4x4_I + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + subq $2, %rax + je .L4_12a + + .align 32 + +.L4_12: + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + dec %rax + jne .L4_12 + +.L4_12a: + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_E + + jmp .L4_16 + + +.L4_13: + + test $1, %rax + jz .L4_14 + + KERNEL4x4_I + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_E + + jmp .L4_16 + + +.L4_14: + + INIT4x4 + + +.L4_16: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L4_19 + + ALIGN_4 + +.L4_17: + + KERNEL4x4_SUB + + dec %rax + jne .L4_17 + ALIGN_4 + + +.L4_19: + + SAVE4x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax + SIZE + leaq (BO, %rax, 4), BO // number of values in B + leaq (AO, %rax, 4), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK // number of values in A +#endif + + decq I # i -- + jg .L4_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L4_20: + // Test rest of M + + testq $3, M + jz .L4_100 // to next 16 lines of N + + +.L4_30: + testq $2, M + jz .L4_40 + + ALIGN_4 + +.L4_31: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,4), BO // add number of values in B + leaq (AO,%rax,2), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + INIT2x4 + + sarq $3, %rax + je .L4_36 + ALIGN_4 + +.L4_32: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + dec %rax + jne .L4_32 + ALIGN_4 + +.L4_36: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L4_39 + + ALIGN_4 + +.L4_37: + + KERNEL2x4_SUB + + dec %rax + jne .L4_37 + + +.L4_39: + + SAVE2x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax + SIZE + leaq (BO, %rax, 4), BO // number of values in B + leaq (AO, %rax, 2), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK // number of values in A +#endif + + +.L4_40: + testq $1, M + jz .L4_100 // to next 3 lines of N + + ALIGN_4 + +.L4_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,4), BO // add number of values in B + leaq (AO,%rax,1), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + INIT1x4 + + sarq $3,%rax + je .L4_46 + + ALIGN_4 + +.L4_42: + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + dec %rax + jne .L4_42 + ALIGN_4 + +.L4_46: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L4_49 + + ALIGN_4 + +.L4_47: + + KERNEL1x4_SUB + + dec %rax + jne .L4_47 + ALIGN_4 + + +.L4_49: + + SAVE1x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax + SIZE + leaq (BO, %rax, 4), BO // number of values in B + leaq (AO, %rax, 1), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK // number of values in A +#endif + +.L4_100: + +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $4, KK // number of values in B +#endif + + + movq K, %rax + salq $2, %rax // * 4 + leaq (B , %rax, SIZE), B + decq J // j -- + jg .L4_10 + + + + +/***************************************************************************************************************/ + +.L2_0: + + movq Nmod12, J + testq $2, J + je .L1_0 + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = m / 4 + je .L2_20 + + ALIGN_4 + +.L2_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,2), BO // add number of values in B + leaq (AO,%rax,4), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + INIT4x2 + + sarq $3, %rax // K / 8 + + je .L2_16 + + .align 32 + +.L2_12: + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + dec %rax + jne .L2_12 + + +.L2_16: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L2_19 + + ALIGN_4 + +.L2_17: + + KERNEL4x2_SUB + + dec %rax + jne .L2_17 + ALIGN_4 + + +.L2_19: + + SAVE4x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax + SIZE + leaq (BO, %rax, 2), BO // number of values in B + leaq (AO, %rax, 4), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK // number of values in A +#endif + + + decq I # i -- + jg .L2_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_20: + // Test rest of M + + testq $3, M + jz .L2_100 // to next 16 lines of N + + +.L2_30: + testq $2, M + jz .L2_40 + + ALIGN_4 + +.L2_31: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,2), BO // add number of values in B + leaq (AO,%rax,2), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + INIT2x2 + + sarq $3, %rax + je .L2_36 + ALIGN_4 + +.L2_32: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + dec %rax + jne .L2_32 + +.L2_36: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L2_39 + + ALIGN_4 + +.L2_37: + + KERNEL2x2_SUB + + dec %rax + jne .L2_37 + + +.L2_39: + + SAVE2x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax + SIZE + leaq (BO, %rax, 2), BO // number of values in B + leaq (AO, %rax, 2), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK // number of values in A +#endif + + +.L2_40: + testq $1, M + jz .L2_100 // to next 3 lines of N + +.L2_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,2), BO // add number of values in B + leaq (AO,%rax,1), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + INIT1x2 + + sarq $3,%rax + je .L2_46 + + ALIGN_4 + +.L2_42: + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + dec %rax + jne .L2_42 + +.L2_46: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L2_49 + + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB + + dec %rax + jne .L2_47 + +.L2_49: + + SAVE1x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax * SIZE + leaq (BO, %rax, 2), BO // number of values in B + leaq (AO, %rax, 1), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK // number of values in A +#endif + + +.L2_100: + + +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK // number of values in B +#endif + + movq K, %rax + salq $1, %rax // * 2 + leaq (B , %rax, SIZE), B + +/***************************************************************************************************************/ + +.L1_0: + + movq Nmod12, J + testq $1, J + je .L999 + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = m / 4 + je .L1_20 + + ALIGN_4 + +.L1_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,1), BO // add number of values in B + leaq (AO,%rax,4), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + INIT4x1 + + sarq $3, %rax // K / 8 + je .L1_16 + + .align 32 + +.L1_12: + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + dec %rax + jne .L1_12 + + +.L1_16: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L1_19 + + ALIGN_4 + +.L1_17: + + KERNEL4x1_SUB + + dec %rax + jne .L1_17 + ALIGN_4 + + +.L1_19: + + SAVE4x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax * SIZE + leaq (BO, %rax, 1), BO // number of values in B + leaq (AO, %rax, 4), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK // number of values in A +#endif + + + decq I # i -- + jg .L1_11 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_20: + // Test rest of M + + testq $3, M + jz .L1_100 + + +.L1_30: + testq $2, M + jz .L1_40 + + ALIGN_4 + +.L1_31: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,1), BO // add number of values in B + leaq (AO,%rax,2), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + INIT2x1 + + sarq $3, %rax + je .L1_36 + ALIGN_4 + +.L1_32: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + + dec %rax + jne .L1_32 + +.L1_36: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L1_39 + + ALIGN_4 + +.L1_37: + + KERNEL2x1_SUB + + dec %rax + jne .L1_37 + +.L1_39: + + SAVE2x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax * SIZE + leaq (BO, %rax, 1), BO // number of values in B + leaq (AO, %rax, 2), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK // number of values in A +#endif + + +.L1_40: + testq $1, M + jz .L1_100 // to next 3 lines of N + + +.L1_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,1), BO // add number of values in B + leaq (AO,%rax,1), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + INIT1x1 + + sarq $3,%rax + je .L1_46 + + ALIGN_4 + +.L1_42: + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + dec %rax + jne .L1_42 + +.L1_46: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L1_49 + + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB + + dec %rax + jne .L1_47 + + +.L1_49: + + SAVE1x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax * SIZE + leaq (BO, %rax, 1), BO // number of values in B + leaq (AO, %rax, 1), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK // number of values in A +#endif + + + +.L1_100: + + +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $1, KK // number of values in B +#endif + + + +.L999: + + vzeroupper + + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + vmovups 64(%rsp), %xmm6 + vmovups 80(%rsp), %xmm7 + vmovups 96(%rsp), %xmm8 + vmovups 112(%rsp), %xmm9 + vmovups 128(%rsp), %xmm10 + vmovups 144(%rsp), %xmm11 + vmovups 160(%rsp), %xmm12 + vmovups 176(%rsp), %xmm13 + vmovups 192(%rsp), %xmm14 + vmovups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE + + + + + +#endif diff --git a/kernel/x86_64/dgemm_kernel_6x4_piledriver.S b/kernel/x86_64/dgemm_kernel_6x4_piledriver.S new file mode 100644 index 000000000..7b5dd1587 --- /dev/null +++ b/kernel/x86_64/dgemm_kernel_6x4_piledriver.S @@ -0,0 +1,1734 @@ +/**************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +// register blocking= 6x4. unloop k = 4. +// Use FMA3 on piledriver. +// Todo: 1) deal with the edge. 2) Add windows abi. + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 128 +#define oldbk_i %rdi +#define oldbk_j %rsi +#define oldbk_l %rdx + +#define _bk_i %r13 +#define _bk_j %r14 +#define _bk_l %r15 + +#define ALPHA %xmm0 +#define _ptr_A %rcx +#define _ptr_B %r8 +#define _ptr_C %r9 +#define LDC %r10 + +#define i %r11 +#define k %rax +#define _pre_B %r12 +#define _ptr__A_0 %rdi +#define _ptr__B_0 %rsi +#define _ptr__C_0 %rbx +#define _ptr__C_1 %rbp + +#define old_ldc 8+STACKSIZE(%rsp) +#define alpha 48(%rsp) +#define j 56(%rsp) + +#define MOVQ2560(s,d) movq s,d +#define LEAQ2560(s,d) leaq s,d +#define SARQ2560(imm,n) sarq imm,n +#define ADDQ2560(off,addr) addq off,addr +#define SUBQ2560(off,addr) subq off,addr +#define DIVQ2560(off,addr) divq off,addr +#define MULQ2560(s,d) mulq s,d +#define DECQ2560(addr) decq addr +#define NEGQ2560(s) negq s +#define TESTQ2560(n,addr) testq n,addr +#define SALQ2560(imm,n) salq imm,n + +#define MOVQ1280(s,d) movq s,d +#define LEAQ1280(s,d) leaq s,d +#define SARQ1280(imm,n) sarq imm,n +#define ADDQ1280(off,addr) addq off,addr +#define SUBQ1280(off,addr) subq off,addr +#define DIVQ1280(off,addr) divq off,addr +#define CMPQ1280(off,addr) cmpq off,addr +#define MULQ1280(s,d) mulq s,d +#define DECQ1280(addr) decq addr +#define NEGQ1280(s) negq s +#define TESTQ1280(n,addr) testq n,addr +#define SALQ1280(imm,n) salq imm,n + +#define JG jg +#define JLE jle + +#define VLD2560(addr,reg) vmovapd addr,reg +#define VST2560(reg,addr) vmovapd reg,addr +#define VMUL2560(a,b,c) vmulpd a,b,c +#define MVMUL2560(a,b,c) vmulpd b,a,c +#define VADD2560(a,b,c) vaddpd a,b,c +#define MVADD2560(a,b,c) vaddpd b,a,c +#define VSHUF2560(imm,s,d) vpermilpd imm,s,d +#define VSHUF2F2560(imm,s1,s2,d) vperm2f128 imm,s1,s2,d +#define BROAD2560(addr,reg) vbroadcastsd addr,reg +#define MOVRR2560(a,b) vmovapd a,b +#define REVS2560(imm,s1,s2,d) vshufpd imm,s1,s2,d +#define EXTR2561(imm,a,b) vextractf128 imm,a,b +#define LDL2561(addr,reg) vmovlpd addr,reg,reg +#define LDH2561(addr,reg) vmovhpd addr,reg,reg +#define STL2561(reg,addr) vmovlpd reg,addr +#define STH2561(reg,addr) vmovhpd reg,addr +#define VADD2561(a,b,c) vaddpd a,b,c +#define VXOR2560(a,b,c) vxorpd a,b,c +#define PREFETCH02560(addr,b) prefetcht0 addr +#define PREFETCH12560(addr,b) prefetcht0 addr +#define PREFETCH22560(addr,b) prefetcht2 addr +#define PREFETCHW2560(addr,b) prefetchw addr +#define PREFETCHN2560(addr,b) prefetchnta addr +#define VMA2560(a,b,c,d) vfmaddpd d,a,b,c +#define MVMA2560(a,b,c,d) vfmaddpd d,a,b,c + +#define VLD1280(addr,reg) vmovapd addr,reg +#define VLD1282(addr,reg) vmovapd addr,reg +#define VLD1281(addr,reg) movsd addr,reg +#define VST1280(reg,addr) vmovapd reg,addr +#define VST1282(reg,addr) vmovapd reg,addr +#define VST1281(reg,addr) movsd reg,addr +#define VLDU1282(addr,reg) vmovupd addr,reg +#define VLDU1281(addr,reg) movsd addr,reg +#define VSTU1282(reg,addr) vmovupd reg,addr +#define VSTU1281(reg,addr) movsd reg,addr +#define VMUL1280(a,b,c) vmulpd a,b,c +#define VMUL1282(a,b,c) vmulpd a,b,c +#define VMUL1281(a,b,c) vmulpd a,b,c +#define MVMUL1280(a,b,c) vmulpd b,a,c +#define VADD1280(a,b,c) vaddpd a,b,c +#define MVADD1280(a,b,c) vaddpd b,a,c +#define VSHUF1280(imm,s,d) vpermilpd imm,s,d +#define VSHUF2F1280(imm,s1,s2,d) vperm2f128 imm,s1,s2,d +#define BROAD1280(addr,reg) vmovddup addr,reg +#define BROAD1282(addr,reg) vmovddup addr,reg +#define BROAD1281(addr,reg) movddup addr,reg +#define MOVRR1280(a,b) vmovapd a,b +#define REVS1280(imm,s1,s2,d) vshufpd imm,s1,s2,d +#define EXTR1281(imm,a,b) vextractf128 imm,a,b +#define LDL1281(addr,reg) vmovlpd addr,reg,reg +#define LDH1281(addr,reg) vmovhpd addr,reg,reg +#define STL1281(reg,addr) vmovlpd reg,addr +#define STH1281(reg,addr) vmovhpd reg,addr +#define VADD1281(a,b,c) vaddpd a,b,c +#define VXOR1280(a,b,c) vxorpd a,b,c +#define VXOR1282(a,b,c) vxorpd a,b,c +#define VXOR1281(a,b,c) vxorpd a,b,c +#define PREFETCH01280(addr,b) prefetcht0 addr +#define PREFETCH11280(addr,b) prefetcht0 addr +#define PREFETCH21280(addr,b) prefetcht2 addr +#define PREFETCHW1280(addr,b) prefetchw addr +#define PREFETCHN1280(addr,b) prefetchnta addr +#define VMA1280(a,b,c,d) vfmaddpd d,a,b,c +#define VMA1282(a,b,c,d) vfmadd231pd a,b,c +#define VMA1281(a,b,c,d) vfmadd231pd a,b,c +#define VMA21282(a,b,c,d) vfmadd231pd a,b,c +#define VMA21281(a,b,c,d) vfmadd231pd a,b,c +//#define VMA1282(a,b,c,d) nop +//#define VMA1281(a,b,c,d) nop +//#define VMA21282(a,b,c,d) nop +//#define VMA21281(a,b,c,d) nop +#define MVMA1280(a,b,c,d) vfmaddpd d,a,b,c + +#define imm1 $0x05 +#define imm3 $0x05 +#define imm100 $0x05 +#define imm200 $0x0a + +#define XMM0 %xmm0 +#define XMM1 %xmm1 +#define XMM2 %xmm2 +#define XMM3 %xmm3 +#define XMM4 %xmm4 +#define XMM5 %xmm5 +#define XMM6 %xmm6 +#define XMM7 %xmm7 +#define XMM8 %xmm8 +#define XMM9 %xmm9 +#define XMM10 %xmm10 +#define XMM11 %xmm11 +#define XMM12 %xmm12 +#define XMM13 %xmm13 +#define XMM14 %xmm14 +#define XMM15 %xmm15 + +#define YMM0 %ymm0 +#define YMM1 %ymm1 +#define YMM2 %ymm2 +#define YMM3 %ymm3 +#define YMM4 %ymm4 +#define YMM5 %ymm5 +#define YMM6 %ymm6 +#define YMM7 %ymm7 +#define YMM8 %ymm8 +#define YMM9 %ymm9 +#define YMM10 %ymm10 +#define YMM11 %ymm11 +#define YMM12 %ymm12 +#define YMM13 %ymm13 +#define YMM14 %ymm14 +#define YMM15 %ymm15 +PROLOGUE + +subq $STACKSIZE, %rsp; +movq %rbx, 0(%rsp); +movq %rbp, 8(%rsp); +movq %r12, 16(%rsp); +movq %r13, 24(%rsp); +movq %r14, 32(%rsp); +movq %r15, 40(%rsp); +vzeroupper +movl old_ldc, %eax +movq %rax, LDC +movlps ALPHA, alpha +movq oldbk_i, _bk_i +movq oldbk_j, _bk_j +movq oldbk_l, _bk_l +leaq (, LDC, SIZE), LDC + +MOVQ1280(_bk_j,j); +SARQ1280($2,j); +JLE ._L_0_loopE; +ALIGN_4; +._L_0_bodyB:; +MOVQ1280(_ptr_A,_ptr__A_0); +MOVQ1280(_ptr_C,_ptr__C_0); +LEAQ1280((_ptr_C,LDC,2),_ptr__C_1); +MOVQ1280(_bk_l,%rax); +SALQ1280($5,%rax); +ADDQ1280(%rax,_pre_B); +MOVQ1280(_bk_i,i); +CMPQ1280($6,i); +JL ._L_1_loopE; +._L_1_bodyB:; +MOVQ1280(_ptr_B,_ptr__B_0); +VXOR1282(XMM0,XMM0,XMM0); +VXOR1282(XMM1,XMM1,XMM1); +VXOR1282(XMM2,XMM2,XMM2); +VXOR1282(XMM3,XMM3,XMM3); +VXOR1282(XMM4,XMM4,XMM4); +VXOR1282(XMM5,XMM5,XMM5); +VXOR1282(XMM6,XMM6,XMM6); +VXOR1282(XMM7,XMM7,XMM7); +VXOR1282(XMM8,XMM8,XMM8); +VXOR1282(XMM9,XMM9,XMM9); +VXOR1282(XMM10,XMM10,XMM10); +VXOR1282(XMM11,XMM11,XMM11); +PREFETCHN1280(3*SIZE(_ptr__C_0),N); +PREFETCHN1280(11*SIZE(_ptr__C_0,LDC,1),N); +PREFETCHN1280(3*SIZE(_ptr__C_1),N); +PREFETCHN1280(11*SIZE(_ptr__C_1,LDC,1),N); +MOVQ1280(_bk_l,k); +SARQ1280($2,k); +JLE ._L_2_loopE; +ALIGN_4; +._L_2_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM12); +VMA1282(XMM12,XMM15,XMM0,XMM0); +VLD1282(2*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM1,XMM1); +VLD1282(4*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM2,XMM2); +BROAD1282(1*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM3,XMM3); +VMA1282(XMM13,XMM15,XMM4,XMM4); +VMA1282(XMM14,XMM15,XMM5,XMM5); +BROAD1282(2*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM6,XMM6); +VMA1282(XMM13,XMM15,XMM7,XMM7); +VMA1282(XMM14,XMM15,XMM8,XMM8); +BROAD1282(3*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM9,XMM9); +VMA1282(XMM13,XMM15,XMM10,XMM10); +VMA1282(XMM14,XMM15,XMM11,XMM11); +PREFETCH21280(168*SIZE(_ptr__A_0),2); +BROAD1282(4*SIZE(_ptr__B_0),XMM15); +VLD1282(6*SIZE(_ptr__A_0),XMM12); +VMA1282(XMM12,XMM15,XMM0,XMM0); +VLD1282(8*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM1,XMM1); +VLD1282(10*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM2,XMM2); +BROAD1282(5*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM3,XMM3); +VMA1282(XMM13,XMM15,XMM4,XMM4); +VMA1282(XMM14,XMM15,XMM5,XMM5); +BROAD1282(6*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM6,XMM6); +VMA1282(XMM13,XMM15,XMM7,XMM7); +VMA1282(XMM14,XMM15,XMM8,XMM8); +BROAD1282(7*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM9,XMM9); +VMA1282(XMM13,XMM15,XMM10,XMM10); +VMA1282(XMM14,XMM15,XMM11,XMM11); +PREFETCH01280(176*SIZE(_ptr__A_0),0); +BROAD1282(8*SIZE(_ptr__B_0),XMM15); +VLD1282(12*SIZE(_ptr__A_0),XMM12); +VMA1282(XMM12,XMM15,XMM0,XMM0); +VLD1282(14*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM1,XMM1); +VLD1282(16*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM2,XMM2); +BROAD1282(9*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM3,XMM3); +VMA1282(XMM13,XMM15,XMM4,XMM4); +VMA1282(XMM14,XMM15,XMM5,XMM5); +BROAD1282(10*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM6,XMM6); +VMA1282(XMM13,XMM15,XMM7,XMM7); +VMA1282(XMM14,XMM15,XMM8,XMM8); +BROAD1282(11*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM9,XMM9); +VMA1282(XMM13,XMM15,XMM10,XMM10); +VMA1282(XMM14,XMM15,XMM11,XMM11); +PREFETCH21280(184*SIZE(_ptr__A_0),2); +BROAD1282(12*SIZE(_ptr__B_0),XMM15); +VLD1282(18*SIZE(_ptr__A_0),XMM12); +VMA1282(XMM12,XMM15,XMM0,XMM0); +VLD1282(20*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM1,XMM1); +VLD1282(22*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM2,XMM2); +BROAD1282(13*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM3,XMM3); +VMA1282(XMM13,XMM15,XMM4,XMM4); +VMA1282(XMM14,XMM15,XMM5,XMM5); +BROAD1282(14*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM6,XMM6); +VMA1282(XMM13,XMM15,XMM7,XMM7); +VMA1282(XMM14,XMM15,XMM8,XMM8); +BROAD1282(15*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM9,XMM9); +VMA1282(XMM13,XMM15,XMM10,XMM10); +VMA1282(XMM14,XMM15,XMM11,XMM11); +ADDQ1280($24*SIZE,_ptr__A_0); +ADDQ1280($16*SIZE,_ptr__B_0); +._L_2_bodyE:; +DECQ1280(k); +JG ._L_2_bodyB; +ALIGN_4; +._L_2_loopE:; +TESTQ1280($2,_bk_l); +JLE ._L_3_loopE; +ALIGN_4; +._L_3_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM12); +VMA1282(XMM12,XMM15,XMM0,XMM0); +VLD1282(2*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM1,XMM1); +VLD1282(4*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM2,XMM2); +BROAD1282(1*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM3,XMM3); +VMA1282(XMM13,XMM15,XMM4,XMM4); +VMA1282(XMM14,XMM15,XMM5,XMM5); +BROAD1282(2*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM6,XMM6); +VMA1282(XMM13,XMM15,XMM7,XMM7); +VMA1282(XMM14,XMM15,XMM8,XMM8); +BROAD1282(3*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM9,XMM9); +VMA1282(XMM13,XMM15,XMM10,XMM10); +VMA1282(XMM14,XMM15,XMM11,XMM11); +PREFETCH21280(168*SIZE(_ptr__A_0),2); +BROAD1282(4*SIZE(_ptr__B_0),XMM15); +VLD1282(6*SIZE(_ptr__A_0),XMM12); +VMA1282(XMM12,XMM15,XMM0,XMM0); +VLD1282(8*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM1,XMM1); +VLD1282(10*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM2,XMM2); +BROAD1282(5*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM3,XMM3); +VMA1282(XMM13,XMM15,XMM4,XMM4); +VMA1282(XMM14,XMM15,XMM5,XMM5); +BROAD1282(6*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM6,XMM6); +VMA1282(XMM13,XMM15,XMM7,XMM7); +VMA1282(XMM14,XMM15,XMM8,XMM8); +BROAD1282(7*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM9,XMM9); +VMA1282(XMM13,XMM15,XMM10,XMM10); +VMA1282(XMM14,XMM15,XMM11,XMM11); +ADDQ1280($12*SIZE,_ptr__A_0); +ADDQ1280($8*SIZE,_ptr__B_0); +._L_3_loopE:; +TESTQ1280($1,_bk_l); +JLE ._L_4_loopE; +ALIGN_4; +._L_4_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM12); +VMA1282(XMM12,XMM15,XMM0,XMM0); +VLD1282(2*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM1,XMM1); +VLD1282(4*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM2,XMM2); +BROAD1282(1*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM3,XMM3); +VMA1282(XMM13,XMM15,XMM4,XMM4); +VMA1282(XMM14,XMM15,XMM5,XMM5); +BROAD1282(2*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM6,XMM6); +VMA1282(XMM13,XMM15,XMM7,XMM7); +VMA1282(XMM14,XMM15,XMM8,XMM8); +BROAD1282(3*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM9,XMM9); +VMA1282(XMM13,XMM15,XMM10,XMM10); +VMA1282(XMM14,XMM15,XMM11,XMM11); +ADDQ1280($6*SIZE,_ptr__A_0); +ADDQ1280($4*SIZE,_ptr__B_0); +._L_4_loopE:; +BROAD1282(alpha,XMM12); +VLDU1282(0*SIZE(_ptr__C_0),XMM13); +VMA21282(XMM12,XMM0,XMM13,XMM0); +VSTU1282(XMM13,0*SIZE(_ptr__C_0)); +VLDU1282(2*SIZE(_ptr__C_0),XMM14); +VMA21282(XMM12,XMM1,XMM14,XMM1); +VSTU1282(XMM14,2*SIZE(_ptr__C_0)); +VLDU1282(4*SIZE(_ptr__C_0),XMM15); +VMA21282(XMM12,XMM2,XMM15,XMM2); +VSTU1282(XMM15,4*SIZE(_ptr__C_0)); +VLDU1282(0*SIZE(_ptr__C_0,LDC,1),XMM13); +VMA21282(XMM12,XMM3,XMM13,XMM3); +VSTU1282(XMM13,0*SIZE(_ptr__C_0,LDC,1)); +VLDU1282(2*SIZE(_ptr__C_0,LDC,1),XMM14); +VMA21282(XMM12,XMM4,XMM14,XMM4); +VSTU1282(XMM14,2*SIZE(_ptr__C_0,LDC,1)); +VLDU1282(4*SIZE(_ptr__C_0,LDC,1),XMM15); +VMA21282(XMM12,XMM5,XMM15,XMM5); +VSTU1282(XMM15,4*SIZE(_ptr__C_0,LDC,1)); +VLDU1282(0*SIZE(_ptr__C_1),XMM13); +VMA21282(XMM12,XMM6,XMM13,XMM6); +VSTU1282(XMM13,0*SIZE(_ptr__C_1)); +VLDU1282(2*SIZE(_ptr__C_1),XMM14); +VMA21282(XMM12,XMM7,XMM14,XMM7); +VSTU1282(XMM14,2*SIZE(_ptr__C_1)); +VLDU1282(4*SIZE(_ptr__C_1),XMM15); +VMA21282(XMM12,XMM8,XMM15,XMM8); +VSTU1282(XMM15,4*SIZE(_ptr__C_1)); +VLDU1282(0*SIZE(_ptr__C_1,LDC,1),XMM13); +VMA21282(XMM12,XMM9,XMM13,XMM9); +VSTU1282(XMM13,0*SIZE(_ptr__C_1,LDC,1)); +VLDU1282(2*SIZE(_ptr__C_1,LDC,1),XMM14); +VMA21282(XMM12,XMM10,XMM14,XMM10); +VSTU1282(XMM14,2*SIZE(_ptr__C_1,LDC,1)); +VLDU1282(4*SIZE(_ptr__C_1,LDC,1),XMM15); +VMA21282(XMM12,XMM11,XMM15,XMM11); +VSTU1282(XMM15,4*SIZE(_ptr__C_1,LDC,1)); +ADDQ1280($6*SIZE,_ptr__C_0); +ADDQ1280($6*SIZE,_ptr__C_1); +._L_1_bodyE:; +SUBQ1280($6,i); +JG ._L_1_bodyB; +ALIGN_4; +._L_1_loopE:; +TESTQ1280($4,i); +JLE ._L_5_loopE; +ALIGN_4; +._L_5_bodyB:; +MOVQ1280(_ptr_B,_ptr__B_0); +VXOR1282(XMM0,XMM0,XMM0); +VXOR1282(XMM1,XMM1,XMM1); +VXOR1282(XMM2,XMM2,XMM2); +VXOR1282(XMM3,XMM3,XMM3); +VXOR1282(XMM4,XMM4,XMM4); +VXOR1282(XMM5,XMM5,XMM5); +VXOR1282(XMM6,XMM6,XMM6); +VXOR1282(XMM7,XMM7,XMM7); +PREFETCHN1280(3*SIZE(_ptr__C_0),N); +PREFETCHN1280(11*SIZE(_ptr__C_0,LDC,1),N); +PREFETCHN1280(3*SIZE(_ptr__C_1),N); +PREFETCHN1280(11*SIZE(_ptr__C_1,LDC,1),N); +MOVQ1280(_bk_l,k); +SARQ1280($2,k); +JLE ._L_6_loopE; +ALIGN_4; +._L_6_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM0,XMM0); +VLD1282(2*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM1,XMM1); +BROAD1282(1*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM2,XMM2); +VMA1282(XMM14,XMM15,XMM3,XMM3); +BROAD1282(2*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM4,XMM4); +VMA1282(XMM14,XMM15,XMM5,XMM5); +BROAD1282(3*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM6,XMM6); +VMA1282(XMM14,XMM15,XMM7,XMM7); +PREFETCH21280(168*SIZE(_ptr__A_0),2); +BROAD1282(4*SIZE(_ptr__B_0),XMM15); +VLD1282(4*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM0,XMM0); +VLD1282(6*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM1,XMM1); +BROAD1282(5*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM2,XMM2); +VMA1282(XMM14,XMM15,XMM3,XMM3); +BROAD1282(6*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM4,XMM4); +VMA1282(XMM14,XMM15,XMM5,XMM5); +BROAD1282(7*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM6,XMM6); +VMA1282(XMM14,XMM15,XMM7,XMM7); +PREFETCH01280(176*SIZE(_ptr__A_0),0); +BROAD1282(8*SIZE(_ptr__B_0),XMM15); +VLD1282(8*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM0,XMM0); +VLD1282(10*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM1,XMM1); +BROAD1282(9*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM2,XMM2); +VMA1282(XMM14,XMM15,XMM3,XMM3); +BROAD1282(10*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM4,XMM4); +VMA1282(XMM14,XMM15,XMM5,XMM5); +BROAD1282(11*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM6,XMM6); +VMA1282(XMM14,XMM15,XMM7,XMM7); +PREFETCH21280(184*SIZE(_ptr__A_0),2); +BROAD1282(12*SIZE(_ptr__B_0),XMM15); +VLD1282(12*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM0,XMM0); +VLD1282(14*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM1,XMM1); +BROAD1282(13*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM2,XMM2); +VMA1282(XMM14,XMM15,XMM3,XMM3); +BROAD1282(14*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM4,XMM4); +VMA1282(XMM14,XMM15,XMM5,XMM5); +BROAD1282(15*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM6,XMM6); +VMA1282(XMM14,XMM15,XMM7,XMM7); +ADDQ1280($16*SIZE,_ptr__A_0); +ADDQ1280($16*SIZE,_ptr__B_0); +._L_6_bodyE:; +DECQ1280(k); +JG ._L_6_bodyB; +ALIGN_4; +._L_6_loopE:; +TESTQ1280($2,_bk_l); +JLE ._L_7_loopE; +ALIGN_4; +._L_7_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM0,XMM0); +VLD1282(2*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM1,XMM1); +BROAD1282(1*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM2,XMM2); +VMA1282(XMM14,XMM15,XMM3,XMM3); +BROAD1282(2*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM4,XMM4); +VMA1282(XMM14,XMM15,XMM5,XMM5); +BROAD1282(3*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM6,XMM6); +VMA1282(XMM14,XMM15,XMM7,XMM7); +PREFETCH21280(168*SIZE(_ptr__A_0),2); +BROAD1282(4*SIZE(_ptr__B_0),XMM15); +VLD1282(4*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM0,XMM0); +VLD1282(6*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM1,XMM1); +BROAD1282(5*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM2,XMM2); +VMA1282(XMM14,XMM15,XMM3,XMM3); +BROAD1282(6*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM4,XMM4); +VMA1282(XMM14,XMM15,XMM5,XMM5); +BROAD1282(7*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM6,XMM6); +VMA1282(XMM14,XMM15,XMM7,XMM7); +ADDQ1280($8*SIZE,_ptr__A_0); +ADDQ1280($8*SIZE,_ptr__B_0); +._L_7_loopE:; +TESTQ1280($1,_bk_l); +JLE ._L_8_loopE; +ALIGN_4; +._L_8_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM0,XMM0); +VLD1282(2*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM1,XMM1); +BROAD1282(1*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM2,XMM2); +VMA1282(XMM14,XMM15,XMM3,XMM3); +BROAD1282(2*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM4,XMM4); +VMA1282(XMM14,XMM15,XMM5,XMM5); +BROAD1282(3*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM6,XMM6); +VMA1282(XMM14,XMM15,XMM7,XMM7); +ADDQ1280($4*SIZE,_ptr__A_0); +ADDQ1280($4*SIZE,_ptr__B_0); +._L_8_loopE:; +BROAD1282(alpha,XMM8); +VLDU1282(0*SIZE(_ptr__C_0),XMM9); +VMA21282(XMM8,XMM0,XMM9,XMM0); +VSTU1282(XMM9,0*SIZE(_ptr__C_0)); +VLDU1282(2*SIZE(_ptr__C_0),XMM10); +VMA21282(XMM8,XMM1,XMM10,XMM1); +VSTU1282(XMM10,2*SIZE(_ptr__C_0)); +VLDU1282(0*SIZE(_ptr__C_0,LDC,1),XMM11); +VMA21282(XMM8,XMM2,XMM11,XMM2); +VSTU1282(XMM11,0*SIZE(_ptr__C_0,LDC,1)); +VLDU1282(2*SIZE(_ptr__C_0,LDC,1),XMM12); +VMA21282(XMM8,XMM3,XMM12,XMM3); +VSTU1282(XMM12,2*SIZE(_ptr__C_0,LDC,1)); +VLDU1282(0*SIZE(_ptr__C_1),XMM13); +VMA21282(XMM8,XMM4,XMM13,XMM4); +VSTU1282(XMM13,0*SIZE(_ptr__C_1)); +VLDU1282(2*SIZE(_ptr__C_1),XMM14); +VMA21282(XMM8,XMM5,XMM14,XMM5); +VSTU1282(XMM14,2*SIZE(_ptr__C_1)); +VLDU1282(0*SIZE(_ptr__C_1,LDC,1),XMM15); +VMA21282(XMM8,XMM6,XMM15,XMM6); +VSTU1282(XMM15,0*SIZE(_ptr__C_1,LDC,1)); +VLDU1282(2*SIZE(_ptr__C_1,LDC,1),XMM9); +VMA21282(XMM8,XMM7,XMM9,XMM7); +VSTU1282(XMM9,2*SIZE(_ptr__C_1,LDC,1)); +ADDQ1280($4*SIZE,_ptr__C_0); +ADDQ1280($4*SIZE,_ptr__C_1); +._L_5_loopE:; +TESTQ1280($2,i); +JLE ._L_9_loopE; +ALIGN_4; +._L_9_bodyB:; +MOVQ1280(_ptr_B,_ptr__B_0); +VXOR1282(XMM0,XMM0,XMM0); +VXOR1282(XMM1,XMM1,XMM1); +VXOR1282(XMM2,XMM2,XMM2); +VXOR1282(XMM3,XMM3,XMM3); +PREFETCHN1280(3*SIZE(_ptr__C_0),N); +PREFETCHN1280(11*SIZE(_ptr__C_0,LDC,1),N); +PREFETCHN1280(3*SIZE(_ptr__C_1),N); +PREFETCHN1280(11*SIZE(_ptr__C_1,LDC,1),N); +MOVQ1280(_bk_l,k); +SARQ1280($2,k); +JLE ._L_10_loopE; +ALIGN_4; +._L_10_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM0,XMM0); +BROAD1282(1*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM1,XMM1); +BROAD1282(2*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM2,XMM2); +BROAD1282(3*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM3,XMM3); +PREFETCH21280(168*SIZE(_ptr__A_0),2); +BROAD1282(4*SIZE(_ptr__B_0),XMM15); +VLD1282(2*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM0,XMM0); +BROAD1282(5*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM1,XMM1); +BROAD1282(6*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM2,XMM2); +BROAD1282(7*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM3,XMM3); +PREFETCH01280(176*SIZE(_ptr__A_0),0); +BROAD1282(8*SIZE(_ptr__B_0),XMM15); +VLD1282(4*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM0,XMM0); +BROAD1282(9*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM1,XMM1); +BROAD1282(10*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM2,XMM2); +BROAD1282(11*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM3,XMM3); +PREFETCH21280(184*SIZE(_ptr__A_0),2); +BROAD1282(12*SIZE(_ptr__B_0),XMM15); +VLD1282(6*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM0,XMM0); +BROAD1282(13*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM1,XMM1); +BROAD1282(14*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM2,XMM2); +BROAD1282(15*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM3,XMM3); +ADDQ1280($8*SIZE,_ptr__A_0); +ADDQ1280($16*SIZE,_ptr__B_0); +._L_10_bodyE:; +DECQ1280(k); +JG ._L_10_bodyB; +ALIGN_4; +._L_10_loopE:; +TESTQ1280($2,_bk_l); +JLE ._L_11_loopE; +ALIGN_4; +._L_11_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM0,XMM0); +BROAD1282(1*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM1,XMM1); +BROAD1282(2*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM2,XMM2); +BROAD1282(3*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM3,XMM3); +PREFETCH21280(168*SIZE(_ptr__A_0),2); +BROAD1282(4*SIZE(_ptr__B_0),XMM15); +VLD1282(2*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM0,XMM0); +BROAD1282(5*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM1,XMM1); +BROAD1282(6*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM2,XMM2); +BROAD1282(7*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM3,XMM3); +ADDQ1280($4*SIZE,_ptr__A_0); +ADDQ1280($8*SIZE,_ptr__B_0); +._L_11_loopE:; +TESTQ1280($1,_bk_l); +JLE ._L_12_loopE; +ALIGN_4; +._L_12_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM0,XMM0); +BROAD1282(1*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM1,XMM1); +BROAD1282(2*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM2,XMM2); +BROAD1282(3*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM3,XMM3); +ADDQ1280($2*SIZE,_ptr__A_0); +ADDQ1280($4*SIZE,_ptr__B_0); +._L_12_loopE:; +BROAD1282(alpha,XMM4); +VLDU1282(0*SIZE(_ptr__C_0),XMM5); +VMA21282(XMM4,XMM0,XMM5,XMM0); +VSTU1282(XMM5,0*SIZE(_ptr__C_0)); +VLDU1282(0*SIZE(_ptr__C_0,LDC,1),XMM6); +VMA21282(XMM4,XMM1,XMM6,XMM1); +VSTU1282(XMM6,0*SIZE(_ptr__C_0,LDC,1)); +VLDU1282(0*SIZE(_ptr__C_1),XMM7); +VMA21282(XMM4,XMM2,XMM7,XMM2); +VSTU1282(XMM7,0*SIZE(_ptr__C_1)); +VLDU1282(0*SIZE(_ptr__C_1,LDC,1),XMM8); +VMA21282(XMM4,XMM3,XMM8,XMM3); +VSTU1282(XMM8,0*SIZE(_ptr__C_1,LDC,1)); +ADDQ1280($2*SIZE,_ptr__C_0); +ADDQ1280($2*SIZE,_ptr__C_1); +._L_9_loopE:; +TESTQ1280($1,i); +JLE ._L_13_loopE; +ALIGN_4; +._L_13_bodyB:; +MOVQ1280(_ptr_B,_ptr__B_0); +VXOR1281(XMM0,XMM0,XMM0); +VXOR1281(XMM1,XMM1,XMM1); +VXOR1281(XMM2,XMM2,XMM2); +VXOR1281(XMM3,XMM3,XMM3); +PREFETCHN1280(3*SIZE(_ptr__C_0),N); +PREFETCHN1280(11*SIZE(_ptr__C_0,LDC,1),N); +PREFETCHN1280(3*SIZE(_ptr__C_1),N); +PREFETCHN1280(11*SIZE(_ptr__C_1,LDC,1),N); +MOVQ1280(_bk_l,k); +SARQ1280($2,k); +JLE ._L_14_loopE; +ALIGN_4; +._L_14_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1281(0*SIZE(_ptr__B_0),XMM15); +VLD1281(0*SIZE(_ptr__A_0),XMM14); +VMA1281(XMM14,XMM15,XMM0,XMM0); +BROAD1281(1*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM1,XMM1); +BROAD1281(2*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM2,XMM2); +BROAD1281(3*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM3,XMM3); +PREFETCH21280(168*SIZE(_ptr__A_0),2); +BROAD1281(4*SIZE(_ptr__B_0),XMM15); +VLD1281(1*SIZE(_ptr__A_0),XMM14); +VMA1281(XMM14,XMM15,XMM0,XMM0); +BROAD1281(5*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM1,XMM1); +BROAD1281(6*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM2,XMM2); +BROAD1281(7*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM3,XMM3); +PREFETCH01280(176*SIZE(_ptr__A_0),0); +BROAD1281(8*SIZE(_ptr__B_0),XMM15); +VLD1281(2*SIZE(_ptr__A_0),XMM14); +VMA1281(XMM14,XMM15,XMM0,XMM0); +BROAD1281(9*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM1,XMM1); +BROAD1281(10*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM2,XMM2); +BROAD1281(11*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM3,XMM3); +PREFETCH21280(184*SIZE(_ptr__A_0),2); +BROAD1281(12*SIZE(_ptr__B_0),XMM15); +VLD1281(3*SIZE(_ptr__A_0),XMM14); +VMA1281(XMM14,XMM15,XMM0,XMM0); +BROAD1281(13*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM1,XMM1); +BROAD1281(14*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM2,XMM2); +BROAD1281(15*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM3,XMM3); +ADDQ1280($4*SIZE,_ptr__A_0); +ADDQ1280($16*SIZE,_ptr__B_0); +._L_14_bodyE:; +DECQ1280(k); +JG ._L_14_bodyB; +ALIGN_4; +._L_14_loopE:; +TESTQ1280($2,_bk_l); +JLE ._L_15_loopE; +ALIGN_4; +._L_15_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1281(0*SIZE(_ptr__B_0),XMM15); +VLD1281(0*SIZE(_ptr__A_0),XMM14); +VMA1281(XMM14,XMM15,XMM0,XMM0); +BROAD1281(1*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM1,XMM1); +BROAD1281(2*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM2,XMM2); +BROAD1281(3*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM3,XMM3); +PREFETCH21280(168*SIZE(_ptr__A_0),2); +BROAD1281(4*SIZE(_ptr__B_0),XMM15); +VLD1281(1*SIZE(_ptr__A_0),XMM14); +VMA1281(XMM14,XMM15,XMM0,XMM0); +BROAD1281(5*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM1,XMM1); +BROAD1281(6*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM2,XMM2); +BROAD1281(7*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM3,XMM3); +ADDQ1280($2*SIZE,_ptr__A_0); +ADDQ1280($8*SIZE,_ptr__B_0); +._L_15_loopE:; +TESTQ1280($1,_bk_l); +JLE ._L_16_loopE; +ALIGN_4; +._L_16_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1281(0*SIZE(_ptr__B_0),XMM15); +VLD1281(0*SIZE(_ptr__A_0),XMM14); +VMA1281(XMM14,XMM15,XMM0,XMM0); +BROAD1281(1*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM1,XMM1); +BROAD1281(2*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM2,XMM2); +BROAD1281(3*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM3,XMM3); +ADDQ1280($1*SIZE,_ptr__A_0); +ADDQ1280($4*SIZE,_ptr__B_0); +._L_16_loopE:; +BROAD1281(alpha,XMM4); +VLDU1281(0*SIZE(_ptr__C_0),XMM5); +VMA21281(XMM4,XMM0,XMM5,XMM0); +VSTU1281(XMM5,0*SIZE(_ptr__C_0)); +VLDU1281(0*SIZE(_ptr__C_0,LDC,1),XMM6); +VMA21281(XMM4,XMM1,XMM6,XMM1); +VSTU1281(XMM6,0*SIZE(_ptr__C_0,LDC,1)); +VLDU1281(0*SIZE(_ptr__C_1),XMM7); +VMA21281(XMM4,XMM2,XMM7,XMM2); +VSTU1281(XMM7,0*SIZE(_ptr__C_1)); +VLDU1281(0*SIZE(_ptr__C_1,LDC,1),XMM8); +VMA21281(XMM4,XMM3,XMM8,XMM3); +VSTU1281(XMM8,0*SIZE(_ptr__C_1,LDC,1)); +ADDQ1280($1*SIZE,_ptr__C_0); +ADDQ1280($1*SIZE,_ptr__C_1); +._L_13_loopE:; +MOVQ1280(LDC,%rax); +SALQ1280($2,%rax); +ADDQ1280(%rax,_ptr_C); +MOVQ1280(_bk_l,%rax); +SALQ1280($5,%rax); +ADDQ1280(%rax,_ptr_B); +._L_0_bodyE:; +DECQ1280(j); +JG ._L_0_bodyB; +ALIGN_4; +._L_0_loopE:; +TESTQ1280($2,_bk_j); +JLE ._L_17_loopE; +ALIGN_4; +._L_17_bodyB:; +MOVQ1280(_ptr_A,_ptr__A_0); +MOVQ1280(_ptr_C,_ptr__C_0); +LEAQ1280((_ptr_C,LDC,1),_ptr__C_1); +MOVQ1280(_bk_l,%rax); +SALQ1280($4,%rax); +ADDQ1280(%rax,_pre_B); +MOVQ1280(_bk_i,i); +CMPQ1280($6,i); +JL ._L_18_loopE; +._L_18_bodyB:; +MOVQ1280(_ptr_B,_ptr__B_0); +VXOR1282(XMM0,XMM0,XMM0); +VXOR1282(XMM1,XMM1,XMM1); +VXOR1282(XMM2,XMM2,XMM2); +VXOR1282(XMM3,XMM3,XMM3); +VXOR1282(XMM4,XMM4,XMM4); +VXOR1282(XMM5,XMM5,XMM5); +PREFETCHN1280(3*SIZE(_ptr__C_0),N); +PREFETCHN1280(11*SIZE(_ptr__C_0,LDC,1),N); +PREFETCHN1280(3*SIZE(_ptr__C_1),N); +PREFETCHN1280(11*SIZE(_ptr__C_1,LDC,1),N); +MOVQ1280(_bk_l,k); +SARQ1280($2,k); +JLE ._L_19_loopE; +ALIGN_4; +._L_19_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM12); +VMA1282(XMM12,XMM15,XMM0,XMM0); +VLD1282(2*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM1,XMM1); +VLD1282(4*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM2,XMM2); +BROAD1282(1*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM3,XMM3); +VMA1282(XMM13,XMM15,XMM4,XMM4); +VMA1282(XMM14,XMM15,XMM5,XMM5); +PREFETCH21280(168*SIZE(_ptr__A_0),2); +BROAD1282(2*SIZE(_ptr__B_0),XMM15); +VLD1282(6*SIZE(_ptr__A_0),XMM12); +VMA1282(XMM12,XMM15,XMM0,XMM0); +VLD1282(8*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM1,XMM1); +VLD1282(10*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM2,XMM2); +BROAD1282(3*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM3,XMM3); +VMA1282(XMM13,XMM15,XMM4,XMM4); +VMA1282(XMM14,XMM15,XMM5,XMM5); +PREFETCH01280(176*SIZE(_ptr__A_0),0); +BROAD1282(4*SIZE(_ptr__B_0),XMM15); +VLD1282(12*SIZE(_ptr__A_0),XMM12); +VMA1282(XMM12,XMM15,XMM0,XMM0); +VLD1282(14*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM1,XMM1); +VLD1282(16*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM2,XMM2); +BROAD1282(5*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM3,XMM3); +VMA1282(XMM13,XMM15,XMM4,XMM4); +VMA1282(XMM14,XMM15,XMM5,XMM5); +PREFETCH21280(184*SIZE(_ptr__A_0),2); +BROAD1282(6*SIZE(_ptr__B_0),XMM15); +VLD1282(18*SIZE(_ptr__A_0),XMM12); +VMA1282(XMM12,XMM15,XMM0,XMM0); +VLD1282(20*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM1,XMM1); +VLD1282(22*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM2,XMM2); +BROAD1282(7*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM3,XMM3); +VMA1282(XMM13,XMM15,XMM4,XMM4); +VMA1282(XMM14,XMM15,XMM5,XMM5); +ADDQ1280($24*SIZE,_ptr__A_0); +ADDQ1280($8*SIZE,_ptr__B_0); +._L_19_bodyE:; +DECQ1280(k); +JG ._L_19_bodyB; +ALIGN_4; +._L_19_loopE:; +TESTQ1280($2,_bk_l); +JLE ._L_20_loopE; +ALIGN_4; +._L_20_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM12); +VMA1282(XMM12,XMM15,XMM0,XMM0); +VLD1282(2*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM1,XMM1); +VLD1282(4*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM2,XMM2); +BROAD1282(1*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM3,XMM3); +VMA1282(XMM13,XMM15,XMM4,XMM4); +VMA1282(XMM14,XMM15,XMM5,XMM5); +PREFETCH21280(168*SIZE(_ptr__A_0),2); +BROAD1282(2*SIZE(_ptr__B_0),XMM15); +VLD1282(6*SIZE(_ptr__A_0),XMM12); +VMA1282(XMM12,XMM15,XMM0,XMM0); +VLD1282(8*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM1,XMM1); +VLD1282(10*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM2,XMM2); +BROAD1282(3*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM3,XMM3); +VMA1282(XMM13,XMM15,XMM4,XMM4); +VMA1282(XMM14,XMM15,XMM5,XMM5); +ADDQ1280($12*SIZE,_ptr__A_0); +ADDQ1280($4*SIZE,_ptr__B_0); +._L_20_loopE:; +TESTQ1280($1,_bk_l); +JLE ._L_21_loopE; +ALIGN_4; +._L_21_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM12); +VMA1282(XMM12,XMM15,XMM0,XMM0); +VLD1282(2*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM1,XMM1); +VLD1282(4*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM2,XMM2); +BROAD1282(1*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM3,XMM3); +VMA1282(XMM13,XMM15,XMM4,XMM4); +VMA1282(XMM14,XMM15,XMM5,XMM5); +ADDQ1280($6*SIZE,_ptr__A_0); +ADDQ1280($2*SIZE,_ptr__B_0); +._L_21_loopE:; +BROAD1282(alpha,XMM6); +VLDU1282(0*SIZE(_ptr__C_0),XMM7); +VMA21282(XMM6,XMM0,XMM7,XMM0); +VSTU1282(XMM7,0*SIZE(_ptr__C_0)); +VLDU1282(2*SIZE(_ptr__C_0),XMM8); +VMA21282(XMM6,XMM1,XMM8,XMM1); +VSTU1282(XMM8,2*SIZE(_ptr__C_0)); +VLDU1282(4*SIZE(_ptr__C_0),XMM9); +VMA21282(XMM6,XMM2,XMM9,XMM2); +VSTU1282(XMM9,4*SIZE(_ptr__C_0)); +VLDU1282(0*SIZE(_ptr__C_0,LDC,1),XMM10); +VMA21282(XMM6,XMM3,XMM10,XMM3); +VSTU1282(XMM10,0*SIZE(_ptr__C_0,LDC,1)); +VLDU1282(2*SIZE(_ptr__C_0,LDC,1),XMM11); +VMA21282(XMM6,XMM4,XMM11,XMM4); +VSTU1282(XMM11,2*SIZE(_ptr__C_0,LDC,1)); +VLDU1282(4*SIZE(_ptr__C_0,LDC,1),XMM12); +VMA21282(XMM6,XMM5,XMM12,XMM5); +VSTU1282(XMM12,4*SIZE(_ptr__C_0,LDC,1)); +ADDQ1280($6*SIZE,_ptr__C_0); +ADDQ1280($6*SIZE,_ptr__C_1); +._L_18_bodyE:; +SUBQ1280($6,i); +JG ._L_18_bodyB; +ALIGN_4; +._L_18_loopE:; +TESTQ1280($4,i); +JLE ._L_22_loopE; +ALIGN_4; +._L_22_bodyB:; +MOVQ1280(_ptr_B,_ptr__B_0); +VXOR1282(XMM0,XMM0,XMM0); +VXOR1282(XMM1,XMM1,XMM1); +VXOR1282(XMM2,XMM2,XMM2); +VXOR1282(XMM3,XMM3,XMM3); +PREFETCHN1280(3*SIZE(_ptr__C_0),N); +PREFETCHN1280(11*SIZE(_ptr__C_0,LDC,1),N); +PREFETCHN1280(3*SIZE(_ptr__C_1),N); +PREFETCHN1280(11*SIZE(_ptr__C_1,LDC,1),N); +MOVQ1280(_bk_l,k); +SARQ1280($2,k); +JLE ._L_23_loopE; +ALIGN_4; +._L_23_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM0,XMM0); +VLD1282(2*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM1,XMM1); +BROAD1282(1*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM2,XMM2); +VMA1282(XMM14,XMM15,XMM3,XMM3); +PREFETCH21280(168*SIZE(_ptr__A_0),2); +BROAD1282(2*SIZE(_ptr__B_0),XMM15); +VLD1282(4*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM0,XMM0); +VLD1282(6*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM1,XMM1); +BROAD1282(3*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM2,XMM2); +VMA1282(XMM14,XMM15,XMM3,XMM3); +PREFETCH01280(176*SIZE(_ptr__A_0),0); +BROAD1282(4*SIZE(_ptr__B_0),XMM15); +VLD1282(8*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM0,XMM0); +VLD1282(10*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM1,XMM1); +BROAD1282(5*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM2,XMM2); +VMA1282(XMM14,XMM15,XMM3,XMM3); +PREFETCH21280(184*SIZE(_ptr__A_0),2); +BROAD1282(6*SIZE(_ptr__B_0),XMM15); +VLD1282(12*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM0,XMM0); +VLD1282(14*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM1,XMM1); +BROAD1282(7*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM2,XMM2); +VMA1282(XMM14,XMM15,XMM3,XMM3); +ADDQ1280($16*SIZE,_ptr__A_0); +ADDQ1280($8*SIZE,_ptr__B_0); +._L_23_bodyE:; +DECQ1280(k); +JG ._L_23_bodyB; +ALIGN_4; +._L_23_loopE:; +TESTQ1280($2,_bk_l); +JLE ._L_24_loopE; +ALIGN_4; +._L_24_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM0,XMM0); +VLD1282(2*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM1,XMM1); +BROAD1282(1*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM2,XMM2); +VMA1282(XMM14,XMM15,XMM3,XMM3); +PREFETCH21280(168*SIZE(_ptr__A_0),2); +BROAD1282(2*SIZE(_ptr__B_0),XMM15); +VLD1282(4*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM0,XMM0); +VLD1282(6*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM1,XMM1); +BROAD1282(3*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM2,XMM2); +VMA1282(XMM14,XMM15,XMM3,XMM3); +ADDQ1280($8*SIZE,_ptr__A_0); +ADDQ1280($4*SIZE,_ptr__B_0); +._L_24_loopE:; +TESTQ1280($1,_bk_l); +JLE ._L_25_loopE; +ALIGN_4; +._L_25_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM0,XMM0); +VLD1282(2*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM1,XMM1); +BROAD1282(1*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM2,XMM2); +VMA1282(XMM14,XMM15,XMM3,XMM3); +ADDQ1280($4*SIZE,_ptr__A_0); +ADDQ1280($2*SIZE,_ptr__B_0); +._L_25_loopE:; +BROAD1282(alpha,XMM4); +VLDU1282(0*SIZE(_ptr__C_0),XMM5); +VMA21282(XMM4,XMM0,XMM5,XMM0); +VSTU1282(XMM5,0*SIZE(_ptr__C_0)); +VLDU1282(2*SIZE(_ptr__C_0),XMM6); +VMA21282(XMM4,XMM1,XMM6,XMM1); +VSTU1282(XMM6,2*SIZE(_ptr__C_0)); +VLDU1282(0*SIZE(_ptr__C_0,LDC,1),XMM7); +VMA21282(XMM4,XMM2,XMM7,XMM2); +VSTU1282(XMM7,0*SIZE(_ptr__C_0,LDC,1)); +VLDU1282(2*SIZE(_ptr__C_0,LDC,1),XMM8); +VMA21282(XMM4,XMM3,XMM8,XMM3); +VSTU1282(XMM8,2*SIZE(_ptr__C_0,LDC,1)); +ADDQ1280($4*SIZE,_ptr__C_0); +ADDQ1280($4*SIZE,_ptr__C_1); +._L_22_loopE:; +TESTQ1280($2,i); +JLE ._L_26_loopE; +ALIGN_4; +._L_26_bodyB:; +MOVQ1280(_ptr_B,_ptr__B_0); +VXOR1282(XMM0,XMM0,XMM0); +VXOR1282(XMM1,XMM1,XMM1); +PREFETCHN1280(3*SIZE(_ptr__C_0),N); +PREFETCHN1280(11*SIZE(_ptr__C_0,LDC,1),N); +PREFETCHN1280(3*SIZE(_ptr__C_1),N); +PREFETCHN1280(11*SIZE(_ptr__C_1,LDC,1),N); +MOVQ1280(_bk_l,k); +SARQ1280($2,k); +JLE ._L_27_loopE; +ALIGN_4; +._L_27_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM0,XMM0); +BROAD1282(1*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM1,XMM1); +PREFETCH21280(168*SIZE(_ptr__A_0),2); +BROAD1282(2*SIZE(_ptr__B_0),XMM15); +VLD1282(2*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM0,XMM0); +BROAD1282(3*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM1,XMM1); +PREFETCH01280(176*SIZE(_ptr__A_0),0); +BROAD1282(4*SIZE(_ptr__B_0),XMM15); +VLD1282(4*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM0,XMM0); +BROAD1282(5*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM1,XMM1); +PREFETCH21280(184*SIZE(_ptr__A_0),2); +BROAD1282(6*SIZE(_ptr__B_0),XMM15); +VLD1282(6*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM0,XMM0); +BROAD1282(7*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM1,XMM1); +ADDQ1280($8*SIZE,_ptr__A_0); +ADDQ1280($8*SIZE,_ptr__B_0); +._L_27_bodyE:; +DECQ1280(k); +JG ._L_27_bodyB; +ALIGN_4; +._L_27_loopE:; +TESTQ1280($2,_bk_l); +JLE ._L_28_loopE; +ALIGN_4; +._L_28_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM0,XMM0); +BROAD1282(1*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM1,XMM1); +PREFETCH21280(168*SIZE(_ptr__A_0),2); +BROAD1282(2*SIZE(_ptr__B_0),XMM15); +VLD1282(2*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM0,XMM0); +BROAD1282(3*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM1,XMM1); +ADDQ1280($4*SIZE,_ptr__A_0); +ADDQ1280($4*SIZE,_ptr__B_0); +._L_28_loopE:; +TESTQ1280($1,_bk_l); +JLE ._L_29_loopE; +ALIGN_4; +._L_29_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM0,XMM0); +BROAD1282(1*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM1,XMM1); +ADDQ1280($2*SIZE,_ptr__A_0); +ADDQ1280($2*SIZE,_ptr__B_0); +._L_29_loopE:; +BROAD1282(alpha,XMM2); +VLDU1282(0*SIZE(_ptr__C_0),XMM3); +VMA21282(XMM2,XMM0,XMM3,XMM0); +VSTU1282(XMM3,0*SIZE(_ptr__C_0)); +VLDU1282(0*SIZE(_ptr__C_0,LDC,1),XMM4); +VMA21282(XMM2,XMM1,XMM4,XMM1); +VSTU1282(XMM4,0*SIZE(_ptr__C_0,LDC,1)); +ADDQ1280($2*SIZE,_ptr__C_0); +ADDQ1280($2*SIZE,_ptr__C_1); +._L_26_loopE:; +TESTQ1280($1,i); +JLE ._L_30_loopE; +ALIGN_4; +._L_30_bodyB:; +MOVQ1280(_ptr_B,_ptr__B_0); +VXOR1281(XMM0,XMM0,XMM0); +VXOR1281(XMM1,XMM1,XMM1); +PREFETCHN1280(3*SIZE(_ptr__C_0),N); +PREFETCHN1280(11*SIZE(_ptr__C_0,LDC,1),N); +PREFETCHN1280(3*SIZE(_ptr__C_1),N); +PREFETCHN1280(11*SIZE(_ptr__C_1,LDC,1),N); +MOVQ1280(_bk_l,k); +SARQ1280($2,k); +JLE ._L_31_loopE; +ALIGN_4; +._L_31_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1281(0*SIZE(_ptr__B_0),XMM15); +VLD1281(0*SIZE(_ptr__A_0),XMM14); +VMA1281(XMM14,XMM15,XMM0,XMM0); +BROAD1281(1*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM1,XMM1); +PREFETCH21280(168*SIZE(_ptr__A_0),2); +BROAD1281(2*SIZE(_ptr__B_0),XMM15); +VLD1281(1*SIZE(_ptr__A_0),XMM14); +VMA1281(XMM14,XMM15,XMM0,XMM0); +BROAD1281(3*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM1,XMM1); +PREFETCH01280(176*SIZE(_ptr__A_0),0); +BROAD1281(4*SIZE(_ptr__B_0),XMM15); +VLD1281(2*SIZE(_ptr__A_0),XMM14); +VMA1281(XMM14,XMM15,XMM0,XMM0); +BROAD1281(5*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM1,XMM1); +PREFETCH21280(184*SIZE(_ptr__A_0),2); +BROAD1281(6*SIZE(_ptr__B_0),XMM15); +VLD1281(3*SIZE(_ptr__A_0),XMM14); +VMA1281(XMM14,XMM15,XMM0,XMM0); +BROAD1281(7*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM1,XMM1); +ADDQ1280($4*SIZE,_ptr__A_0); +ADDQ1280($8*SIZE,_ptr__B_0); +._L_31_bodyE:; +DECQ1280(k); +JG ._L_31_bodyB; +ALIGN_4; +._L_31_loopE:; +TESTQ1280($2,_bk_l); +JLE ._L_32_loopE; +ALIGN_4; +._L_32_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1281(0*SIZE(_ptr__B_0),XMM15); +VLD1281(0*SIZE(_ptr__A_0),XMM14); +VMA1281(XMM14,XMM15,XMM0,XMM0); +BROAD1281(1*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM1,XMM1); +PREFETCH21280(168*SIZE(_ptr__A_0),2); +BROAD1281(2*SIZE(_ptr__B_0),XMM15); +VLD1281(1*SIZE(_ptr__A_0),XMM14); +VMA1281(XMM14,XMM15,XMM0,XMM0); +BROAD1281(3*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM1,XMM1); +ADDQ1280($2*SIZE,_ptr__A_0); +ADDQ1280($4*SIZE,_ptr__B_0); +._L_32_loopE:; +TESTQ1280($1,_bk_l); +JLE ._L_33_loopE; +ALIGN_4; +._L_33_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1281(0*SIZE(_ptr__B_0),XMM15); +VLD1281(0*SIZE(_ptr__A_0),XMM14); +VMA1281(XMM14,XMM15,XMM0,XMM0); +BROAD1281(1*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM1,XMM1); +ADDQ1280($1*SIZE,_ptr__A_0); +ADDQ1280($2*SIZE,_ptr__B_0); +._L_33_loopE:; +BROAD1281(alpha,XMM2); +VLDU1281(0*SIZE(_ptr__C_0),XMM3); +VMA21281(XMM2,XMM0,XMM3,XMM0); +VSTU1281(XMM3,0*SIZE(_ptr__C_0)); +VLDU1281(0*SIZE(_ptr__C_0,LDC,1),XMM4); +VMA21281(XMM2,XMM1,XMM4,XMM1); +VSTU1281(XMM4,0*SIZE(_ptr__C_0,LDC,1)); +ADDQ1280($1*SIZE,_ptr__C_0); +ADDQ1280($1*SIZE,_ptr__C_1); +._L_30_loopE:; +MOVQ1280(LDC,%rax); +SALQ1280($1,%rax); +ADDQ1280(%rax,_ptr_C); +MOVQ1280(_bk_l,%rax); +SALQ1280($4,%rax); +ADDQ1280(%rax,_ptr_B); +._L_17_loopE:; +TESTQ1280($1,_bk_j); +JLE ._L_34_loopE; +ALIGN_4; +._L_34_bodyB:; +MOVQ1280(_ptr_A,_ptr__A_0); +MOVQ1280(_ptr_C,_ptr__C_0); +MOVQ1280(_bk_l,%rax); +SALQ1280($3,%rax); +ADDQ1280(%rax,_pre_B); +MOVQ1280(_bk_i,i); +CMPQ1280($6,i); +JL ._L_35_loopE; +._L_35_bodyB:; +MOVQ1280(_ptr_B,_ptr__B_0); +VXOR1282(XMM0,XMM0,XMM0); +VXOR1282(XMM1,XMM1,XMM1); +VXOR1282(XMM2,XMM2,XMM2); +PREFETCHN1280(3*SIZE(_ptr__C_0),N); +PREFETCHN1280(11*SIZE(_ptr__C_0,LDC,1),N); +PREFETCHN1280(3*SIZE(_ptr__C_1),N); +PREFETCHN1280(11*SIZE(_ptr__C_1,LDC,1),N); +MOVQ1280(_bk_l,k); +SARQ1280($2,k); +JLE ._L_36_loopE; +ALIGN_4; +._L_36_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM12); +VMA1282(XMM12,XMM15,XMM0,XMM0); +VLD1282(2*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM1,XMM1); +VLD1282(4*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM2,XMM2); +PREFETCH21280(168*SIZE(_ptr__A_0),2); +BROAD1282(1*SIZE(_ptr__B_0),XMM15); +VLD1282(6*SIZE(_ptr__A_0),XMM12); +VMA1282(XMM12,XMM15,XMM0,XMM0); +VLD1282(8*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM1,XMM1); +VLD1282(10*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM2,XMM2); +PREFETCH01280(176*SIZE(_ptr__A_0),0); +BROAD1282(2*SIZE(_ptr__B_0),XMM15); +VLD1282(12*SIZE(_ptr__A_0),XMM12); +VMA1282(XMM12,XMM15,XMM0,XMM0); +VLD1282(14*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM1,XMM1); +VLD1282(16*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM2,XMM2); +PREFETCH21280(184*SIZE(_ptr__A_0),2); +BROAD1282(3*SIZE(_ptr__B_0),XMM15); +VLD1282(18*SIZE(_ptr__A_0),XMM12); +VMA1282(XMM12,XMM15,XMM0,XMM0); +VLD1282(20*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM1,XMM1); +VLD1282(22*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM2,XMM2); +ADDQ1280($24*SIZE,_ptr__A_0); +ADDQ1280($4*SIZE,_ptr__B_0); +._L_36_bodyE:; +DECQ1280(k); +JG ._L_36_bodyB; +ALIGN_4; +._L_36_loopE:; +TESTQ1280($2,_bk_l); +JLE ._L_37_loopE; +ALIGN_4; +._L_37_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM12); +VMA1282(XMM12,XMM15,XMM0,XMM0); +VLD1282(2*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM1,XMM1); +VLD1282(4*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM2,XMM2); +PREFETCH21280(168*SIZE(_ptr__A_0),2); +BROAD1282(1*SIZE(_ptr__B_0),XMM15); +VLD1282(6*SIZE(_ptr__A_0),XMM12); +VMA1282(XMM12,XMM15,XMM0,XMM0); +VLD1282(8*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM1,XMM1); +VLD1282(10*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM2,XMM2); +ADDQ1280($12*SIZE,_ptr__A_0); +ADDQ1280($2*SIZE,_ptr__B_0); +._L_37_loopE:; +TESTQ1280($1,_bk_l); +JLE ._L_38_loopE; +ALIGN_4; +._L_38_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM12); +VMA1282(XMM12,XMM15,XMM0,XMM0); +VLD1282(2*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM1,XMM1); +VLD1282(4*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM2,XMM2); +ADDQ1280($6*SIZE,_ptr__A_0); +ADDQ1280($1*SIZE,_ptr__B_0); +._L_38_loopE:; +BROAD1282(alpha,XMM3); +VLDU1282(0*SIZE(_ptr__C_0),XMM4); +VMA21282(XMM3,XMM0,XMM4,XMM0); +VSTU1282(XMM4,0*SIZE(_ptr__C_0)); +VLDU1282(2*SIZE(_ptr__C_0),XMM5); +VMA21282(XMM3,XMM1,XMM5,XMM1); +VSTU1282(XMM5,2*SIZE(_ptr__C_0)); +VLDU1282(4*SIZE(_ptr__C_0),XMM6); +VMA21282(XMM3,XMM2,XMM6,XMM2); +VSTU1282(XMM6,4*SIZE(_ptr__C_0)); +ADDQ1280($6*SIZE,_ptr__C_0); +ADDQ1280($6*SIZE,_ptr__C_1); +._L_35_bodyE:; +SUBQ1280($6,i); +JG ._L_35_bodyB; +ALIGN_4; +._L_35_loopE:; +TESTQ1280($4,i); +JLE ._L_39_loopE; +ALIGN_4; +._L_39_bodyB:; +MOVQ1280(_ptr_B,_ptr__B_0); +VXOR1282(XMM0,XMM0,XMM0); +VXOR1282(XMM1,XMM1,XMM1); +PREFETCHN1280(3*SIZE(_ptr__C_0),N); +PREFETCHN1280(11*SIZE(_ptr__C_0,LDC,1),N); +PREFETCHN1280(3*SIZE(_ptr__C_1),N); +PREFETCHN1280(11*SIZE(_ptr__C_1,LDC,1),N); +MOVQ1280(_bk_l,k); +SARQ1280($2,k); +JLE ._L_40_loopE; +ALIGN_4; +._L_40_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM0,XMM0); +VLD1282(2*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM1,XMM1); +PREFETCH21280(168*SIZE(_ptr__A_0),2); +BROAD1282(1*SIZE(_ptr__B_0),XMM15); +VLD1282(4*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM0,XMM0); +VLD1282(6*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM1,XMM1); +PREFETCH01280(176*SIZE(_ptr__A_0),0); +BROAD1282(2*SIZE(_ptr__B_0),XMM15); +VLD1282(8*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM0,XMM0); +VLD1282(10*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM1,XMM1); +PREFETCH21280(184*SIZE(_ptr__A_0),2); +BROAD1282(3*SIZE(_ptr__B_0),XMM15); +VLD1282(12*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM0,XMM0); +VLD1282(14*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM1,XMM1); +ADDQ1280($16*SIZE,_ptr__A_0); +ADDQ1280($4*SIZE,_ptr__B_0); +._L_40_bodyE:; +DECQ1280(k); +JG ._L_40_bodyB; +ALIGN_4; +._L_40_loopE:; +TESTQ1280($2,_bk_l); +JLE ._L_41_loopE; +ALIGN_4; +._L_41_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM0,XMM0); +VLD1282(2*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM1,XMM1); +PREFETCH21280(168*SIZE(_ptr__A_0),2); +BROAD1282(1*SIZE(_ptr__B_0),XMM15); +VLD1282(4*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM0,XMM0); +VLD1282(6*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM1,XMM1); +ADDQ1280($8*SIZE,_ptr__A_0); +ADDQ1280($2*SIZE,_ptr__B_0); +._L_41_loopE:; +TESTQ1280($1,_bk_l); +JLE ._L_42_loopE; +ALIGN_4; +._L_42_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM0,XMM0); +VLD1282(2*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM1,XMM1); +ADDQ1280($4*SIZE,_ptr__A_0); +ADDQ1280($1*SIZE,_ptr__B_0); +._L_42_loopE:; +BROAD1282(alpha,XMM2); +VLDU1282(0*SIZE(_ptr__C_0),XMM3); +VMA21282(XMM2,XMM0,XMM3,XMM0); +VSTU1282(XMM3,0*SIZE(_ptr__C_0)); +VLDU1282(2*SIZE(_ptr__C_0),XMM4); +VMA21282(XMM2,XMM1,XMM4,XMM1); +VSTU1282(XMM4,2*SIZE(_ptr__C_0)); +ADDQ1280($4*SIZE,_ptr__C_0); +ADDQ1280($4*SIZE,_ptr__C_1); +._L_39_loopE:; +TESTQ1280($2,i); +JLE ._L_43_loopE; +ALIGN_4; +._L_43_bodyB:; +MOVQ1280(_ptr_B,_ptr__B_0); +VXOR1282(XMM0,XMM0,XMM0); +PREFETCHN1280(3*SIZE(_ptr__C_0),N); +PREFETCHN1280(11*SIZE(_ptr__C_0,LDC,1),N); +PREFETCHN1280(3*SIZE(_ptr__C_1),N); +PREFETCHN1280(11*SIZE(_ptr__C_1,LDC,1),N); +MOVQ1280(_bk_l,k); +SARQ1280($2,k); +JLE ._L_44_loopE; +ALIGN_4; +._L_44_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM0,XMM0); +PREFETCH21280(168*SIZE(_ptr__A_0),2); +BROAD1282(1*SIZE(_ptr__B_0),XMM15); +VLD1282(2*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM0,XMM0); +PREFETCH01280(176*SIZE(_ptr__A_0),0); +BROAD1282(2*SIZE(_ptr__B_0),XMM15); +VLD1282(4*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM0,XMM0); +PREFETCH21280(184*SIZE(_ptr__A_0),2); +BROAD1282(3*SIZE(_ptr__B_0),XMM15); +VLD1282(6*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM0,XMM0); +ADDQ1280($8*SIZE,_ptr__A_0); +ADDQ1280($4*SIZE,_ptr__B_0); +._L_44_bodyE:; +DECQ1280(k); +JG ._L_44_bodyB; +ALIGN_4; +._L_44_loopE:; +TESTQ1280($2,_bk_l); +JLE ._L_45_loopE; +ALIGN_4; +._L_45_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM0,XMM0); +PREFETCH21280(168*SIZE(_ptr__A_0),2); +BROAD1282(1*SIZE(_ptr__B_0),XMM15); +VLD1282(2*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM0,XMM0); +ADDQ1280($4*SIZE,_ptr__A_0); +ADDQ1280($2*SIZE,_ptr__B_0); +._L_45_loopE:; +TESTQ1280($1,_bk_l); +JLE ._L_46_loopE; +ALIGN_4; +._L_46_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM0,XMM0); +ADDQ1280($2*SIZE,_ptr__A_0); +ADDQ1280($1*SIZE,_ptr__B_0); +._L_46_loopE:; +BROAD1282(alpha,XMM1); +VLDU1282(0*SIZE(_ptr__C_0),XMM2); +VMA21282(XMM1,XMM0,XMM2,XMM0); +VSTU1282(XMM2,0*SIZE(_ptr__C_0)); +ADDQ1280($2*SIZE,_ptr__C_0); +ADDQ1280($2*SIZE,_ptr__C_1); +._L_43_loopE:; +TESTQ1280($1,i); +JLE ._L_47_loopE; +ALIGN_4; +._L_47_bodyB:; +MOVQ1280(_ptr_B,_ptr__B_0); +VXOR1281(XMM0,XMM0,XMM0); +PREFETCHN1280(3*SIZE(_ptr__C_0),N); +PREFETCHN1280(11*SIZE(_ptr__C_0,LDC,1),N); +PREFETCHN1280(3*SIZE(_ptr__C_1),N); +PREFETCHN1280(11*SIZE(_ptr__C_1,LDC,1),N); +MOVQ1280(_bk_l,k); +SARQ1280($2,k); +JLE ._L_48_loopE; +ALIGN_4; +._L_48_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1281(0*SIZE(_ptr__B_0),XMM15); +VLD1281(0*SIZE(_ptr__A_0),XMM14); +VMA1281(XMM14,XMM15,XMM0,XMM0); +PREFETCH21280(168*SIZE(_ptr__A_0),2); +BROAD1281(1*SIZE(_ptr__B_0),XMM15); +VLD1281(1*SIZE(_ptr__A_0),XMM14); +VMA1281(XMM14,XMM15,XMM0,XMM0); +PREFETCH01280(176*SIZE(_ptr__A_0),0); +BROAD1281(2*SIZE(_ptr__B_0),XMM15); +VLD1281(2*SIZE(_ptr__A_0),XMM14); +VMA1281(XMM14,XMM15,XMM0,XMM0); +PREFETCH21280(184*SIZE(_ptr__A_0),2); +BROAD1281(3*SIZE(_ptr__B_0),XMM15); +VLD1281(3*SIZE(_ptr__A_0),XMM14); +VMA1281(XMM14,XMM15,XMM0,XMM0); +ADDQ1280($4*SIZE,_ptr__A_0); +ADDQ1280($4*SIZE,_ptr__B_0); +._L_48_bodyE:; +DECQ1280(k); +JG ._L_48_bodyB; +ALIGN_4; +._L_48_loopE:; +TESTQ1280($2,_bk_l); +JLE ._L_49_loopE; +ALIGN_4; +._L_49_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1281(0*SIZE(_ptr__B_0),XMM15); +VLD1281(0*SIZE(_ptr__A_0),XMM14); +VMA1281(XMM14,XMM15,XMM0,XMM0); +PREFETCH21280(168*SIZE(_ptr__A_0),2); +BROAD1281(1*SIZE(_ptr__B_0),XMM15); +VLD1281(1*SIZE(_ptr__A_0),XMM14); +VMA1281(XMM14,XMM15,XMM0,XMM0); +ADDQ1280($2*SIZE,_ptr__A_0); +ADDQ1280($2*SIZE,_ptr__B_0); +._L_49_loopE:; +TESTQ1280($1,_bk_l); +JLE ._L_50_loopE; +ALIGN_4; +._L_50_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1281(0*SIZE(_ptr__B_0),XMM15); +VLD1281(0*SIZE(_ptr__A_0),XMM14); +VMA1281(XMM14,XMM15,XMM0,XMM0); +ADDQ1280($1*SIZE,_ptr__A_0); +ADDQ1280($1*SIZE,_ptr__B_0); +._L_50_loopE:; +BROAD1281(alpha,XMM1); +VLDU1281(0*SIZE(_ptr__C_0),XMM2); +VMA21281(XMM1,XMM0,XMM2,XMM0); +VSTU1281(XMM2,0*SIZE(_ptr__C_0)); +ADDQ1280($1*SIZE,_ptr__C_0); +ADDQ1280($1*SIZE,_ptr__C_1); +._L_47_loopE:; +MOVQ1280(LDC,%rax); +ADDQ1280(%rax,_ptr_C); +MOVQ1280(_bk_l,%rax); +SALQ1280($3,%rax); +ADDQ1280(%rax,_ptr_B); +._L_34_loopE:; +vzeroupper +movq 0(%rsp), %rbx; +movq 8(%rsp), %rbp; +movq 16(%rsp), %r12; +movq 24(%rsp), %r13; +movq 32(%rsp), %r14; +movq 40(%rsp), %r15; +addq $STACKSIZE, %rsp; +ret + +EPILOGUE diff --git a/kernel/x86_64/dgemm_kernel_8x2_piledriver.S b/kernel/x86_64/dgemm_kernel_8x2_piledriver.S new file mode 100644 index 000000000..8585d45de --- /dev/null +++ b/kernel/x86_64/dgemm_kernel_8x2_piledriver.S @@ -0,0 +1,4523 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +/********************************************************************* +* +* 2013/11/13 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +* +* 2013/10/31 Saar +* +* Parameter: +* UNROLL_M 8 +* UNROLL_N 2 +* DGEMM_P 768 +* DGEMM_Q 168 +* DGEMM_R 12288 +* A_PR1 512 +* B_PR1 256 +* +* Performance at m x n on AMD 8320 (ACML-Version: 5.3.1): +* +* 4608x4608 83.9 GFLOPS with 8 threads on 4 modules (ACML: 78.4 GFLOPS) +* 4608x4608 80.9 GFLOPS with 4 threads on 4 modules (ACML: 78.4 GFLOPS) +* 4608x4608 41.3 GFLOPS with 2 threads on 2 modules (ACML: 40.9 GFLOPS) +* 4608x4608 20.7 GFLOPS with 1 threads on 1 modules (ACML: 20.8 GFLOPS) +* +* Performance at m x n on AMD 6380 (ACML-Version: 5.3.1): +* +* 13824x13824 234.5 GFLOPS with 32 threads on 16 modules (ACML: 88.5 GFLOPS) !strange thermal behavior +* 13824x13824 241.9 GFLOPS with 16 threads on 16 modules (ACML: 191.5 GFLOPS) !strange thermal behavior +* 9216x9216 137.6 GFLOPS with 8 threads on 8 modules (ACML: 106.5 GFLOPS) +* 4608x4608 75.7 GFLOPS with 4 threads on 4 modules (ACML: 56.3 GFLOPS) +* 4608x4608 38.6 GFLOPS with 2 threads on 2 modules (ACML: 34.1 GFLOPS) +* 4608x4608 19.6 GFLOPS with 1 threads on 1 modules (ACML: 18.3 GFLOPS) +* +*********************************************************************/ + + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define J %r14 +#define OLD_K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define K %r12 +#define BI %rbp +#define SP %rbx + +#define BO1 %rdi +#define BO2 %r15 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 96 + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#endif + +#define L_BUFFER_SIZE 512*8*4 +#define LB2_OFFSET 512*8*2 + +#define Ndiv6 24(%rsp) +#define Nmod6 32(%rsp) +#define N 40(%rsp) +#define ALPHA 48(%rsp) +#define OFFSET 56(%rsp) +#define KK 64(%rsp) +#define KKK 72(%rsp) +#define BUFFER1 128(%rsp) +#define BUFFER2 LB2_OFFSET+128(%rsp) + +#if defined(OS_WINDOWS) +#if L_BUFFER_SIZE > 16384 +#define STACK_TOUCH \ + movl $0, 4096 * 4(%rsp);\ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 12288 +#define STACK_TOUCH \ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 8192 +#define STACK_TOUCH \ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 4096 +#define STACK_TOUCH \ + movl $0, 4096 * 1(%rsp); +#else +#define STACK_TOUCH +#endif +#else +#define STACK_TOUCH +#endif + +#if defined(BULLDOZER) + +#define VFMADD231PD_( y1,y2,y0 ) vfmaddpd y0,y1,y2,y0 + +#define VFMADD231SD_( x1,x2,x0 ) vfmaddsd x0,x1,x2,x0 + +#else + +#define VFMADD231PD_( y1,y2,y0 ) vfmadd231pd y2,y1,y0 + +#define VFMADD231SD_( x1,x2,x0 ) vfmadd231sd x2,x1,x0 + +#endif + + + + +#define A_PR1 512 +#define B_PR1 256 +#define C_PR1 64 + +.macro INIT8x3 + vxorpd %xmm4 , %xmm4 , %xmm4 + vxorpd %xmm5 , %xmm5 , %xmm5 + vxorpd %xmm6 , %xmm6 , %xmm6 + vxorpd %xmm7 , %xmm7 , %xmm7 + vxorpd %xmm8 , %xmm8 , %xmm8 + vxorpd %xmm9 , %xmm9 , %xmm9 + vxorpd %xmm10, %xmm10, %xmm10 + vxorpd %xmm11, %xmm11, %xmm11 + vxorpd %xmm12, %xmm12, %xmm12 + vxorpd %xmm13, %xmm13, %xmm13 + vxorpd %xmm14, %xmm14, %xmm14 + vxorpd %xmm15, %xmm15, %xmm15 +.endm + +.macro KERNEL8x3_INIT + vmovddup -12 * SIZE(BO), %xmm1 + vmovups -16 * SIZE(AO), %xmm0 + prefetcht0 A_PR1(AO) + vmulpd %xmm1,%xmm0,%xmm4 + vmovddup -11 * SIZE(BO), %xmm2 + vmulpd %xmm2,%xmm0,%xmm5 + vmovddup -10 * SIZE(BO), %xmm3 + vmulpd %xmm3,%xmm0,%xmm6 + vmovups -14 * SIZE(AO), %xmm0 + vmulpd %xmm1,%xmm0,%xmm7 + vmulpd %xmm2,%xmm0,%xmm8 + vmulpd %xmm3,%xmm0,%xmm9 + vmovups -12 * SIZE(AO), %xmm0 + vmulpd %xmm1,%xmm0,%xmm10 + vmulpd %xmm2,%xmm0,%xmm11 + addq $3*SIZE, BO + vmulpd %xmm3,%xmm0,%xmm12 + vmovups -10 * SIZE(AO), %xmm0 + vmulpd %xmm1,%xmm0,%xmm13 + vmovddup -12 * SIZE(BO), %xmm1 + vmulpd %xmm2,%xmm0,%xmm14 + vmovddup -11 * SIZE(BO), %xmm2 + vmulpd %xmm3,%xmm0,%xmm15 +.endm + + +.macro KERNEL8x3_M1 + vmovups -16 * SIZE(AO), %xmm0 + prefetcht0 A_PR1(AO) + VFMADD231PD_( %xmm1,%xmm0,%xmm4 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm5 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm6 ) + vmovups -14 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm7 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm8 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm9 ) + vmovups -12 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm10 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm11 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) + vmovups -10 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) + vmovddup -12 * SIZE(BO), %xmm1 + VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) + vmovddup -11 * SIZE(BO), %xmm2 + VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) +.endm + +.macro KERNEL8x3_M2 + vmovups -8 * SIZE(AO), %xmm0 + prefetcht0 A_PR1+64(AO) + vmovddup -10 * SIZE(BO), %xmm3 + VFMADD231PD_( %xmm1,%xmm0,%xmm4 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm5 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm6 ) + vmovups -6 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm7 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm8 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm9 ) + vmovups -4 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm10 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm11 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) + vmovups -2 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) + vmovddup -9 * SIZE(BO), %xmm1 + VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) + vmovddup -8 * SIZE(BO), %xmm2 + VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) +.endm + + +.macro KERNEL8x3_M3 + vmovups 0 * SIZE(AO), %xmm0 + prefetcht0 A_PR1+128(AO) + vmovddup -7 * SIZE(BO), %xmm3 + VFMADD231PD_( %xmm1,%xmm0,%xmm4 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm5 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm6 ) + vmovups 2 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm7 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm8 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm9 ) + vmovups 4 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm10 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm11 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) + vmovups 6 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) + vmovddup -6 * SIZE(BO), %xmm1 + VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) + vmovddup -5 * SIZE(BO), %xmm2 + VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) +.endm + +.macro KERNEL8x3_M4 + vmovups 8 * SIZE(AO), %xmm0 + prefetcht0 A_PR1+192(AO) + vmovddup -4 * SIZE(BO), %xmm3 + VFMADD231PD_( %xmm1,%xmm0,%xmm4 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm5 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm6 ) + vmovups 10 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm7 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm8 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm9 ) + vmovups 12 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm10 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm11 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) + vmovups 14 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) + vmovddup -3 * SIZE(BO), %xmm1 + addq $32 * SIZE, AO + VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) + vmovddup -2 * SIZE(BO), %xmm2 + VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) +.endm + +.macro KERNEL8x3_M5 + vmovups -16 * SIZE(AO), %xmm0 + prefetcht0 A_PR1(AO) + vmovddup -1 * SIZE(BO), %xmm3 + VFMADD231PD_( %xmm1,%xmm0,%xmm4 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm5 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm6 ) + vmovups -14 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm7 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm8 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm9 ) + vmovups -12 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm10 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm11 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) + vmovups -10 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) + vmovddup 0 * SIZE(BO), %xmm1 + VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) + vmovddup 1 * SIZE(BO), %xmm2 + VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) +.endm + +.macro KERNEL8x3_M6 + vmovups -8 * SIZE(AO), %xmm0 + prefetcht0 A_PR1+64(AO) + vmovddup 2 * SIZE(BO), %xmm3 + VFMADD231PD_( %xmm1,%xmm0,%xmm4 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm5 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm6 ) + vmovups -6 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm7 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm8 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm9 ) + vmovups -4 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm10 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm11 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) + vmovups -2 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) + vmovddup 3 * SIZE(BO), %xmm1 + VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) + vmovddup 4 * SIZE(BO), %xmm2 + VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) +.endm + + +.macro KERNEL8x3_M7 + vmovups 0 * SIZE(AO), %xmm0 + prefetcht0 A_PR1+128(AO) + vmovddup 5 * SIZE(BO), %xmm3 + VFMADD231PD_( %xmm1,%xmm0,%xmm4 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm5 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm6 ) + vmovups 2 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm7 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm8 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm9 ) + vmovups 4 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm10 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm11 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) + vmovups 6 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) + vmovddup 6 * SIZE(BO), %xmm1 + VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) + vmovddup 7 * SIZE(BO), %xmm2 + VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) +.endm + +.macro KERNEL8x3_M8 + vmovups 8 * SIZE(AO), %xmm0 + prefetcht0 A_PR1+192(AO) + vmovddup 8 * SIZE(BO), %xmm3 + VFMADD231PD_( %xmm1,%xmm0,%xmm4 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm5 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm6 ) + vmovups 10 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm7 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm8 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm9 ) + vmovups 12 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm10 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm11 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) + vmovups 14 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) + vmovddup 9 * SIZE(BO), %xmm1 + VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) + vmovddup 10 * SIZE(BO), %xmm2 + VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) + vmovddup 11 * SIZE(BO), %xmm3 + addq $32 * SIZE, AO + addq $24 * SIZE, BO +.endm + + +.macro KERNEL8x3_E + vmovups 8 * SIZE(AO), %xmm0 + prefetcht0 A_PR1+192(AO) + vmovddup 8 * SIZE(BO), %xmm3 + VFMADD231PD_( %xmm1,%xmm0,%xmm4 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm5 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm6 ) + vmovups 10 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm7 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm8 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm9 ) + vmovups 12 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm10 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm11 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) + vmovups 14 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) + addq $32*SIZE, AO + VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) + addq $21*SIZE, BO + VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) +.endm + +.macro KERNEL8x3_SUBN + vmovddup -12 * SIZE(BO), %xmm1 + vmovups -16 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm4 ) + vmovddup -11 * SIZE(BO), %xmm2 + VFMADD231PD_( %xmm2,%xmm0,%xmm5 ) + vmovddup -10 * SIZE(BO), %xmm3 + VFMADD231PD_( %xmm3,%xmm0,%xmm6 ) + vmovups -14 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm7 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm8 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm9 ) + vmovups -12 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm10 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm11 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) + vmovups -10 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) + addq $3*SIZE, BO + VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) + addq $8*SIZE, AO + VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) +.endm + +.macro SAVE8x3 + vmovddup ALPHA, %xmm0 + + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddpd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + + vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10 + vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13 + + vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + vfmaddpd 4 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 + vfmaddpd 6 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 + + vfmaddpd 2 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 + vfmaddpd 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12 + vfmaddpd 6 * SIZE(CO1, LDC, 2),%xmm0, %xmm15,%xmm15 + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 2 * SIZE(CO1) + vmovups %xmm10, 4 * SIZE(CO1) + vmovups %xmm13, 6 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 2 * SIZE(CO1, LDC) + vmovups %xmm11, 4 * SIZE(CO1, LDC) + vmovups %xmm14, 6 * SIZE(CO1, LDC) + + vmovups %xmm6 , (CO1, LDC, 2) + vmovups %xmm9 , 2 * SIZE(CO1, LDC, 2) + vmovups %xmm12, 4 * SIZE(CO1, LDC, 2) + vmovups %xmm15, 6 * SIZE(CO1, LDC, 2) + + prefetcht0 C_PR1(CO1) + prefetcht0 C_PR1(CO1,LDC) + prefetcht0 C_PR1(CO1,LDC,2) + + addq $8 * SIZE, CO1 # coffset += 8 +.endm + + +/*******************************************************************************************/ + +#define KERNEL4x3_1(xx) \ + vmovddup -6 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -5 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup -4 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ + +#define KERNEL4x3_2(xx) \ + vmovddup -3 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -2 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup -1 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ + +#define KERNEL4x3_3(xx) \ + vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -8 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup 1 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup 2 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -6 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ + +#define KERNEL4x3_4(xx) \ + vmovddup 3 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -4 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup 4 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup 5 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -2 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ + addq $12, BI ;\ + addq $16, %rax ;\ + +#define KERNEL4x3_SUB(xx) \ + vmovddup -6 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -5 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup -4 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ + + + + + +/*******************************************************************************************/ + +#define KERNEL2x3_1(xx) \ + vmovddup -6 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -5 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup -4 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL2x3_2(xx) \ + vmovddup -3 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -2 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup -1 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL2x3_3(xx) \ + vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup 1 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup 2 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL2x3_4(xx) \ + vmovddup 3 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup 4 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup 5 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + addq $12, BI ;\ + addq $8, %rax ;\ + +#define KERNEL2x3_SUB(xx) \ + vmovddup -6 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -5 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup -4 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +/*******************************************************************************************/ + +#define KERNEL1x3_1(xx) \ + vmovsd -6 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovsd -5 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovsd -4 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddsd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL1x3_2(xx) \ + vmovsd -3 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -15 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovsd -2 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovsd -1 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddsd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL1x3_3(xx) \ + vmovsd 0 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovsd 1 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovsd 2 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddsd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL1x3_4(xx) \ + vmovsd 3 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -13 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovsd 4 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovsd 5 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddsd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + addq $12, BI ;\ + addq $4, %rax ;\ + +#define KERNEL1x3_SUB(xx) \ + vmovsd -6 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovsd -5 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovsd -4 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddsd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + + + +/******************************************************************************************* +* 2 lines of N +*******************************************************************************************/ + +#define KERNEL8x2_1(xx) \ + prefetcht0 A_PR1(AO,%rax,8) ;\ + vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ + +#define KERNEL8x2_2(xx) \ + prefetcht0 A_PR1+64(AO,%rax,8) ;\ + vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -8 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -1 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -6 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vmovups -4 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vmovups -2 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ + +#define KERNEL8x2_3(xx) \ + prefetcht0 A_PR1+128(AO,%rax,8) ;\ + vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups 0 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup 1 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups 2 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vmovups 4 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vmovups 6 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ + +#define KERNEL8x2_4(xx) \ + prefetcht0 A_PR1+192(AO,%rax,8) ;\ + vmovddup 2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups 8 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup 3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups 10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vmovups 12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vmovups 14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ + addq $8, BI ;\ + addq $32, %rax ;\ + +#define KERNEL8x2_SUB(xx) \ + vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ + + +/*******************************************************************************************/ + +#define KERNEL4x2_1(xx) \ + vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + +#define KERNEL4x2_2(xx) \ + vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -1 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + +#define KERNEL4x2_3(xx) \ + vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -8 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup 1 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -6 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + +#define KERNEL4x2_4(xx) \ + vmovddup 2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -4 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup 3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -2 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + addq $8, BI ;\ + addq $16, %rax ;\ + +#define KERNEL4x2_SUB(xx) \ + vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + + +/*******************************************************************************************/ + +#define KERNEL2x2_1(xx) \ + vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL2x2_2(xx) \ + vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -1 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL2x2_3(xx) \ + vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup 1 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL2x2_4(xx) \ + vmovddup 2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup 3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + addq $8, BI ;\ + addq $8, %rax ;\ + +#define KERNEL2x2_SUB(xx) \ + vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +/*******************************************************************************************/ + +#define KERNEL1x2_1(xx) \ + vmovsd -4 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovsd -3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL1x2_2(xx) \ + vmovsd -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -15 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovsd -1 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL1x2_3(xx) \ + vmovsd 0 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovsd 1 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL1x2_4(xx) \ + vmovsd 2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -13 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovsd 3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + addq $8, BI ;\ + addq $4, %rax ;\ + +#define KERNEL1x2_SUB(xx) \ + vmovsd -4 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovsd -3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + + + +/******************************************************************************************* +* 1 line of N +*******************************************************************************************/ + +#define KERNEL8x1_1(xx) \ + prefetcht0 A_PR1(AO,%rax,8) ;\ + vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + +#define KERNEL8x1_2(xx) \ + prefetcht0 A_PR1+64(AO,%rax,8) ;\ + vmovddup -1 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -8 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -6 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vmovups -4 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vmovups -2 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + +#define KERNEL8x1_3(xx) \ + prefetcht0 A_PR1+128(AO,%rax,8) ;\ + vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups 0 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups 2 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vmovups 4 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vmovups 6 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + +#define KERNEL8x1_4(xx) \ + prefetcht0 A_PR1+192(AO,%rax,8) ;\ + vmovddup 1 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups 8 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups 10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vmovups 12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vmovups 14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + addq $4, BI ;\ + addq $32, %rax ;\ + +#define KERNEL8x1_SUB(xx) \ + vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + + +/*******************************************************************************************/ + +#define KERNEL4x1_1(xx) \ + vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + +#define KERNEL4x1_2(xx) \ + vmovddup -1 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + +#define KERNEL4x1_3(xx) \ + vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -8 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -6 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + +#define KERNEL4x1_4(xx) \ + vmovddup 1 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -4 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -2 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + addq $4, BI ;\ + addq $16, %rax ;\ + +#define KERNEL4x1_SUB(xx) \ + vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + + +/*******************************************************************************************/ + +#define KERNEL2x1_1(xx) \ + vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL2x1_2(xx) \ + vmovddup -1 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL2x1_3(xx) \ + vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL2x1_4(xx) \ + vmovddup 1 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + addq $4, BI ;\ + addq $8, %rax ;\ + +#define KERNEL2x1_SUB(xx) \ + vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +/*******************************************************************************************/ + +#define KERNEL1x1_1(xx) \ + vmovsd -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL1x1_2(xx) \ + vmovsd -1 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -15 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL1x1_3(xx) \ + vmovsd 0 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL1x1_4(xx) \ + vmovsd 1 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -13 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + addq $4, BI ;\ + addq $4, %rax ;\ + +#define KERNEL1x1_SUB(xx) \ + vmovsd -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + + +/*******************************************************************************************/ + +#if !defined(TRMMKERNEL) + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC + + vmovaps %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovsd %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $6, %rdi + divq %rdi // N / 6 + movq %rax, Ndiv6 // N / 6 + movq %rdx, Nmod6 // N % 6 + + movq Ndiv6, J + cmpq $0, J + je .L2_0 + ALIGN_4 + +.L6_01: + // copy to sub buffer + movq K, %rax + salq $1,%rax // K * 2 + movq B, BO1 + leaq (B,%rax,8), BO2 // next offset to BO2 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + sarq $2, %rax // K / 4 + jz .L6_02a + ALIGN_4 + +.L6_02: + prefetcht0 B_PR1(BO1) + prefetcht0 B_PR1(BO2) + prefetchw B_PR1(BO) + vmovups (BO1), %xmm0 + vmovups 2*SIZE(BO1), %xmm2 + vmovups 4*SIZE(BO1), %xmm4 + vmovups 6*SIZE(BO1), %xmm6 + vmovsd (BO2), %xmm1 + vmovsd 2*SIZE(BO2), %xmm3 + vmovsd 4*SIZE(BO2), %xmm5 + vmovsd 6*SIZE(BO2), %xmm7 + vmovups %xmm0, (BO) + vmovsd %xmm1, 2*SIZE(BO) + vmovups %xmm2, 3*SIZE(BO) + vmovsd %xmm3, 5*SIZE(BO) + vmovups %xmm4, 6*SIZE(BO) + vmovsd %xmm5, 8*SIZE(BO) + vmovups %xmm6, 9*SIZE(BO) + vmovsd %xmm7,11*SIZE(BO) + addq $8*SIZE,BO1 + addq $8*SIZE,BO2 + addq $12*SIZE,BO + decq %rax + jnz .L6_02 + +.L6_02a: + + movq K, %rax + andq $3, %rax // K % 4 + jz .L6_02c + ALIGN_4 + +.L6_02b: + + vmovups (BO1), %xmm0 + vmovsd (BO2), %xmm1 + vmovups %xmm0, (BO) + vmovsd %xmm1, 2*SIZE(BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO2 + addq $3*SIZE,BO + decq %rax + jnz .L6_02b + +.L6_02c: + + movq K, %rax + salq $1,%rax // K * 2 + leaq (B,%rax,8), BO1 // next offset to BO1 + leaq (BO1,%rax,8), BO2 // next offset to BO1 + leaq BUFFER2, BO // second buffer to BO + movq K, %rax + sarq $2, %rax // k / 4 + jz .L6_03a + ALIGN_4 + + +.L6_03: + + prefetcht0 B_PR1(BO2) + prefetchw B_PR1(BO) + vmovups (BO2), %xmm0 + vmovups 2*SIZE(BO2), %xmm2 + vmovups 4*SIZE(BO2), %xmm4 + vmovups 6*SIZE(BO2), %xmm6 + vmovsd 1*SIZE(BO1), %xmm1 + vmovsd 3*SIZE(BO1), %xmm3 + vmovsd 5*SIZE(BO1), %xmm5 + vmovsd 7*SIZE(BO1), %xmm7 + vmovsd %xmm1, 0*SIZE(BO) + vmovups %xmm0, 1*SIZE(BO) + vmovsd %xmm3, 3*SIZE(BO) + vmovups %xmm2, 4*SIZE(BO) + vmovsd %xmm5, 6*SIZE(BO) + vmovups %xmm4, 7*SIZE(BO) + vmovsd %xmm7, 9*SIZE(BO) + vmovups %xmm6,10*SIZE(BO) + addq $8*SIZE,BO1 + addq $8*SIZE,BO2 + addq $12*SIZE,BO + decq %rax + jnz .L6_03 + +.L6_03a: + + movq K, %rax + andq $3, %rax // K % 4 + jz .L6_03c + ALIGN_4 + + +.L6_03b: + + vmovsd 1*SIZE(BO1), %xmm0 + vmovups (BO2), %xmm1 + vmovsd %xmm0, (BO) + vmovups %xmm1, 1*SIZE(BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO2 + addq $3*SIZE,BO + decq %rax + jnz .L6_03b + + +.L6_03c: + + movq BO2, B // next offset of B + +.L6_10: + movq C, CO1 + leaq (C, LDC, 2), C + leaq (C, LDC, 1), C // c += 3 * ldc + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $3, I // i = (m >> 3) + je .L6_20 + + ALIGN_4 + +.L6_11: + + leaq BUFFER1, BO // first buffer to BO + addq $12 * SIZE, BO + movq K, %rax + sarq $3, %rax // K / 8 + cmpq $3, %rax + jl .L6_13 + + prefetcht0 B_PR1(BO) + prefetcht0 B_PR1+64(BO) + prefetcht0 B_PR1+128(BO) + KERNEL8x3_INIT + KERNEL8x3_M2 + KERNEL8x3_M3 + KERNEL8x3_M4 + KERNEL8x3_M5 + KERNEL8x3_M6 + KERNEL8x3_M7 + KERNEL8x3_M8 + + subq $2, %rax + + ALIGN_5 + +.L6_12: + + prefetcht0 B_PR1-24(BO) + prefetcht0 B_PR1+40(BO) + KERNEL8x3_M1 + KERNEL8x3_M2 + KERNEL8x3_M3 + KERNEL8x3_M4 + KERNEL8x3_M5 + prefetcht0 B_PR1+104(BO) + KERNEL8x3_M6 + KERNEL8x3_M7 + KERNEL8x3_M8 + + dec %rax + jne .L6_12 + +.L6_12_E: + + prefetcht0 B_PR1(BO) + prefetcht0 B_PR1+64(BO) + KERNEL8x3_M1 + KERNEL8x3_M2 + KERNEL8x3_M3 + KERNEL8x3_M4 + KERNEL8x3_M5 + KERNEL8x3_M6 + KERNEL8x3_M7 + KERNEL8x3_E + + jmp .L6_16 + +.L6_13: + + test $2, %rax + jz .L6_14 + + KERNEL8x3_INIT + KERNEL8x3_M2 + KERNEL8x3_M3 + KERNEL8x3_M4 + KERNEL8x3_M5 + KERNEL8x3_M6 + KERNEL8x3_M7 + KERNEL8x3_M8 + + KERNEL8x3_M1 + KERNEL8x3_M2 + KERNEL8x3_M3 + KERNEL8x3_M4 + KERNEL8x3_M5 + KERNEL8x3_M6 + KERNEL8x3_M7 + KERNEL8x3_E + + jmp .L6_16 + + +.L6_14: + + test $1, %rax + jz .L6_15 + + KERNEL8x3_INIT + KERNEL8x3_M2 + KERNEL8x3_M3 + KERNEL8x3_M4 + KERNEL8x3_M5 + KERNEL8x3_M6 + KERNEL8x3_M7 + KERNEL8x3_E + + + jmp .L6_16 + +.L6_15: + + INIT8x3 + +.L6_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_19 + + ALIGN_4 + +.L6_17: + + KERNEL8x3_SUBN + dec %rax + jne .L6_17 + ALIGN_4 + + +.L6_19: + + SAVE8x3 + + decq I # i -- + jg .L6_11 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L6_20: + // Test rest of M + + testq $7, M + jz .L7_10 // to next 3 lines of N + + testq $4, M + jz .L6_30 + + ALIGN_4 + +.L6_21: + leaq BUFFER1, BO // first buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L6_26 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L6_22: + + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + je .L6_26 + + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + je .L6_26 + + jmp .L6_22 + ALIGN_4 + +.L6_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L6_27: + + KERNEL4x3_SUB(xxx) + addq $3, BI + addq $4, %rax + jl .L6_27 + ALIGN_4 + + +.L6_29: + + vmovddup ALPHA, %xmm0 + + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + + vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + + vfmaddpd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + vfmaddpd 2 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 + + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 2 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 2 * SIZE(CO1, LDC) + + vmovups %xmm6 , (CO1, LDC, 2) + vmovups %xmm9 , 2 * SIZE(CO1, LDC, 2) + + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L6_30: + testq $2, M + jz .L6_40 + + ALIGN_4 + +.L6_31: + leaq BUFFER1, BO // first buffer to BO + addq $6 * SIZE, BO + + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L6_36 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L6_32: + + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + je .L6_36 + + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + je .L6_36 + + jmp .L6_32 + ALIGN_4 + +.L6_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L6_37: + + KERNEL2x3_SUB(xxx) + addq $3, BI + addq $2, %rax + jl .L6_37 + ALIGN_4 + + +.L6_39: + + vmovddup ALPHA, %xmm0 + + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddpd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm6 , (CO1, LDC, 2) + + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L6_40: + testq $1, M + jz .L7_10 // to next 3 lines of N + + ALIGN_4 + +.L6_41: + leaq BUFFER1, BO // first buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L6_46 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L6_42: + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + je .L6_46 + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + je .L6_46 + + jmp .L6_42 + ALIGN_4 + +.L6_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L6_47: + + KERNEL1x3_SUB(xxx) + addq $3, BI + addq $1, %rax + jl .L6_47 + ALIGN_4 + + +.L6_49: + + vmovddup ALPHA, %xmm0 + + + vfmaddsd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddsd (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddsd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + + vmovsd %xmm4 , (CO1) + vmovsd %xmm5 , (CO1, LDC) + vmovsd %xmm6 , (CO1, LDC, 2) + + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + + +/***************************************************************************************************************/ + +.L7_10: + movq C, CO1 + leaq (C, LDC, 2), C + leaq (C, LDC, 1), C // c += 3 * ldc + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $3, I // i = (m >> 3) + je .L7_20 + ALIGN_4 + +.L7_11: + + leaq BUFFER2, BO // first buffer to BO + addq $12 * SIZE, BO + movq K, %rax + sarq $3, %rax // K / 8 + cmpq $3, %rax + jl .L7_13 + + prefetcht0 B_PR1(BO) + prefetcht0 B_PR1+64(BO) + prefetcht0 B_PR1+128(BO) + KERNEL8x3_INIT + KERNEL8x3_M2 + KERNEL8x3_M3 + KERNEL8x3_M4 + KERNEL8x3_M5 + KERNEL8x3_M6 + KERNEL8x3_M7 + KERNEL8x3_M8 + + subq $2, %rax + + ALIGN_5 + +.L7_12: + + prefetcht0 B_PR1-24(BO) + prefetcht0 B_PR1+40(BO) + KERNEL8x3_M1 + KERNEL8x3_M2 + KERNEL8x3_M3 + KERNEL8x3_M4 + prefetcht0 B_PR1+104(BO) + KERNEL8x3_M5 + KERNEL8x3_M6 + KERNEL8x3_M7 + KERNEL8x3_M8 + + dec %rax + jne .L7_12 + +.L7_12_E: + + prefetcht0 B_PR1(BO) + prefetcht0 B_PR1+64(BO) + KERNEL8x3_M1 + KERNEL8x3_M2 + KERNEL8x3_M3 + KERNEL8x3_M4 + KERNEL8x3_M5 + KERNEL8x3_M6 + KERNEL8x3_M7 + KERNEL8x3_E + + jmp .L7_16 + + + +.L7_13: + + test $2, %rax + jz .L7_14 + + KERNEL8x3_INIT + KERNEL8x3_M2 + KERNEL8x3_M3 + KERNEL8x3_M4 + KERNEL8x3_M5 + KERNEL8x3_M6 + KERNEL8x3_M7 + KERNEL8x3_M8 + + KERNEL8x3_M1 + KERNEL8x3_M2 + KERNEL8x3_M3 + KERNEL8x3_M4 + KERNEL8x3_M5 + KERNEL8x3_M6 + KERNEL8x3_M7 + KERNEL8x3_E + + jmp .L7_16 + + +.L7_14: + + test $1, %rax + jz .L7_15 + + KERNEL8x3_INIT + KERNEL8x3_M2 + KERNEL8x3_M3 + KERNEL8x3_M4 + KERNEL8x3_M5 + KERNEL8x3_M6 + KERNEL8x3_M7 + KERNEL8x3_E + + jmp .L7_16 + + + +.L7_15: + + INIT8x3 + +.L7_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_19 + + + ALIGN_4 + +.L7_17: + + KERNEL8x3_SUBN + dec %rax + jne .L7_17 + ALIGN_4 + + +.L7_19: + + SAVE8x3 + + decq I # i -- + jg .L7_11 + ALIGN_4 + +.L7_20: + // Test rest of M + + testq $7, M + jz .L7_60 // to next 6 lines of N + + testq $4, M + jz .L7_30 + + ALIGN_4 + +.L7_21: + leaq BUFFER2, BO // second buffer to BO + addq $6 * SIZE, BO + + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L7_26 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L7_22: + + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + je .L7_26 + + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + je .L7_26 + + jmp .L7_22 + ALIGN_4 + +.L7_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L7_27: + + KERNEL4x3_SUB(xxx) + addq $3, BI + addq $4, %rax + jl .L7_27 + ALIGN_4 + + +.L7_29: + + vmovddup ALPHA, %xmm0 + + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + + vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + + vfmaddpd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + vfmaddpd 2 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 + + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 2 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 2 * SIZE(CO1, LDC) + + vmovups %xmm6 , (CO1, LDC, 2) + vmovups %xmm9 , 2 * SIZE(CO1, LDC, 2) + + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L7_30: + testq $2, M + jz .L7_40 + + ALIGN_4 + +.L7_31: + leaq BUFFER2, BO // second buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L7_36 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L7_32: + + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + je .L7_36 + + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + je .L7_36 + + jmp .L7_32 + ALIGN_4 + +.L7_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L7_37: + + KERNEL2x3_SUB(xxx) + addq $3, BI + addq $2, %rax + jl .L7_37 + ALIGN_4 + + +.L7_39: + + vmovddup ALPHA, %xmm0 + + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddpd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm6 , (CO1, LDC, 2) + + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + + + + + +.L7_40: + testq $1, M + jz .L7_60 // to next 6 lines of N + + ALIGN_4 + +.L7_41: + leaq BUFFER2, BO // second buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + + andq $-8, %rax + je .L7_46 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L7_42: + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + je .L7_46 + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + je .L7_46 + + jmp .L7_42 + ALIGN_4 + +.L7_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L7_47: + + KERNEL1x3_SUB(xxx) + addq $3, BI + addq $1, %rax + jl .L7_47 + ALIGN_4 + + +.L7_49: + + vmovddup ALPHA, %xmm0 + + + vfmaddsd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddsd (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddsd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + + + vmovsd %xmm4 , (CO1) + vmovsd %xmm5 , (CO1, LDC) + vmovsd %xmm6 , (CO1, LDC, 2) + + + addq $1 * SIZE, CO1 # coffset += 1 + +.L7_60: + + decq J // j -- + jg .L6_01 + + +.L2_0: + cmpq $0, Nmod6 // N % 6 == 0 + je .L999 + +/************************************************************************************************ +* Loop for Nmod6 / 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + sarq $1, J // j = j / 2 + je .L1_0 + ALIGN_4 + +.L2_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L2_02b: + + vmovups (BO1), %xmm0 + vmovups %xmm0, (BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO + decq %rax + jnz .L2_02b + +.L2_02c: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $3, I // i = (m >> 3) + je .L2_20 + + ALIGN_4 + +.L2_11: + + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_16 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_12: + + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + je .L2_16 + + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + je .L2_16 + + jmp .L2_12 + ALIGN_4 + +.L2_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_17: + + KERNEL8x2_SUB(xxx) + addq $2, BI + addq $8, %rax + jl .L2_17 + ALIGN_4 + + +.L2_19: + + vmovddup ALPHA, %xmm0 + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10 + vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13 + + vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + vfmaddpd 4 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 + vfmaddpd 6 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 2 * SIZE(CO1) + vmovups %xmm10, 4 * SIZE(CO1) + vmovups %xmm13, 6 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 2 * SIZE(CO1, LDC) + vmovups %xmm11, 4 * SIZE(CO1, LDC) + vmovups %xmm14, 6 * SIZE(CO1, LDC) + + + addq $8 * SIZE, CO1 # coffset += 8 + decq I # i -- + jg .L2_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_20: + // Test rest of M + + testq $7, M + jz .L2_60 // to next 2 lines of N + + testq $4, M + jz .L2_30 + + ALIGN_4 + +.L2_21: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L2_26 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_22: + + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + je .L2_26 + + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + je .L2_26 + + jmp .L2_22 + ALIGN_4 + +.L2_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_27: + + KERNEL4x2_SUB(xxx) + addq $2, BI + addq $4, %rax + jl .L2_27 + ALIGN_4 + + +.L2_29: + + vmovddup ALPHA, %xmm0 + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + + vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 2 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 2 * SIZE(CO1, LDC) + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L2_30: + testq $2, M + jz .L2_40 + + ALIGN_4 + +.L2_31: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L2_36 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_32: + + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_36 + + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_36 + + jmp .L2_32 + ALIGN_4 + +.L2_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_37: + + KERNEL2x2_SUB(xxx) + addq $2, BI + addq $2, %rax + jl .L2_37 + ALIGN_4 + + +.L2_39: + + vmovddup ALPHA, %xmm0 + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , (CO1, LDC) + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + + +.L2_40: + testq $1, M + jz .L2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_41: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L2_46 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_42: + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + jmp .L2_42 + ALIGN_4 + +.L2_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB(xxx) + addq $2, BI + addq $1, %rax + jl .L2_47 + ALIGN_4 + + +.L2_49: + + vmovddup ALPHA, %xmm0 + + vfmaddsd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddsd (CO1, LDC),%xmm0, %xmm5,%xmm5 + + vmovsd %xmm4 , (CO1) + vmovsd %xmm5 , (CO1, LDC) + + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + +.L2_60: + + decq J // j -- + jg .L2_01 // next 2 lines of N + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovsd (BO1), %xmm0 + vmovsd %xmm0, (BO) + addq $1*SIZE,BO1 + addq $1*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $3, I // i = (m >> 3) + je .L1_20 + + ALIGN_4 + +.L1_11: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_16 + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_12: + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + je .L1_16 + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + je .L1_16 + + jmp .L1_12 + ALIGN_4 + +.L1_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_19 + + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_17: + + KERNEL8x1_SUB(xxx) + addq $1, BI + addq $8, %rax + jl .L1_17 + ALIGN_4 + + +.L1_19: + + vmovddup ALPHA, %xmm0 + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10 + vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13 + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 2 * SIZE(CO1) + vmovups %xmm10, 4 * SIZE(CO1) + vmovups %xmm13, 6 * SIZE(CO1) + + addq $8 * SIZE, CO1 # coffset += 8 + decq I # i -- + jg .L1_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_20: + // Test rest of M + + testq $7, M + jz .L999 + + testq $4, M + jz .L1_30 + + ALIGN_4 + +.L1_21: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L1_26 + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_22: + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + je .L1_26 + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + je .L1_26 + + jmp .L1_22 + ALIGN_4 + +.L1_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_29 + + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_27: + + KERNEL4x1_SUB(xxx) + addq $1, BI + addq $4, %rax + jl .L1_27 + ALIGN_4 + + +.L1_29: + + vmovddup ALPHA, %xmm0 + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 2 * SIZE(CO1) + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L1_30: + testq $2, M + jz .L1_40 + + ALIGN_4 + +.L1_31: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L1_36 + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_32: + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_36 + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_36 + + jmp .L1_32 + ALIGN_4 + +.L1_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_39 + + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_37: + + KERNEL2x1_SUB(xxx) + addq $1, BI + addq $2, %rax + jl .L1_37 + ALIGN_4 + + +.L1_39: + + vmovddup ALPHA, %xmm0 + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + + vmovups %xmm4 , (CO1) + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + + +.L1_40: + testq $1, M + jz .L999 + + ALIGN_4 + +.L1_41: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L1_46 + movq %rax, BI // Index for BO + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_42: + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + jmp .L1_42 + ALIGN_4 + +.L1_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_49 + + movq %rax, BI // Index for BO + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB(xxx) + addq $1, BI + addq $1, %rax + jl .L1_47 + ALIGN_4 + + +.L1_49: + + vmovddup ALPHA, %xmm0 + + vfmaddsd (CO1),%xmm0, %xmm4,%xmm4 + + vmovsd %xmm4 , (CO1) + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + +.L999: + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE + + +#else +/************************************************************************************* +* TRMM Kernel +*************************************************************************************/ + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif + vmovaps %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + movsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovsd %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $2, %rdi + divq %rdi // N / 2 + movq %rax, Ndiv6 // N / 2 + movq %rdx, Nmod6 // N % 2 + + + +#ifdef TRMMKERNEL + vmovsd %xmm12, OFFSET + vmovsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + + movq Ndiv6, J + cmpq $0, J + je .L1_0 + ALIGN_4 + +.L2_0: + +.L2_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L2_02b: + + vmovups (BO1), %xmm0 + vmovups %xmm0, (BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO + decq %rax + jnz .L2_02b + +.L2_02c: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $3, I // i = (m >> 3) + je .L2_20 + + ALIGN_4 + +.L2_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, 8), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_16 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_12: + + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + je .L2_16 + + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + je .L2_16 + + jmp .L2_12 + ALIGN_4 + +.L2_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_17: + + KERNEL8x2_SUB(xxx) + addq $2, BI + addq $8, %rax + jl .L2_17 + ALIGN_4 + + +.L2_19: + + vmovddup ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10 + vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13 + + vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + vfmaddpd 4 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 + vfmaddpd 6 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 + + +#else + vmulpd %xmm0, %xmm4,%xmm4 + vmulpd %xmm0, %xmm7,%xmm7 + vmulpd %xmm0, %xmm10,%xmm10 + vmulpd %xmm0, %xmm13,%xmm13 + + vmulpd %xmm0, %xmm5,%xmm5 + vmulpd %xmm0, %xmm8,%xmm8 + vmulpd %xmm0, %xmm11,%xmm11 + vmulpd %xmm0, %xmm14,%xmm14 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 2 * SIZE(CO1) + vmovups %xmm10, 4 * SIZE(CO1) + vmovups %xmm13, 6 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 2 * SIZE(CO1, LDC) + vmovups %xmm11, 4 * SIZE(CO1, LDC) + vmovups %xmm14, 6 * SIZE(CO1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, 8), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + decq I # i -- + jg .L2_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_20: + // Test rest of M + + testq $7, M + jz .L2_60 // to next 2 lines of N + + testq $4, M + jz .L2_30 + + ALIGN_4 + +.L2_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, 8), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_26 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_22: + + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + je .L2_26 + + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + je .L2_26 + + jmp .L2_22 + ALIGN_4 + +.L2_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_27: + + KERNEL4x2_SUB(xxx) + addq $2, BI + addq $4, %rax + jl .L2_27 + ALIGN_4 + + +.L2_29: + + vmovddup ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + + vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + + +#else + vmulpd %xmm0, %xmm4,%xmm4 + vmulpd %xmm0, %xmm7,%xmm7 + + vmulpd %xmm0, %xmm5,%xmm5 + vmulpd %xmm0, %xmm8,%xmm8 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 2 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 2 * SIZE(CO1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, 8), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L2_30: + testq $2, M + jz .L2_40 + + ALIGN_4 + +.L2_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, 8), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, 8), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_36 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_32: + + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_36 + + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_36 + + jmp .L2_32 + ALIGN_4 + +.L2_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_37: + + KERNEL2x2_SUB(xxx) + addq $2, BI + addq $2, %rax + jl .L2_37 + ALIGN_4 + + +.L2_39: + + vmovddup ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 + +#else + vmulpd %xmm0, %xmm4,%xmm4 + vmulpd %xmm0, %xmm5,%xmm5 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , (CO1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, 8), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, 8), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + + +.L2_40: + testq $1, M + jz .L2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, 8), BO + leaq (AO, %rax, 8), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L2_46 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_42: + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + jmp .L2_42 + ALIGN_4 + +.L2_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB(xxx) + addq $2, BI + addq $1, %rax + jl .L2_47 + ALIGN_4 + + +.L2_49: + + vmovddup ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddsd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddsd (CO1, LDC),%xmm0, %xmm5,%xmm5 + +#else + vmulsd %xmm0, %xmm4,%xmm4 + vmulsd %xmm0, %xmm5,%xmm5 + +#endif + + vmovsd %xmm4 , (CO1) + vmovsd %xmm5 , (CO1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, 8), BO + leaq (AO, %rax, 8), AO +#endif + + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + +.L2_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + decq J // j -- + jg .L2_01 // next 2 lines of N + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovsd (BO1), %xmm0 + vmovsd %xmm0, (BO) + addq $1*SIZE,BO1 + addq $1*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $3, I // i = (m >> 3) + je .L1_20 + + ALIGN_4 + +.L1_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, 8), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_16 + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_12: + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + je .L1_16 + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + je .L1_16 + + jmp .L1_12 + ALIGN_4 + +.L1_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_19 + + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_17: + + KERNEL8x1_SUB(xxx) + addq $1, BI + addq $8, %rax + jl .L1_17 + ALIGN_4 + + +.L1_19: + + vmovddup ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10 + vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13 + +#else + vmulpd %xmm0, %xmm4,%xmm4 + vmulpd %xmm0, %xmm7,%xmm7 + vmulpd %xmm0, %xmm10,%xmm10 + vmulpd %xmm0, %xmm13,%xmm13 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 2 * SIZE(CO1) + vmovups %xmm10, 4 * SIZE(CO1) + vmovups %xmm13, 6 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, 8), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + addq $8 * SIZE, CO1 # coffset += 8 + decq I # i -- + jg .L1_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_20: + // Test rest of M + + testq $7, M + jz .L999 + + testq $4, M + jz .L1_30 + + ALIGN_4 + +.L1_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, 8), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_26 + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_22: + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + je .L1_26 + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + je .L1_26 + + jmp .L1_22 + ALIGN_4 + +.L1_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_29 + + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_27: + + KERNEL4x1_SUB(xxx) + addq $1, BI + addq $4, %rax + jl .L1_27 + ALIGN_4 + + +.L1_29: + + vmovddup ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + +#else + vmulpd %xmm0, %xmm4,%xmm4 + vmulpd %xmm0, %xmm7,%xmm7 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 2 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, 8), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L1_30: + testq $2, M + jz .L1_40 + + ALIGN_4 + +.L1_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, 8), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, 8), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_36 + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_32: + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_36 + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_36 + + jmp .L1_32 + ALIGN_4 + +.L1_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_39 + + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_37: + + KERNEL2x1_SUB(xxx) + addq $1, BI + addq $2, %rax + jl .L1_37 + ALIGN_4 + + +.L1_39: + + vmovddup ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + +#else + vmulpd %xmm0, %xmm4,%xmm4 + +#endif + + vmovups %xmm4 , (CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, 8), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, 8), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + + +.L1_40: + testq $1, M + jz .L999 + + ALIGN_4 + +.L1_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, 8), BO + leaq (AO, %rax, 8), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L1_46 + movq %rax, BI // Index for BO + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_42: + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + jmp .L1_42 + ALIGN_4 + +.L1_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_49 + + movq %rax, BI // Index for BO + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB(xxx) + addq $1, BI + addq $1, %rax + jl .L1_47 + ALIGN_4 + + +.L1_49: + + vmovddup ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddsd (CO1),%xmm0, %xmm4,%xmm4 + +#else + vmulsd %xmm0, %xmm4,%xmm4 + +#endif + + vmovsd %xmm4 , (CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, 8), BO + leaq (AO, %rax, 8), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + +.L999: + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE + + + +#endif diff --git a/kernel/x86_64/sgemm_kernel_16x2_piledriver.S b/kernel/x86_64/sgemm_kernel_16x2_piledriver.S new file mode 100644 index 000000000..dcfed6bc5 --- /dev/null +++ b/kernel/x86_64/sgemm_kernel_16x2_piledriver.S @@ -0,0 +1,5258 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/********************************************************************* +* +* 2013/10/18 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +* +* 2013/10/29 Saar +* +* Parameter: +* UNROLL_M 16 +* UNROLL_N 2 +* SGEMM_P 768 +* SGEMM_Q 192 +* SGEMM_R 12288 +* A_PR1 384 +* B_PR1 192 +* +* Performance at m x n on AMD 8320 (ACML-Version: 5.3.1): +* +* 6144x6144 168.2 GFLOPS with 8 threads on 4 modules (ACML: 158.0 ) (BULLDOZER: 167.4 ) +* 6144x6144 162.7 GFLOPS with 4 threads on 4 modules (ACML: 157.6 ) (BULLDOZER: 159.0 ) +* 6144x6144 82.0 GFLOPS with 2 threads on 2 modules (ACML: 81.4 ) (BULLDOZER: 80.3 ) +* 6144x6144 41.3 GFLOPS with 1 threads on 1 modules (ACML: 41.1 ) (BULLDOZER: 40.4 ) +* +* Performance at m x n on AMD 6380 (ACML-Version: 5.3.1): +* +* 12288x12288 469.5 GFLOPS with 32 threads on 16 modules (ACML: 375.3 ) (BULLDOZER: 445.5 ) +* 12288x12288 442.9 GFLOPS with 16 threads on 16 modules (ACML: 378.5 ) (BULLDOZER: 416.3 ) +* 12288x12288 265.1 GFLOPS with 8 threads on 8 modules (ACML: 218.5 ) (BULLDOZER: 261.5 ) +* 6144x6144 139.7 GFLOPS with 4 threads on 4 modules (ACML: 116.0 ) (BULLDOZER: 137.7 ) +* 6144x6144 70.9 GFLOPS with 2 threads on 2 modules (ACML: 67.4 ) (BULLDOZER: 69.5 ) +* 6144x6144 35.6 GFLOPS with 1 threads on 1 modules (ACML: 36.1 ) (BULLDOZER: 35.1 ) +* +*********************************************************************/ + + + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define J %r14 +#define OLD_K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define K %r12 +#define BI %rbp +#define SP %rbx + +#define BO1 %rdi +#define BO2 %r15 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 96 + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#endif + +#define L_BUFFER_SIZE 512*8*4 +#define LB2_OFFSET 512*8*2 + +#define Ndiv6 24(%rsp) +#define Nmod6 32(%rsp) +#define N 40(%rsp) +#define ALPHA 48(%rsp) +#define OFFSET 56(%rsp) +#define KK 64(%rsp) +#define KKK 72(%rsp) +#define BUFFER1 128(%rsp) +#define BUFFER2 LB2_OFFSET+128(%rsp) + +#if defined(OS_WINDOWS) +#if L_BUFFER_SIZE > 16384 +#define STACK_TOUCH \ + movl $0, 4096 * 4(%rsp);\ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 12288 +#define STACK_TOUCH \ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 8192 +#define STACK_TOUCH \ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 4096 +#define STACK_TOUCH \ + movl $0, 4096 * 1(%rsp); +#else +#define STACK_TOUCH +#endif +#else +#define STACK_TOUCH +#endif + + + +#define A_PR1 384 +#define B_PR1 192 + +/******************************************************************************************* +* 3 lines of N +*******************************************************************************************/ + +#define KERNEL16x3_1(xx) \ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ + nop ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vfmaddps %xmm12,%xmm3,%xmm0,%xmm12 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ + +#define KERNEL16x3_2(xx) \ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\ + nop ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ + vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vfmaddps %xmm12,%xmm3,%xmm0,%xmm12 ;\ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ + +#define KERNEL16x3_3(xx) \ + vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\ + nop ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + prefetcht0 A_PR1+128(AO,%rax,SIZE) ;\ + vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ + vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vfmaddps %xmm12,%xmm3,%xmm0,%xmm12 ;\ + vmovups 12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ + +#define KERNEL16x3_4(xx) \ + vmovups 16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\ + nop ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups 20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + prefetcht0 A_PR1+192(AO,%rax,SIZE) ;\ + vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ + vmovups 24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ + addq $12, BI ;\ + vfmaddps %xmm12,%xmm3,%xmm0,%xmm12 ;\ + vmovups 28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ + addq $64, %rax ;\ + vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ + +#define KERNEL16x3_SUB(xx) \ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ + nop ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vfmaddps %xmm12,%xmm3,%xmm0,%xmm12 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ + vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ + + +/*******************************************************************************************/ + +#define KERNEL8x3_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ + nop ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ + +#define KERNEL8x3_2(xx) \ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\ + nop ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ + +#define KERNEL8x3_3(xx) \ + prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\ + nop ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ + +#define KERNEL8x3_4(xx) \ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\ + nop ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ + addq $12, BI ;\ + addq $32, %rax ;\ + +#define KERNEL8x3_SUB(xx) \ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ + nop ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ + + +/*******************************************************************************************/ + +#define KERNEL4x3_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL4x3_2(xx) \ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL4x3_3(xx) \ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL4x3_4(xx) \ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + addq $12, BI ;\ + addq $16, %rax ;\ + +#define KERNEL4x3_SUB(xx) \ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +/*******************************************************************************************/ + +#define KERNEL2x3_1(xx) \ + vmovss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ + vfmaddss %xmm12,%xmm3,%xmm0,%xmm12 ;\ + +#define KERNEL2x3_2(xx) \ + vmovss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ + vfmaddss %xmm12,%xmm3,%xmm0,%xmm12 ;\ + +#define KERNEL2x3_3(xx) \ + vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovss -27 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ + vfmaddss %xmm12,%xmm3,%xmm0,%xmm12 ;\ + +#define KERNEL2x3_4(xx) \ + vmovss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -26 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovss -25 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ + vfmaddss %xmm12,%xmm3,%xmm0,%xmm12 ;\ + addq $12, BI ;\ + addq $8, %rax ;\ + +#define KERNEL2x3_SUB(xx) \ + vmovss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ + vfmaddss %xmm12,%xmm3,%xmm0,%xmm12 ;\ + +/*******************************************************************************************/ + +#define KERNEL1x3_1(xx) \ + vmovss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL1x3_2(xx) \ + vmovss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL1x3_3(xx) \ + vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL1x3_4(xx) \ + vmovss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ + addq $12, BI ;\ + addq $4, %rax ;\ + +#define KERNEL1x3_SUB(xx) \ + vmovss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +/*******************************************************************************************/ + +/******************************************************************************************* +* 2 lines of N +*******************************************************************************************/ + +#define KERNEL16x2_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ + +#define KERNEL16x2_2(xx) \ + prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ + +#define KERNEL16x2_3(xx) \ + prefetcht0 A_PR1+128(AO,%rax,SIZE) ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vmovups 12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ + +#define KERNEL16x2_4(xx) \ + prefetcht0 A_PR1+192(AO,%rax,SIZE) ;\ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups 16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups 20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vmovups 24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vmovups 28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ + addq $8, BI ;\ + addq $64, %rax ;\ + +#define KERNEL16x2_SUB(xx) \ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ + + +/*******************************************************************************************/ + +#define KERNEL8x2_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + +#define KERNEL8x2_2(xx) \ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + +#define KERNEL8x2_3(xx) \ + prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + +#define KERNEL8x2_4(xx) \ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + addq $8, BI ;\ + addq $32, %rax ;\ + +#define KERNEL8x2_SUB(xx) \ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + + +/*******************************************************************************************/ + +#define KERNEL4x2_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL4x2_2(xx) \ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL4x2_3(xx) \ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL4x2_4(xx) \ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + addq $8, BI ;\ + addq $16, %rax ;\ + +#define KERNEL4x2_SUB(xx) \ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +/*******************************************************************************************/ + +#define KERNEL2x2_1(xx) \ + vmovss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ + +#define KERNEL2x2_2(xx) \ + vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ + +#define KERNEL2x2_3(xx) \ + vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss -27 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ + +#define KERNEL2x2_4(xx) \ + vmovss 2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -26 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss 3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss -25 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ + addq $8, BI ;\ + addq $8, %rax ;\ + +#define KERNEL2x2_SUB(xx) \ + vmovss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ + +/*******************************************************************************************/ + +#define KERNEL1x2_1(xx) \ + vmovss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL1x2_2(xx) \ + vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL1x2_3(xx) \ + vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL1x2_4(xx) \ + vmovss 2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss 3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + addq $8, BI ;\ + addq $4, %rax ;\ + +#define KERNEL1x2_SUB(xx) \ + vmovss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +/*******************************************************************************************/ + +/******************************************************************************************* +* 1 line of N +*******************************************************************************************/ + +#define KERNEL16x1_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + +#define KERNEL16x1_2(xx) \ + prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + +#define KERNEL16x1_3(xx) \ + prefetcht0 A_PR1+128(AO,%rax,SIZE) ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vmovups 12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + +#define KERNEL16x1_4(xx) \ + prefetcht0 A_PR1+192(AO,%rax,SIZE) ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups 16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups 20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vmovups 24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vmovups 28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + addq $4, BI ;\ + addq $64, %rax ;\ + +#define KERNEL16x1_SUB(xx) \ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + + +/*******************************************************************************************/ + +#define KERNEL8x1_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + +#define KERNEL8x1_2(xx) \ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + +#define KERNEL8x1_3(xx) \ + prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + +#define KERNEL8x1_4(xx) \ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + addq $4, BI ;\ + addq $32, %rax ;\ + +#define KERNEL8x1_SUB(xx) \ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + + +/*******************************************************************************************/ + +#define KERNEL4x1_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL4x1_2(xx) \ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL4x1_3(xx) \ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL4x1_4(xx) \ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + addq $4, BI ;\ + addq $16, %rax ;\ + +#define KERNEL4x1_SUB(xx) \ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +/*******************************************************************************************/ + +#define KERNEL2x1_1(xx) \ + vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + +#define KERNEL2x1_2(xx) \ + vmovss -1 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + +#define KERNEL2x1_3(xx) \ + vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -27 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + +#define KERNEL2x1_4(xx) \ + vmovss 1 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -26 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -25 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + addq $4, BI ;\ + addq $8, %rax ;\ + +#define KERNEL2x1_SUB(xx) \ + vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + +/*******************************************************************************************/ + +#define KERNEL1x1_1(xx) \ + vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL1x1_2(xx) \ + vmovss -1 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL1x1_3(xx) \ + vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL1x1_4(xx) \ + vmovss 1 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + addq $4, BI ;\ + addq $4, %rax ;\ + +#define KERNEL1x1_SUB(xx) \ + vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +/*******************************************************************************************/ + +#if !defined(TRMMKERNEL) + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC + + vmovaps %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovsd %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $6, %rdi + divq %rdi // N / 6 + movq %rax, Ndiv6 // N / 6 + movq %rdx, Nmod6 // N % 6 + + + movq Ndiv6, J + cmpq $0, J + je .L2_0 + ALIGN_4 + +.L6_01: + // copy to sub buffer + movq K, %rax + salq $1,%rax // K * 2 ; read 2 values + movq B, BO1 + leaq (B,%rax, SIZE), BO2 // next offset to BO2 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + sarq $3 , %rax // K / 8 + jz .L6_01a_2 + ALIGN_4 + +.L6_01a_1: + + prefetcht0 512(BO1) + prefetcht0 512(BO2) + prefetchw 512(BO) + + vmovsd 0 * SIZE(BO1), %xmm0 + vmovsd 2 * SIZE(BO1), %xmm2 + vmovsd 4 * SIZE(BO1), %xmm4 + vmovsd 6 * SIZE(BO1), %xmm6 + vmovss 0 * SIZE(BO2), %xmm1 + vmovss 2 * SIZE(BO2), %xmm3 + vmovss 4 * SIZE(BO2), %xmm5 + vmovss 6 * SIZE(BO2), %xmm7 + vmovsd %xmm0, 0*SIZE(BO) + vmovss %xmm1, 2*SIZE(BO) + vmovsd %xmm2, 3*SIZE(BO) + vmovss %xmm3, 5*SIZE(BO) + vmovsd %xmm4, 6*SIZE(BO) + vmovss %xmm5, 8*SIZE(BO) + vmovsd %xmm6, 9*SIZE(BO) + vmovss %xmm7,11*SIZE(BO) + addq $8*SIZE,BO1 + addq $8*SIZE,BO2 + addq $12*SIZE,BO + + vmovsd 0 * SIZE(BO1), %xmm0 + vmovsd 2 * SIZE(BO1), %xmm2 + vmovsd 4 * SIZE(BO1), %xmm4 + vmovsd 6 * SIZE(BO1), %xmm6 + vmovss 0 * SIZE(BO2), %xmm1 + vmovss 2 * SIZE(BO2), %xmm3 + vmovss 4 * SIZE(BO2), %xmm5 + vmovss 6 * SIZE(BO2), %xmm7 + vmovsd %xmm0, 0*SIZE(BO) + vmovss %xmm1, 2*SIZE(BO) + vmovsd %xmm2, 3*SIZE(BO) + vmovss %xmm3, 5*SIZE(BO) + vmovsd %xmm4, 6*SIZE(BO) + vmovss %xmm5, 8*SIZE(BO) + vmovsd %xmm6, 9*SIZE(BO) + vmovss %xmm7,11*SIZE(BO) + addq $8*SIZE,BO1 + addq $8*SIZE,BO2 + addq $12*SIZE,BO + + decq %rax + jnz .L6_01a_1 + + + +.L6_01a_2: + + movq K, %rax + andq $7, %rax // K % 8 + jz .L6_02c + ALIGN_4 + + +.L6_02b: + + vmovsd 0 * SIZE(BO1), %xmm0 + vmovss 0 * SIZE(BO2), %xmm2 + vmovsd %xmm0, 0*SIZE(BO) + vmovss %xmm2, 2*SIZE(BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO2 + addq $3*SIZE,BO + decq %rax + jnz .L6_02b + +.L6_02c: + + movq K, %rax + salq $1,%rax // K * 2 + leaq (B,%rax, SIZE), BO1 // next offset to BO1 + leaq (BO1,%rax, SIZE), BO2 // next offset to BO2 + leaq BUFFER2, BO // second buffer to BO + movq K, %rax + sarq $3 , %rax // K / 8 + jz .L6_02c_2 + ALIGN_4 + +.L6_02c_1: + + prefetcht0 512(BO2) + prefetchw 512(BO) + + vmovsd 0 * SIZE(BO2), %xmm0 + vmovsd 2 * SIZE(BO2), %xmm2 + vmovsd 4 * SIZE(BO2), %xmm4 + vmovsd 6 * SIZE(BO2), %xmm6 + vmovss 1 * SIZE(BO1), %xmm1 + vmovss 3 * SIZE(BO1), %xmm3 + vmovss 5 * SIZE(BO1), %xmm5 + vmovss 7 * SIZE(BO1), %xmm7 + vmovss %xmm1, 0*SIZE(BO) + vmovsd %xmm0, 1*SIZE(BO) + vmovss %xmm3, 3*SIZE(BO) + vmovsd %xmm2, 4*SIZE(BO) + vmovss %xmm5, 6*SIZE(BO) + vmovsd %xmm4, 7*SIZE(BO) + vmovss %xmm7, 9*SIZE(BO) + vmovsd %xmm6,10*SIZE(BO) + addq $8*SIZE,BO1 + addq $8*SIZE,BO2 + addq $12*SIZE,BO + + + vmovsd 0 * SIZE(BO2), %xmm0 + vmovsd 2 * SIZE(BO2), %xmm2 + vmovsd 4 * SIZE(BO2), %xmm4 + vmovsd 6 * SIZE(BO2), %xmm6 + vmovss 1 * SIZE(BO1), %xmm1 + vmovss 3 * SIZE(BO1), %xmm3 + vmovss 5 * SIZE(BO1), %xmm5 + vmovss 7 * SIZE(BO1), %xmm7 + vmovss %xmm1, 0*SIZE(BO) + vmovsd %xmm0, 1*SIZE(BO) + vmovss %xmm3, 3*SIZE(BO) + vmovsd %xmm2, 4*SIZE(BO) + vmovss %xmm5, 6*SIZE(BO) + vmovsd %xmm4, 7*SIZE(BO) + vmovss %xmm7, 9*SIZE(BO) + vmovsd %xmm6,10*SIZE(BO) + addq $8*SIZE,BO1 + addq $8*SIZE,BO2 + addq $12*SIZE,BO + + decq %rax + jnz .L6_02c_1 + + +.L6_02c_2: + + movq K, %rax + andq $7, %rax // K % 8 + jz .L6_03c + ALIGN_4 + +.L6_03b: + + vmovss 1*SIZE(BO1), %xmm0 + vmovsd 0*SIZE(BO2), %xmm1 + vmovss %xmm0, 0*SIZE(BO) + vmovsd %xmm1, 1*SIZE(BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO2 + addq $3*SIZE,BO + decq %rax + jnz .L6_03b + + +.L6_03c: + + movq BO2, B // next offset of B + +.L6_10: + movq C, CO1 + leaq (C, LDC, 2), C + leaq (C, LDC, 1), C // c += 3 * ldc + + + movq A, AO // aoffset = a + addq $32 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L6_20 + + ALIGN_4 + +.L6_11: + leaq BUFFER1, BO // first buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax // K = K - ( K % 8 ) + je .L6_16 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_12: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL16x3_1(xxx) + KERNEL16x3_2(xxx) + KERNEL16x3_3(xxx) + KERNEL16x3_4(xxx) + + KERNEL16x3_1(xxx) + prefetcht0 B_PR1+16(BO,BI, SIZE) + KERNEL16x3_2(xxx) + KERNEL16x3_3(xxx) + KERNEL16x3_4(xxx) + + je .L6_16 + + KERNEL16x3_1(xxx) + KERNEL16x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI, SIZE) + KERNEL16x3_3(xxx) + KERNEL16x3_4(xxx) + + KERNEL16x3_1(xxx) + KERNEL16x3_2(xxx) + KERNEL16x3_3(xxx) + KERNEL16x3_4(xxx) + + je .L6_16 + + jmp .L6_12 + ALIGN_4 + +.L6_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_17: + + KERNEL16x3_SUB(xxx) + addq $3, BI + addq $16, %rax + jl .L6_17 + ALIGN_4 + + +.L6_19: + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 + vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 + + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + vfmaddps 8 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 + vfmaddps 12 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 + + vfmaddps (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + vfmaddps 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 + vfmaddps 8 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12 + vfmaddps 12 * SIZE(CO1, LDC, 2),%xmm0, %xmm15,%xmm15 + + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + vmovups %xmm10, 8 * SIZE(CO1) + vmovups %xmm13,12 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 4 * SIZE(CO1, LDC) + vmovups %xmm11, 8 * SIZE(CO1, LDC) + vmovups %xmm14,12 * SIZE(CO1, LDC) + + vmovups %xmm6 , (CO1, LDC, 2) + vmovups %xmm9 , 4 * SIZE(CO1, LDC, 2) + vmovups %xmm12, 8 * SIZE(CO1, LDC, 2) + vmovups %xmm15,12 * SIZE(CO1, LDC, 2) + + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L6_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L6_20: + // Test rest of M + + testq $15, M + jz .L7_10 // to next 3 lines of N + + testq $8, M + jz .L6_21pre + ALIGN_4 + +/**************************************************************************/ + +.L6_20_1: + leaq BUFFER1, BO // first buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L6_20_6 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_20_2: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL8x3_1(xxx) + KERNEL8x3_2(xxx) + KERNEL8x3_3(xxx) + KERNEL8x3_4(xxx) + + KERNEL8x3_1(xxx) + prefetcht0 B_PR1+16(BO,BI, SIZE) + KERNEL8x3_2(xxx) + KERNEL8x3_3(xxx) + KERNEL8x3_4(xxx) + + je .L6_20_6 + + KERNEL8x3_1(xxx) + KERNEL8x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI, SIZE) + KERNEL8x3_3(xxx) + KERNEL8x3_4(xxx) + + KERNEL8x3_1(xxx) + KERNEL8x3_2(xxx) + KERNEL8x3_3(xxx) + KERNEL8x3_4(xxx) + + je .L6_20_6 + + jmp .L6_20_2 + ALIGN_4 + +.L6_20_6: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_20_9 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_20_7: + + KERNEL8x3_SUB(xxx) + addq $3, BI + addq $8, %rax + jl .L6_20_7 + ALIGN_4 + + +.L6_20_9: + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + + vfmaddps (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + vfmaddps 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 + + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 4 * SIZE(CO1, LDC) + + vmovups %xmm6 , (CO1, LDC, 2) + vmovups %xmm9 , 4 * SIZE(CO1, LDC, 2) + + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L6_21pre: + + testq $4, M + jz .L6_30 + ALIGN_4 + +.L6_21: + leaq BUFFER1, BO // first buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L6_26 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_22: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + KERNEL4x3_1(xxx) + prefetcht0 B_PR1+16(BO,BI, SIZE) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + je .L6_26 + + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI, SIZE) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + je .L6_26 + + jmp .L6_22 + ALIGN_4 + +.L6_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_27: + + KERNEL4x3_SUB(xxx) + addq $3, BI + addq $4, %rax + jl .L6_27 + ALIGN_4 + + +.L6_29: + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddps (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm6 , (CO1, LDC, 2) + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L6_30: + testq $2, M + jz .L6_40 + + ALIGN_4 + +.L6_31: + leaq BUFFER1, BO // first buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L6_36 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_32: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + KERNEL2x3_1(xxx) + prefetcht0 B_PR1+16(BO,BI,SIZE) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + je .L6_36 + + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI,SIZE) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + je .L6_36 + + jmp .L6_32 + ALIGN_4 + +.L6_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_37: + + KERNEL2x3_SUB(xxx) + addq $3, BI + addq $2, %rax + jl .L6_37 + ALIGN_4 + + +.L6_39: + + vmovss ALPHA, %xmm0 + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 + vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddss 1 * SIZE(CO1, LDC),%xmm0, %xmm10,%xmm10 + vfmaddss (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + vfmaddss 1 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12 + + + vmovss %xmm4 , (CO1) + vmovss %xmm8 , 1 * SIZE(CO1) + vmovss %xmm5 , (CO1, LDC) + vmovss %xmm10, 1 * SIZE(CO1, LDC) + vmovss %xmm6 , (CO1, LDC, 2) + vmovss %xmm12, 1 * SIZE(CO1, LDC, 2) + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L6_40: + testq $1, M + jz .L7_10 // to next 3 lines of N + + ALIGN_4 + +.L6_41: + leaq BUFFER1, BO // first buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L6_46 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_42: + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + je .L6_46 + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + je .L6_46 + + jmp .L6_42 + ALIGN_4 + +.L6_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_47: + + KERNEL1x3_SUB(xxx) + addq $3, BI + addq $1, %rax + jl .L6_47 + ALIGN_4 + + +.L6_49: + + vmovss ALPHA, %xmm0 + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddss (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + + + vmovss %xmm4 , (CO1) + vmovss %xmm5 , (CO1, LDC) + vmovss %xmm6 , (CO1, LDC, 2) + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + + +/***************************************************************************************************************/ + +.L7_10: + movq C, CO1 + leaq (C, LDC, 2), C + leaq (C, LDC, 1), C // c += 3 * ldc + + + movq A, AO // aoffset = a + addq $32 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L7_20 + + ALIGN_4 + +.L7_11: + leaq BUFFER2, BO // second buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax // K = K - ( K % 8 ) + je .L7_16 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_12: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL16x3_1(xxx) + KERNEL16x3_2(xxx) + KERNEL16x3_3(xxx) + KERNEL16x3_4(xxx) + + KERNEL16x3_1(xxx) + prefetcht0 B_PR1+16(BO,BI, SIZE) + KERNEL16x3_2(xxx) + KERNEL16x3_3(xxx) + KERNEL16x3_4(xxx) + + je .L7_16 + + KERNEL16x3_1(xxx) + KERNEL16x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI, SIZE) + KERNEL16x3_3(xxx) + KERNEL16x3_4(xxx) + + KERNEL16x3_1(xxx) + KERNEL16x3_2(xxx) + KERNEL16x3_3(xxx) + KERNEL16x3_4(xxx) + + je .L7_16 + + jmp .L7_12 + ALIGN_4 + +.L7_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_17: + + KERNEL16x3_SUB(xxx) + addq $3, BI + addq $16, %rax + jl .L7_17 + ALIGN_4 + + +.L7_19: + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 + vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 + + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + vfmaddps 8 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 + vfmaddps 12 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 + + vfmaddps (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + vfmaddps 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 + vfmaddps 8 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12 + vfmaddps 12 * SIZE(CO1, LDC, 2),%xmm0, %xmm15,%xmm15 + + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + vmovups %xmm10, 8 * SIZE(CO1) + vmovups %xmm13,12 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 4 * SIZE(CO1, LDC) + vmovups %xmm11, 8 * SIZE(CO1, LDC) + vmovups %xmm14,12 * SIZE(CO1, LDC) + + vmovups %xmm6 , (CO1, LDC, 2) + vmovups %xmm9 , 4 * SIZE(CO1, LDC, 2) + vmovups %xmm12, 8 * SIZE(CO1, LDC, 2) + vmovups %xmm15,12 * SIZE(CO1, LDC, 2) + + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L7_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L7_20: + // Test rest of M + + testq $15, M + jz .L7_60 // to next 3 lines of N + + testq $8, M + jz .L7_21pre + ALIGN_4 + +/**************************************************************************/ + +.L7_20_1: + leaq BUFFER2, BO // first buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L7_20_6 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_20_2: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL8x3_1(xxx) + KERNEL8x3_2(xxx) + KERNEL8x3_3(xxx) + KERNEL8x3_4(xxx) + + KERNEL8x3_1(xxx) + prefetcht0 B_PR1+16(BO,BI, SIZE) + KERNEL8x3_2(xxx) + KERNEL8x3_3(xxx) + KERNEL8x3_4(xxx) + + je .L7_20_6 + + KERNEL8x3_1(xxx) + KERNEL8x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI, SIZE) + KERNEL8x3_3(xxx) + KERNEL8x3_4(xxx) + + KERNEL8x3_1(xxx) + KERNEL8x3_2(xxx) + KERNEL8x3_3(xxx) + KERNEL8x3_4(xxx) + + je .L7_20_6 + + jmp .L7_20_2 + ALIGN_4 + +.L7_20_6: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_20_9 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_20_7: + + KERNEL8x3_SUB(xxx) + addq $3, BI + addq $8, %rax + jl .L7_20_7 + ALIGN_4 + +.L7_20_9: + + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + + vfmaddps (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + vfmaddps 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 4 * SIZE(CO1, LDC) + + vmovups %xmm6 , (CO1, LDC, 2) + vmovups %xmm9 , 4 * SIZE(CO1, LDC, 2) + + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L7_21pre: + + testq $4, M + jz .L7_30 + ALIGN_4 + +.L7_21: + leaq BUFFER2, BO // second buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L7_26 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_22: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + KERNEL4x3_1(xxx) + prefetcht0 B_PR1+16(BO,BI, SIZE) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + je .L7_26 + + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI, SIZE) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + je .L7_26 + + jmp .L7_22 + ALIGN_4 + +.L7_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_27: + + KERNEL4x3_SUB(xxx) + addq $3, BI + addq $4, %rax + jl .L7_27 + ALIGN_4 + + +.L7_29: + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddps (CO1, LDC, 2),%xmm0, %xmm6 ,%xmm6 + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm6 , (CO1, LDC, 2) + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L7_30: + testq $2, M + jz .L7_40 + + ALIGN_4 + +.L7_31: + leaq BUFFER2, BO // second buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L7_36 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_32: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + KERNEL2x3_1(xxx) + prefetcht0 B_PR1+16(BO,BI,SIZE) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + je .L7_36 + + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI,SIZE) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + je .L7_36 + + jmp .L7_32 + ALIGN_4 + +.L7_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_37: + + KERNEL2x3_SUB(xxx) + addq $3, BI + addq $2, %rax + jl .L7_37 + ALIGN_4 + + +.L7_39: + + vmovss ALPHA, %xmm0 + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 + vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddss 1 * SIZE(CO1, LDC),%xmm0, %xmm10,%xmm10 + vfmaddss (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + vfmaddss 1 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12 + + vmovss %xmm4 , (CO1) + vmovss %xmm8 , 1 * SIZE(CO1) + vmovss %xmm5 , (CO1, LDC) + vmovss %xmm10, 1 * SIZE(CO1, LDC) + vmovss %xmm6 , (CO1, LDC, 2) + vmovss %xmm12, 1 * SIZE(CO1, LDC, 2) + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L7_40: + testq $1, M + jz .L7_60 // to next 3 lines of N + + ALIGN_4 + +.L7_41: + leaq BUFFER2, BO // second buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L7_46 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_42: + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + je .L7_46 + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + je .L7_46 + + jmp .L7_42 + ALIGN_4 + +.L7_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_47: + + KERNEL1x3_SUB(xxx) + addq $3, BI + addq $1, %rax + jl .L7_47 + ALIGN_4 + + +.L7_49: + + vmovss ALPHA, %xmm0 + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddss (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + + vmovss %xmm4 , (CO1) + vmovss %xmm5 , (CO1, LDC) + vmovss %xmm6 , (CO1, LDC, 2) + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + +.L7_60: + + decq J // j -- + jg .L6_01 + + +.L2_0: + cmpq $0, Nmod6 // N % 6 == 0 + je .L999 + +/************************************************************************************************ +* Loop for Nmod6 / 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + sarq $1, J // j = j / 2 + je .L1_0 + ALIGN_4 + +.L2_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L2_02b: + + vmovsd (BO1), %xmm0 + vmovsd %xmm0, (BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO + decq %rax + jnz .L2_02b + +.L2_02c: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + + + movq A, AO // aoffset = a + addq $32 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L2_20 + + ALIGN_4 + +.L2_11: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_16 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_12: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL16x2_1(xxx) + KERNEL16x2_2(xxx) + KERNEL16x2_3(xxx) + KERNEL16x2_4(xxx) + + KERNEL16x2_1(xxx) + KERNEL16x2_2(xxx) + KERNEL16x2_3(xxx) + KERNEL16x2_4(xxx) + + je .L2_16 + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL16x2_1(xxx) + KERNEL16x2_2(xxx) + KERNEL16x2_3(xxx) + KERNEL16x2_4(xxx) + + KERNEL16x2_1(xxx) + KERNEL16x2_2(xxx) + KERNEL16x2_3(xxx) + KERNEL16x2_4(xxx) + + je .L2_16 + + jmp .L2_12 + ALIGN_4 + +.L2_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_17: + + KERNEL16x2_SUB(xxx) + addq $2, BI + addq $16, %rax + jl .L2_17 + ALIGN_4 + + +.L2_19: + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 + vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 + + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + vfmaddps 8 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 + vfmaddps 12 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + vmovups %xmm10, 8 * SIZE(CO1) + vmovups %xmm13,12 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 4 * SIZE(CO1, LDC) + vmovups %xmm11, 8 * SIZE(CO1, LDC) + vmovups %xmm14,12 * SIZE(CO1, LDC) + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L2_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_20: + // Test rest of M + + testq $15, M + jz .L2_60 // to next 3 lines of N + + testq $8, M + jz .L2_21pre + ALIGN_4 + +/**************************************************************************/ + +.L2_20_1: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L2_20_6 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_2: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + je .L2_20_6 + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + je .L2_20_6 + + jmp .L2_20_2 + ALIGN_4 + +.L2_20_6: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_20_9 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_7: + + KERNEL8x2_SUB(xxx) + addq $2, BI + addq $8, %rax + jl .L2_20_7 + ALIGN_4 + + +.L2_20_9: + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 4 * SIZE(CO1, LDC) + + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L2_21pre: + + testq $4, M + jz .L2_30 + ALIGN_4 + +.L2_21: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L2_26 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 1 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_22: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + je .L2_26 + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + je .L2_26 + + jmp .L2_22 + ALIGN_4 + +.L2_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_27: + + KERNEL4x2_SUB(xxx) + addq $2, BI + addq $4, %rax + jl .L2_27 + ALIGN_4 + + +.L2_29: + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , (CO1, LDC) + + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L2_30: + testq $2, M + jz .L2_40 + + ALIGN_4 + +.L2_31: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L2_36 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_32: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_36 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_36 + + jmp .L2_32 + ALIGN_4 + +.L2_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_37: + + KERNEL2x2_SUB(xxx) + addq $2, BI + addq $2, %rax + jl .L2_37 + ALIGN_4 + + +.L2_39: + + vmovss ALPHA, %xmm0 + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 + vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddss 1 * SIZE(CO1, LDC),%xmm0, %xmm10,%xmm10 + + vmovss %xmm4 , (CO1) + vmovss %xmm8 , 1 * SIZE(CO1) + vmovss %xmm5 , (CO1, LDC) + vmovss %xmm10, 1 * SIZE(CO1, LDC) + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L2_40: + testq $1, M + jz .L2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_41: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L2_46 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_42: + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + jmp .L2_42 + ALIGN_4 + +.L2_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB(xxx) + addq $2, BI + addq $1, %rax + jl .L2_47 + ALIGN_4 + + +.L2_49: + + vmovss ALPHA, %xmm0 + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 + + vmovss %xmm4 , (CO1) + vmovss %xmm5 , (CO1, LDC) + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + +.L2_60: + + decq J // j -- + jg .L2_01 // next 2 lines of N + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovss (BO1), %xmm0 + vmovss %xmm0, (BO) + addq $1*SIZE,BO1 + addq $1*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + + + movq A, AO // aoffset = a + addq $32 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L1_20 + + ALIGN_4 + +.L1_11: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_16 + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_12: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL16x1_1(xxx) + KERNEL16x1_2(xxx) + KERNEL16x1_3(xxx) + KERNEL16x1_4(xxx) + + KERNEL16x1_1(xxx) + KERNEL16x1_2(xxx) + KERNEL16x1_3(xxx) + KERNEL16x1_4(xxx) + + je .L1_16 + + KERNEL16x1_1(xxx) + KERNEL16x1_2(xxx) + KERNEL16x1_3(xxx) + KERNEL16x1_4(xxx) + + KERNEL16x1_1(xxx) + KERNEL16x1_2(xxx) + KERNEL16x1_3(xxx) + KERNEL16x1_4(xxx) + + je .L1_16 + + jmp .L1_12 + ALIGN_4 + +.L1_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_19 + + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_17: + + KERNEL16x1_SUB(xxx) + addq $1, BI + addq $16, %rax + jl .L1_17 + ALIGN_4 + + +.L1_19: + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 + vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + vmovups %xmm10, 8 * SIZE(CO1) + vmovups %xmm13,12 * SIZE(CO1) + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L1_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_20: + // Test rest of M + + testq $15, M + jz .L999 + + testq $8, M + jz .L1_21pre + ALIGN_4 + +/**************************************************************************/ + +.L1_20_1: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L1_20_6 + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_2: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + je .L1_20_6 + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + je .L1_20_6 + + jmp .L1_20_2 + ALIGN_4 + +.L1_20_6: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_20_9 + + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_7: + + KERNEL8x1_SUB(xxx) + addq $1, BI + addq $8, %rax + jl .L1_20_7 + ALIGN_4 + + +.L1_20_9: + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L1_21pre: + + testq $4, M + jz .L1_30 + ALIGN_4 + +.L1_21: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L1_26 + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_22: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + je .L1_26 + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + je .L1_26 + + jmp .L1_22 + ALIGN_4 + +.L1_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_29 + + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_27: + + KERNEL4x1_SUB(xxx) + addq $1, BI + addq $4, %rax + jl .L1_27 + ALIGN_4 + + +.L1_29: + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + + vmovups %xmm4 , (CO1) + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L1_30: + testq $2, M + jz .L1_40 + + ALIGN_4 + +.L1_31: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L1_36 + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_32: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_36 + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_36 + + jmp .L1_32 + ALIGN_4 + +.L1_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_39 + + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_37: + + KERNEL2x1_SUB(xxx) + addq $1, BI + addq $2, %rax + jl .L1_37 + ALIGN_4 + + +.L1_39: + + vmovss ALPHA, %xmm0 + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 + + vmovss %xmm4 , (CO1) + vmovss %xmm8 , 1 * SIZE(CO1) + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L1_40: + testq $1, M + jz .L999 + + ALIGN_4 + +.L1_41: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L1_46 + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_42: + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + jmp .L1_42 + ALIGN_4 + +.L1_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_49 + + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB(xxx) + addq $1, BI + addq $1, %rax + jl .L1_47 + ALIGN_4 + + +.L1_49: + + vmovss ALPHA, %xmm0 + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + + vmovss %xmm4 , (CO1) + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + +.L999: + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE + + +#else +/************************************************************************************* +* TRMM Kernel +*************************************************************************************/ + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif + vmovaps %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + movsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovsd %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $2, %rdi + divq %rdi // N / 6 + movq %rax, Ndiv6 // N / 6 + movq %rdx, Nmod6 // N % 6 + + + +#ifdef TRMMKERNEL + vmovsd %xmm12, OFFSET + vmovsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + + movq Ndiv6, J + cmpq $0, J + je .L1_0 + ALIGN_4 + +.L2_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L2_02b: + + vmovsd (BO1), %xmm0 + vmovsd %xmm0, (BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO + decq %rax + jnz .L2_02b + +.L2_02c: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $32 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L2_20 + + ALIGN_4 + +.L2_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_16 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_12: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL16x2_1(xxx) + KERNEL16x2_2(xxx) + KERNEL16x2_3(xxx) + KERNEL16x2_4(xxx) + + KERNEL16x2_1(xxx) + KERNEL16x2_2(xxx) + KERNEL16x2_3(xxx) + KERNEL16x2_4(xxx) + + je .L2_16 + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL16x2_1(xxx) + KERNEL16x2_2(xxx) + KERNEL16x2_3(xxx) + KERNEL16x2_4(xxx) + + KERNEL16x2_1(xxx) + KERNEL16x2_2(xxx) + KERNEL16x2_3(xxx) + KERNEL16x2_4(xxx) + + je .L2_16 + + jmp .L2_12 + ALIGN_4 + +.L2_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_17: + + KERNEL16x2_SUB(xxx) + addq $2, BI + addq $16, %rax + jl .L2_17 + ALIGN_4 + + +.L2_19: + + vbroadcastss ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 + vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 + + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + vfmaddps 8 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 + vfmaddps 12 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 + +#else + vmulps %xmm0, %xmm4,%xmm4 + vmulps %xmm0, %xmm7,%xmm7 + vmulps %xmm0, %xmm10,%xmm10 + vmulps %xmm0, %xmm13,%xmm13 + + vmulps %xmm0, %xmm5,%xmm5 + vmulps %xmm0, %xmm8,%xmm8 + vmulps %xmm0, %xmm11,%xmm11 + vmulps %xmm0, %xmm14,%xmm14 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + vmovups %xmm10, 8 * SIZE(CO1) + vmovups %xmm13,12 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 4 * SIZE(CO1, LDC) + vmovups %xmm11, 8 * SIZE(CO1, LDC) + vmovups %xmm14,12 * SIZE(CO1, LDC) + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L2_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_20: + // Test rest of M + + testq $15, M + jz .L2_60 // to next 3 lines of N + + testq $8, M + jz .L2_21pre + ALIGN_4 + +/**************************************************************************/ + +.L2_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_20_6 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_2: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + je .L2_20_6 + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + je .L2_20_6 + + jmp .L2_20_2 + ALIGN_4 + +.L2_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_20_9 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_7: + + KERNEL8x2_SUB(xxx) + addq $2, BI + addq $8, %rax + jl .L2_20_7 + ALIGN_4 + + +.L2_20_9: + + vbroadcastss ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + +#else + vmulps %xmm0, %xmm4,%xmm4 + vmulps %xmm0, %xmm7,%xmm7 + + vmulps %xmm0, %xmm5,%xmm5 + vmulps %xmm0, %xmm8,%xmm8 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 4 * SIZE(CO1, LDC) + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L2_21pre: + + testq $4, M + jz .L2_30 + ALIGN_4 + +.L2_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_26 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 1 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_22: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + je .L2_26 + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + je .L2_26 + + jmp .L2_22 + ALIGN_4 + +.L2_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_27: + + KERNEL4x2_SUB(xxx) + addq $2, BI + addq $4, %rax + jl .L2_27 + ALIGN_4 + + +.L2_29: + + vbroadcastss ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + +#else + vmulps %xmm0, %xmm4,%xmm4 + vmulps %xmm0, %xmm5,%xmm5 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , (CO1, LDC) + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L2_30: + testq $2, M + jz .L2_40 + + ALIGN_4 + +.L2_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_36 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_32: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_36 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_36 + + jmp .L2_32 + ALIGN_4 + +.L2_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_37: + + KERNEL2x2_SUB(xxx) + addq $2, BI + addq $2, %rax + jl .L2_37 + ALIGN_4 + + +.L2_39: + + vmovss ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 + vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddss 1 * SIZE(CO1, LDC),%xmm0, %xmm10,%xmm10 + +#else + vmulss %xmm0, %xmm4,%xmm4 + vmulss %xmm0, %xmm8,%xmm8 + vmulss %xmm0, %xmm5,%xmm5 + vmulss %xmm0, %xmm10,%xmm10 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm8 , 1 * SIZE(CO1) + vmovss %xmm5 , (CO1, LDC) + vmovss %xmm10, 1 * SIZE(CO1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L2_40: + testq $1, M + jz .L2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L2_46 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_42: + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + jmp .L2_42 + ALIGN_4 + +.L2_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB(xxx) + addq $2, BI + addq $1, %rax + jl .L2_47 + ALIGN_4 + + +.L2_49: + + vmovss ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 + +#else + vmulss %xmm0, %xmm4,%xmm4 + vmulss %xmm0, %xmm5,%xmm5 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm5 , (CO1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + + + +.L2_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + decq J // j -- + jg .L2_01 // next 2 lines of N + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovss (BO1), %xmm0 + vmovss %xmm0, (BO) + addq $1*SIZE,BO1 + addq $1*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $32 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L1_20 + + ALIGN_4 + +.L1_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_16 + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_12: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL16x1_1(xxx) + KERNEL16x1_2(xxx) + KERNEL16x1_3(xxx) + KERNEL16x1_4(xxx) + + KERNEL16x1_1(xxx) + KERNEL16x1_2(xxx) + KERNEL16x1_3(xxx) + KERNEL16x1_4(xxx) + + je .L1_16 + + KERNEL16x1_1(xxx) + KERNEL16x1_2(xxx) + KERNEL16x1_3(xxx) + KERNEL16x1_4(xxx) + + KERNEL16x1_1(xxx) + KERNEL16x1_2(xxx) + KERNEL16x1_3(xxx) + KERNEL16x1_4(xxx) + + je .L1_16 + + jmp .L1_12 + ALIGN_4 + +.L1_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_19 + + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_17: + + KERNEL16x1_SUB(xxx) + addq $1, BI + addq $16, %rax + jl .L1_17 + ALIGN_4 + + +.L1_19: + + vbroadcastss ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 + vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 + +#else + vmulps %xmm0, %xmm4,%xmm4 + vmulps %xmm0, %xmm7,%xmm7 + vmulps %xmm0, %xmm10,%xmm10 + vmulps %xmm0, %xmm13,%xmm13 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + vmovups %xmm10, 8 * SIZE(CO1) + vmovups %xmm13,12 * SIZE(CO1) + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L1_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_20: + // Test rest of M + + testq $15, M + jz .L999 + + testq $8, M + jz .L1_21pre + ALIGN_4 + +/**************************************************************************/ + +.L1_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_20_6 + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_2: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + je .L1_20_6 + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + je .L1_20_6 + + jmp .L1_20_2 + ALIGN_4 + +.L1_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_20_9 + + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_7: + + KERNEL8x1_SUB(xxx) + addq $1, BI + addq $8, %rax + jl .L1_20_7 + ALIGN_4 + + +.L1_20_9: + + vbroadcastss ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + +#else + vmulps %xmm0, %xmm4,%xmm4 + vmulps %xmm0, %xmm7,%xmm7 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L1_21pre: + + testq $4, M + jz .L1_30 + ALIGN_4 + +.L1_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_26 + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_22: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + je .L1_26 + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + je .L1_26 + + jmp .L1_22 + ALIGN_4 + +.L1_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_29 + + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_27: + + KERNEL4x1_SUB(xxx) + addq $1, BI + addq $4, %rax + jl .L1_27 + ALIGN_4 + + +.L1_29: + + vbroadcastss ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + +#else + vmulps %xmm0, %xmm4,%xmm4 + +#endif + + vmovups %xmm4 , (CO1) + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L1_30: + testq $2, M + jz .L1_40 + + ALIGN_4 + +.L1_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_36 + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_32: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_36 + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_36 + + jmp .L1_32 + ALIGN_4 + +.L1_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_39 + + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_37: + + KERNEL2x1_SUB(xxx) + addq $1, BI + addq $2, %rax + jl .L1_37 + ALIGN_4 + + +.L1_39: + + vmovss ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 + +#else + vmulss %xmm0, %xmm4,%xmm4 + vmulss %xmm0, %xmm8,%xmm8 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm8 , 1 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L1_40: + testq $1, M + jz .L999 + + ALIGN_4 + +.L1_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L1_46 + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_42: + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + jmp .L1_42 + ALIGN_4 + +.L1_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_49 + + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB(xxx) + addq $1, BI + addq $1, %rax + jl .L1_47 + ALIGN_4 + + +.L1_49: + + vmovss ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + +#else + vmulss %xmm0, %xmm4,%xmm4 + +#endif + + vmovss %xmm4 , (CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + +.L999: + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE + + + + + +#endif diff --git a/kernel/x86_64/sgemm_kernel_16x4_haswell.S b/kernel/x86_64/sgemm_kernel_16x4_haswell.S new file mode 100644 index 000000000..2f1434ffa --- /dev/null +++ b/kernel/x86_64/sgemm_kernel_16x4_haswell.S @@ -0,0 +1,3159 @@ +/********************************************************************************* +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + +/********************************************************************* +* 2013/11/13 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +* 2013/10/28 Saar +* Parameter: +* SGEMM_DEFAULT_UNROLL_N 4 +* SGEMM_DEFAULT_UNROLL_M 16 +* SGEMM_DEFAULT_P 768 +* SGEMM_DEFAULT_Q 384 +* A_PR1 512 +* B_PR1 512 +* +* +* Performance at 9216x9216x9216: +* 1 thread: 86 GFLOPS (SANDYBRIDGE: 59) (MKL: 83) +* 2 threads: 157 GFLOPS (SANDYBRIDGE: 116) (MKL: 155) +* 3 threads: 235 GFLOPS (SANDYBRIDGE: 165) (MKL: 230) +* 4 threads: 288 GFLOPS (SANDYBRIDGE: 223) (MKL: 267) +* +*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define J %r14 +#define OLD_K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define K %r12 +#define BI %rbp +#define SP %rbx + +#define BO1 %rdi +#define CO2 %rdx + +#ifndef WINDOWS_ABI + +#define STACKSIZE 96 + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#endif + +#define L_BUFFER_SIZE 512*8*4 +#define LB2_OFFSET 512*8*2 + +#define Ndiv6 24(%rsp) +#define Nmod6 32(%rsp) +#define N 40(%rsp) +#define ALPHA 48(%rsp) +#define OFFSET 56(%rsp) +#define KK 64(%rsp) +#define KKK 72(%rsp) +#define BUFFER1 128(%rsp) +#define BUFFER2 LB2_OFFSET+128(%rsp) + +#if defined(OS_WINDOWS) +#if L_BUFFER_SIZE > 16384 +#define STACK_TOUCH \ + movl $0, 4096 * 4(%rsp);\ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 12288 +#define STACK_TOUCH \ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 8192 +#define STACK_TOUCH \ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 4096 +#define STACK_TOUCH \ + movl $0, 4096 * 1(%rsp); +#else +#define STACK_TOUCH +#endif +#else +#define STACK_TOUCH +#endif + +#if defined(BULLDOZER) + +#define VFMADD231PS_( y0,y1,y2 ) vfmaddps y0,y1,y2,y0 + +#define VFMADD231SS_( x0,x1,x2 ) vfmaddss x0,x1,x2,x0 + +#else + +#define VFMADD231PS_( y0,y1,y2 ) vfmadd231ps y1,y2,y0 + +#define VFMADD231SS_( x0,x1,x2 ) vfmadd231ss x1,x2,x0 + +#endif + + +#define A_PR1 512 +#define B_PR1 512 + +/******************************************************************************************* +* 4 lines of N +*******************************************************************************************/ + +.macro KERNEL16x4_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 + vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) + VFMADD231PS_( %ymm5,%ymm2,%ymm1 ) + VFMADD231PS_( %ymm6,%ymm3,%ymm0 ) + VFMADD231PS_( %ymm7,%ymm3,%ymm1 ) + vbroadcastss -2 * SIZE(BO, BI, SIZE), %ymm2 + vbroadcastss -1 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PS_( %ymm8,%ymm2,%ymm0 ) + VFMADD231PS_( %ymm9,%ymm2,%ymm1 ) + VFMADD231PS_( %ymm10,%ymm3,%ymm0 ) + VFMADD231PS_( %ymm11,%ymm3,%ymm1 ) + addq $4 , BI + addq $16, %rax +.endm + +.macro SAVE16x4 + + vbroadcastss ALPHA, %ymm0 + + vmulps %ymm0 , %ymm4 , %ymm4 + vmulps %ymm0 , %ymm5 , %ymm5 + vmulps %ymm0 , %ymm6 , %ymm6 + vmulps %ymm0 , %ymm7 , %ymm7 + vmulps %ymm0 , %ymm8 , %ymm8 + vmulps %ymm0 , %ymm9 , %ymm9 + vmulps %ymm0 , %ymm10, %ymm10 + vmulps %ymm0 , %ymm11, %ymm11 + + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %ymm4,%ymm4 + vaddps 8 * SIZE(CO1), %ymm5,%ymm5 + + vaddps (CO1, LDC), %ymm6,%ymm6 + vaddps 8 * SIZE(CO1, LDC), %ymm7,%ymm7 + + vaddps (CO2), %ymm8,%ymm8 + vaddps 8 * SIZE(CO2), %ymm9,%ymm9 + + vaddps (CO2, LDC), %ymm10,%ymm10 + vaddps 8 * SIZE(CO2, LDC), %ymm11,%ymm11 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm5 , 8 * SIZE(CO1) + + vmovups %ymm6 , (CO1, LDC) + vmovups %ymm7 , 8 * SIZE(CO1, LDC) + + vmovups %ymm8 , (CO2) + vmovups %ymm9 , 8 * SIZE(CO2) + + vmovups %ymm10, (CO2, LDC) + vmovups %ymm11, 8 * SIZE(CO2, LDC) + + prefetcht0 64(CO1) + prefetcht0 64(CO1, LDC) + prefetcht0 64(CO2) + prefetcht0 64(CO2, LDC) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL8x4_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 + vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) + VFMADD231PS_( %ymm6,%ymm3,%ymm0 ) + vbroadcastss -2 * SIZE(BO, BI, SIZE), %ymm2 + vbroadcastss -1 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PS_( %ymm8,%ymm2,%ymm0 ) + VFMADD231PS_( %ymm10,%ymm3,%ymm0 ) + addq $4 , BI + addq $8 , %rax +.endm + +.macro SAVE8x4 + + vbroadcastss ALPHA, %ymm0 + + vmulps %ymm0 , %ymm4 , %ymm4 + vmulps %ymm0 , %ymm6 , %ymm6 + vmulps %ymm0 , %ymm8 , %ymm8 + vmulps %ymm0 , %ymm10, %ymm10 + + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %ymm4,%ymm4 + vaddps (CO1, LDC), %ymm6,%ymm6 + vaddps (CO2), %ymm8,%ymm8 + vaddps (CO2, LDC), %ymm10,%ymm10 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm6 , (CO1, LDC) + vmovups %ymm8 , (CO2) + vmovups %ymm10, (CO2, LDC) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL4x4_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2 + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231PS_( %xmm4,%xmm2,%xmm0 ) + VFMADD231PS_( %xmm6,%xmm3,%xmm0 ) + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2 + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231PS_( %xmm8,%xmm2,%xmm0 ) + VFMADD231PS_( %xmm10,%xmm3,%xmm0 ) + addq $4 , BI + addq $4 , %rax +.endm + +.macro SAVE4x4 + + vbroadcastss ALPHA, %xmm0 + + vmulps %xmm0 , %xmm4 , %xmm4 + vmulps %xmm0 , %xmm6 , %xmm6 + vmulps %xmm0 , %xmm8 , %xmm8 + vmulps %xmm0 , %xmm10, %xmm10 + + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %xmm4,%xmm4 + vaddps (CO1, LDC), %xmm6,%xmm6 + vaddps (CO2), %xmm8,%xmm8 + vaddps (CO2, LDC), %xmm10,%xmm10 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm6 , (CO1, LDC) + vmovups %xmm8 , (CO2) + vmovups %xmm10, (CO2, LDC) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL2x4_SUB + vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 + vmovss -15 * SIZE(AO, %rax, SIZE), %xmm1 + vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 + vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm5,%xmm2,%xmm1 ) + VFMADD231SS_( %xmm6,%xmm3,%xmm0 ) + VFMADD231SS_( %xmm7,%xmm3,%xmm1 ) + vmovss -2 * SIZE(BO, BI, SIZE), %xmm2 + vmovss -1 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SS_( %xmm8,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm9,%xmm2,%xmm1 ) + VFMADD231SS_( %xmm10,%xmm3,%xmm0 ) + VFMADD231SS_( %xmm11,%xmm3,%xmm1 ) + addq $4 , BI + addq $2, %rax +.endm + +.macro SAVE2x4 + + vmovss ALPHA, %xmm0 + + vmulss %xmm0 , %xmm4 , %xmm4 + vmulss %xmm0 , %xmm5 , %xmm5 + vmulss %xmm0 , %xmm6 , %xmm6 + vmulss %xmm0 , %xmm7 , %xmm7 + vmulss %xmm0 , %xmm8 , %xmm8 + vmulss %xmm0 , %xmm9 , %xmm9 + vmulss %xmm0 , %xmm10, %xmm10 + vmulss %xmm0 , %xmm11, %xmm11 + + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %xmm4,%xmm4 + vaddps 1 * SIZE(CO1), %xmm5,%xmm5 + + vaddps (CO1, LDC), %xmm6,%xmm6 + vaddps 1 * SIZE(CO1, LDC), %xmm7,%xmm7 + + vaddps (CO2), %xmm8,%xmm8 + vaddps 1 * SIZE(CO2), %xmm9,%xmm9 + + vaddps (CO2, LDC), %xmm10,%xmm10 + vaddps 1 * SIZE(CO2, LDC), %xmm11,%xmm11 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm5 , 1 * SIZE(CO1) + + vmovss %xmm6 , (CO1, LDC) + vmovss %xmm7 , 1 * SIZE(CO1, LDC) + + vmovss %xmm8 , (CO2) + vmovss %xmm9 , 1 * SIZE(CO2) + + vmovss %xmm10, (CO2, LDC) + vmovss %xmm11, 1 * SIZE(CO2, LDC) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL1x4_SUB + vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 + vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 + vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm6,%xmm3,%xmm0 ) + vmovss -2 * SIZE(BO, BI, SIZE), %xmm2 + vmovss -1 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SS_( %xmm8,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm10,%xmm3,%xmm0 ) + addq $4 , BI + addq $1, %rax +.endm + +.macro SAVE1x4 + + vmovss ALPHA, %xmm0 + + vmulss %xmm0 , %xmm4 , %xmm4 + vmulss %xmm0 , %xmm6 , %xmm6 + vmulss %xmm0 , %xmm8 , %xmm8 + vmulss %xmm0 , %xmm10, %xmm10 + + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %xmm4,%xmm4 + vaddps (CO1, LDC), %xmm6,%xmm6 + vaddps (CO2), %xmm8,%xmm8 + vaddps (CO2, LDC), %xmm10,%xmm10 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm6 , (CO1, LDC) + vmovss %xmm8 , (CO2) + vmovss %xmm10, (CO2, LDC) + +.endm + + +/*******************************************************************************************/ + +/******************************************************************************************* +* 2 lines of N +*******************************************************************************************/ + +.macro KERNEL16x2_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 + vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) + VFMADD231PS_( %ymm5,%ymm2,%ymm1 ) + VFMADD231PS_( %ymm6,%ymm3,%ymm0 ) + VFMADD231PS_( %ymm7,%ymm3,%ymm1 ) + addq $2 , BI + addq $16, %rax +.endm + +.macro SAVE16x2 + + vbroadcastss ALPHA, %ymm0 + + vmulps %ymm0 , %ymm4 , %ymm4 + vmulps %ymm0 , %ymm5 , %ymm5 + vmulps %ymm0 , %ymm6 , %ymm6 + vmulps %ymm0 , %ymm7 , %ymm7 + + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %ymm4,%ymm4 + vaddps 8 * SIZE(CO1), %ymm5,%ymm5 + + vaddps (CO1, LDC), %ymm6,%ymm6 + vaddps 8 * SIZE(CO1, LDC), %ymm7,%ymm7 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm5 , 8 * SIZE(CO1) + + vmovups %ymm6 , (CO1, LDC) + vmovups %ymm7 , 8 * SIZE(CO1, LDC) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL8x2_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 + vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) + VFMADD231PS_( %ymm6,%ymm3,%ymm0 ) + addq $2 , BI + addq $8 , %rax +.endm + +.macro SAVE8x2 + + vbroadcastss ALPHA, %ymm0 + + vmulps %ymm0 , %ymm4 , %ymm4 + vmulps %ymm0 , %ymm6 , %ymm6 + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %ymm4,%ymm4 + vaddps (CO1, LDC), %ymm6,%ymm6 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm6 , (CO1, LDC) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL4x2_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2 + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231PS_( %xmm4,%xmm2,%xmm0 ) + VFMADD231PS_( %xmm6,%xmm3,%xmm0 ) + addq $2 , BI + addq $4 , %rax +.endm + +.macro SAVE4x2 + + vbroadcastss ALPHA, %xmm0 + + vmulps %xmm0 , %xmm4 , %xmm4 + vmulps %xmm0 , %xmm6 , %xmm6 + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %xmm4,%xmm4 + vaddps (CO1, LDC), %xmm6,%xmm6 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm6 , (CO1, LDC) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL2x2_SUB + vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 + vmovss -15 * SIZE(AO, %rax, SIZE), %xmm1 + vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 + vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm5,%xmm2,%xmm1 ) + VFMADD231SS_( %xmm6,%xmm3,%xmm0 ) + VFMADD231SS_( %xmm7,%xmm3,%xmm1 ) + addq $2 , BI + addq $2, %rax +.endm + +.macro SAVE2x2 + + vmovss ALPHA, %xmm0 + + vmulss %xmm0 , %xmm4 , %xmm4 + vmulss %xmm0 , %xmm5 , %xmm5 + vmulss %xmm0 , %xmm6 , %xmm6 + vmulss %xmm0 , %xmm7 , %xmm7 + + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %xmm4,%xmm4 + vaddps 1 * SIZE(CO1), %xmm5,%xmm5 + + vaddps (CO1, LDC), %xmm6,%xmm6 + vaddps 1 * SIZE(CO1, LDC), %xmm7,%xmm7 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm5 , 1 * SIZE(CO1) + + vmovss %xmm6 , (CO1, LDC) + vmovss %xmm7 , 1 * SIZE(CO1, LDC) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL1x2_SUB + vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 + vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 + vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm6,%xmm3,%xmm0 ) + addq $2 , BI + addq $1, %rax +.endm + +.macro SAVE1x2 + + vmovss ALPHA, %xmm0 + + vmulss %xmm0 , %xmm4 , %xmm4 + vmulss %xmm0 , %xmm6 , %xmm6 + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %xmm4,%xmm4 + vaddps (CO1, LDC), %xmm6,%xmm6 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm6 , (CO1, LDC) + +.endm + + +/*******************************************************************************************/ + +/******************************************************************************************* +* 1 line of N +*******************************************************************************************/ + +.macro KERNEL16x1_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) + VFMADD231PS_( %ymm5,%ymm2,%ymm1 ) + addq $1 , BI + addq $16, %rax +.endm + +.macro SAVE16x1 + + vbroadcastss ALPHA, %ymm0 + + vmulps %ymm0 , %ymm4 , %ymm4 + vmulps %ymm0 , %ymm5 , %ymm5 + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %ymm4,%ymm4 + vaddps 8 * SIZE(CO1), %ymm5,%ymm5 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm5 , 8 * SIZE(CO1) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL8x1_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) + addq $1 , BI + addq $8 , %rax +.endm + +.macro SAVE8x1 + + vbroadcastss ALPHA, %ymm0 + + vmulps %ymm0 , %ymm4 , %ymm4 + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %ymm4,%ymm4 + +#endif + + vmovups %ymm4 , (CO1) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL4x1_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231PS_( %xmm4,%xmm2,%xmm0 ) + addq $1 , BI + addq $4 , %rax +.endm + +.macro SAVE4x1 + + vbroadcastss ALPHA, %xmm0 + + vmulps %xmm0 , %xmm4 , %xmm4 + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %xmm4,%xmm4 + +#endif + + vmovups %xmm4 , (CO1) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL2x1_SUB + vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 + vmovss -15 * SIZE(AO, %rax, SIZE), %xmm1 + vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm5,%xmm2,%xmm1 ) + addq $1 , BI + addq $2, %rax +.endm + +.macro SAVE2x1 + + vmovss ALPHA, %xmm0 + + vmulss %xmm0 , %xmm4 , %xmm4 + vmulss %xmm0 , %xmm5 , %xmm5 + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %xmm4,%xmm4 + vaddps 1 * SIZE(CO1), %xmm5,%xmm5 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm5 , 1 * SIZE(CO1) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL1x1_SUB + vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 + vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) + addq $1 , BI + addq $1, %rax +.endm + +.macro SAVE1x1 + + vmovss ALPHA, %xmm0 + + vmulss %xmm0 , %xmm4 , %xmm4 + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %xmm4,%xmm4 + +#endif + + vmovss %xmm4 , (CO1) + +.endm + + +/*******************************************************************************************/ + +/************************************************************************************* +* TRMM Kernel +*************************************************************************************/ + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + vmovsd OLD_OFFSET, %xmm12 +#endif + vmovaps %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + movsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovss %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $4, %rdi + divq %rdi // N / 4 + movq %rax, Ndiv6 // N / 4 + movq %rdx, Nmod6 // N % 4 + + + +#ifdef TRMMKERNEL + vmovsd %xmm12, OFFSET + vmovsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + + movq Ndiv6, J + cmpq $0, J + je .L2_0 + ALIGN_4 + +/*******************************************************************************************/ + +.L4_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + sarq $2, %rax // K / 4 + jz .L4_01b + ALIGN_4 + + +.L4_01a: + prefetcht0 512(BO1) + prefetchw 512(BO) + + vmovups (BO1), %xmm0 + vmovups 4*SIZE(BO1), %xmm1 + vmovups 8*SIZE(BO1), %xmm2 + vmovups 12*SIZE(BO1), %xmm3 + + vmovups %xmm0, (BO) + vmovups %xmm1, 4*SIZE(BO) + vmovups %xmm2, 8*SIZE(BO) + vmovups %xmm3,12*SIZE(BO) + + addq $16*SIZE,BO1 + addq $16*SIZE,BO + decq %rax + jnz .L4_01a + + +.L4_01b: + + movq K, %rax + andq $3, %rax // K % 4 + jz .L4_02d + ALIGN_4 + +.L4_02c: + + vmovups (BO1), %xmm0 + vmovups %xmm0, (BO) + addq $4*SIZE,BO1 + addq $4*SIZE,BO + decq %rax + jnz .L4_02c + +.L4_02d: + + movq BO1, B // next offset of B + +.L4_10: + movq C, CO1 + leaq (C, LDC, 2), CO2 + leaq (C, LDC, 4), C // c += 4 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L4_20 + + ALIGN_4 + +.L4_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L4_16 + movq %rax, BI // Index for BO + leaq (,BI,4) , BI // BI = BI * 4 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_12: + + prefetcht0 A_PR1(AO, %rax, SIZE) + prefetcht0 B_PR1(BO, BI , SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + + prefetcht0 A_PR1(AO, %rax, SIZE) + prefetcht0 B_PR1(BO, BI , SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + + je .L4_16 + + prefetcht0 A_PR1(AO, %rax, SIZE) + prefetcht0 B_PR1(BO, BI , SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + + prefetcht0 A_PR1(AO, %rax, SIZE) + prefetcht0 B_PR1(BO, BI , SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + + je .L4_16 + + jmp .L4_12 + ALIGN_4 + +.L4_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_19 + + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_17: + + KERNEL16x4_SUB + + jl .L4_17 + ALIGN_4 + + +.L4_19: + + SAVE16x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + addq $16 * SIZE, CO2 # coffset += 16 + decq I # i -- + jg .L4_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L4_20: + // Test rest of M + + testq $15, M + jz .L4_60 // to next 3 lines of N + + testq $8, M + jz .L4_21pre + ALIGN_4 + +/**************************************************************************/ + +.L4_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L4_20_6 + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_20_2: + + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + + je .L4_20_6 + + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + + je .L4_20_6 + + jmp .L4_20_2 + ALIGN_4 + +.L4_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_20_9 + + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_20_7: + + KERNEL8x4_SUB + + jl .L4_20_7 + ALIGN_4 + + +.L4_20_9: + + SAVE8x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + addq $8 * SIZE, CO2 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L4_21pre: + + testq $4, M + jz .L4_30 + ALIGN_4 + +.L4_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L4_26 + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_22: + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + je .L4_26 + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + je .L4_26 + + jmp .L4_22 + ALIGN_4 + +.L4_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_29 + + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_27: + + KERNEL4x4_SUB + + jl .L4_27 + ALIGN_4 + + +.L4_29: + + SAVE4x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + ALIGN_4 + + +.L4_30: + testq $2, M + jz .L4_40 + + ALIGN_4 + +.L4_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L4_36 + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_32: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + je .L4_36 + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + je .L4_36 + + jmp .L4_32 + ALIGN_4 + +.L4_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_39 + + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_37: + + KERNEL2x4_SUB + + jl .L4_37 + ALIGN_4 + + +.L4_39: + + SAVE2x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + addq $2 * SIZE, CO2 # coffset += 2 + ALIGN_4 + +.L4_40: + testq $1, M + jz .L4_60 // to next 4 lines of N + + ALIGN_4 + +.L4_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L4_46 + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_42: + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + je .L4_46 + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + je .L4_46 + + jmp .L4_42 + ALIGN_4 + +.L4_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_49 + + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_47: + + KERNEL1x4_SUB + + jl .L4_47 + ALIGN_4 + + +.L4_49: + + SAVE1x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + addq $1 * SIZE, CO2 # coffset += 1 + ALIGN_4 + + + + + +.L4_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $4, KK +#endif + + decq J // j -- + jg .L4_01 // next 4 lines of N + + + +/*******************************************************************************************/ +.L2_0: + + movq Nmod6, J + andq $3, J // j % 4 + je .L999 + + movq Nmod6, J + andq $2, J // j % 4 + je .L1_0 + +.L2_01: + + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + sarq $2, %rax // K / 4 + jz .L2_01b + ALIGN_4 + +.L2_01a: + + vmovsd (BO1), %xmm0 + vmovsd 2*SIZE(BO1), %xmm1 + vmovsd 4*SIZE(BO1), %xmm2 + vmovsd 6*SIZE(BO1), %xmm3 + + vmovsd %xmm0, (BO) + vmovsd %xmm1, 2*SIZE(BO) + vmovsd %xmm2, 4*SIZE(BO) + vmovsd %xmm3, 6*SIZE(BO) + + addq $8*SIZE,BO1 + addq $8*SIZE,BO + decq %rax + jnz .L2_01a + + +.L2_01b: + + movq K, %rax + andq $3, %rax // K % 4 + jz .L2_02d + ALIGN_4 + +.L2_02c: + + vmovsd (BO1), %xmm0 + vmovsd %xmm0, (BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO + decq %rax + jnz .L2_02c + +.L2_02d: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L2_20 + + ALIGN_4 + +.L2_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_16 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_12: + + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + + je .L2_16 + + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + + je .L2_16 + + jmp .L2_12 + ALIGN_4 + +.L2_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_17: + + KERNEL16x2_SUB + + jl .L2_17 + ALIGN_4 + + +.L2_19: + + SAVE16x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L2_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_20: + // Test rest of M + + testq $15, M + jz .L2_60 // to next 2 lines of N + + testq $8, M + jz .L2_21pre + ALIGN_4 + +/**************************************************************************/ + +.L2_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_20_6 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_2: + + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + je .L2_20_6 + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + je .L2_20_6 + + jmp .L2_20_2 + ALIGN_4 + +.L2_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_20_9 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_7: + + KERNEL8x2_SUB + + jl .L2_20_7 + ALIGN_4 + + +.L2_20_9: + + SAVE8x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L2_21pre: + + testq $4, M + jz .L2_30 + ALIGN_4 + +.L2_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_26 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 1 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_22: + + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + je .L2_26 + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + je .L2_26 + + jmp .L2_22 + ALIGN_4 + +.L2_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_27: + + KERNEL4x2_SUB + + jl .L2_27 + ALIGN_4 + + +.L2_29: + + SAVE4x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L2_30: + testq $2, M + jz .L2_40 + + ALIGN_4 + +.L2_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_36 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_32: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + je .L2_36 + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + je .L2_36 + + jmp .L2_32 + ALIGN_4 + +.L2_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_37: + + KERNEL2x2_SUB + + jl .L2_37 + ALIGN_4 + + +.L2_39: + + SAVE2x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L2_40: + testq $1, M + jz .L2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L2_46 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_42: + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + je .L2_46 + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + je .L2_46 + + jmp .L2_42 + ALIGN_4 + +.L2_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB + + jl .L2_47 + ALIGN_4 + + +.L2_49: + + SAVE1x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + + + +.L2_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovss (BO1), %xmm0 + vmovss %xmm0, (BO) + addq $1*SIZE,BO1 + addq $1*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L1_20 + + ALIGN_4 + +.L1_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_16 + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_12: + + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + + je .L1_16 + + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + + je .L1_16 + + jmp .L1_12 + ALIGN_4 + +.L1_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_19 + + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_17: + + KERNEL16x1_SUB + + jl .L1_17 + ALIGN_4 + + +.L1_19: + + SAVE16x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L1_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_20: + // Test rest of M + + testq $15, M + jz .L999 + + testq $8, M + jz .L1_21pre + ALIGN_4 + +/**************************************************************************/ + +.L1_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_20_6 + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_2: + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + je .L1_20_6 + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + je .L1_20_6 + + jmp .L1_20_2 + ALIGN_4 + +.L1_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_20_9 + + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_7: + + KERNEL8x1_SUB + + jl .L1_20_7 + ALIGN_4 + + +.L1_20_9: + + SAVE8x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L1_21pre: + + testq $4, M + jz .L1_30 + ALIGN_4 + +.L1_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_26 + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_22: + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + je .L1_26 + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + je .L1_26 + + jmp .L1_22 + ALIGN_4 + +.L1_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_29 + + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_27: + + KERNEL4x1_SUB + + jl .L1_27 + ALIGN_4 + + +.L1_29: + + SAVE4x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L1_30: + testq $2, M + jz .L1_40 + + ALIGN_4 + +.L1_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_36 + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_32: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + je .L1_36 + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + je .L1_36 + + jmp .L1_32 + ALIGN_4 + +.L1_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_39 + + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_37: + + KERNEL2x1_SUB + + jl .L1_37 + ALIGN_4 + + +.L1_39: + + SAVE2x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L1_40: + testq $1, M + jz .L999 + + ALIGN_4 + +.L1_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L1_46 + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_42: + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + je .L1_46 + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + je .L1_46 + + jmp .L1_42 + ALIGN_4 + +.L1_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_49 + + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB + + jl .L1_47 + ALIGN_4 + + +.L1_49: + + SAVE1x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + +.L999: + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE + + + + + diff --git a/kernel/x86_64/symv_L_sse.S b/kernel/x86_64/symv_L_sse.S index f56490ee3..5083d0b01 100644 --- a/kernel/x86_64/symv_L_sse.S +++ b/kernel/x86_64/symv_L_sse.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 12) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 12) diff --git a/kernel/x86_64/symv_L_sse2.S b/kernel/x86_64/symv_L_sse2.S index bb40ac41e..a8bbb1cad 100644 --- a/kernel/x86_64/symv_L_sse2.S +++ b/kernel/x86_64/symv_L_sse2.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 12) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 12) diff --git a/kernel/x86_64/symv_U_sse.S b/kernel/x86_64/symv_U_sse.S index 653a96ff8..47af7726a 100644 --- a/kernel/x86_64/symv_U_sse.S +++ b/kernel/x86_64/symv_U_sse.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 12) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 12) diff --git a/kernel/x86_64/symv_U_sse2.S b/kernel/x86_64/symv_U_sse2.S index e26088c16..57d8c2a20 100644 --- a/kernel/x86_64/symv_U_sse2.S +++ b/kernel/x86_64/symv_U_sse2.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 12) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) diff --git a/kernel/x86_64/zgemm_kernel_2x2_piledriver.S b/kernel/x86_64/zgemm_kernel_2x2_piledriver.S new file mode 100644 index 000000000..9f1392d78 --- /dev/null +++ b/kernel/x86_64/zgemm_kernel_2x2_piledriver.S @@ -0,0 +1,1428 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/********************************************************************* +* +* 2013/10/30 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +* +* 2013/10/30 Saar +* +* Parameter: +* UNROLL_M 2 +* UNROLL_N 2 +* ZGEMM_P 384 +* ZGEMM_Q 168 +* A_PR1 512 +* B_PR1 256 +* +* Performance at m x n on AMD 8320 (ACML-Version: 5.3.1): +* +* 3456x3456 82.4 GFLOPS with 8 threads on 4 modules (ACML: 76.3 ) (BULLDOZER: 81.0 ) +* 3456x3456 79.9 GFLOPS with 4 threads on 4 modules (ACML: 69.9 ) (BULLDOZER: 74.6 ) +* 3456x3456 40.4 GFLOPS with 2 threads on 2 modules (ACML: 35.8 ) (BULLDOZER: 37.9 ) +* 3456x3456 20.3 GFLOPS with 1 threads on 1 modules (ACML: 18.1 ) (BULLDOZER: 19.2 ) +* +* Performance at m x n on AMD 6380 (ACML-Version: 5.3.1): +* +* 6912x6912 227.5 GFLOPS with 32 threads on 16 modules (ACML: 166.3 ) (BULLDOZER: 228.5 ) +* 6912x6912 211.6 GFLOPS with 16 threads on 16 modules (ACML: 169.5 ) (BULLDOZER: 204.3 ) +* 6912x6912 123.5 GFLOPS with 8 threads on 8 modules (ACML: 92.7 ) (BULLDOZER: 117.0 ) +* 3456x3456 64.1 GFLOPS with 4 threads on 4 modules (ACML: 49.1 ) (BULLDOZER: 61.7 ) +* 3456x3456 33.4 GFLOPS with 2 threads on 2 modules (ACML: 28.1 ) (BULLDOZER: 30.9 ) +* 3456x3456 17.0 GFLOPS with 1 threads on 1 modules (ACML: 15.2 ) (BULLDOZER: 15.7 ) +* +*********************************************************************/ + + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define J %r14 +#define OLD_K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define K %r12 +#define BI %rbp +#define SP %rbx + +#define BO1 %rdi +#define BO2 %r15 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 96 + +#else + +#define STACKSIZE 320 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#endif + +#define L_BUFFER_SIZE 512*8*4 +#define LB2_OFFSET 512*8*2 + +#define Ndiv6 24(%rsp) +#define Nmod6 32(%rsp) +#define N 40(%rsp) +#define ALPHA_R 48(%rsp) +#define ALPHA_I 56(%rsp) +#define OFFSET 64(%rsp) +#define KK 72(%rsp) +#define KKK 80(%rsp) +#define BUFFER1 128(%rsp) +#define BUFFER2 LB2_OFFSET+128(%rsp) + +#if defined(OS_WINDOWS) +#if L_BUFFER_SIZE > 16384 +#define STACK_TOUCH \ + movl $0, 4096 * 4(%rsp);\ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 12288 +#define STACK_TOUCH \ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 8192 +#define STACK_TOUCH \ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 4096 +#define STACK_TOUCH \ + movl $0, 4096 * 1(%rsp); +#else +#define STACK_TOUCH +#endif +#else +#define STACK_TOUCH +#endif + + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define VFMADD_R vfmaddpd +#define VFMADD_I vfmaddpd +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define VFMADD_R vfnmaddpd +#define VFMADD_I vfmaddpd +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define VFMADD_R vfmaddpd +#define VFMADD_I vfnmaddpd +#else +#define VFMADD_R vfnmaddpd +#define VFMADD_I vfnmaddpd +#endif + + +#define A_PR1 512 +#define B_PR1 256 + +#define KERNEL2x2_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + +#define KERNEL2x2_2(xx) \ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -2 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vmovddup -2 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vmovddup -1 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + +#define KERNEL2x2_3(xx) \ + prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ + vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups 2 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vmovddup 2 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vmovddup 3 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + +#define KERNEL2x2_4(xx) \ + vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup 4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups 6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup 5 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vmovddup 6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vmovddup 7 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + addq $16, BI ;\ + addq $16, %rax ;\ + + +#define KERNEL2x2_SUB(xx) \ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + addq $4, BI ;\ + addq $4, %rax ;\ + +/************************************************************************************************/ + +#define KERNEL1x2_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + +#define KERNEL1x2_2(xx) \ + vmovups -6 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vmovddup -2 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vmovddup -1 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + +#define KERNEL1x2_3(xx) \ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vmovddup 2 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vmovddup 3 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + +#define KERNEL1x2_4(xx) \ + vmovups -2 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup 4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup 5 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vmovddup 6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vmovddup 7 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + addq $16, BI ;\ + addq $8 , %rax ;\ + + +#define KERNEL1x2_SUB(xx) \ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + addq $4, BI ;\ + addq $2, %rax ;\ + +/************************************************************************************************/ + +#define KERNEL2x1_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + +#define KERNEL2x1_2(xx) \ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -2 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -2 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup -1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + +#define KERNEL2x1_3(xx) \ + prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ + vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups 2 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + +#define KERNEL2x1_4(xx) \ + vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup 2 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups 6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup 3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + addq $8, BI ;\ + addq $16, %rax ;\ + + +#define KERNEL2x1_SUB(xx) \ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + addq $2, BI ;\ + addq $4, %rax ;\ + + +/************************************************************************************************/ + +#define KERNEL1x1_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + +#define KERNEL1x1_2(xx) \ + vmovups -6 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -2 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup -1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + +#define KERNEL1x1_3(xx) \ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + +#define KERNEL1x1_4(xx) \ + vmovups -2 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup 2 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup 3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + addq $8, BI ;\ + addq $8, %rax ;\ + + +#define KERNEL1x1_SUB(xx) \ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + addq $2, BI ;\ + addq $2, %rax ;\ + + +/************************************************************************************************/ + + + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif + vmovaps %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + movsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovsd %xmm0, ALPHA_R + vmovsd %xmm1, ALPHA_I + + salq $ZBASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $2, %rdi + divq %rdi // N / 2 + movq %rax, Ndiv6 // N / 2 + movq %rdx, Nmod6 // N % 2 + + + +#ifdef TRMMKERNEL + vmovsd %xmm12, OFFSET + vmovsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + +.L2_0: + + movq Ndiv6, J + cmpq $0, J + je .L1_0 + ALIGN_4 + + + +.L2_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L2_02b: + + vmovups (BO1), %xmm0 + vmovups 2 * SIZE(BO1), %xmm1 + vmovups %xmm0, (BO) + vmovups %xmm1, 2 * SIZE(BO) + addq $4*SIZE,BO1 + addq $4*SIZE,BO + decq %rax + jnz .L2_02b + +.L2_02c: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $8 * SIZE, AO + + movq M, I + sarq $1, I // i = (m >> 1) + je .L2_40 + + ALIGN_4 + +.L2_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_16 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_12: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + prefetcht0 B_PR1+64(BO,BI,SIZE) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + prefetcht0 B_PR1+64(BO,BI,SIZE) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_16 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + prefetcht0 B_PR1+64(BO,BI,SIZE) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + prefetcht0 B_PR1+64(BO,BI,SIZE) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_16 + + jmp .L2_12 + ALIGN_4 + +.L2_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_17: + + KERNEL2x2_SUB(xxx) + jl .L2_17 + ALIGN_4 + + +.L2_19: + + vmovddup ALPHA_R, %xmm0 + vmovddup ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $0x01, %xmm11, %xmm11, %xmm11 + vshufpd $0x01, %xmm13, %xmm13, %xmm13 + vshufpd $0x01, %xmm15, %xmm15, %xmm15 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm11,%xmm10, %xmm10 + vaddsubpd %xmm13,%xmm12, %xmm12 + vaddsubpd %xmm15,%xmm14, %xmm14 + + vshufpd $0x01, %xmm8 , %xmm8, %xmm9 + vshufpd $0x01, %xmm10, %xmm10, %xmm11 + vshufpd $0x01, %xmm12, %xmm12, %xmm13 + vshufpd $0x01, %xmm14, %xmm14, %xmm15 + +#else + vaddsubpd %xmm8, %xmm9 ,%xmm9 + vaddsubpd %xmm10, %xmm11,%xmm11 + vaddsubpd %xmm12, %xmm13,%xmm13 + vaddsubpd %xmm14, %xmm15,%xmm15 + + vmovapd %xmm9, %xmm8 + vmovapd %xmm11, %xmm10 + vmovapd %xmm13, %xmm12 + vmovapd %xmm15, %xmm14 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $0x01, %xmm11, %xmm11, %xmm11 + vshufpd $0x01, %xmm13, %xmm13, %xmm13 + vshufpd $0x01, %xmm15, %xmm15, %xmm15 + +#endif + + // multiply with ALPHA_R + vmulpd %xmm8 , %xmm0, %xmm8 + vmulpd %xmm10, %xmm0, %xmm10 + vmulpd %xmm12, %xmm0, %xmm12 + vmulpd %xmm14, %xmm0, %xmm14 + + // multiply with ALPHA_I + vmulpd %xmm9 , %xmm1, %xmm9 + vmulpd %xmm11, %xmm1, %xmm11 + vmulpd %xmm13, %xmm1, %xmm13 + vmulpd %xmm15, %xmm1, %xmm15 + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm11,%xmm10, %xmm10 + vaddsubpd %xmm13,%xmm12, %xmm12 + vaddsubpd %xmm15,%xmm14, %xmm14 + + + +#ifndef TRMMKERNEL + + vaddpd (CO1), %xmm8 , %xmm8 + vaddpd 2 * SIZE(CO1), %xmm12, %xmm12 + + vaddpd (CO1, LDC), %xmm10, %xmm10 + vaddpd 2 * SIZE(CO1, LDC), %xmm14, %xmm14 + +#endif + + vmovups %xmm8 , (CO1) + vmovups %xmm12 , 2 * SIZE(CO1) + + vmovups %xmm10 , (CO1, LDC) + vmovups %xmm14 , 2 * SIZE(CO1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + decq I # i -- + jg .L2_11 + ALIGN_4 + + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_40: + testq $1, M + jz .L2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_46 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_42: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + prefetcht0 B_PR1+64(BO,BI,SIZE) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + prefetcht0 B_PR1+64(BO,BI,SIZE) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + prefetcht0 B_PR1+64(BO,BI,SIZE) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + prefetcht0 B_PR1+64(BO,BI,SIZE) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + jmp .L2_42 + ALIGN_4 + +.L2_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_49 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB(xxx) + jl .L2_47 + ALIGN_4 + + +.L2_49: + + vmovddup ALPHA_R, %xmm0 + vmovddup ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $0x01, %xmm11, %xmm11, %xmm11 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm11,%xmm10, %xmm10 + + vshufpd $0x01, %xmm8 , %xmm8, %xmm9 + vshufpd $0x01, %xmm10, %xmm10, %xmm11 + +#else + vaddsubpd %xmm8, %xmm9, %xmm9 + vaddsubpd %xmm10,%xmm11, %xmm11 + + vmovapd %xmm9, %xmm8 + vmovapd %xmm11, %xmm10 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $0x01, %xmm11, %xmm11, %xmm11 + +#endif + + // multiply with ALPHA_R + vmulpd %xmm8 , %xmm0, %xmm8 + vmulpd %xmm10, %xmm0, %xmm10 + + // multiply with ALPHA_I + vmulpd %xmm9 , %xmm1, %xmm9 + vmulpd %xmm11, %xmm1, %xmm11 + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm11,%xmm10, %xmm10 + + + +#ifndef TRMMKERNEL + + vaddpd (CO1), %xmm8 , %xmm8 + vaddpd (CO1, LDC), %xmm10, %xmm10 + +#endif + + vmovups %xmm8 , (CO1) + vmovups %xmm10 , (CO1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + + + + +.L2_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + decq J // j -- + jg .L2_01 // next 2 lines of N + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovups (BO1), %xmm0 + vmovups %xmm0, (BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $8 * SIZE, AO + + movq M, I + sarq $1, I // i = (m >> 1) + je .L1_40 + + ALIGN_4 + +.L1_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_16 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_12: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_16 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_16 + + jmp .L1_12 + ALIGN_4 + +.L1_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_17: + + KERNEL2x1_SUB(xxx) + jl .L1_17 + ALIGN_4 + + +.L1_19: + + vmovddup ALPHA_R, %xmm0 + vmovddup ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $0x01, %xmm13, %xmm13, %xmm13 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm13,%xmm12 , %xmm12 + + vshufpd $0x01, %xmm8 , %xmm8, %xmm9 + vshufpd $0x01, %xmm12, %xmm12, %xmm13 + +#else + vaddsubpd %xmm8, %xmm9 , %xmm9 + vaddsubpd %xmm12,%xmm13, %xmm13 + + vmovapd %xmm9, %xmm8 + vmovapd %xmm13, %xmm12 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $0x01, %xmm13, %xmm13, %xmm13 + +#endif + + // multiply with ALPHA_R + vmulpd %xmm8 , %xmm0, %xmm8 + vmulpd %xmm12, %xmm0, %xmm12 + + // multiply with ALPHA_I + vmulpd %xmm9 , %xmm1, %xmm9 + vmulpd %xmm13, %xmm1, %xmm13 + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm13, %xmm12, %xmm12 + + + +#ifndef TRMMKERNEL + + vaddpd (CO1), %xmm8 , %xmm8 + vaddpd 2 * SIZE(CO1), %xmm12, %xmm12 + +#endif + + vmovups %xmm8 , (CO1) + vmovups %xmm12 , 2 * SIZE(CO1) + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + decq I # i -- + jg .L1_11 + ALIGN_4 + + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_40: + testq $1, M + jz .L999 + + ALIGN_4 + +.L1_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_46 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_42: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + jmp .L1_42 + ALIGN_4 + +.L1_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_49 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB(xxx) + jl .L1_47 + ALIGN_4 + + +.L1_49: + + vmovddup ALPHA_R, %xmm0 + vmovddup ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubpd %xmm9, %xmm8, %xmm8 + + vshufpd $0x01, %xmm8 , %xmm8, %xmm9 + +#else + vaddsubpd %xmm8, %xmm9, %xmm9 + + vmovapd %xmm9, %xmm8 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + +#endif + + // multiply with ALPHA_R + vmulpd %xmm8 , %xmm0, %xmm8 + + // multiply with ALPHA_I + vmulpd %xmm9 , %xmm1, %xmm9 + + vaddsubpd %xmm9 ,%xmm8, %xmm8 + + + +#ifndef TRMMKERNEL + + vaddpd (CO1), %xmm8 , %xmm8 + +#endif + + vmovups %xmm8 , (CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + + + +.L999: + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/zgemm_kernel_4x2_haswell.S b/kernel/x86_64/zgemm_kernel_4x2_haswell.S new file mode 100644 index 000000000..1e6278466 --- /dev/null +++ b/kernel/x86_64/zgemm_kernel_4x2_haswell.S @@ -0,0 +1,1812 @@ +/********************************************************************************* +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + +/******************************************************************************** +* 2013/11/13 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +* 2013/10/28 Saar +* Parameter: +* ZGEMM_DEFAULT_UNROLL_N 2 +* ZGEMM_DEFAULT_UNROLL_M 4 +* ZGEMM_DEFAULT_P 256 +* ZGEMM_DEFAULT_Q 128 +* A_PR1 512 +* B_PR1 512 +* +* +* Performance at 4608x4608x4608: +* 1 thread: 43 GFLOPS (SANDYBRIDGE: 29) (MKL: 53) +* 2 threads: 85 GFLOPS (SANDYBRIDGE: 59) (MKL: 100) +* 3 threads: 122 GFLOPS (SANDYBRIDGE: 86) (MKL: 138) +* 4 threads: 156 GFLOPS (SANDYBRIDGE: 108) (MKL: 172) +* +********************************************************************************/ + + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define J %r14 +#define OLD_K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define K %r12 +#define BI %rbp +#define SP %rbx + +#define BO1 %rdi +#define BO2 %r15 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 96 + +#else + +#define STACKSIZE 320 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#endif + +#define L_BUFFER_SIZE 512*8*4 +#define LB2_OFFSET 512*8*2 + +#define Ndiv6 24(%rsp) +#define Nmod6 32(%rsp) +#define N 40(%rsp) +#define ALPHA_R 48(%rsp) +#define ALPHA_I 56(%rsp) +#define OFFSET 64(%rsp) +#define KK 72(%rsp) +#define KKK 80(%rsp) +#define BUFFER1 128(%rsp) +#define BUFFER2 LB2_OFFSET+128(%rsp) + +#if defined(OS_WINDOWS) +#if L_BUFFER_SIZE > 16384 +#define STACK_TOUCH \ + movl $0, 4096 * 4(%rsp);\ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 12288 +#define STACK_TOUCH \ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 8192 +#define STACK_TOUCH \ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 4096 +#define STACK_TOUCH \ + movl $0, 4096 * 1(%rsp); +#else +#define STACK_TOUCH +#endif +#else +#define STACK_TOUCH +#endif + + +#if defined(BULLDOZER) + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + +#define VFMADDPD_R( y0,y1,y2 ) vfmaddpd y0,y1,y2,y0 + +#define VFMADDPD_I( y0,y1,y2 ) vfmaddpd y0,y1,y2,y0 + +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) + +#define VFMADDPD_R( y0,y1,y2 ) vfnmaddpd y0,y1,y2,y0 + +#define VFMADDPD_I( y0,y1,y2 ) vfmaddpd y0,y1,y2,y0 + +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) + +#define VFMADDPD_R( y0,y1,y2 ) vfmaddpd y0,y1,y2,y0 + +#define VFMADDPD_I( y0,y1,y2 ) vfnmaddpd y0,y1,y2,y0 + +#else + +#define VFMADDPD_R( y0,y1,y2 ) vfnmaddpd y0,y1,y2,y0 + +#define VFMADDPD_I( y0,y1,y2 ) vfnmaddpd y0,y1,y2,y0 + +#endif + +#else + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + +#define VFMADDPD_R( y0,y1,y2 ) vfmadd231pd y1,y2,y0 + +#define VFMADDPD_I( y0,y1,y2 ) vfmadd231pd y1,y2,y0 + +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) + +#define VFMADDPD_R( y0,y1,y2 ) vfnmadd231pd y1,y2,y0 + +#define VFMADDPD_I( y0,y1,y2 ) vfmadd231pd y1,y2,y0 + +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) + +#define VFMADDPD_R( y0,y1,y2 ) vfmadd231pd y1,y2,y0 + +#define VFMADDPD_I( y0,y1,y2 ) vfnmadd231pd y1,y2,y0 + +#else + +#define VFMADDPD_R( y0,y1,y2 ) vfnmadd231pd y1,y2,y0 + +#define VFMADDPD_I( y0,y1,y2 ) vfnmadd231pd y1,y2,y0 + +#endif + +#endif + +#define A_PR1 512 +#define B_PR1 512 +/***************************************************************************************************/ + +.macro KERNEL4x2_SUB + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 + vmovups -4 * SIZE(AO, %rax, SIZE), %ymm1 + + vbroadcastsd -8 * SIZE(BO, BI, SIZE), %ymm4 + vbroadcastsd -7 * SIZE(BO, BI, SIZE), %ymm5 + VFMADDPD_R( %ymm8 ,%ymm4,%ymm0 ) + VFMADDPD_R( %ymm12,%ymm4,%ymm1 ) + vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm6 + VFMADDPD_I( %ymm9 ,%ymm5,%ymm0 ) + VFMADDPD_I( %ymm13,%ymm5,%ymm1 ) + vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm7 + VFMADDPD_R( %ymm10,%ymm6,%ymm0 ) + VFMADDPD_R( %ymm14,%ymm6,%ymm1 ) + VFMADDPD_I( %ymm11,%ymm7,%ymm0 ) + VFMADDPD_I( %ymm15,%ymm7,%ymm1 ) + + addq $4, BI + addq $8, %rax +.endm + +.macro SAVE4x2 + + vbroadcastsd ALPHA_R, %ymm0 + vbroadcastsd ALPHA_I, %ymm1 + + // swap high and low 8 bytes + vshufpd $0x05, %ymm9 , %ymm9, %ymm9 + vshufpd $0x05, %ymm11, %ymm11, %ymm11 + vshufpd $0x05, %ymm13, %ymm13, %ymm13 + vshufpd $0x05, %ymm15, %ymm15, %ymm15 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubpd %ymm9, %ymm8 , %ymm8 + vaddsubpd %ymm11,%ymm10, %ymm10 + vaddsubpd %ymm13,%ymm12, %ymm12 + vaddsubpd %ymm15,%ymm14, %ymm14 + + vshufpd $0x05, %ymm8 , %ymm8, %ymm9 + vshufpd $0x05, %ymm10, %ymm10, %ymm11 + vshufpd $0x05, %ymm12, %ymm12, %ymm13 + vshufpd $0x05, %ymm14, %ymm14, %ymm15 + +#else + vaddsubpd %ymm8, %ymm9 ,%ymm9 + vaddsubpd %ymm10, %ymm11,%ymm11 + vaddsubpd %ymm12, %ymm13,%ymm13 + vaddsubpd %ymm14, %ymm15,%ymm15 + + vmovapd %ymm9, %ymm8 + vmovapd %ymm11, %ymm10 + vmovapd %ymm13, %ymm12 + vmovapd %ymm15, %ymm14 + + // swap high and low 8 bytes + vshufpd $0x05, %ymm9 , %ymm9, %ymm9 + vshufpd $0x05, %ymm11, %ymm11, %ymm11 + vshufpd $0x05, %ymm13, %ymm13, %ymm13 + vshufpd $0x05, %ymm15, %ymm15, %ymm15 + +#endif + + // multiply with ALPHA_R + vmulpd %ymm8 , %ymm0, %ymm8 + vmulpd %ymm10, %ymm0, %ymm10 + vmulpd %ymm12, %ymm0, %ymm12 + vmulpd %ymm14, %ymm0, %ymm14 + + // multiply with ALPHA_I + vmulpd %ymm9 , %ymm1, %ymm9 + vmulpd %ymm11, %ymm1, %ymm11 + vmulpd %ymm13, %ymm1, %ymm13 + vmulpd %ymm15, %ymm1, %ymm15 + + vaddsubpd %ymm9, %ymm8 , %ymm8 + vaddsubpd %ymm11,%ymm10, %ymm10 + vaddsubpd %ymm13,%ymm12, %ymm12 + vaddsubpd %ymm15,%ymm14, %ymm14 + + + +#ifndef TRMMKERNEL + + vaddpd (CO1), %ymm8 , %ymm8 + vaddpd 4 * SIZE(CO1), %ymm12, %ymm12 + + vaddpd (CO1, LDC), %ymm10, %ymm10 + vaddpd 4 * SIZE(CO1, LDC), %ymm14, %ymm14 + +#endif + + vmovups %ymm8 , (CO1) + vmovups %ymm12 , 4 * SIZE(CO1) + + vmovups %ymm10 , (CO1, LDC) + vmovups %ymm14 , 4 * SIZE(CO1, LDC) + + prefetcht0 64(CO1) + prefetcht0 64(CO1, LDC) + +.endm + +/***************************************************************************************************/ + +.macro KERNEL2x2_SUB + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 + vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 + vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 + VFMADDPD_R( %xmm8,%xmm4,%xmm0 ) + VFMADDPD_R( %xmm12,%xmm4,%xmm1 ) + vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 + VFMADDPD_I( %xmm9,%xmm5,%xmm0 ) + VFMADDPD_I( %xmm13,%xmm5,%xmm1 ) + vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 + VFMADDPD_R( %xmm10,%xmm6,%xmm0 ) + VFMADDPD_R( %xmm14,%xmm6,%xmm1 ) + vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 + VFMADDPD_I( %xmm11,%xmm7,%xmm0 ) + VFMADDPD_I( %xmm15,%xmm7,%xmm1 ) + addq $4, BI + addq $4, %rax +.endm + +.macro SAVE2x2 + + vmovddup ALPHA_R, %xmm0 + vmovddup ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $0x01, %xmm11, %xmm11, %xmm11 + vshufpd $0x01, %xmm13, %xmm13, %xmm13 + vshufpd $0x01, %xmm15, %xmm15, %xmm15 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm11,%xmm10, %xmm10 + vaddsubpd %xmm13,%xmm12, %xmm12 + vaddsubpd %xmm15,%xmm14, %xmm14 + + vshufpd $0x01, %xmm8 , %xmm8, %xmm9 + vshufpd $0x01, %xmm10, %xmm10, %xmm11 + vshufpd $0x01, %xmm12, %xmm12, %xmm13 + vshufpd $0x01, %xmm14, %xmm14, %xmm15 + +#else + vaddsubpd %xmm8, %xmm9 ,%xmm9 + vaddsubpd %xmm10, %xmm11,%xmm11 + vaddsubpd %xmm12, %xmm13,%xmm13 + vaddsubpd %xmm14, %xmm15,%xmm15 + + vmovapd %xmm9, %xmm8 + vmovapd %xmm11, %xmm10 + vmovapd %xmm13, %xmm12 + vmovapd %xmm15, %xmm14 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $0x01, %xmm11, %xmm11, %xmm11 + vshufpd $0x01, %xmm13, %xmm13, %xmm13 + vshufpd $0x01, %xmm15, %xmm15, %xmm15 + +#endif + + // multiply with ALPHA_R + vmulpd %xmm8 , %xmm0, %xmm8 + vmulpd %xmm10, %xmm0, %xmm10 + vmulpd %xmm12, %xmm0, %xmm12 + vmulpd %xmm14, %xmm0, %xmm14 + + // multiply with ALPHA_I + vmulpd %xmm9 , %xmm1, %xmm9 + vmulpd %xmm11, %xmm1, %xmm11 + vmulpd %xmm13, %xmm1, %xmm13 + vmulpd %xmm15, %xmm1, %xmm15 + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm11,%xmm10, %xmm10 + vaddsubpd %xmm13,%xmm12, %xmm12 + vaddsubpd %xmm15,%xmm14, %xmm14 + + + +#ifndef TRMMKERNEL + + vaddpd (CO1), %xmm8 , %xmm8 + vaddpd 2 * SIZE(CO1), %xmm12, %xmm12 + + vaddpd (CO1, LDC), %xmm10, %xmm10 + vaddpd 2 * SIZE(CO1, LDC), %xmm14, %xmm14 + +#endif + + vmovups %xmm8 , (CO1) + vmovups %xmm12 , 2 * SIZE(CO1) + + vmovups %xmm10 , (CO1, LDC) + vmovups %xmm14 , 2 * SIZE(CO1, LDC) + +.endm + +/************************************************************************************************/ + +/************************************************************************************************/ + + +.macro KERNEL1x2_SUB + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 + vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 + vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 + VFMADDPD_R( %xmm8,%xmm4,%xmm0 ) + VFMADDPD_I( %xmm9,%xmm5,%xmm0 ) + vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 + vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 + VFMADDPD_R( %xmm10,%xmm6,%xmm0 ) + VFMADDPD_I( %xmm11,%xmm7,%xmm0 ) + addq $4, BI + addq $2, %rax +.endm + +.macro SAVE1x2 + + vmovddup ALPHA_R, %xmm0 + vmovddup ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $0x01, %xmm11, %xmm11, %xmm11 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm11,%xmm10, %xmm10 + + vshufpd $0x01, %xmm8 , %xmm8, %xmm9 + vshufpd $0x01, %xmm10, %xmm10, %xmm11 + +#else + vaddsubpd %xmm8, %xmm9, %xmm9 + vaddsubpd %xmm10,%xmm11, %xmm11 + + vmovapd %xmm9, %xmm8 + vmovapd %xmm11, %xmm10 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $0x01, %xmm11, %xmm11, %xmm11 + +#endif + + // multiply with ALPHA_R + vmulpd %xmm8 , %xmm0, %xmm8 + vmulpd %xmm10, %xmm0, %xmm10 + + // multiply with ALPHA_I + vmulpd %xmm9 , %xmm1, %xmm9 + vmulpd %xmm11, %xmm1, %xmm11 + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm11,%xmm10, %xmm10 + +#ifndef TRMMKERNEL + + vaddpd (CO1), %xmm8 , %xmm8 + vaddpd (CO1, LDC), %xmm10, %xmm10 + +#endif + + vmovups %xmm8 , (CO1) + vmovups %xmm10 , (CO1, LDC) + +.endm + + +/************************************************************************************************/ + +.macro KERNEL4x1_SUB + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 + vmovups -4 * SIZE(AO, %rax, SIZE), %ymm1 + vbroadcastsd -4 * SIZE(BO, BI, SIZE) , %ymm4 + vbroadcastsd -3 * SIZE(BO, BI, SIZE) , %ymm5 + VFMADDPD_R( %ymm8 ,%ymm4,%ymm0 ) + VFMADDPD_R( %ymm12,%ymm4,%ymm1 ) + VFMADDPD_I( %ymm9 ,%ymm5,%ymm0 ) + VFMADDPD_I( %ymm13,%ymm5,%ymm1 ) + + addq $2, BI + addq $8, %rax +.endm + +.macro SAVE4x1 + + vbroadcastsd ALPHA_R, %ymm0 + vbroadcastsd ALPHA_I, %ymm1 + + // swap high and low 8 bytes + vshufpd $0x05, %ymm9 , %ymm9, %ymm9 + vshufpd $0x05, %ymm13, %ymm13, %ymm13 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubpd %ymm9, %ymm8 , %ymm8 + vaddsubpd %ymm13,%ymm12 , %ymm12 + + vshufpd $0x05, %ymm8 , %ymm8, %ymm9 + vshufpd $0x05, %ymm12, %ymm12, %ymm13 + +#else + vaddsubpd %ymm8, %ymm9 , %ymm9 + vaddsubpd %ymm12,%ymm13, %ymm13 + + vmovapd %ymm9, %ymm8 + vmovapd %ymm13, %ymm12 + + // swap high and low 8 bytes + vshufpd $0x05, %ymm9 , %ymm9, %ymm9 + vshufpd $0x05, %ymm13, %ymm13, %ymm13 + +#endif + + // multiply with ALPHA_R + vmulpd %ymm8 , %ymm0, %ymm8 + vmulpd %ymm12, %ymm0, %ymm12 + + // multiply with ALPHA_I + vmulpd %ymm9 , %ymm1, %ymm9 + vmulpd %ymm13, %ymm1, %ymm13 + + vaddsubpd %ymm9, %ymm8 , %ymm8 + vaddsubpd %ymm13, %ymm12, %ymm12 + + + +#ifndef TRMMKERNEL + + vaddpd (CO1), %ymm8 , %ymm8 + vaddpd 4 * SIZE(CO1), %ymm12, %ymm12 + +#endif + + vmovups %ymm8 , (CO1) + vmovups %ymm12 ,4 * SIZE(CO1) + +.endm + + + +/************************************************************************************************/ + +.macro KERNEL2x1_SUB + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 + vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 + VFMADDPD_R( %xmm8,%xmm4,%xmm0 ) + vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 + VFMADDPD_R( %xmm12,%xmm4,%xmm1 ) + vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 + VFMADDPD_I( %xmm9,%xmm5,%xmm0 ) + VFMADDPD_I( %xmm13,%xmm5,%xmm1 ) + addq $2, BI + addq $4, %rax +.endm + +.macro SAVE2x1 + + vmovddup ALPHA_R, %xmm0 + vmovddup ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $0x01, %xmm13, %xmm13, %xmm13 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm13,%xmm12 , %xmm12 + + vshufpd $0x01, %xmm8 , %xmm8, %xmm9 + vshufpd $0x01, %xmm12, %xmm12, %xmm13 + +#else + vaddsubpd %xmm8, %xmm9 , %xmm9 + vaddsubpd %xmm12,%xmm13, %xmm13 + + vmovapd %xmm9, %xmm8 + vmovapd %xmm13, %xmm12 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $0x01, %xmm13, %xmm13, %xmm13 + +#endif + + // multiply with ALPHA_R + vmulpd %xmm8 , %xmm0, %xmm8 + vmulpd %xmm12, %xmm0, %xmm12 + + // multiply with ALPHA_I + vmulpd %xmm9 , %xmm1, %xmm9 + vmulpd %xmm13, %xmm1, %xmm13 + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm13, %xmm12, %xmm12 + +#ifndef TRMMKERNEL + + vaddpd (CO1), %xmm8 , %xmm8 + vaddpd 2 * SIZE(CO1), %xmm12, %xmm12 + +#endif + + vmovups %xmm8 , (CO1) + vmovups %xmm12 , 2 * SIZE(CO1) + +.endm + + +/************************************************************************************************/ + +.macro KERNEL1x1_SUB + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 + vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 + VFMADDPD_R( %xmm8,%xmm4,%xmm0 ) + vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 + VFMADDPD_I( %xmm9,%xmm5,%xmm0 ) + addq $2, BI + addq $2, %rax +.endm + +.macro SAVE1x1 + + vmovddup ALPHA_R, %xmm0 + vmovddup ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubpd %xmm9, %xmm8, %xmm8 + + vshufpd $0x01, %xmm8 , %xmm8, %xmm9 + +#else + vaddsubpd %xmm8, %xmm9, %xmm9 + + vmovapd %xmm9, %xmm8 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + +#endif + + // multiply with ALPHA_R + vmulpd %xmm8 , %xmm0, %xmm8 + + // multiply with ALPHA_I + vmulpd %xmm9 , %xmm1, %xmm9 + + vaddsubpd %xmm9 ,%xmm8, %xmm8 + +#ifndef TRMMKERNEL + + vaddpd (CO1), %xmm8 , %xmm8 + +#endif + + vmovups %xmm8 , (CO1) + +.endm + + +/************************************************************************************************/ + + + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif + vmovaps %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + movsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovsd %xmm0, ALPHA_R + vmovsd %xmm1, ALPHA_I + + salq $ZBASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $2, %rdi + divq %rdi // N / 2 + movq %rax, Ndiv6 // N / 2 + movq %rdx, Nmod6 // N % 2 + + + +#ifdef TRMMKERNEL + vmovsd %xmm12, OFFSET + vmovsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + +.L2_00_0: + + movq Ndiv6, J + cmpq $0, J + je .L1_2_0 + ALIGN_4 + + + +.L2_00_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L2_00_02b: + + vmovups (BO1), %xmm0 + vmovups 2 * SIZE(BO1), %xmm1 + vmovups %xmm0, (BO) + vmovups %xmm1, 2 * SIZE(BO) + addq $4*SIZE,BO1 + addq $4*SIZE,BO + decq %rax + jnz .L2_00_02b + +.L2_00_02c: + + movq BO1, B // next offset of B + + +.L2_00_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $8 * SIZE, AO + + movq M, I + sarq $2, I // i = (m >> 2) + je .L2_2_10 + + ALIGN_4 + +/******************************************************************************************************************/ + +.L2_4_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_4_16 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_4_12: + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI ,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI ,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI ,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI ,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + + je .L2_4_16 + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI ,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI ,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI ,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI ,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + + je .L2_4_16 + + jmp .L2_4_12 + ALIGN_4 + +.L2_4_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_4_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_4_17: + + KERNEL4x2_SUB + + jl .L2_4_17 + ALIGN_4 + + +.L2_4_19: + + SAVE4x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + decq I # i -- + jg .L2_4_11 + ALIGN_4 + + +/************************************************************************** +* Rest of M +***************************************************************************/ + + +/******************************************************************************************************************/ +.L2_2_10: + testq $2, M + jz .L2_2_40 // to next 2 lines of N + +.L2_2_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_2_16 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_2_12: + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + + je .L2_2_16 + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + + je .L2_2_16 + + jmp .L2_2_12 + ALIGN_4 + +.L2_2_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_2_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_2_17: + + KERNEL2x2_SUB + + jl .L2_2_17 + ALIGN_4 + + +.L2_2_19: + + SAVE2x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_2_40: + testq $1, M + jz .L2_2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_2_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_2_46 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_2_42: + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + + je .L2_2_46 + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + + je .L2_2_46 + + jmp .L2_2_42 + ALIGN_4 + +.L2_2_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_2_49 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_2_47: + + KERNEL1x2_SUB + + jl .L2_2_47 + ALIGN_4 + + +.L2_2_49: + + SAVE1x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + decq I # i -- + jg .L2_2_41 + ALIGN_4 + + + + +.L2_2_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + decq J // j -- + jg .L2_00_01 // next 2 lines of N + + + +.L1_2_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_00_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_00_02b: + + vmovups (BO1), %xmm0 + vmovups %xmm0, (BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO + decq %rax + jnz .L1_00_02b + +.L1_00_02c: + + movq BO1, B // next offset of B + +.L1_00_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $8 * SIZE, AO + + movq M, I + sarq $2, I // i = (m >> 2) + je .L1_2_10 + + ALIGN_4 + +/*******************************************************************************************************/ + + +.L1_4_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_4_16 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_4_12: + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + je .L1_4_16 + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + je .L1_4_16 + + jmp .L1_4_12 + ALIGN_4 + +.L1_4_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_4_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_4_17: + + KERNEL4x1_SUB + + jl .L1_4_17 + ALIGN_4 + + +.L1_4_19: + + SAVE4x1 + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + decq I # i -- + jg .L1_4_11 + ALIGN_4 + + + + +/*******************************************************************************************************/ +.L1_2_10: + testq $2, M + jz .L1_2_40 + + +.L1_2_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_2_16 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_2_12: + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + + je .L1_2_16 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + + je .L1_2_16 + + jmp .L1_2_12 + ALIGN_4 + +.L1_2_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_2_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_2_17: + + KERNEL2x1_SUB + + jl .L1_2_17 + ALIGN_4 + + +.L1_2_19: + + SAVE2x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + + ALIGN_4 + + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_2_40: + testq $1, M + jz .L999 + + ALIGN_4 + +.L1_2_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_2_46 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_2_42: + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + je .L1_2_46 + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + je .L1_2_46 + + jmp .L1_2_42 + ALIGN_4 + +.L1_2_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_2_49 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_2_47: + + KERNEL1x1_SUB + + jl .L1_2_47 + ALIGN_4 + + +.L1_2_49: + + SAVE1x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + decq I # i -- + jg .L1_2_41 + ALIGN_4 + + + + + + +.L999: + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/zsymv_L_sse.S b/kernel/x86_64/zsymv_L_sse.S index 7f3b54ddb..204e5e6ab 100644 --- a/kernel/x86_64/zsymv_L_sse.S +++ b/kernel/x86_64/zsymv_L_sse.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 24) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) diff --git a/kernel/x86_64/zsymv_L_sse2.S b/kernel/x86_64/zsymv_L_sse2.S index 852a81820..5769d242a 100644 --- a/kernel/x86_64/zsymv_L_sse2.S +++ b/kernel/x86_64/zsymv_L_sse2.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 24) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) diff --git a/kernel/x86_64/zsymv_U_sse.S b/kernel/x86_64/zsymv_U_sse.S index c79e7f1a2..6f782b1e2 100644 --- a/kernel/x86_64/zsymv_U_sse.S +++ b/kernel/x86_64/zsymv_U_sse.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 24) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) diff --git a/kernel/x86_64/zsymv_U_sse2.S b/kernel/x86_64/zsymv_U_sse2.S index 6d0afce18..f92779e24 100644 --- a/kernel/x86_64/zsymv_U_sse2.S +++ b/kernel/x86_64/zsymv_U_sse2.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 24) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) diff --git a/lapack/laswp/arm/Makefile b/lapack/laswp/arm/Makefile new file mode 100644 index 000000000..434c82a84 --- /dev/null +++ b/lapack/laswp/arm/Makefile @@ -0,0 +1,33 @@ +TOPDIR = ../../.. +include ../../../Makefile.system + +ifeq ($(CORE), CORE2) +LASWP = ../generic/laswp_k_2.c +ZLASWP = ../generic/zlaswp_k_2.c +endif + +ifeq ($(CORE), OPTERON) +LASWP = ../generic/laswp_k_1.c +ZLASWP = ../generic/zlaswp_k_1.c +endif + +ifeq ($(CORE), PRESCOTT) +LASWP = ../generic/laswp_k_1.c +ZLASWP = ../generic/zlaswp_k_1.c +endif + +ifeq ($(DYNAMIC_ARCH), 1) +LASWP = ../generic/laswp_k_4.c +ZLASWP = ../generic/zlaswp_k_4.c +endif + +ifndef LASWP +LASWP = ../generic/laswp_k.c +endif + +ifndef ZLASWP +ZLASWP = ../generic/zlaswp_k.c +endif + +include ../generic/Makefile + diff --git a/lapack/laswp/arm64/Makefile b/lapack/laswp/arm64/Makefile new file mode 100644 index 000000000..434c82a84 --- /dev/null +++ b/lapack/laswp/arm64/Makefile @@ -0,0 +1,33 @@ +TOPDIR = ../../.. +include ../../../Makefile.system + +ifeq ($(CORE), CORE2) +LASWP = ../generic/laswp_k_2.c +ZLASWP = ../generic/zlaswp_k_2.c +endif + +ifeq ($(CORE), OPTERON) +LASWP = ../generic/laswp_k_1.c +ZLASWP = ../generic/zlaswp_k_1.c +endif + +ifeq ($(CORE), PRESCOTT) +LASWP = ../generic/laswp_k_1.c +ZLASWP = ../generic/zlaswp_k_1.c +endif + +ifeq ($(DYNAMIC_ARCH), 1) +LASWP = ../generic/laswp_k_4.c +ZLASWP = ../generic/zlaswp_k_4.c +endif + +ifndef LASWP +LASWP = ../generic/laswp_k.c +endif + +ifndef ZLASWP +ZLASWP = ../generic/zlaswp_k.c +endif + +include ../generic/Makefile + diff --git a/param.h b/param.h index 0c3df6951..e4b095a84 100644 --- a/param.h +++ b/param.h @@ -304,9 +304,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef PILEDRIVER - -#define SNUMOPT 8 -#define DNUMOPT 4 +#define SNUMOPT 8 +#define DNUMOPT 4 #define GEMM_DEFAULT_OFFSET_A 64 #define GEMM_DEFAULT_OFFSET_B 832 @@ -344,39 +343,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMV_UNROLL 8 #endif - #if defined(ARCH_X86_64) #define SGEMM_DEFAULT_P 768 -#define DGEMM_DEFAULT_P 384 +#define DGEMM_DEFAULT_P 768 +#define ZGEMM_DEFAULT_P 384 +#define CGEMM_DEFAULT_P 768 #else #define SGEMM_DEFAULT_P 448 -#define DGEMM_DEFAULT_P 224 +#define DGEMM_DEFAULT_P 480 +#define ZGEMM_DEFAULT_P 112 +#define CGEMM_DEFAULT_P 224 #endif #define QGEMM_DEFAULT_P 112 -#define CGEMM_DEFAULT_P 224 -#define ZGEMM_DEFAULT_P 112 #define XGEMM_DEFAULT_P 56 #if defined(ARCH_X86_64) -#define SGEMM_DEFAULT_Q 168 +#define SGEMM_DEFAULT_Q 192 #define DGEMM_DEFAULT_Q 168 +#define ZGEMM_DEFAULT_Q 168 +#define CGEMM_DEFAULT_Q 168 #else #define SGEMM_DEFAULT_Q 224 #define DGEMM_DEFAULT_Q 224 +#define ZGEMM_DEFAULT_Q 224 +#define CGEMM_DEFAULT_Q 224 #endif #define QGEMM_DEFAULT_Q 224 -#define CGEMM_DEFAULT_Q 224 -#define ZGEMM_DEFAULT_Q 224 #define XGEMM_DEFAULT_Q 224 -#define SGEMM_DEFAULT_R sgemm_r +#define SGEMM_DEFAULT_R 12288 #define QGEMM_DEFAULT_R qgemm_r -#define DGEMM_DEFAULT_R dgemm_r +#define DGEMM_DEFAULT_R 12288 #define CGEMM_DEFAULT_R cgemm_r #define ZGEMM_DEFAULT_R zgemm_r #define XGEMM_DEFAULT_R xgemm_r -#define SYMV_P 16 +#define SYMV_P 16 #define HAVE_EXCLUSIVE_CACHE #define GEMM_THREAD gemm_thread_mn @@ -1150,6 +1152,107 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif +#ifdef HASWELL + +#define SNUMOPT 16 +#define DNUMOPT 8 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN 0x03fffUL + +#define SYMV_P 8 + +#define SWITCH_RATIO 4 + +#ifdef ARCH_X86 + +#define SGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_M 2 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_M 1 +#define XGEMM_DEFAULT_UNROLL_M 1 + +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_N 4 +#define QGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 +#define XGEMM_DEFAULT_UNROLL_N 1 + +#else + +#define SGEMM_DEFAULT_UNROLL_M 16 +#define DGEMM_DEFAULT_UNROLL_M 4 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 8 +#define ZGEMM_DEFAULT_UNROLL_M 4 +#define XGEMM_DEFAULT_UNROLL_M 1 + +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_N 4 +#define QGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 +#define XGEMM_DEFAULT_UNROLL_N 1 + +#endif + +#ifdef ARCH_X86 + +#define SGEMM_DEFAULT_P 512 +#define SGEMM_DEFAULT_R sgemm_r +#define DGEMM_DEFAULT_P 512 +#define DGEMM_DEFAULT_R dgemm_r +#define QGEMM_DEFAULT_P 504 +#define QGEMM_DEFAULT_R qgemm_r +#define CGEMM_DEFAULT_P 128 +#define CGEMM_DEFAULT_R 1024 +#define ZGEMM_DEFAULT_P 512 +#define ZGEMM_DEFAULT_R zgemm_r +#define XGEMM_DEFAULT_P 252 +#define XGEMM_DEFAULT_R xgemm_r +#define SGEMM_DEFAULT_Q 256 +#define DGEMM_DEFAULT_Q 256 +#define QGEMM_DEFAULT_Q 128 +#define CGEMM_DEFAULT_Q 256 +#define ZGEMM_DEFAULT_Q 192 +#define XGEMM_DEFAULT_Q 128 + +#else + +#define SGEMM_DEFAULT_P 768 +#define DGEMM_DEFAULT_P 512 +#define CGEMM_DEFAULT_P 384 +#define ZGEMM_DEFAULT_P 256 + +#define SGEMM_DEFAULT_Q 384 +#define DGEMM_DEFAULT_Q 256 +#define CGEMM_DEFAULT_Q 192 +#define ZGEMM_DEFAULT_Q 128 + +#define SGEMM_DEFAULT_R sgemm_r +#define DGEMM_DEFAULT_R 13824 +#define CGEMM_DEFAULT_R cgemm_r +#define ZGEMM_DEFAULT_R zgemm_r + +#define QGEMM_DEFAULT_Q 128 +#define QGEMM_DEFAULT_P 504 +#define QGEMM_DEFAULT_R qgemm_r +#define XGEMM_DEFAULT_P 252 +#define XGEMM_DEFAULT_R xgemm_r +#define XGEMM_DEFAULT_Q 128 + +#define CGEMM3M_DEFAULT_UNROLL_N 4 +#define CGEMM3M_DEFAULT_UNROLL_M 8 +#define ZGEMM3M_DEFAULT_UNROLL_N 2 +#define ZGEMM3M_DEFAULT_UNROLL_M 8 +#endif + + +#endif + #ifdef ATOM @@ -1793,6 +1896,129 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SYMV_P 16 #endif + +#ifdef ARMV7 +#define SNUMOPT 2 +#define DNUMOPT 2 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN 0x03fffUL + +#define SGEMM_DEFAULT_UNROLL_M 4 +#define SGEMM_DEFAULT_UNROLL_N 4 + +#define DGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_N 4 + +#define CGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_N 2 + +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 + +#define SGEMM_DEFAULT_P 128 +#define DGEMM_DEFAULT_P 128 +#define CGEMM_DEFAULT_P 96 +#define ZGEMM_DEFAULT_P 64 + +#define SGEMM_DEFAULT_Q 240 +#define DGEMM_DEFAULT_Q 120 +#define CGEMM_DEFAULT_Q 120 +#define ZGEMM_DEFAULT_Q 120 + +#define SGEMM_DEFAULT_R 12288 +#define DGEMM_DEFAULT_R 8192 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 4096 + + + +#define SYMV_P 16 +#endif + + +#if defined(ARMV6) +#define SNUMOPT 2 +#define DNUMOPT 2 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN 0x03fffUL + +#define SGEMM_DEFAULT_UNROLL_M 4 +#define SGEMM_DEFAULT_UNROLL_N 2 + +#define DGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_N 2 + +#define CGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_N 2 + +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 + +#define SGEMM_DEFAULT_P 128 +#define DGEMM_DEFAULT_P 128 +#define CGEMM_DEFAULT_P 96 +#define ZGEMM_DEFAULT_P 64 + +#define SGEMM_DEFAULT_Q 240 +#define DGEMM_DEFAULT_Q 120 +#define CGEMM_DEFAULT_Q 120 +#define ZGEMM_DEFAULT_Q 120 + +#define SGEMM_DEFAULT_R 12288 +#define DGEMM_DEFAULT_R 8192 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 4096 + + +#define SYMV_P 16 +#endif + +#if defined(ARMV8) +#define SNUMOPT 2 +#define DNUMOPT 2 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN 0x03fffUL + +#define SGEMM_DEFAULT_UNROLL_M 2 +#define SGEMM_DEFAULT_UNROLL_N 2 + +#define DGEMM_DEFAULT_UNROLL_M 2 +#define DGEMM_DEFAULT_UNROLL_N 2 + +#define CGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_N 2 + +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 + +#define SGEMM_DEFAULT_P 128 +#define DGEMM_DEFAULT_P 128 +#define CGEMM_DEFAULT_P 96 +#define ZGEMM_DEFAULT_P 64 + +#define SGEMM_DEFAULT_Q 240 +#define DGEMM_DEFAULT_Q 120 +#define CGEMM_DEFAULT_Q 120 +#define ZGEMM_DEFAULT_Q 120 + +#define SGEMM_DEFAULT_R 12288 +#define DGEMM_DEFAULT_R 8192 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 4096 + + +#define SYMV_P 16 +#endif + + + + #ifdef GENERIC #define SNUMOPT 2