Merge bulldozer, haswell, piledriver and armv7 branchestags/v0.2.9.rc1
| @@ -0,0 +1,12 @@ | |||||
| ifeq ($(CORE), ARMV7) | |||||
| CCOMMON_OPT += -marm -mfpu=vfpv3 -mfloat-abi=hard -march=armv7-a | |||||
| FCOMMON_OPT += -marm -mfpu=vfpv3 -mfloat-abi=hard -march=armv7-a | |||||
| endif | |||||
| ifeq ($(CORE), ARMV6) | |||||
| CCOMMON_OPT += -marm -mfpu=vfp -mfloat-abi=hard -march=armv6 | |||||
| FCOMMON_OPT += -marm -mfpu=vfp -mfloat-abi=hard -march=armv6 | |||||
| endif | |||||
| @@ -0,0 +1,7 @@ | |||||
| ifeq ($(CORE), ARMV8) | |||||
| CCOMMON_OPT += -march=armv8-a | |||||
| FCOMMON_OPT += -march=armv8-a | |||||
| endif | |||||
| @@ -336,14 +336,14 @@ ifeq ($(ARCH), x86) | |||||
| DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \ | DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \ | ||||
| CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO | CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO | ||||
| ifneq ($(NO_AVX), 1) | ifneq ($(NO_AVX), 1) | ||||
| DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER | |||||
| DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER HASWELL | |||||
| endif | endif | ||||
| endif | endif | ||||
| ifeq ($(ARCH), x86_64) | ifeq ($(ARCH), x86_64) | ||||
| DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO | DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO | ||||
| ifneq ($(NO_AVX), 1) | ifneq ($(NO_AVX), 1) | ||||
| DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER | |||||
| DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER HASWELL | |||||
| endif | endif | ||||
| endif | endif | ||||
| @@ -373,6 +373,19 @@ NO_BINARY_MODE = 1 | |||||
| BINARY_DEFINED = 1 | BINARY_DEFINED = 1 | ||||
| endif | endif | ||||
| ifeq ($(ARCH), arm) | |||||
| NO_BINARY_MODE = 1 | |||||
| BINARY_DEFINED = 1 | |||||
| endif | |||||
| ifeq ($(ARCH), arm64) | |||||
| NO_BINARY_MODE = 1 | |||||
| BINARY_DEFINED = 1 | |||||
| endif | |||||
| # | # | ||||
| # C Compiler dependent settings | # C Compiler dependent settings | ||||
| # | # | ||||
| @@ -833,6 +846,19 @@ ifeq ($(DEBUG), 1) | |||||
| COMMON_OPT += -g | COMMON_OPT += -g | ||||
| endif | endif | ||||
| ifndef COMMON_OPT | |||||
| ifeq ($(ARCH), arm) | |||||
| COMMON_OPT = -O3 | |||||
| endif | |||||
| endif | |||||
| ifndef COMMON_OPT | |||||
| ifeq ($(ARCH), arm64) | |||||
| COMMON_OPT = -O3 | |||||
| endif | |||||
| endif | |||||
| ifndef COMMON_OPT | ifndef COMMON_OPT | ||||
| COMMON_OPT = -O2 | COMMON_OPT = -O2 | ||||
| endif | endif | ||||
| @@ -958,6 +984,10 @@ export HAVE_SSE4_2 | |||||
| export HAVE_SSE4A | export HAVE_SSE4A | ||||
| export HAVE_SSE5 | export HAVE_SSE5 | ||||
| export HAVE_AVX | export HAVE_AVX | ||||
| export HAVE_VFP | |||||
| export HAVE_VFPV3 | |||||
| export HAVE_VFPV4 | |||||
| export HAVE_NEON | |||||
| export KERNELDIR | export KERNELDIR | ||||
| export FUNCTION_PROFILE | export FUNCTION_PROFILE | ||||
| export TARGET_CORE | export TARGET_CORE | ||||
| @@ -63,6 +63,8 @@ $architecture = mips64 if ($data =~ /ARCH_MIPS64/); | |||||
| $architecture = alpha if ($data =~ /ARCH_ALPHA/); | $architecture = alpha if ($data =~ /ARCH_ALPHA/); | ||||
| $architecture = sparc if ($data =~ /ARCH_SPARC/); | $architecture = sparc if ($data =~ /ARCH_SPARC/); | ||||
| $architecture = ia64 if ($data =~ /ARCH_IA64/); | $architecture = ia64 if ($data =~ /ARCH_IA64/); | ||||
| $architecture = arm if ($data =~ /ARCH_ARM/); | |||||
| $architecture = arm64 if ($data =~ /ARCH_ARM64/); | |||||
| $defined = 0; | $defined = 0; | ||||
| @@ -149,6 +151,8 @@ $architecture = mips64 if ($data =~ /ARCH_MIPS64/); | |||||
| $architecture = alpha if ($data =~ /ARCH_ALPHA/); | $architecture = alpha if ($data =~ /ARCH_ALPHA/); | ||||
| $architecture = sparc if ($data =~ /ARCH_SPARC/); | $architecture = sparc if ($data =~ /ARCH_SPARC/); | ||||
| $architecture = ia64 if ($data =~ /ARCH_IA64/); | $architecture = ia64 if ($data =~ /ARCH_IA64/); | ||||
| $architecture = arm if ($data =~ /ARCH_ARM/); | |||||
| $architecture = arm64 if ($data =~ /ARCH_ARM64/); | |||||
| $binformat = bin32; | $binformat = bin32; | ||||
| $binformat = bin64 if ($data =~ /BINARY_64/); | $binformat = bin64 if ($data =~ /BINARY_64/); | ||||
| @@ -0,0 +1,303 @@ | |||||
| #ifndef CBLAS_H | |||||
| #define CBLAS_H | |||||
| #include <stddef.h> | |||||
| #include "common.h" | |||||
| #ifdef __cplusplus | |||||
| extern "C" { | |||||
| /* Assume C declarations for C++ */ | |||||
| #endif /* __cplusplus */ | |||||
| /*Set the number of threads on runtime.*/ | |||||
| void openblas_set_num_threads(int num_threads); | |||||
| void goto_set_num_threads(int num_threads); | |||||
| /*Get the build configure on runtime.*/ | |||||
| char* openblas_get_config(void); | |||||
| /* Get the parallelization type which is used by OpenBLAS */ | |||||
| int openblas_get_parallel(void); | |||||
| /* OpenBLAS is compiled for sequential use */ | |||||
| #define OPENBLAS_SEQUENTIAL 0 | |||||
| /* OpenBLAS is compiled using normal threading model */ | |||||
| #define OPENBLAS_THREAD 1 | |||||
| /* OpenBLAS is compiled using OpenMP threading model */ | |||||
| #define OPENBLAS_OPENMP 2 | |||||
| #define CBLAS_INDEX size_t | |||||
| typedef enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102} CBLAS_ORDER; | |||||
| typedef enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113, CblasConjNoTrans=114} CBLAS_TRANSPOSE; | |||||
| typedef enum CBLAS_UPLO {CblasUpper=121, CblasLower=122} CBLAS_UPLO; | |||||
| typedef enum CBLAS_DIAG {CblasNonUnit=131, CblasUnit=132} CBLAS_DIAG; | |||||
| typedef enum CBLAS_SIDE {CblasLeft=141, CblasRight=142} CBLAS_SIDE; | |||||
| float cblas_sdsdot(blasint n, float alpha, float *x, blasint incx, float *y, blasint incy); | |||||
| double cblas_dsdot (blasint n, float *x, blasint incx, float *y, blasint incy); | |||||
| float cblas_sdot(blasint n, float *x, blasint incx, float *y, blasint incy); | |||||
| double cblas_ddot(blasint n, double *x, blasint incx, double *y, blasint incy); | |||||
| openblas_complex_float cblas_cdotu(blasint n, float *x, blasint incx, float *y, blasint incy); | |||||
| openblas_complex_float cblas_cdotc(blasint n, float *x, blasint incx, float *y, blasint incy); | |||||
| openblas_complex_double cblas_zdotu(blasint n, double *x, blasint incx, double *y, blasint incy); | |||||
| openblas_complex_double cblas_zdotc(blasint n, double *x, blasint incx, double *y, blasint incy); | |||||
| void cblas_cdotu_sub(blasint n, float *x, blasint incx, float *y, blasint incy, openblas_complex_float *ret); | |||||
| void cblas_cdotc_sub(blasint n, float *x, blasint incx, float *y, blasint incy, openblas_complex_float *ret); | |||||
| void cblas_zdotu_sub(blasint n, double *x, blasint incx, double *y, blasint incy, openblas_complex_double *ret); | |||||
| void cblas_zdotc_sub(blasint n, double *x, blasint incx, double *y, blasint incy, openblas_complex_double *ret); | |||||
| float cblas_sasum (blasint n, float *x, blasint incx); | |||||
| double cblas_dasum (blasint n, double *x, blasint incx); | |||||
| float cblas_scasum(blasint n, float *x, blasint incx); | |||||
| double cblas_dzasum(blasint n, double *x, blasint incx); | |||||
| float cblas_snrm2 (blasint N, float *X, blasint incX); | |||||
| double cblas_dnrm2 (blasint N, double *X, blasint incX); | |||||
| float cblas_scnrm2(blasint N, float *X, blasint incX); | |||||
| double cblas_dznrm2(blasint N, double *X, blasint incX); | |||||
| CBLAS_INDEX cblas_isamax(blasint n, float *x, blasint incx); | |||||
| CBLAS_INDEX cblas_idamax(blasint n, double *x, blasint incx); | |||||
| CBLAS_INDEX cblas_icamax(blasint n, float *x, blasint incx); | |||||
| CBLAS_INDEX cblas_izamax(blasint n, double *x, blasint incx); | |||||
| void cblas_saxpy(blasint n, float alpha, float *x, blasint incx, float *y, blasint incy); | |||||
| void cblas_daxpy(blasint n, double alpha, double *x, blasint incx, double *y, blasint incy); | |||||
| void cblas_caxpy(blasint n, float *alpha, float *x, blasint incx, float *y, blasint incy); | |||||
| void cblas_zaxpy(blasint n, double *alpha, double *x, blasint incx, double *y, blasint incy); | |||||
| void cblas_scopy(blasint n, float *x, blasint incx, float *y, blasint incy); | |||||
| void cblas_dcopy(blasint n, double *x, blasint incx, double *y, blasint incy); | |||||
| void cblas_ccopy(blasint n, float *x, blasint incx, float *y, blasint incy); | |||||
| void cblas_zcopy(blasint n, double *x, blasint incx, double *y, blasint incy); | |||||
| void cblas_sswap(blasint n, float *x, blasint incx, float *y, blasint incy); | |||||
| void cblas_dswap(blasint n, double *x, blasint incx, double *y, blasint incy); | |||||
| void cblas_cswap(blasint n, float *x, blasint incx, float *y, blasint incy); | |||||
| void cblas_zswap(blasint n, double *x, blasint incx, double *y, blasint incy); | |||||
| void cblas_srot(blasint N, float *X, blasint incX, float *Y, blasint incY, float c, float s); | |||||
| void cblas_drot(blasint N, double *X, blasint incX, double *Y, blasint incY, double c, double s); | |||||
| void cblas_srotg(float *a, float *b, float *c, float *s); | |||||
| void cblas_drotg(double *a, double *b, double *c, double *s); | |||||
| void cblas_srotm(blasint N, float *X, blasint incX, float *Y, blasint incY, float *P); | |||||
| void cblas_drotm(blasint N, double *X, blasint incX, double *Y, blasint incY, double *P); | |||||
| void cblas_srotmg(float *d1, float *d2, float *b1, float b2, float *P); | |||||
| void cblas_drotmg(double *d1, double *d2, double *b1, double b2, double *P); | |||||
| void cblas_sscal(blasint N, float alpha, float *X, blasint incX); | |||||
| void cblas_dscal(blasint N, double alpha, double *X, blasint incX); | |||||
| void cblas_cscal(blasint N, float *alpha, float *X, blasint incX); | |||||
| void cblas_zscal(blasint N, double *alpha, double *X, blasint incX); | |||||
| void cblas_csscal(blasint N, float alpha, float *X, blasint incX); | |||||
| void cblas_zdscal(blasint N, double alpha, double *X, blasint incX); | |||||
| void cblas_sgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n, | |||||
| float alpha, float *a, blasint lda, float *x, blasint incx, float beta, float *y, blasint incy); | |||||
| void cblas_dgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n, | |||||
| double alpha, double *a, blasint lda, double *x, blasint incx, double beta, double *y, blasint incy); | |||||
| void cblas_cgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n, | |||||
| float *alpha, float *a, blasint lda, float *x, blasint incx, float *beta, float *y, blasint incy); | |||||
| void cblas_zgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n, | |||||
| double *alpha, double *a, blasint lda, double *x, blasint incx, double *beta, double *y, blasint incy); | |||||
| void cblas_sger (enum CBLAS_ORDER order, blasint M, blasint N, float alpha, float *X, blasint incX, float *Y, blasint incY, float *A, blasint lda); | |||||
| void cblas_dger (enum CBLAS_ORDER order, blasint M, blasint N, double alpha, double *X, blasint incX, double *Y, blasint incY, double *A, blasint lda); | |||||
| void cblas_cgeru(enum CBLAS_ORDER order, blasint M, blasint N, float *alpha, float *X, blasint incX, float *Y, blasint incY, float *A, blasint lda); | |||||
| void cblas_cgerc(enum CBLAS_ORDER order, blasint M, blasint N, float *alpha, float *X, blasint incX, float *Y, blasint incY, float *A, blasint lda); | |||||
| void cblas_zgeru(enum CBLAS_ORDER order, blasint M, blasint N, double *alpha, double *X, blasint incX, double *Y, blasint incY, double *A, blasint lda); | |||||
| void cblas_zgerc(enum CBLAS_ORDER order, blasint M, blasint N, double *alpha, double *X, blasint incX, double *Y, blasint incY, double *A, blasint lda); | |||||
| void cblas_strsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX); | |||||
| void cblas_dtrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX); | |||||
| void cblas_ctrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX); | |||||
| void cblas_ztrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX); | |||||
| void cblas_strmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX); | |||||
| void cblas_dtrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX); | |||||
| void cblas_ctrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX); | |||||
| void cblas_ztrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX); | |||||
| void cblas_ssyr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *A, blasint lda); | |||||
| void cblas_dsyr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *A, blasint lda); | |||||
| void cblas_cher(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *A, blasint lda); | |||||
| void cblas_zher(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *A, blasint lda); | |||||
| void cblas_ssyr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,blasint N, float alpha, float *X, | |||||
| blasint incX, float *Y, blasint incY, float *A, blasint lda); | |||||
| void cblas_dsyr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, | |||||
| blasint incX, double *Y, blasint incY, double *A, blasint lda); | |||||
| void cblas_cher2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float *alpha, float *X, blasint incX, | |||||
| float *Y, blasint incY, float *A, blasint lda); | |||||
| void cblas_zher2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double *alpha, double *X, blasint incX, | |||||
| double *Y, blasint incY, double *A, blasint lda); | |||||
| void cblas_sgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N, | |||||
| blasint KL, blasint KU, float alpha, float *A, blasint lda, float *X, blasint incX, float beta, float *Y, blasint incY); | |||||
| void cblas_dgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N, | |||||
| blasint KL, blasint KU, double alpha, double *A, blasint lda, double *X, blasint incX, double beta, double *Y, blasint incY); | |||||
| void cblas_cgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N, | |||||
| blasint KL, blasint KU, float *alpha, float *A, blasint lda, float *X, blasint incX, float *beta, float *Y, blasint incY); | |||||
| void cblas_zgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N, | |||||
| blasint KL, blasint KU, double *alpha, double *A, blasint lda, double *X, blasint incX, double *beta, double *Y, blasint incY); | |||||
| void cblas_ssbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K, float alpha, float *A, | |||||
| blasint lda, float *X, blasint incX, float beta, float *Y, blasint incY); | |||||
| void cblas_dsbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K, double alpha, double *A, | |||||
| blasint lda, double *X, blasint incX, double beta, double *Y, blasint incY); | |||||
| void cblas_stbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, | |||||
| blasint N, blasint K, float *A, blasint lda, float *X, blasint incX); | |||||
| void cblas_dtbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, | |||||
| blasint N, blasint K, double *A, blasint lda, double *X, blasint incX); | |||||
| void cblas_ctbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, | |||||
| blasint N, blasint K, float *A, blasint lda, float *X, blasint incX); | |||||
| void cblas_ztbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, | |||||
| blasint N, blasint K, double *A, blasint lda, double *X, blasint incX); | |||||
| void cblas_stbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, | |||||
| blasint N, blasint K, float *A, blasint lda, float *X, blasint incX); | |||||
| void cblas_dtbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, | |||||
| blasint N, blasint K, double *A, blasint lda, double *X, blasint incX); | |||||
| void cblas_ctbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, | |||||
| blasint N, blasint K, float *A, blasint lda, float *X, blasint incX); | |||||
| void cblas_ztbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, | |||||
| blasint N, blasint K, double *A, blasint lda, double *X, blasint incX); | |||||
| void cblas_stpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, | |||||
| blasint N, float *Ap, float *X, blasint incX); | |||||
| void cblas_dtpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, | |||||
| blasint N, double *Ap, double *X, blasint incX); | |||||
| void cblas_ctpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, | |||||
| blasint N, float *Ap, float *X, blasint incX); | |||||
| void cblas_ztpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, | |||||
| blasint N, double *Ap, double *X, blasint incX); | |||||
| void cblas_stpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, | |||||
| blasint N, float *Ap, float *X, blasint incX); | |||||
| void cblas_dtpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, | |||||
| blasint N, double *Ap, double *X, blasint incX); | |||||
| void cblas_ctpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, | |||||
| blasint N, float *Ap, float *X, blasint incX); | |||||
| void cblas_ztpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, | |||||
| blasint N, double *Ap, double *X, blasint incX); | |||||
| void cblas_ssymv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *A, | |||||
| blasint lda, float *X, blasint incX, float beta, float *Y, blasint incY); | |||||
| void cblas_dsymv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *A, | |||||
| blasint lda, double *X, blasint incX, double beta, double *Y, blasint incY); | |||||
| void cblas_chemv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float *alpha, float *A, | |||||
| blasint lda, float *X, blasint incX, float *beta, float *Y, blasint incY); | |||||
| void cblas_zhemv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double *alpha, double *A, | |||||
| blasint lda, double *X, blasint incX, double *beta, double *Y, blasint incY); | |||||
| void cblas_sspmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *Ap, | |||||
| float *X, blasint incX, float beta, float *Y, blasint incY); | |||||
| void cblas_dspmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *Ap, | |||||
| double *X, blasint incX, double beta, double *Y, blasint incY); | |||||
| void cblas_sspr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *Ap); | |||||
| void cblas_dspr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *Ap); | |||||
| void cblas_chpr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *A); | |||||
| void cblas_zhpr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X,blasint incX, double *A); | |||||
| void cblas_sspr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *Y, blasint incY, float *A); | |||||
| void cblas_dspr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *Y, blasint incY, double *A); | |||||
| void cblas_chpr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float *alpha, float *X, blasint incX, float *Y, blasint incY, float *Ap); | |||||
| void cblas_zhpr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double *alpha, double *X, blasint incX, double *Y, blasint incY, double *Ap); | |||||
| void cblas_chbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K, | |||||
| float *alpha, float *A, blasint lda, float *X, blasint incX, float *beta, float *Y, blasint incY); | |||||
| void cblas_zhbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K, | |||||
| double *alpha, double *A, blasint lda, double *X, blasint incX, double *beta, double *Y, blasint incY); | |||||
| void cblas_chpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, | |||||
| float *alpha, float *Ap, float *X, blasint incX, float *beta, float *Y, blasint incY); | |||||
| void cblas_zhpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, | |||||
| double *alpha, double *Ap, double *X, blasint incX, double *beta, double *Y, blasint incY); | |||||
| void cblas_sgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K, | |||||
| float alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc); | |||||
| void cblas_dgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K, | |||||
| double alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc); | |||||
| void cblas_cgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K, | |||||
| float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc); | |||||
| void cblas_zgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K, | |||||
| double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc); | |||||
| void cblas_ssymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, | |||||
| float alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc); | |||||
| void cblas_dsymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, | |||||
| double alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc); | |||||
| void cblas_csymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, | |||||
| float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc); | |||||
| void cblas_zsymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, | |||||
| double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc); | |||||
| void cblas_ssyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, | |||||
| blasint N, blasint K, float alpha, float *A, blasint lda, float beta, float *C, blasint ldc); | |||||
| void cblas_dsyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, | |||||
| blasint N, blasint K, double alpha, double *A, blasint lda, double beta, double *C, blasint ldc); | |||||
| void cblas_csyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, | |||||
| blasint N, blasint K, float *alpha, float *A, blasint lda, float *beta, float *C, blasint ldc); | |||||
| void cblas_zsyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, | |||||
| blasint N, blasint K, double *alpha, double *A, blasint lda, double *beta, double *C, blasint ldc); | |||||
| void cblas_ssyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, | |||||
| blasint N, blasint K, float alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc); | |||||
| void cblas_dsyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, | |||||
| blasint N, blasint K, double alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc); | |||||
| void cblas_csyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, | |||||
| blasint N, blasint K, float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc); | |||||
| void cblas_zsyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, | |||||
| blasint N, blasint K, double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc); | |||||
| void cblas_strmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, | |||||
| enum CBLAS_DIAG Diag, blasint M, blasint N, float alpha, float *A, blasint lda, float *B, blasint ldb); | |||||
| void cblas_dtrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, | |||||
| enum CBLAS_DIAG Diag, blasint M, blasint N, double alpha, double *A, blasint lda, double *B, blasint ldb); | |||||
| void cblas_ctrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, | |||||
| enum CBLAS_DIAG Diag, blasint M, blasint N, float *alpha, float *A, blasint lda, float *B, blasint ldb); | |||||
| void cblas_ztrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, | |||||
| enum CBLAS_DIAG Diag, blasint M, blasint N, double *alpha, double *A, blasint lda, double *B, blasint ldb); | |||||
| void cblas_strsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, | |||||
| enum CBLAS_DIAG Diag, blasint M, blasint N, float alpha, float *A, blasint lda, float *B, blasint ldb); | |||||
| void cblas_dtrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, | |||||
| enum CBLAS_DIAG Diag, blasint M, blasint N, double alpha, double *A, blasint lda, double *B, blasint ldb); | |||||
| void cblas_ctrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, | |||||
| enum CBLAS_DIAG Diag, blasint M, blasint N, float *alpha, float *A, blasint lda, float *B, blasint ldb); | |||||
| void cblas_ztrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, | |||||
| enum CBLAS_DIAG Diag, blasint M, blasint N, double *alpha, double *A, blasint lda, double *B, blasint ldb); | |||||
| void cblas_chemm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, | |||||
| float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc); | |||||
| void cblas_zhemm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, | |||||
| double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc); | |||||
| void cblas_cherk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K, | |||||
| float alpha, float *A, blasint lda, float beta, float *C, blasint ldc); | |||||
| void cblas_zherk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K, | |||||
| double alpha, double *A, blasint lda, double beta, double *C, blasint ldc); | |||||
| void cblas_cher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K, | |||||
| float *alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc); | |||||
| void cblas_zher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K, | |||||
| double *alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc); | |||||
| void cblas_xerbla(blasint p, char *rout, char *form, ...); | |||||
| #ifdef __cplusplus | |||||
| } | |||||
| #endif /* __cplusplus */ | |||||
| #endif | |||||
| @@ -310,6 +310,15 @@ typedef int blasint; | |||||
| #define YIELDING SwitchToThread() | #define YIELDING SwitchToThread() | ||||
| #endif | #endif | ||||
| #if defined(ARMV7) || defined(ARMV6) || defined(ARMV8) | |||||
| #define YIELDING asm volatile ("nop;nop;nop;nop;nop;nop;nop;nop; \n"); | |||||
| #endif | |||||
| #ifdef PILEDRIVER | |||||
| #define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n"); | |||||
| #endif | |||||
| #ifndef YIELDING | #ifndef YIELDING | ||||
| #define YIELDING sched_yield() | #define YIELDING sched_yield() | ||||
| #endif | #endif | ||||
| @@ -363,6 +372,15 @@ please https://github.com/xianyi/OpenBLAS/issues/246 | |||||
| #include "common_mips64.h" | #include "common_mips64.h" | ||||
| #endif | #endif | ||||
| #ifdef ARCH_ARM | |||||
| #include "common_arm.h" | |||||
| #endif | |||||
| #ifdef ARCH_ARM64 | |||||
| #include "common_arm64.h" | |||||
| #endif | |||||
| #ifdef OS_LINUX | #ifdef OS_LINUX | ||||
| #include "common_linux.h" | #include "common_linux.h" | ||||
| #endif | #endif | ||||
| @@ -0,0 +1,169 @@ | |||||
| /***************************************************************************** | |||||
| Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the ISCAS nor the names of its contributors may | |||||
| be used to endorse or promote products derived from this software | |||||
| without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| **********************************************************************************/ | |||||
| /*********************************************************************/ | |||||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||||
| /* All rights reserved. */ | |||||
| /* */ | |||||
| /* Redistribution and use in source and binary forms, with or */ | |||||
| /* without modification, are permitted provided that the following */ | |||||
| /* conditions are met: */ | |||||
| /* */ | |||||
| /* 1. Redistributions of source code must retain the above */ | |||||
| /* copyright notice, this list of conditions and the following */ | |||||
| /* disclaimer. */ | |||||
| /* */ | |||||
| /* 2. Redistributions in binary form must reproduce the above */ | |||||
| /* copyright notice, this list of conditions and the following */ | |||||
| /* disclaimer in the documentation and/or other materials */ | |||||
| /* provided with the distribution. */ | |||||
| /* */ | |||||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||||
| /* */ | |||||
| /* The views and conclusions contained in the software and */ | |||||
| /* documentation are those of the authors and should not be */ | |||||
| /* interpreted as representing official policies, either expressed */ | |||||
| /* or implied, of The University of Texas at Austin. */ | |||||
| /*********************************************************************/ | |||||
| #ifndef COMMON_ARM | |||||
| #define COMMON_ARM | |||||
| #define MB | |||||
| #define WMB | |||||
| #define INLINE inline | |||||
| #define RETURN_BY_COMPLEX | |||||
| #ifndef ASSEMBLER | |||||
| static void __inline blas_lock(volatile BLASULONG *address){ | |||||
| int register ret; | |||||
| do { | |||||
| while (*address) {YIELDING;}; | |||||
| __asm__ __volatile__( | |||||
| "ldrex r2, [%1] \n\t" | |||||
| "mov r2, #0 \n\t" | |||||
| "strex r3, r2, [%1] \n\t" | |||||
| "mov %0 , r3 \n\t" | |||||
| : "=r"(ret), "=r"(address) | |||||
| : "1"(address) | |||||
| : "memory", "r2" , "r3" | |||||
| ); | |||||
| } while (ret); | |||||
| } | |||||
| static inline unsigned long long rpcc(void){ | |||||
| unsigned long long ret=0; | |||||
| double v; | |||||
| struct timeval tv; | |||||
| gettimeofday(&tv,NULL); | |||||
| v=(double) tv.tv_sec + (double) tv.tv_usec * 1e-6; | |||||
| ret = (unsigned long long) ( v * 1000.0d ); | |||||
| return ret; | |||||
| } | |||||
| static inline int blas_quickdivide(blasint x, blasint y){ | |||||
| return x / y; | |||||
| } | |||||
| #if defined(DOUBLE) | |||||
| #define GET_IMAGE(res) __asm__ __volatile__("vstr.f64 d1, %0" : "=m"(res) : : "memory") | |||||
| #else | |||||
| #define GET_IMAGE(res) __asm__ __volatile__("vstr.f32 s1, %0" : "=m"(res) : : "memory") | |||||
| #endif | |||||
| #define GET_IMAGE_CANCEL | |||||
| #endif | |||||
| #ifndef F_INTERFACE | |||||
| #define REALNAME ASMNAME | |||||
| #else | |||||
| #define REALNAME ASMFNAME | |||||
| #endif | |||||
| #if defined(ASSEMBLER) && !defined(NEEDPARAM) | |||||
| #define PROLOGUE \ | |||||
| .arm ;\ | |||||
| .global REALNAME ;\ | |||||
| .func REALNAME ;\ | |||||
| REALNAME: | |||||
| #define EPILOGUE | |||||
| #define PROFCODE | |||||
| #endif | |||||
| #define SEEK_ADDRESS | |||||
| #ifndef PAGESIZE | |||||
| #define PAGESIZE ( 4 << 10) | |||||
| #endif | |||||
| #define HUGE_PAGESIZE ( 4 << 20) | |||||
| #define BUFFER_SIZE (16 << 20) | |||||
| #define BASE_ADDRESS (START_ADDRESS - BUFFER_SIZE * MAX_CPU_NUMBER) | |||||
| #ifndef MAP_ANONYMOUS | |||||
| #define MAP_ANONYMOUS MAP_ANON | |||||
| #endif | |||||
| #endif | |||||
| @@ -0,0 +1,169 @@ | |||||
| /***************************************************************************** | |||||
| Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the ISCAS nor the names of its contributors may | |||||
| be used to endorse or promote products derived from this software | |||||
| without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| **********************************************************************************/ | |||||
| /*********************************************************************/ | |||||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||||
| /* All rights reserved. */ | |||||
| /* */ | |||||
| /* Redistribution and use in source and binary forms, with or */ | |||||
| /* without modification, are permitted provided that the following */ | |||||
| /* conditions are met: */ | |||||
| /* */ | |||||
| /* 1. Redistributions of source code must retain the above */ | |||||
| /* copyright notice, this list of conditions and the following */ | |||||
| /* disclaimer. */ | |||||
| /* */ | |||||
| /* 2. Redistributions in binary form must reproduce the above */ | |||||
| /* copyright notice, this list of conditions and the following */ | |||||
| /* disclaimer in the documentation and/or other materials */ | |||||
| /* provided with the distribution. */ | |||||
| /* */ | |||||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||||
| /* */ | |||||
| /* The views and conclusions contained in the software and */ | |||||
| /* documentation are those of the authors and should not be */ | |||||
| /* interpreted as representing official policies, either expressed */ | |||||
| /* or implied, of The University of Texas at Austin. */ | |||||
| /*********************************************************************/ | |||||
| #ifndef COMMON_ARM64 | |||||
| #define COMMON_ARM64 | |||||
| #define MB | |||||
| #define WMB | |||||
| #define INLINE inline | |||||
| #define RETURN_BY_COMPLEX | |||||
| #ifndef ASSEMBLER | |||||
| static void __inline blas_lock(volatile BLASULONG *address){ | |||||
| /* | |||||
| int register ret; | |||||
| do { | |||||
| while (*address) {YIELDING;}; | |||||
| __asm__ __volatile__( | |||||
| "ldrex r2, [%1] \n\t" | |||||
| "mov r2, #0 \n\t" | |||||
| "strex r3, r2, [%1] \n\t" | |||||
| "mov %0 , r3 \n\t" | |||||
| : "=r"(ret), "=r"(address) | |||||
| : "1"(address) | |||||
| : "memory", "r2" , "r3" | |||||
| ); | |||||
| } while (ret); | |||||
| */ | |||||
| } | |||||
| static inline unsigned long long rpcc(void){ | |||||
| unsigned long long ret=0; | |||||
| double v; | |||||
| struct timeval tv; | |||||
| gettimeofday(&tv,NULL); | |||||
| v=(double) tv.tv_sec + (double) tv.tv_usec * 1e-6; | |||||
| ret = (unsigned long long) ( v * 1000.0d ); | |||||
| return ret; | |||||
| } | |||||
| static inline int blas_quickdivide(blasint x, blasint y){ | |||||
| return x / y; | |||||
| } | |||||
| #if defined(DOUBLE) | |||||
| #define GET_IMAGE(res) __asm__ __volatile__("vstr.f64 d1, %0" : "=m"(res) : : "memory") | |||||
| #else | |||||
| #define GET_IMAGE(res) __asm__ __volatile__("vstr.f32 s1, %0" : "=m"(res) : : "memory") | |||||
| #endif | |||||
| #define GET_IMAGE_CANCEL | |||||
| #endif | |||||
| #ifndef F_INTERFACE | |||||
| #define REALNAME ASMNAME | |||||
| #else | |||||
| #define REALNAME ASMFNAME | |||||
| #endif | |||||
| #if defined(ASSEMBLER) && !defined(NEEDPARAM) | |||||
| #define PROLOGUE \ | |||||
| .arm ;\ | |||||
| .global REALNAME ;\ | |||||
| .func REALNAME ;\ | |||||
| REALNAME: | |||||
| #define EPILOGUE | |||||
| #define PROFCODE | |||||
| #endif | |||||
| #define SEEK_ADDRESS | |||||
| #ifndef PAGESIZE | |||||
| #define PAGESIZE ( 4 << 10) | |||||
| #endif | |||||
| #define HUGE_PAGESIZE ( 4 << 20) | |||||
| #define BUFFER_SIZE (16 << 20) | |||||
| #define BASE_ADDRESS (START_ADDRESS - BUFFER_SIZE * MAX_CPU_NUMBER) | |||||
| #ifndef MAP_ANONYMOUS | |||||
| #define MAP_ANONYMOUS MAP_ANON | |||||
| #endif | |||||
| #endif | |||||
| @@ -107,7 +107,7 @@ | |||||
| #define CORE_BOBCAT 21 | #define CORE_BOBCAT 21 | ||||
| #define CORE_BULLDOZER 22 | #define CORE_BULLDOZER 22 | ||||
| #define CORE_PILEDRIVER 23 | #define CORE_PILEDRIVER 23 | ||||
| #define CORE_HASWELL CORE_SANDYBRIDGE | |||||
| #define CORE_HASWELL 24 | |||||
| #define HAVE_SSE (1 << 0) | #define HAVE_SSE (1 << 0) | ||||
| #define HAVE_SSE2 (1 << 1) | #define HAVE_SSE2 (1 << 1) | ||||
| @@ -200,7 +200,6 @@ typedef struct { | |||||
| #define CPUTYPE_BOBCAT 45 | #define CPUTYPE_BOBCAT 45 | ||||
| #define CPUTYPE_BULLDOZER 46 | #define CPUTYPE_BULLDOZER 46 | ||||
| #define CPUTYPE_PILEDRIVER 47 | #define CPUTYPE_PILEDRIVER 47 | ||||
| // this define is because BLAS doesn't have haswell specific optimizations yet | |||||
| #define CPUTYPE_HASWELL CPUTYPE_SANDYBRIDGE | |||||
| #define CPUTYPE_HASWELL 48 | |||||
| #endif | #endif | ||||
| @@ -0,0 +1,262 @@ | |||||
| /************************************************************************** | |||||
| Copyright (c) 2013, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #include <string.h> | |||||
| #define CPU_UNKNOWN 0 | |||||
| #define CPU_ARMV6 1 | |||||
| #define CPU_ARMV7 2 | |||||
| #define CPU_CORTEXA15 3 | |||||
| static char *cpuname[] = { | |||||
| "UNKOWN", | |||||
| "ARMV6", | |||||
| "ARMV7", | |||||
| "CORTEXA15" | |||||
| }; | |||||
| int get_feature(char *search) | |||||
| { | |||||
| #ifdef linux | |||||
| FILE *infile; | |||||
| char buffer[2048], *p,*t; | |||||
| p = (char *) NULL ; | |||||
| infile = fopen("/proc/cpuinfo", "r"); | |||||
| while (fgets(buffer, sizeof(buffer), infile)) | |||||
| { | |||||
| if (!strncmp("Features", buffer, 8)) | |||||
| { | |||||
| p = strchr(buffer, ':') + 2; | |||||
| break; | |||||
| } | |||||
| } | |||||
| fclose(infile); | |||||
| if( p == NULL ) return; | |||||
| t = strtok(p," "); | |||||
| while( t = strtok(NULL," ")) | |||||
| { | |||||
| if (!strcmp(t, search)) { return(1); } | |||||
| } | |||||
| #endif | |||||
| return(0); | |||||
| } | |||||
| int detect(void) | |||||
| { | |||||
| #ifdef linux | |||||
| FILE *infile; | |||||
| char buffer[512], *p; | |||||
| p = (char *) NULL ; | |||||
| infile = fopen("/proc/cpuinfo", "r"); | |||||
| while (fgets(buffer, sizeof(buffer), infile)) | |||||
| { | |||||
| if (!strncmp("model name", buffer, 10)) | |||||
| { | |||||
| p = strchr(buffer, ':') + 2; | |||||
| break; | |||||
| } | |||||
| } | |||||
| fclose(infile); | |||||
| if(p != NULL) | |||||
| { | |||||
| if (strstr(p, "ARMv7")) | |||||
| { | |||||
| if ( get_feature("vfpv4")) | |||||
| return CPU_ARMV7; | |||||
| if ( get_feature("vfpv3")) | |||||
| return CPU_ARMV7; | |||||
| if ( get_feature("vfp")) | |||||
| return CPU_ARMV6; | |||||
| } | |||||
| if (strstr(p, "ARMv6")) | |||||
| { | |||||
| if ( get_feature("vfp")) | |||||
| return CPU_ARMV6; | |||||
| } | |||||
| } | |||||
| #endif | |||||
| return CPU_UNKNOWN; | |||||
| } | |||||
| char *get_corename(void) | |||||
| { | |||||
| return cpuname[detect()]; | |||||
| } | |||||
| void get_architecture(void) | |||||
| { | |||||
| printf("ARM"); | |||||
| } | |||||
| void get_subarchitecture(void) | |||||
| { | |||||
| int d = detect(); | |||||
| switch (d) | |||||
| { | |||||
| case CPU_ARMV7: | |||||
| printf("ARMV7"); | |||||
| break; | |||||
| case CPU_ARMV6: | |||||
| printf("ARMV6"); | |||||
| break; | |||||
| default: | |||||
| printf("UNKNOWN"); | |||||
| break; | |||||
| } | |||||
| } | |||||
| void get_subdirname(void) | |||||
| { | |||||
| printf("arm"); | |||||
| } | |||||
| void get_cpuconfig(void) | |||||
| { | |||||
| int d = detect(); | |||||
| switch (d) | |||||
| { | |||||
| case CPU_ARMV7: | |||||
| printf("#define ARMV7\n"); | |||||
| printf("#define HAVE_VFP\n"); | |||||
| printf("#define HAVE_VFPV3\n"); | |||||
| if ( get_feature("neon")) printf("#define HAVE_NEON\n"); | |||||
| if ( get_feature("vfpv4")) printf("#define HAVE_VFPV4\n"); | |||||
| printf("#define L1_DATA_SIZE 65536\n"); | |||||
| printf("#define L1_DATA_LINESIZE 32\n"); | |||||
| printf("#define L2_SIZE 512488\n"); | |||||
| printf("#define L2_LINESIZE 32\n"); | |||||
| printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||||
| printf("#define DTB_SIZE 4096\n"); | |||||
| printf("#define L2_ASSOCIATIVE 4\n"); | |||||
| break; | |||||
| case CPU_ARMV6: | |||||
| printf("#define ARMV6\n"); | |||||
| printf("#define HAVE_VFP\n"); | |||||
| printf("#define L1_DATA_SIZE 65536\n"); | |||||
| printf("#define L1_DATA_LINESIZE 32\n"); | |||||
| printf("#define L2_SIZE 512488\n"); | |||||
| printf("#define L2_LINESIZE 32\n"); | |||||
| printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||||
| printf("#define DTB_SIZE 4096\n"); | |||||
| printf("#define L2_ASSOCIATIVE 4\n"); | |||||
| break; | |||||
| } | |||||
| } | |||||
| void get_libname(void) | |||||
| { | |||||
| int d = detect(); | |||||
| switch (d) | |||||
| { | |||||
| case CPU_ARMV7: | |||||
| printf("armv7\n"); | |||||
| break; | |||||
| case CPU_ARMV6: | |||||
| printf("armv6\n"); | |||||
| break; | |||||
| } | |||||
| } | |||||
| void get_features(void) | |||||
| { | |||||
| #ifdef linux | |||||
| FILE *infile; | |||||
| char buffer[2048], *p,*t; | |||||
| p = (char *) NULL ; | |||||
| infile = fopen("/proc/cpuinfo", "r"); | |||||
| while (fgets(buffer, sizeof(buffer), infile)) | |||||
| { | |||||
| if (!strncmp("Features", buffer, 8)) | |||||
| { | |||||
| p = strchr(buffer, ':') + 2; | |||||
| break; | |||||
| } | |||||
| } | |||||
| fclose(infile); | |||||
| if( p == NULL ) return; | |||||
| t = strtok(p," "); | |||||
| while( t = strtok(NULL," ")) | |||||
| { | |||||
| if (!strcmp(t, "vfp")) { printf("HAVE_VFP=1\n"); continue; } | |||||
| if (!strcmp(t, "vfpv3")) { printf("HAVE_VFPV3=1\n"); continue; } | |||||
| if (!strcmp(t, "vfpv4")) { printf("HAVE_VFPV4=1\n"); continue; } | |||||
| if (!strcmp(t, "neon")) { printf("HAVE_NEON=1\n"); continue; } | |||||
| } | |||||
| #endif | |||||
| return; | |||||
| } | |||||
| @@ -1243,6 +1243,7 @@ static char *cpuname[] = { | |||||
| "BOBCAT", | "BOBCAT", | ||||
| "BULLDOZER", | "BULLDOZER", | ||||
| "PILEDRIVER", | "PILEDRIVER", | ||||
| "HASWELL", | |||||
| }; | }; | ||||
| static char *lowercpuname[] = { | static char *lowercpuname[] = { | ||||
| @@ -1293,6 +1294,7 @@ static char *lowercpuname[] = { | |||||
| "bobcat", | "bobcat", | ||||
| "bulldozer", | "bulldozer", | ||||
| "piledriver", | "piledriver", | ||||
| "haswell", | |||||
| }; | }; | ||||
| static char *corename[] = { | static char *corename[] = { | ||||
| @@ -1320,6 +1322,7 @@ static char *corename[] = { | |||||
| "BOBCAT", | "BOBCAT", | ||||
| "BULLDOZER", | "BULLDOZER", | ||||
| "PILEDRIVER", | "PILEDRIVER", | ||||
| "HASWELL", | |||||
| }; | }; | ||||
| static char *corename_lower[] = { | static char *corename_lower[] = { | ||||
| @@ -1347,6 +1350,7 @@ static char *corename_lower[] = { | |||||
| "bobcat", | "bobcat", | ||||
| "bulldozer", | "bulldozer", | ||||
| "piledriver", | "piledriver", | ||||
| "haswell", | |||||
| }; | }; | ||||
| @@ -124,3 +124,12 @@ ARCH_IA64 | |||||
| #if defined(__LP64) || defined(__LP64__) || defined(__ptr64) || defined(__x86_64__) || defined(__amd64__) || defined(__64BIT__) | #if defined(__LP64) || defined(__LP64__) || defined(__ptr64) || defined(__x86_64__) || defined(__amd64__) || defined(__64BIT__) | ||||
| BINARY_64 | BINARY_64 | ||||
| #endif | #endif | ||||
| #if defined(__ARM_ARCH) || defined(__ARM_ARCH_7A__) | |||||
| ARCH_ARM | |||||
| #endif | |||||
| #if defined(__aarch64__) | |||||
| ARCH_ARM64 | |||||
| #endif | |||||
| @@ -333,9 +333,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||||
| for(jjs = js; jjs < js + min_j; jjs += min_jj){ | for(jjs = js; jjs < js + min_j; jjs += min_jj){ | ||||
| min_jj = min_j + js - jjs; | min_jj = min_j + js - jjs; | ||||
| #if defined(BULLDOZER) && defined(ARCH_X86_64) && !defined(XDOUBLE) && !defined(COMPLEX) | |||||
| if (min_jj >= 12*GEMM_UNROLL_N) min_jj = 12*GEMM_UNROLL_N; | |||||
| else | |||||
| #if ( defined(BULLDOZER) || defined(PILEDRIVER) || defined(HASWELL) ) && defined(ARCH_X86_64) && !defined(XDOUBLE) && !defined(COMPLEX) | |||||
| if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; | if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; | ||||
| else | else | ||||
| if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N; | if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N; | ||||
| @@ -367,9 +367,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||||
| for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){ | for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){ | ||||
| min_jj = MIN(n_to, xxx + div_n) - jjs; | min_jj = MIN(n_to, xxx + div_n) - jjs; | ||||
| #if defined(BULLDOZER) && defined(ARCH_X86_64) && !defined(XDOUBLE) && !defined(COMPLEX) | |||||
| if (min_jj >= 12*GEMM_UNROLL_N) min_jj = 12*GEMM_UNROLL_N; | |||||
| else | |||||
| #if ( defined(BULLDOZER) || defined(PILEDRIVER) || defined(HASWELL) ) && defined(ARCH_X86_64) && !defined(XDOUBLE) && !defined(COMPLEX) | |||||
| if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; | if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; | ||||
| else | else | ||||
| if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N; | if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N; | ||||
| @@ -65,14 +65,15 @@ extern gotoblas_t gotoblas_BOBCAT; | |||||
| extern gotoblas_t gotoblas_SANDYBRIDGE; | extern gotoblas_t gotoblas_SANDYBRIDGE; | ||||
| extern gotoblas_t gotoblas_BULLDOZER; | extern gotoblas_t gotoblas_BULLDOZER; | ||||
| extern gotoblas_t gotoblas_PILEDRIVER; | extern gotoblas_t gotoblas_PILEDRIVER; | ||||
| extern gotoblas_t gotoblas_HASWELL; | |||||
| #else | #else | ||||
| //Use NEHALEM kernels for sandy bridge | //Use NEHALEM kernels for sandy bridge | ||||
| #define gotoblas_SANDYBRIDGE gotoblas_NEHALEM | #define gotoblas_SANDYBRIDGE gotoblas_NEHALEM | ||||
| #define gotoblas_HASWELL gotoblas_NEHALEM | |||||
| #define gotoblas_BULLDOZER gotoblas_BARCELONA | #define gotoblas_BULLDOZER gotoblas_BARCELONA | ||||
| #define gotoblas_PILEDRIVER gotoblas_BARCELONA | #define gotoblas_PILEDRIVER gotoblas_BARCELONA | ||||
| #endif | #endif | ||||
| //Use sandy bridge kernels for haswell. | |||||
| #define gotoblas_HASWELL gotoblas_SANDYBRIDGE | |||||
| #define VENDOR_INTEL 1 | #define VENDOR_INTEL 1 | ||||
| #define VENDOR_AMD 2 | #define VENDOR_AMD 2 | ||||
| @@ -297,6 +298,7 @@ static char *corename[] = { | |||||
| "Bobcat", | "Bobcat", | ||||
| "Bulldozer", | "Bulldozer", | ||||
| "Piledriver", | "Piledriver", | ||||
| "Haswell", | |||||
| }; | }; | ||||
| char *gotoblas_corename(void) { | char *gotoblas_corename(void) { | ||||
| @@ -319,7 +321,8 @@ char *gotoblas_corename(void) { | |||||
| if (gotoblas == &gotoblas_SANDYBRIDGE) return corename[16]; | if (gotoblas == &gotoblas_SANDYBRIDGE) return corename[16]; | ||||
| if (gotoblas == &gotoblas_BOBCAT) return corename[17]; | if (gotoblas == &gotoblas_BOBCAT) return corename[17]; | ||||
| if (gotoblas == &gotoblas_BULLDOZER) return corename[18]; | if (gotoblas == &gotoblas_BULLDOZER) return corename[18]; | ||||
| if (gotoblas == &gotoblas_PILEDRIVER) return corename[19]; | |||||
| if (gotoblas == &gotoblas_PILEDRIVER) return corename[19]; | |||||
| if (gotoblas == &gotoblas_HASWELL) return corename[20]; | |||||
| return corename[0]; | return corename[0]; | ||||
| } | } | ||||
| @@ -298,6 +298,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define CORENAME "SANDYBRIDGE" | #define CORENAME "SANDYBRIDGE" | ||||
| #endif | #endif | ||||
| #ifdef FORCE_HASWELL | |||||
| #define FORCE | |||||
| #define FORCE_INTEL | |||||
| #define ARCHITECTURE "X86" | |||||
| #define SUBARCHITECTURE "HASWELL" | |||||
| #define ARCHCONFIG "-DHASWELL " \ | |||||
| "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ | |||||
| "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ | |||||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | |||||
| "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \ | |||||
| "-DFMA3" | |||||
| #define LIBNAME "haswell" | |||||
| #define CORENAME "HASWELL" | |||||
| #endif | |||||
| #ifdef FORCE_ATOM | #ifdef FORCE_ATOM | ||||
| #define FORCE | #define FORCE | ||||
| #define FORCE_INTEL | #define FORCE_INTEL | ||||
| @@ -679,6 +694,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define CORENAME "generic" | #define CORENAME "generic" | ||||
| #endif | #endif | ||||
| #ifdef FORCE_ARMV7 | |||||
| #define FORCE | |||||
| #define ARCHITECTURE "ARM" | |||||
| #define SUBARCHITECTURE "ARMV7" | |||||
| #define SUBDIRNAME "arm" | |||||
| #define ARCHCONFIG "-DARMV7 " \ | |||||
| "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \ | |||||
| "-DL2_SIZE=512488 -DL2_LINESIZE=32 " \ | |||||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " \ | |||||
| "-DHAVE_VFPV3 -DHAVE_VFP" | |||||
| #define LIBNAME "armv7" | |||||
| #define CORENAME "ARMV7" | |||||
| #else | |||||
| #endif | |||||
| #ifdef FORCE_ARMV6 | |||||
| #define FORCE | |||||
| #define ARCHITECTURE "ARM" | |||||
| #define SUBARCHITECTURE "ARMV6" | |||||
| #define SUBDIRNAME "arm" | |||||
| #define ARCHCONFIG "-DARMV6 " \ | |||||
| "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \ | |||||
| "-DL2_SIZE=512488 -DL2_LINESIZE=32 " \ | |||||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " \ | |||||
| "-DHAVE_VFP" | |||||
| #define LIBNAME "armv6" | |||||
| #define CORENAME "ARMV6" | |||||
| #else | |||||
| #endif | |||||
| #ifdef FORCE_ARMV8 | |||||
| #define FORCE | |||||
| #define ARCHITECTURE "ARM64" | |||||
| #define SUBARCHITECTURE "ARMV8" | |||||
| #define SUBDIRNAME "arm64" | |||||
| #define ARCHCONFIG "-DARMV8 " \ | |||||
| "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \ | |||||
| "-DL2_SIZE=512488 -DL2_LINESIZE=32 " \ | |||||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " \ | |||||
| "-DHAVE_VFP -DHAVE_VFPV3 -DHAVE_VFPV4" | |||||
| #define LIBNAME "armv8" | |||||
| #define CORENAME "ARMV8" | |||||
| #else | |||||
| #endif | |||||
| #ifndef FORCE | #ifndef FORCE | ||||
| #if defined(__powerpc__) || defined(__powerpc) || defined(powerpc) || \ | #if defined(__powerpc__) || defined(__powerpc) || defined(powerpc) || \ | ||||
| @@ -719,6 +780,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define OPENBLAS_SUPPORTED | #define OPENBLAS_SUPPORTED | ||||
| #endif | #endif | ||||
| #ifdef __arm__ | |||||
| #include "cpuid_arm.c" | |||||
| #define OPENBLAS_SUPPORTED | |||||
| #endif | |||||
| #ifndef OPENBLAS_SUPPORTED | #ifndef OPENBLAS_SUPPORTED | ||||
| #error "This arch/CPU is not supported by OpenBLAS." | #error "This arch/CPU is not supported by OpenBLAS." | ||||
| #endif | #endif | ||||
| @@ -773,7 +840,7 @@ int main(int argc, char *argv[]){ | |||||
| #ifdef FORCE | #ifdef FORCE | ||||
| printf("CORE=%s\n", CORENAME); | printf("CORE=%s\n", CORENAME); | ||||
| #else | #else | ||||
| #if defined(__i386__) || defined(__x86_64__) || defined(POWER) || defined(__mips__) | |||||
| #if defined(__i386__) || defined(__x86_64__) || defined(POWER) || defined(__mips__) || defined(__arm__) | |||||
| printf("CORE=%s\n", get_corename()); | printf("CORE=%s\n", get_corename()); | ||||
| #endif | #endif | ||||
| #endif | #endif | ||||
| @@ -788,6 +855,11 @@ int main(int argc, char *argv[]){ | |||||
| printf("NUM_CORES=%d\n", get_num_cores()); | printf("NUM_CORES=%d\n", get_num_cores()); | ||||
| #if defined(__arm__) && !defined(FORCE) | |||||
| get_features(); | |||||
| #endif | |||||
| #if defined(__i386__) || defined(__x86_64__) | #if defined(__i386__) || defined(__x86_64__) | ||||
| #ifndef FORCE | #ifndef FORCE | ||||
| get_sse(); | get_sse(); | ||||
| @@ -14,6 +14,20 @@ ifeq ($(ARCH), MIPS) | |||||
| USE_GEMM3M = 1 | USE_GEMM3M = 1 | ||||
| endif | endif | ||||
| ifeq ($(ARCH), arm) | |||||
| USE_TRMM = 1 | |||||
| endif | |||||
| ifeq ($(ARCH), arm64) | |||||
| USE_TRMM = 1 | |||||
| endif | |||||
| ifeq ($(TARGET), LOONGSON3B) | |||||
| USE_TRMM = 1 | |||||
| endif | |||||
| SKERNELOBJS += \ | SKERNELOBJS += \ | ||||
| sgemm_kernel$(TSUFFIX).$(SUFFIX) \ | sgemm_kernel$(TSUFFIX).$(SUFFIX) \ | ||||
| $(SGEMMINCOPYOBJ) $(SGEMMITCOPYOBJ) \ | $(SGEMMINCOPYOBJ) $(SGEMMITCOPYOBJ) \ | ||||
| @@ -498,7 +512,8 @@ $(KDIR)xgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMD | |||||
| $(KDIR)xgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND) | $(KDIR)xgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND) | ||||
| $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $@ | $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $@ | ||||
| ifeq ($(TARGET), LOONGSON3B) | |||||
| ifdef USE_TRMM | |||||
| $(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) | $(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) | ||||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ | $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ | ||||
| @@ -582,24 +597,6 @@ $(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) | |||||
| $(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) | $(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) | ||||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ | $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ | ||||
| else | |||||
| ifdef STRMMKERNEL | |||||
| $(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) | |||||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ | |||||
| $(KDIR)strmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) | |||||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ | |||||
| $(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) | |||||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ | |||||
| $(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) | |||||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ | |||||
| else | else | ||||
| $(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) | $(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) | ||||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ | $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ | ||||
| @@ -613,93 +610,17 @@ $(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) | |||||
| $(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) | $(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) | ||||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ | $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ | ||||
| endif | |||||
| ifdef DTRMMKERNEL | |||||
| ifdef DTRMMKERNEL_LN | |||||
| $(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL_LN) | |||||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ | |||||
| else | |||||
| $(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) | |||||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ | |||||
| endif | |||||
| ifdef DTRMMKERNEL_LT | |||||
| $(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL_LT) | |||||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ | |||||
| else | |||||
| $(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) | |||||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ | |||||
| endif | |||||
| ifdef DTRMMKERNEL_RN | |||||
| $(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL_RN) | |||||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ | |||||
| else | |||||
| $(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) | |||||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ | |||||
| endif | |||||
| ifdef DTRMMKERNEL_RT | |||||
| $(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL_RT) | |||||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ | |||||
| else | |||||
| $(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) | |||||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ | |||||
| endif | |||||
| else | |||||
| ifdef DTRMMKERNEL_LN | |||||
| $(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL_LN) | |||||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ | |||||
| else | |||||
| $(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) | $(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) | ||||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ | $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ | ||||
| endif | |||||
| ifdef DTRMMKERNEL_LT | |||||
| $(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL_LT) | |||||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ | |||||
| else | |||||
| $(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) | $(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) | ||||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ | $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ | ||||
| endif | |||||
| ifdef DTRMMKERNEL_RN | |||||
| $(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL_RN) | |||||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ | |||||
| else | |||||
| $(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) | $(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) | ||||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ | $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ | ||||
| endif | |||||
| ifdef DTRMMKERNEL_RT | |||||
| $(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL_RT) | |||||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ | |||||
| else | |||||
| $(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) | $(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) | ||||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ | $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ | ||||
| endif | |||||
| endif | |||||
| ifdef QTRMMKERNEL | |||||
| $(KDIR)qtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) | |||||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ | |||||
| $(KDIR)qtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) | |||||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ | |||||
| $(KDIR)qtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) | |||||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ | |||||
| $(KDIR)qtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) | |||||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ | |||||
| else | |||||
| $(KDIR)qtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) | $(KDIR)qtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) | ||||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ | $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ | ||||
| @@ -713,36 +634,6 @@ $(KDIR)qtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) | |||||
| $(KDIR)qtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) | $(KDIR)qtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) | ||||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ | $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ | ||||
| endif | |||||
| ifdef CTRMMKERNEL | |||||
| $(KDIR)ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) | |||||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ | |||||
| $(KDIR)ctrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) | |||||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ | |||||
| $(KDIR)ctrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) | |||||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ | |||||
| $(KDIR)ctrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) | |||||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ | |||||
| $(KDIR)ctrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) | |||||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ | |||||
| $(KDIR)ctrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) | |||||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ | |||||
| $(KDIR)ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) | |||||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ | |||||
| $(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) | |||||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ | |||||
| else | |||||
| $(KDIR)ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) | $(KDIR)ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) | ||||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ | $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ | ||||
| @@ -767,37 +658,6 @@ $(KDIR)ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) | |||||
| $(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) | $(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) | ||||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ | $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ | ||||
| endif | |||||
| ifdef ZTRMMKERNEL | |||||
| $(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) | |||||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ | |||||
| $(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) | |||||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ | |||||
| $(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) | |||||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ | |||||
| $(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) | |||||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ | |||||
| $(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) | |||||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ | |||||
| $(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) | |||||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ | |||||
| $(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) | |||||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ | |||||
| $(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) | |||||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ | |||||
| else | |||||
| $(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) | $(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) | ||||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ | $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ | ||||
| @@ -821,37 +681,10 @@ $(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) | |||||
| $(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) | $(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) | ||||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ | $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ | ||||
| endif | endif | ||||
| endif | |||||
| ifdef XTRMMKERNEL | |||||
| $(KDIR)xtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRMMKERNEL) | |||||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ | |||||
| $(KDIR)xtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRMMKERNEL) | |||||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ | |||||
| $(KDIR)xtrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRMMKERNEL) | |||||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ | |||||
| $(KDIR)xtrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRMMKERNEL) | |||||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ | |||||
| $(KDIR)xtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRMMKERNEL) | |||||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ | |||||
| $(KDIR)xtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRMMKERNEL) | |||||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ | |||||
| $(KDIR)xtrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRMMKERNEL) | |||||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ | |||||
| $(KDIR)xtrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRMMKERNEL) | |||||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ | |||||
| else | |||||
| $(KDIR)xtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) | $(KDIR)xtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) | ||||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ | $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ | ||||
| @@ -877,9 +710,6 @@ $(KDIR)xtrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) | |||||
| $(KDIR)xtrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) | $(KDIR)xtrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) | ||||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ | $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ | ||||
| endif | |||||
| $(KDIR)cgemm3m_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM3MKERNEL) | $(KDIR)cgemm3m_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM3MKERNEL) | ||||
| $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $@ | $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $@ | ||||
| @@ -0,0 +1,46 @@ | |||||
| ifndef SNRM2KERNEL | |||||
| SNRM2KERNEL = nrm2.c | |||||
| endif | |||||
| ifndef DNRM2KERNEL | |||||
| DNRM2KERNEL = nrm2.c | |||||
| endif | |||||
| ifndef CNRM2KERNEL | |||||
| CNRM2KERNEL = znrm2.c | |||||
| endif | |||||
| ifndef ZNRM2KERNEL | |||||
| ZNRM2KERNEL = znrm2.c | |||||
| endif | |||||
| ifndef SCABS_KERNEL | |||||
| SCABS_KERNEL = ../generic/cabs.c | |||||
| endif | |||||
| ifndef DCABS_KERNEL | |||||
| DCABS_KERNEL = ../generic/cabs.c | |||||
| endif | |||||
| ifndef QCABS_KERNEL | |||||
| QCABS_KERNEL = ../generic/cabs.c | |||||
| endif | |||||
| ifndef LSAME_KERNEL | |||||
| LSAME_KERNEL = ../generic/lsame.c | |||||
| endif | |||||
| ifndef SGEMM_BETA | |||||
| SGEMM_BETA = ../generic/gemm_beta.c | |||||
| endif | |||||
| ifndef DGEMM_BETA | |||||
| DGEMM_BETA = ../generic/gemm_beta.c | |||||
| endif | |||||
| ifndef CGEMM_BETA | |||||
| CGEMM_BETA = ../generic/zgemm_beta.c | |||||
| endif | |||||
| ifndef ZGEMM_BETA | |||||
| ZGEMM_BETA = ../generic/zgemm_beta.c | |||||
| endif | |||||
| @@ -0,0 +1,142 @@ | |||||
| SAMAXKERNEL = iamax_vfp.S | |||||
| DAMAXKERNEL = iamax_vfp.S | |||||
| CAMAXKERNEL = iamax_vfp.S | |||||
| ZAMAXKERNEL = iamax_vfp.S | |||||
| SAMINKERNEL = iamax_vfp.S | |||||
| DAMINKERNEL = iamax_vfp.S | |||||
| CAMINKERNEL = iamax_vfp.S | |||||
| ZAMINKERNEL = iamax_vfp.S | |||||
| SMAXKERNEL = iamax_vfp.S | |||||
| DMAXKERNEL = iamax_vfp.S | |||||
| SMINKERNEL = iamax_vfp.S | |||||
| DMINKERNEL = iamax_vfp.S | |||||
| ISAMAXKERNEL = iamax_vfp.S | |||||
| IDAMAXKERNEL = iamax_vfp.S | |||||
| ICAMAXKERNEL = iamax_vfp.S | |||||
| IZAMAXKERNEL = iamax_vfp.S | |||||
| ISAMINKERNEL = iamax_vfp.S | |||||
| IDAMINKERNEL = iamax_vfp.S | |||||
| ICAMINKERNEL = iamax_vfp.S | |||||
| IZAMINKERNEL = iamax_vfp.S | |||||
| ISMAXKERNEL = iamax_vfp.S | |||||
| IDMAXKERNEL = iamax_vfp.S | |||||
| ISMINKERNEL = iamax_vfp.S | |||||
| IDMINKERNEL = iamax_vfp.S | |||||
| SASUMKERNEL = asum_vfp.S | |||||
| DASUMKERNEL = asum_vfp.S | |||||
| CASUMKERNEL = asum_vfp.S | |||||
| ZASUMKERNEL = asum_vfp.S | |||||
| SAXPYKERNEL = axpy_vfp.S | |||||
| DAXPYKERNEL = axpy_vfp.S | |||||
| CAXPYKERNEL = axpy_vfp.S | |||||
| ZAXPYKERNEL = axpy_vfp.S | |||||
| SCOPYKERNEL = scopy_vfp.S | |||||
| DCOPYKERNEL = dcopy_vfp.S | |||||
| CCOPYKERNEL = ccopy_vfp.S | |||||
| ZCOPYKERNEL = zcopy_vfp.S | |||||
| SDOTKERNEL = sdot_vfp.S | |||||
| DDOTKERNEL = ddot_vfp.S | |||||
| CDOTKERNEL = cdot_vfp.S | |||||
| ZDOTKERNEL = zdot_vfp.S | |||||
| SNRM2KERNEL = nrm2_vfp.S | |||||
| DNRM2KERNEL = nrm2_vfp.S | |||||
| CNRM2KERNEL = nrm2_vfp.S | |||||
| ZNRM2KERNEL = nrm2_vfp.S | |||||
| SROTKERNEL = rot_vfp.S | |||||
| DROTKERNEL = rot_vfp.S | |||||
| CROTKERNEL = rot_vfp.S | |||||
| ZROTKERNEL = rot_vfp.S | |||||
| SSCALKERNEL = scal_vfp.S | |||||
| DSCALKERNEL = scal_vfp.S | |||||
| CSCALKERNEL = scal_vfp.S | |||||
| ZSCALKERNEL = scal_vfp.S | |||||
| SSWAPKERNEL = swap_vfp.S | |||||
| DSWAPKERNEL = swap_vfp.S | |||||
| CSWAPKERNEL = swap_vfp.S | |||||
| ZSWAPKERNEL = swap_vfp.S | |||||
| SGEMVNKERNEL = gemv_n_vfp.S | |||||
| DGEMVNKERNEL = gemv_n_vfp.S | |||||
| CGEMVNKERNEL = cgemv_n_vfp.S | |||||
| ZGEMVNKERNEL = zgemv_n_vfp.S | |||||
| SGEMVTKERNEL = gemv_t_vfp.S | |||||
| DGEMVTKERNEL = gemv_t_vfp.S | |||||
| CGEMVTKERNEL = cgemv_t_vfp.S | |||||
| ZGEMVTKERNEL = zgemv_t_vfp.S | |||||
| STRMMKERNEL = strmm_kernel_4x2_vfp.S | |||||
| DTRMMKERNEL = dtrmm_kernel_4x2_vfp.S | |||||
| CTRMMKERNEL = ctrmm_kernel_2x2_vfp.S | |||||
| ZTRMMKERNEL = ztrmm_kernel_2x2_vfp.S | |||||
| SGEMMKERNEL = sgemm_kernel_4x2_vfp.S | |||||
| SGEMMINCOPY = sgemm_ncopy_4_vfp.S | |||||
| SGEMMITCOPY = sgemm_tcopy_4_vfp.S | |||||
| SGEMMINCOPYOBJ = sgemm_incopy.o | |||||
| SGEMMITCOPYOBJ = sgemm_itcopy.o | |||||
| SGEMMONCOPY = sgemm_ncopy_2_vfp.S | |||||
| SGEMMOTCOPY = ../generic/gemm_tcopy_2.c | |||||
| SGEMMONCOPYOBJ = sgemm_oncopy.o | |||||
| SGEMMOTCOPYOBJ = sgemm_otcopy.o | |||||
| DGEMMKERNEL = dgemm_kernel_4x2_vfp.S | |||||
| DGEMMINCOPY = dgemm_ncopy_4_vfp.S | |||||
| DGEMMITCOPY = dgemm_tcopy_4_vfp.S | |||||
| DGEMMINCOPYOBJ = dgemm_incopy.o | |||||
| DGEMMITCOPYOBJ = dgemm_itcopy.o | |||||
| DGEMMONCOPY = dgemm_ncopy_2_vfp.S | |||||
| DGEMMOTCOPY = ../generic/gemm_tcopy_2.c | |||||
| DGEMMONCOPYOBJ = dgemm_oncopy.o | |||||
| DGEMMOTCOPYOBJ = dgemm_otcopy.o | |||||
| CGEMMKERNEL = cgemm_kernel_2x2_vfp.S | |||||
| CGEMMONCOPY = cgemm_ncopy_2_vfp.S | |||||
| CGEMMOTCOPY = cgemm_tcopy_2_vfp.S | |||||
| CGEMMONCOPYOBJ = cgemm_oncopy.o | |||||
| CGEMMOTCOPYOBJ = cgemm_otcopy.o | |||||
| ZGEMMKERNEL = zgemm_kernel_2x2_vfp.S | |||||
| ZGEMMONCOPY = zgemm_ncopy_2_vfp.S | |||||
| ZGEMMOTCOPY = zgemm_tcopy_2_vfp.S | |||||
| ZGEMMONCOPYOBJ = zgemm_oncopy.o | |||||
| ZGEMMOTCOPYOBJ = zgemm_otcopy.o | |||||
| STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| @@ -0,0 +1,141 @@ | |||||
| SAMAXKERNEL = iamax_vfp.S | |||||
| DAMAXKERNEL = iamax_vfp.S | |||||
| CAMAXKERNEL = iamax_vfp.S | |||||
| ZAMAXKERNEL = iamax_vfp.S | |||||
| SAMINKERNEL = iamax_vfp.S | |||||
| DAMINKERNEL = iamax_vfp.S | |||||
| CAMINKERNEL = iamax_vfp.S | |||||
| ZAMINKERNEL = iamax_vfp.S | |||||
| SMAXKERNEL = iamax_vfp.S | |||||
| DMAXKERNEL = iamax_vfp.S | |||||
| SMINKERNEL = iamax_vfp.S | |||||
| DMINKERNEL = iamax_vfp.S | |||||
| ISAMAXKERNEL = iamax_vfp.S | |||||
| IDAMAXKERNEL = iamax_vfp.S | |||||
| ICAMAXKERNEL = iamax_vfp.S | |||||
| IZAMAXKERNEL = iamax_vfp.S | |||||
| ISAMINKERNEL = iamax_vfp.S | |||||
| IDAMINKERNEL = iamax_vfp.S | |||||
| ICAMINKERNEL = iamax_vfp.S | |||||
| IZAMINKERNEL = iamax_vfp.S | |||||
| ISMAXKERNEL = iamax_vfp.S | |||||
| IDMAXKERNEL = iamax_vfp.S | |||||
| ISMINKERNEL = iamax_vfp.S | |||||
| IDMINKERNEL = iamax_vfp.S | |||||
| SSWAPKERNEL = swap_vfp.S | |||||
| DSWAPKERNEL = swap_vfp.S | |||||
| CSWAPKERNEL = swap_vfp.S | |||||
| ZSWAPKERNEL = swap_vfp.S | |||||
| SASUMKERNEL = asum_vfp.S | |||||
| DASUMKERNEL = asum_vfp.S | |||||
| CASUMKERNEL = asum_vfp.S | |||||
| ZASUMKERNEL = asum_vfp.S | |||||
| SAXPYKERNEL = axpy_vfp.S | |||||
| DAXPYKERNEL = axpy_vfp.S | |||||
| CAXPYKERNEL = axpy_vfp.S | |||||
| ZAXPYKERNEL = axpy_vfp.S | |||||
| SCOPYKERNEL = scopy_vfp.S | |||||
| DCOPYKERNEL = dcopy_vfp.S | |||||
| CCOPYKERNEL = ccopy_vfp.S | |||||
| ZCOPYKERNEL = zcopy_vfp.S | |||||
| SDOTKERNEL = sdot_vfp.S | |||||
| DDOTKERNEL = ddot_vfp.S | |||||
| CDOTKERNEL = cdot_vfp.S | |||||
| ZDOTKERNEL = zdot_vfp.S | |||||
| SNRM2KERNEL = nrm2_vfpv3.S | |||||
| DNRM2KERNEL = nrm2_vfpv3.S | |||||
| CNRM2KERNEL = nrm2_vfpv3.S | |||||
| ZNRM2KERNEL = nrm2_vfpv3.S | |||||
| SROTKERNEL = rot_vfp.S | |||||
| DROTKERNEL = rot_vfp.S | |||||
| CROTKERNEL = rot_vfp.S | |||||
| ZROTKERNEL = rot_vfp.S | |||||
| SSCALKERNEL = scal_vfp.S | |||||
| DSCALKERNEL = scal_vfp.S | |||||
| CSCALKERNEL = scal_vfp.S | |||||
| ZSCALKERNEL = scal_vfp.S | |||||
| SGEMVNKERNEL = gemv_n_vfp.S | |||||
| DGEMVNKERNEL = gemv_n_vfp.S | |||||
| CGEMVNKERNEL = cgemv_n_vfp.S | |||||
| ZGEMVNKERNEL = zgemv_n_vfp.S | |||||
| SGEMVTKERNEL = gemv_t_vfp.S | |||||
| DGEMVTKERNEL = gemv_t_vfp.S | |||||
| CGEMVTKERNEL = cgemv_t_vfp.S | |||||
| ZGEMVTKERNEL = zgemv_t_vfp.S | |||||
| STRMMKERNEL = strmm_kernel_4x4_vfpv3.S | |||||
| DTRMMKERNEL = dtrmm_kernel_4x4_vfpv3.S | |||||
| CTRMMKERNEL = ctrmm_kernel_2x2_vfpv3.S | |||||
| ZTRMMKERNEL = ztrmm_kernel_2x2_vfpv3.S | |||||
| #SGEMMKERNEL = ../generic/gemmkernel_2x2.c | |||||
| SGEMMKERNEL = sgemm_kernel_4x4_vfpv3.S | |||||
| SGEMMINCOPY = | |||||
| SGEMMITCOPY = | |||||
| SGEMMONCOPY = sgemm_ncopy_4_vfp.S | |||||
| SGEMMOTCOPY = sgemm_tcopy_4_vfp.S | |||||
| SGEMMINCOPYOBJ = | |||||
| SGEMMITCOPYOBJ = | |||||
| SGEMMONCOPYOBJ = sgemm_oncopy.o | |||||
| SGEMMOTCOPYOBJ = sgemm_otcopy.o | |||||
| DGEMMKERNEL = dgemm_kernel_4x4_vfpv3.S | |||||
| DGEMMINCOPY = | |||||
| DGEMMITCOPY = | |||||
| DGEMMONCOPY = dgemm_ncopy_4_vfp.S | |||||
| DGEMMOTCOPY = dgemm_tcopy_4_vfp.S | |||||
| DGEMMINCOPYOBJ = | |||||
| DGEMMITCOPYOBJ = | |||||
| DGEMMONCOPYOBJ = dgemm_oncopy.o | |||||
| DGEMMOTCOPYOBJ = dgemm_otcopy.o | |||||
| CGEMMKERNEL = cgemm_kernel_2x2_vfpv3.S | |||||
| CGEMMONCOPY = cgemm_ncopy_2_vfp.S | |||||
| CGEMMOTCOPY = cgemm_tcopy_2_vfp.S | |||||
| CGEMMONCOPYOBJ = cgemm_oncopy.o | |||||
| CGEMMOTCOPYOBJ = cgemm_otcopy.o | |||||
| ZGEMMKERNEL = zgemm_kernel_2x2_vfpv3.S | |||||
| ZGEMMONCOPY = zgemm_ncopy_2_vfp.S | |||||
| ZGEMMOTCOPY = zgemm_tcopy_2_vfp.S | |||||
| ZGEMMONCOPYOBJ = zgemm_oncopy.o | |||||
| ZGEMMOTCOPYOBJ = zgemm_otcopy.o | |||||
| STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| @@ -0,0 +1,2 @@ | |||||
| clean :: | |||||
| @@ -0,0 +1,73 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| /************************************************************************************** | |||||
| * 2013/09/14 Saar | |||||
| * BLASTEST float : OK | |||||
| * BLASTEST double : OK | |||||
| * CTEST : NoTest | |||||
| * TEST : NoTest | |||||
| * | |||||
| **************************************************************************************/ | |||||
| #include "common.h" | |||||
| #include <math.h> | |||||
| #if defined(DOUBLE) | |||||
| #define ABS fabs | |||||
| #else | |||||
| #define ABS fabsf | |||||
| #endif | |||||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| { | |||||
| BLASLONG i=0; | |||||
| BLASLONG ix=0; | |||||
| FLOAT maxf=0.0; | |||||
| if (n < 0 || inc_x < 1 ) return(maxf); | |||||
| maxf=ABS(x[0]); | |||||
| while(i < n) | |||||
| { | |||||
| if( ABS(x[ix]) > ABS(maxf) ) | |||||
| { | |||||
| maxf = ABS(x[ix]); | |||||
| } | |||||
| ix += inc_x; | |||||
| i++; | |||||
| } | |||||
| return(maxf); | |||||
| } | |||||
| @@ -0,0 +1,73 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| /************************************************************************************** | |||||
| * 2013/09/14 Saar | |||||
| * BLASTEST float : OK | |||||
| * BLASTEST double : OK | |||||
| * CTEST : NoTest | |||||
| * TEST : NoTest | |||||
| * | |||||
| **************************************************************************************/ | |||||
| #include "common.h" | |||||
| #include <math.h> | |||||
| #if defined(DOUBLE) | |||||
| #define ABS fabs | |||||
| #else | |||||
| #define ABS fabsf | |||||
| #endif | |||||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| { | |||||
| BLASLONG i=0; | |||||
| BLASLONG ix=0; | |||||
| FLOAT minf=0.0; | |||||
| if (n < 0 || inc_x < 1 ) return(minf); | |||||
| minf=ABS(x[0]); | |||||
| while(i < n) | |||||
| { | |||||
| if( ABS(x[ix]) < ABS(minf) ) | |||||
| { | |||||
| minf = ABS(x[ix]); | |||||
| } | |||||
| ix += inc_x; | |||||
| i++; | |||||
| } | |||||
| return(minf); | |||||
| } | |||||
| @@ -0,0 +1,67 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| /************************************************************************************** | |||||
| * 2013/09/14 Saar | |||||
| * BLASTEST float : OK | |||||
| * BLASTEST double : OK | |||||
| * CTEST : OK | |||||
| * TEST : OK | |||||
| * | |||||
| **************************************************************************************/ | |||||
| #include "common.h" | |||||
| #include <math.h> | |||||
| #if defined(DOUBLE) | |||||
| #define ABS fabs | |||||
| #else | |||||
| #define ABS fabsf | |||||
| #endif | |||||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| { | |||||
| BLASLONG i=0; | |||||
| FLOAT sumf = 0.0; | |||||
| if (n < 0 || inc_x < 1 ) return(sumf); | |||||
| n *= inc_x; | |||||
| while(i < n) | |||||
| { | |||||
| sumf += ABS(x[i]); | |||||
| i += inc_x; | |||||
| } | |||||
| return(sumf); | |||||
| } | |||||
| @@ -0,0 +1,481 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| /************************************************************************************** | |||||
| * 2013/11/11 Saar | |||||
| * BLASTEST : OK | |||||
| * CTEST : OK | |||||
| * TEST : OK | |||||
| * | |||||
| **************************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define STACKSIZE 256 | |||||
| #define N r0 | |||||
| #define X r1 | |||||
| #define INC_X r2 | |||||
| #define I r12 | |||||
| #define X_PRE 512 | |||||
| /************************************************************************************** | |||||
| * Macro definitions | |||||
| **************************************************************************************/ | |||||
| #if !defined(COMPLEX) | |||||
| #if defined(DOUBLE) | |||||
| .macro KERNEL_F4 | |||||
| pld [ X, #X_PRE ] | |||||
| fldmiad X!, { d4 - d5 } | |||||
| vabs.f64 d4, d4 | |||||
| vadd.f64 d0 , d0, d4 | |||||
| vabs.f64 d5, d5 | |||||
| fldmiad X!, { d6 - d7 } | |||||
| vabs.f64 d6, d6 | |||||
| vadd.f64 d1 , d1, d5 | |||||
| vabs.f64 d7, d7 | |||||
| vadd.f64 d0 , d0, d6 | |||||
| vadd.f64 d1 , d1, d7 | |||||
| .endm | |||||
| .macro KERNEL_F1 | |||||
| fldmiad X!, { d4 } | |||||
| vabs.f64 d4, d4 | |||||
| vadd.f64 d0 , d0, d4 | |||||
| .endm | |||||
| .macro KERNEL_S4 | |||||
| fldmiad X, { d4 } | |||||
| vabs.f64 d4, d4 | |||||
| vadd.f64 d0 , d0, d4 | |||||
| add X, X, INC_X | |||||
| fldmiad X, { d4 } | |||||
| vabs.f64 d4, d4 | |||||
| vadd.f64 d0 , d0, d4 | |||||
| add X, X, INC_X | |||||
| fldmiad X, { d4 } | |||||
| vabs.f64 d4, d4 | |||||
| vadd.f64 d0 , d0, d4 | |||||
| add X, X, INC_X | |||||
| fldmiad X, { d4 } | |||||
| vabs.f64 d4, d4 | |||||
| vadd.f64 d0 , d0, d4 | |||||
| add X, X, INC_X | |||||
| .endm | |||||
| .macro KERNEL_S1 | |||||
| fldmiad X, { d4 } | |||||
| vabs.f64 d4, d4 | |||||
| vadd.f64 d0 , d0, d4 | |||||
| add X, X, INC_X | |||||
| .endm | |||||
| #else | |||||
| .macro KERNEL_F4 | |||||
| fldmias X!, { s4 - s5 } | |||||
| vabs.f32 s4, s4 | |||||
| vadd.f32 s0 , s0, s4 | |||||
| vabs.f32 s5, s5 | |||||
| fldmias X!, { s6 - s7 } | |||||
| vabs.f32 s6, s6 | |||||
| vadd.f32 s1 , s1, s5 | |||||
| vabs.f32 s7, s7 | |||||
| vadd.f32 s0 , s0, s6 | |||||
| vadd.f32 s1 , s1, s7 | |||||
| .endm | |||||
| .macro KERNEL_F1 | |||||
| fldmias X!, { s4 } | |||||
| vabs.f32 s4, s4 | |||||
| vadd.f32 s0 , s0, s4 | |||||
| .endm | |||||
| .macro KERNEL_S4 | |||||
| fldmias X, { s4 } | |||||
| vabs.f32 s4, s4 | |||||
| vadd.f32 s0 , s0, s4 | |||||
| add X, X, INC_X | |||||
| fldmias X, { s4 } | |||||
| vabs.f32 s4, s4 | |||||
| vadd.f32 s0 , s0, s4 | |||||
| add X, X, INC_X | |||||
| fldmias X, { s4 } | |||||
| vabs.f32 s4, s4 | |||||
| vadd.f32 s0 , s0, s4 | |||||
| add X, X, INC_X | |||||
| fldmias X, { s4 } | |||||
| vabs.f32 s4, s4 | |||||
| vadd.f32 s0 , s0, s4 | |||||
| add X, X, INC_X | |||||
| .endm | |||||
| .macro KERNEL_S1 | |||||
| fldmias X, { s4 } | |||||
| vabs.f32 s4, s4 | |||||
| vadd.f32 s0 , s0, s4 | |||||
| add X, X, INC_X | |||||
| .endm | |||||
| #endif | |||||
| #else | |||||
| #if defined(DOUBLE) | |||||
| .macro KERNEL_F4 | |||||
| pld [ X, #X_PRE ] | |||||
| fldmiad X!, { d4 - d5 } | |||||
| vabs.f64 d4, d4 | |||||
| vadd.f64 d0 , d0, d4 | |||||
| vabs.f64 d5, d5 | |||||
| fldmiad X!, { d6 - d7 } | |||||
| vabs.f64 d6, d6 | |||||
| vadd.f64 d1 , d1, d5 | |||||
| vabs.f64 d7, d7 | |||||
| vadd.f64 d0 , d0, d6 | |||||
| vadd.f64 d1 , d1, d7 | |||||
| pld [ X, #X_PRE ] | |||||
| fldmiad X!, { d4 - d5 } | |||||
| vabs.f64 d4, d4 | |||||
| vadd.f64 d0 , d0, d4 | |||||
| vabs.f64 d5, d5 | |||||
| fldmiad X!, { d6 - d7 } | |||||
| vabs.f64 d6, d6 | |||||
| vadd.f64 d1 , d1, d5 | |||||
| vabs.f64 d7, d7 | |||||
| vadd.f64 d0 , d0, d6 | |||||
| vadd.f64 d1 , d1, d7 | |||||
| .endm | |||||
| .macro KERNEL_F1 | |||||
| fldmiad X!, { d4 } | |||||
| vabs.f64 d4, d4 | |||||
| vadd.f64 d0 , d0, d4 | |||||
| fldmiad X!, { d4 } | |||||
| vabs.f64 d4, d4 | |||||
| vadd.f64 d0 , d0, d4 | |||||
| .endm | |||||
| .macro KERNEL_S4 | |||||
| fldmiad X, { d4 -d5 } | |||||
| vabs.f64 d4, d4 | |||||
| vadd.f64 d0 , d0, d4 | |||||
| vabs.f64 d5, d5 | |||||
| vadd.f64 d0 , d0, d5 | |||||
| add X, X, INC_X | |||||
| fldmiad X, { d4 -d5 } | |||||
| vabs.f64 d4, d4 | |||||
| vadd.f64 d0 , d0, d4 | |||||
| vabs.f64 d5, d5 | |||||
| vadd.f64 d0 , d0, d5 | |||||
| add X, X, INC_X | |||||
| fldmiad X, { d4 -d5 } | |||||
| vabs.f64 d4, d4 | |||||
| vadd.f64 d0 , d0, d4 | |||||
| vabs.f64 d5, d5 | |||||
| vadd.f64 d0 , d0, d5 | |||||
| add X, X, INC_X | |||||
| fldmiad X, { d4 -d5 } | |||||
| vabs.f64 d4, d4 | |||||
| vadd.f64 d0 , d0, d4 | |||||
| vabs.f64 d5, d5 | |||||
| vadd.f64 d0 , d0, d5 | |||||
| add X, X, INC_X | |||||
| .endm | |||||
| .macro KERNEL_S1 | |||||
| fldmiad X, { d4 -d5 } | |||||
| vabs.f64 d4, d4 | |||||
| vadd.f64 d0 , d0, d4 | |||||
| vabs.f64 d5, d5 | |||||
| vadd.f64 d0 , d0, d5 | |||||
| add X, X, INC_X | |||||
| .endm | |||||
| #else | |||||
| .macro KERNEL_F4 | |||||
| pld [ X, #X_PRE ] | |||||
| fldmias X!, { s4 - s5 } | |||||
| vabs.f32 s4, s4 | |||||
| vadd.f32 s0 , s0, s4 | |||||
| vabs.f32 s5, s5 | |||||
| fldmias X!, { s6 - s7 } | |||||
| vabs.f32 s6, s6 | |||||
| vadd.f32 s1 , s1, s5 | |||||
| vabs.f32 s7, s7 | |||||
| vadd.f32 s0 , s0, s6 | |||||
| vadd.f32 s1 , s1, s7 | |||||
| fldmias X!, { s4 - s5 } | |||||
| vabs.f32 s4, s4 | |||||
| vadd.f32 s0 , s0, s4 | |||||
| vabs.f32 s5, s5 | |||||
| fldmias X!, { s6 - s7 } | |||||
| vabs.f32 s6, s6 | |||||
| vadd.f32 s1 , s1, s5 | |||||
| vabs.f32 s7, s7 | |||||
| vadd.f32 s0 , s0, s6 | |||||
| vadd.f32 s1 , s1, s7 | |||||
| .endm | |||||
| .macro KERNEL_F1 | |||||
| fldmias X!, { s4 } | |||||
| vabs.f32 s4, s4 | |||||
| vadd.f32 s0 , s0, s4 | |||||
| fldmias X!, { s4 } | |||||
| vabs.f32 s4, s4 | |||||
| vadd.f32 s0 , s0, s4 | |||||
| .endm | |||||
| .macro KERNEL_S4 | |||||
| fldmias X, { s4 -s5 } | |||||
| vabs.f32 s4, s4 | |||||
| vadd.f32 s0 , s0, s4 | |||||
| vabs.f32 s5, s5 | |||||
| vadd.f32 s0 , s0, s5 | |||||
| add X, X, INC_X | |||||
| fldmias X, { s4 -s5 } | |||||
| vabs.f32 s4, s4 | |||||
| vadd.f32 s0 , s0, s4 | |||||
| vabs.f32 s5, s5 | |||||
| vadd.f32 s0 , s0, s5 | |||||
| add X, X, INC_X | |||||
| fldmias X, { s4 -s5 } | |||||
| vabs.f32 s4, s4 | |||||
| vadd.f32 s0 , s0, s4 | |||||
| vabs.f32 s5, s5 | |||||
| vadd.f32 s0 , s0, s5 | |||||
| add X, X, INC_X | |||||
| fldmias X, { s4 -s5 } | |||||
| vabs.f32 s4, s4 | |||||
| vadd.f32 s0 , s0, s4 | |||||
| vabs.f32 s5, s5 | |||||
| vadd.f32 s0 , s0, s5 | |||||
| add X, X, INC_X | |||||
| .endm | |||||
| .macro KERNEL_S1 | |||||
| fldmias X, { s4 -s5 } | |||||
| vabs.f32 s4, s4 | |||||
| vadd.f32 s0 , s0, s4 | |||||
| vabs.f32 s5, s5 | |||||
| vadd.f32 s0 , s0, s5 | |||||
| add X, X, INC_X | |||||
| .endm | |||||
| #endif | |||||
| #endif | |||||
| /************************************************************************************** | |||||
| * End of macro definitions | |||||
| **************************************************************************************/ | |||||
| PROLOGUE | |||||
| .align 5 | |||||
| #if defined(DOUBLE) | |||||
| vsub.f64 d0 , d0 , d0 | |||||
| vsub.f64 d1 , d1 , d1 | |||||
| #else | |||||
| vsub.f32 s0 , s0 , s0 | |||||
| vsub.f32 s1 , s1 , s1 | |||||
| #endif | |||||
| cmp N, #0 | |||||
| ble asum_kernel_L999 | |||||
| cmp INC_X, #0 | |||||
| beq asum_kernel_L999 | |||||
| cmp INC_X, #1 | |||||
| bne asum_kernel_S_BEGIN | |||||
| asum_kernel_F_BEGIN: | |||||
| asrs I, N, #2 // I = N / 4 | |||||
| ble asum_kernel_F1 | |||||
| .align 5 | |||||
| asum_kernel_F4: | |||||
| #if !defined(DOUBLE) && !defined(COMPLEX) | |||||
| pld [ X, #X_PRE ] | |||||
| #endif | |||||
| KERNEL_F4 | |||||
| subs I, I, #1 | |||||
| ble asum_kernel_F1 | |||||
| KERNEL_F4 | |||||
| subs I, I, #1 | |||||
| bne asum_kernel_F4 | |||||
| asum_kernel_F1: | |||||
| ands I, N, #3 | |||||
| ble asum_kernel_L999 | |||||
| asum_kernel_F10: | |||||
| KERNEL_F1 | |||||
| subs I, I, #1 | |||||
| bne asum_kernel_F10 | |||||
| b asum_kernel_L999 | |||||
| asum_kernel_S_BEGIN: | |||||
| #if defined(COMPLEX) | |||||
| #if defined(DOUBLE) | |||||
| lsl INC_X, INC_X, #4 // INC_X * SIZE * 2 | |||||
| #else | |||||
| lsl INC_X, INC_X, #3 // INC_X * SIZE * 2 | |||||
| #endif | |||||
| #else | |||||
| #if defined(DOUBLE) | |||||
| lsl INC_X, INC_X, #3 // INC_X * SIZE | |||||
| #else | |||||
| lsl INC_X, INC_X, #2 // INC_X * SIZE | |||||
| #endif | |||||
| #endif | |||||
| asrs I, N, #2 // I = N / 4 | |||||
| ble asum_kernel_S1 | |||||
| .align 5 | |||||
| asum_kernel_S4: | |||||
| KERNEL_S4 | |||||
| subs I, I, #1 | |||||
| bne asum_kernel_S4 | |||||
| asum_kernel_S1: | |||||
| ands I, N, #3 | |||||
| ble asum_kernel_L999 | |||||
| asum_kernel_S10: | |||||
| KERNEL_S1 | |||||
| subs I, I, #1 | |||||
| bne asum_kernel_S10 | |||||
| asum_kernel_L999: | |||||
| #if defined(DOUBLE) | |||||
| vadd.f64 d0 , d0, d1 // set return value | |||||
| #else | |||||
| vadd.f32 s0 , s0, s1 // set return value | |||||
| #endif | |||||
| bx lr | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,64 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| /************************************************************************************** | |||||
| * 2013/09/14 Saar | |||||
| * BLASTEST float : OK | |||||
| * BLASTEST double : OK | |||||
| * CTEST : OK | |||||
| * TEST : OK | |||||
| * | |||||
| **************************************************************************************/ | |||||
| #include "common.h" | |||||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | |||||
| { | |||||
| BLASLONG i=0; | |||||
| BLASLONG ix,iy; | |||||
| if ( n < 0 ) return(0); | |||||
| if ( da == 0.0 ) return(0); | |||||
| ix = 0; | |||||
| iy = 0; | |||||
| while(i < n) | |||||
| { | |||||
| y[iy] += da * x[ix] ; | |||||
| ix += inc_x ; | |||||
| iy += inc_y ; | |||||
| i++ ; | |||||
| } | |||||
| return(0); | |||||
| } | |||||
| @@ -0,0 +1,503 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| /************************************************************************************** | |||||
| * 2013/11/14 Saar | |||||
| * BLASTEST : xOK | |||||
| * CTEST : xOK | |||||
| * TEST : xOK | |||||
| * | |||||
| **************************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define STACKSIZE 256 | |||||
| #define OLD_INC_X [fp, #0 ] | |||||
| #define OLD_Y [fp, #4 ] | |||||
| #define OLD_INC_Y [fp, #8 ] | |||||
| #define N r0 | |||||
| #define Y r1 | |||||
| #define INC_X r2 | |||||
| #define X r3 | |||||
| #define INC_Y r4 | |||||
| #define I r12 | |||||
| #define X_PRE 512 | |||||
| /************************************************************************************** | |||||
| * Macro definitions | |||||
| **************************************************************************************/ | |||||
| /*****************************************************************************************/ | |||||
| #if !defined(CONJ) | |||||
| #if defined(DOUBLE) | |||||
| #define FMAC_R1 fmacd | |||||
| #define FMAC_R2 fnmacd | |||||
| #define FMAC_I1 fmacd | |||||
| #define FMAC_I2 fmacd | |||||
| #else | |||||
| #define FMAC_R1 fmacs | |||||
| #define FMAC_R2 fnmacs | |||||
| #define FMAC_I1 fmacs | |||||
| #define FMAC_I2 fmacs | |||||
| #endif | |||||
| #else // CONJ | |||||
| #if defined(DOUBLE) | |||||
| #define FMAC_R1 fmacd | |||||
| #define FMAC_R2 fmacd | |||||
| #define FMAC_I1 fnmacd | |||||
| #define FMAC_I2 fmacd | |||||
| #else | |||||
| #define FMAC_R1 fmacs | |||||
| #define FMAC_R2 fmacs | |||||
| #define FMAC_I1 fnmacs | |||||
| #define FMAC_I2 fmacs | |||||
| #endif | |||||
| #endif | |||||
| #if !defined(COMPLEX) | |||||
| #if defined(DOUBLE) | |||||
| .macro KERNEL_F4 | |||||
| pld [ X, #X_PRE ] | |||||
| fldmiad X!, { d4 - d7 } | |||||
| pld [ Y, #X_PRE ] | |||||
| fldmiad Y , { d8 - d11 } | |||||
| fmacd d8 , d0, d4 | |||||
| fstmiad Y!, { d8 } | |||||
| fmacd d9 , d0, d5 | |||||
| fstmiad Y!, { d9 } | |||||
| fmacd d10, d0, d6 | |||||
| fstmiad Y!, { d10 } | |||||
| fmacd d11, d0, d7 | |||||
| fstmiad Y!, { d11 } | |||||
| .endm | |||||
| .macro KERNEL_F1 | |||||
| fldmiad X!, { d4 } | |||||
| fldmiad Y , { d8 } | |||||
| fmacd d8 , d0, d4 | |||||
| fstmiad Y!, { d8 } | |||||
| .endm | |||||
| .macro KERNEL_S1 | |||||
| fldmiad X , { d4 } | |||||
| fldmiad Y , { d8 } | |||||
| fmacd d8 , d0, d4 | |||||
| fstmiad Y , { d8 } | |||||
| add X, X, INC_X | |||||
| add Y, Y, INC_Y | |||||
| .endm | |||||
| #else | |||||
| .macro KERNEL_F4 | |||||
| fldmias X!, { s4 - s7 } | |||||
| fldmias Y , { s8 - s11 } | |||||
| fmacs s8 , s0, s4 | |||||
| fstmias Y!, { s8 } | |||||
| fmacs s9 , s0, s5 | |||||
| fstmias Y!, { s9 } | |||||
| fmacs s10, s0, s6 | |||||
| fstmias Y!, { s10 } | |||||
| fmacs s11, s0, s7 | |||||
| fstmias Y!, { s11 } | |||||
| .endm | |||||
| .macro KERNEL_F1 | |||||
| fldmias X!, { s4 } | |||||
| fldmias Y , { s8 } | |||||
| fmacs s8 , s0, s4 | |||||
| fstmias Y!, { s8 } | |||||
| .endm | |||||
| .macro KERNEL_S1 | |||||
| fldmias X , { s4 } | |||||
| fldmias Y , { s8 } | |||||
| fmacs s8 , s0, s4 | |||||
| fstmias Y , { s8 } | |||||
| add X, X, INC_X | |||||
| add Y, Y, INC_Y | |||||
| .endm | |||||
| #endif | |||||
| #else | |||||
| #if defined(DOUBLE) | |||||
| .macro KERNEL_F4 | |||||
| pld [ X, #X_PRE ] | |||||
| fldmiad X!, { d4 - d7 } | |||||
| pld [ Y, #X_PRE ] | |||||
| fldmiad Y , { d8 - d11 } | |||||
| FMAC_R1 d8 , d0, d4 | |||||
| FMAC_R2 d8 , d1, d5 | |||||
| FMAC_I1 d9 , d0, d5 | |||||
| FMAC_I2 d9 , d1, d4 | |||||
| fstmiad Y!, { d8 } | |||||
| fstmiad Y!, { d9 } | |||||
| FMAC_R1 d10, d0, d6 | |||||
| FMAC_R2 d10, d1, d7 | |||||
| FMAC_I1 d11, d0, d7 | |||||
| FMAC_I2 d11, d1, d6 | |||||
| fstmiad Y!, { d10 } | |||||
| fstmiad Y!, { d11 } | |||||
| pld [ X, #X_PRE ] | |||||
| fldmiad X!, { d4 - d7 } | |||||
| pld [ Y, #X_PRE ] | |||||
| fldmiad Y , { d8 - d11 } | |||||
| FMAC_R1 d8 , d0, d4 | |||||
| FMAC_R2 d8 , d1, d5 | |||||
| FMAC_I1 d9 , d0, d5 | |||||
| FMAC_I2 d9 , d1, d4 | |||||
| fstmiad Y!, { d8 } | |||||
| fstmiad Y!, { d9 } | |||||
| FMAC_R1 d10, d0, d6 | |||||
| FMAC_R2 d10, d1, d7 | |||||
| FMAC_I1 d11, d0, d7 | |||||
| FMAC_I2 d11, d1, d6 | |||||
| fstmiad Y!, { d10 } | |||||
| fstmiad Y!, { d11 } | |||||
| .endm | |||||
| .macro KERNEL_F1 | |||||
| fldmiad X!, { d4 - d5 } | |||||
| fldmiad Y , { d8 - d9 } | |||||
| FMAC_R1 d8 , d0, d4 | |||||
| FMAC_R2 d8 , d1, d5 | |||||
| FMAC_I1 d9 , d0, d5 | |||||
| FMAC_I2 d9 , d1, d4 | |||||
| fstmiad Y!, { d8 } | |||||
| fstmiad Y!, { d9 } | |||||
| .endm | |||||
| .macro KERNEL_S1 | |||||
| fldmiad X , { d4 - d5 } | |||||
| fldmiad Y , { d8 - d9 } | |||||
| FMAC_R1 d8 , d0, d4 | |||||
| FMAC_R2 d8 , d1, d5 | |||||
| FMAC_I1 d9 , d0, d5 | |||||
| FMAC_I2 d9 , d1, d4 | |||||
| fstmiad Y , { d8 - d9 } | |||||
| add X, X, INC_X | |||||
| add Y, Y, INC_Y | |||||
| .endm | |||||
| #else | |||||
| .macro KERNEL_F4 | |||||
| pld [ X, #X_PRE ] | |||||
| fldmias X!, { s4 - s7 } | |||||
| pld [ Y, #X_PRE ] | |||||
| fldmias Y , { s8 - s11 } | |||||
| FMAC_R1 s8 , s0, s4 | |||||
| FMAC_R2 s8 , s1, s5 | |||||
| FMAC_I1 s9 , s0, s5 | |||||
| FMAC_I2 s9 , s1, s4 | |||||
| fstmias Y!, { s8 } | |||||
| fstmias Y!, { s9 } | |||||
| FMAC_R1 s10, s0, s6 | |||||
| FMAC_R2 s10, s1, s7 | |||||
| FMAC_I1 s11, s0, s7 | |||||
| FMAC_I2 s11, s1, s6 | |||||
| fstmias Y!, { s10 } | |||||
| fstmias Y!, { s11 } | |||||
| fldmias X!, { s4 - s7 } | |||||
| fldmias Y , { s8 - s11 } | |||||
| FMAC_R1 s8 , s0, s4 | |||||
| FMAC_R2 s8 , s1, s5 | |||||
| FMAC_I1 s9 , s0, s5 | |||||
| FMAC_I2 s9 , s1, s4 | |||||
| fstmias Y!, { s8 } | |||||
| fstmias Y!, { s9 } | |||||
| FMAC_R1 s10, s0, s6 | |||||
| FMAC_R2 s10, s1, s7 | |||||
| FMAC_I1 s11, s0, s7 | |||||
| FMAC_I2 s11, s1, s6 | |||||
| fstmias Y!, { s10 } | |||||
| fstmias Y!, { s11 } | |||||
| .endm | |||||
| .macro KERNEL_F1 | |||||
| fldmias X!, { s4 - s5 } | |||||
| fldmias Y , { s8 - s9 } | |||||
| FMAC_R1 s8 , s0, s4 | |||||
| FMAC_R2 s8 , s1, s5 | |||||
| FMAC_I1 s9 , s0, s5 | |||||
| FMAC_I2 s9 , s1, s4 | |||||
| fstmias Y!, { s8 } | |||||
| fstmias Y!, { s9 } | |||||
| .endm | |||||
| .macro KERNEL_S1 | |||||
| fldmias X , { s4 - s5 } | |||||
| fldmias Y , { s8 - s9 } | |||||
| FMAC_R1 s8 , s0, s4 | |||||
| FMAC_R2 s8 , s1, s5 | |||||
| FMAC_I1 s9 , s0, s5 | |||||
| FMAC_I2 s9 , s1, s4 | |||||
| fstmias Y , { s8 - s9 } | |||||
| add X, X, INC_X | |||||
| add Y, Y, INC_Y | |||||
| .endm | |||||
| #endif | |||||
| #endif | |||||
| /************************************************************************************** | |||||
| * End of macro definitions | |||||
| **************************************************************************************/ | |||||
| PROLOGUE | |||||
| .align 5 | |||||
| push {r4 , fp} | |||||
| add fp, sp, #8 | |||||
| sub sp, sp, #STACKSIZE // reserve stack | |||||
| ldr INC_X , OLD_INC_X | |||||
| ldr Y, OLD_Y | |||||
| ldr INC_Y , OLD_INC_Y | |||||
| sub r12, fp, #128 | |||||
| #if defined(DOUBLE) | |||||
| vstm r12, { d8 - d15} // store floating point registers | |||||
| #else | |||||
| vstm r12, { s8 - s15} // store floating point registers | |||||
| #endif | |||||
| cmp N, #0 | |||||
| ble axpy_kernel_L999 | |||||
| cmp INC_X, #0 | |||||
| beq axpy_kernel_L999 | |||||
| cmp INC_Y, #0 | |||||
| beq axpy_kernel_L999 | |||||
| cmp INC_X, #1 | |||||
| bne axpy_kernel_S_BEGIN | |||||
| cmp INC_Y, #1 | |||||
| bne axpy_kernel_S_BEGIN | |||||
| axpy_kernel_F_BEGIN: | |||||
| asrs I, N, #2 // I = N / 4 | |||||
| ble axpy_kernel_F1 | |||||
| .align 5 | |||||
| axpy_kernel_F4: | |||||
| #if !defined(COMPLEX) && !defined(DOUBLE) | |||||
| pld [ X, #X_PRE ] | |||||
| pld [ Y, #X_PRE ] | |||||
| #endif | |||||
| KERNEL_F4 | |||||
| subs I, I, #1 | |||||
| ble axpy_kernel_F1 | |||||
| KERNEL_F4 | |||||
| subs I, I, #1 | |||||
| bne axpy_kernel_F4 | |||||
| axpy_kernel_F1: | |||||
| ands I, N, #3 | |||||
| ble axpy_kernel_L999 | |||||
| axpy_kernel_F10: | |||||
| KERNEL_F1 | |||||
| subs I, I, #1 | |||||
| bne axpy_kernel_F10 | |||||
| b axpy_kernel_L999 | |||||
| axpy_kernel_S_BEGIN: | |||||
| #if defined(COMPLEX) | |||||
| #if defined(DOUBLE) | |||||
| lsl INC_X, INC_X, #4 // INC_X * SIZE * 2 | |||||
| lsl INC_Y, INC_Y, #4 // INC_Y * SIZE * 2 | |||||
| #else | |||||
| lsl INC_X, INC_X, #3 // INC_X * SIZE * 2 | |||||
| lsl INC_Y, INC_Y, #3 // INC_Y * SIZE * 2 | |||||
| #endif | |||||
| #else | |||||
| #if defined(DOUBLE) | |||||
| lsl INC_X, INC_X, #3 // INC_X * SIZE | |||||
| lsl INC_Y, INC_Y, #3 // INC_Y * SIZE | |||||
| #else | |||||
| lsl INC_X, INC_X, #2 // INC_X * SIZE | |||||
| lsl INC_Y, INC_Y, #2 // INC_Y * SIZE | |||||
| #endif | |||||
| #endif | |||||
| asrs I, N, #2 // I = N / 4 | |||||
| ble axpy_kernel_S1 | |||||
| .align 5 | |||||
| axpy_kernel_S4: | |||||
| KERNEL_S1 | |||||
| KERNEL_S1 | |||||
| KERNEL_S1 | |||||
| KERNEL_S1 | |||||
| subs I, I, #1 | |||||
| bne axpy_kernel_S4 | |||||
| axpy_kernel_S1: | |||||
| ands I, N, #3 | |||||
| ble axpy_kernel_L999 | |||||
| axpy_kernel_S10: | |||||
| KERNEL_S1 | |||||
| subs I, I, #1 | |||||
| bne axpy_kernel_S10 | |||||
| axpy_kernel_L999: | |||||
| sub r3, fp, #128 | |||||
| #if defined(DOUBLE) | |||||
| vldm r3, { d8 - d15 } // restore floating point registers | |||||
| #else | |||||
| vldm r3, { s8 - s15 } // restore floating point registers | |||||
| #endif | |||||
| mov r0, #0 // set return value | |||||
| sub sp, fp, #8 | |||||
| pop {r4,fp} | |||||
| bx lr | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,222 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| /************************************************************************************** | |||||
| * 2013/11/07 Saar | |||||
| * BLASTEST : OK | |||||
| * CTEST : OK | |||||
| * TEST : OK | |||||
| * | |||||
| **************************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define STACKSIZE 256 | |||||
| #define N r0 | |||||
| #define X r1 | |||||
| #define INC_X r2 | |||||
| #define OLD_Y r3 | |||||
| /****************************************************** | |||||
| * [fp, #-128] - [fp, #-64] is reserved | |||||
| * for store and restore of floating point | |||||
| * registers | |||||
| *******************************************************/ | |||||
| #define OLD_INC_Y [fp, #4 ] | |||||
| #define I r5 | |||||
| #define Y r6 | |||||
| #define INC_Y r7 | |||||
| #define X_PRE 256 | |||||
| /************************************************************************************** | |||||
| * Macro definitions | |||||
| **************************************************************************************/ | |||||
| .macro COPY_F4 | |||||
| pld [ X, #X_PRE ] | |||||
| fldmias X!, { s0 - s7 } | |||||
| fstmias Y!, { s0 - s7 } | |||||
| .endm | |||||
| .macro COPY_F1 | |||||
| fldmias X!, { s0 - s1 } | |||||
| fstmias Y!, { s0 - s1 } | |||||
| .endm | |||||
| /*************************************************************************************************************************/ | |||||
| .macro COPY_S4 | |||||
| nop | |||||
| fldmias X, { s0 - s1 } | |||||
| fstmias Y, { s0 - s1 } | |||||
| add X, X, INC_X | |||||
| add Y, Y, INC_Y | |||||
| fldmias X, { s2 - s3 } | |||||
| fstmias Y, { s2 - s3 } | |||||
| add X, X, INC_X | |||||
| add Y, Y, INC_Y | |||||
| fldmias X, { s0 - s1 } | |||||
| fstmias Y, { s0 - s1 } | |||||
| add X, X, INC_X | |||||
| add Y, Y, INC_Y | |||||
| fldmias X, { s2 - s3 } | |||||
| fstmias Y, { s2 - s3 } | |||||
| add X, X, INC_X | |||||
| add Y, Y, INC_Y | |||||
| .endm | |||||
| .macro COPY_S1 | |||||
| fldmias X, { s0 - s1 } | |||||
| fstmias Y, { s0 - s1 } | |||||
| add X, X, INC_X | |||||
| add Y, Y, INC_Y | |||||
| .endm | |||||
| /************************************************************************************** | |||||
| * End of macro definitions | |||||
| **************************************************************************************/ | |||||
| PROLOGUE | |||||
| .align 5 | |||||
| push {r4 - r9, fp} | |||||
| add fp, sp, #24 | |||||
| sub sp, sp, #STACKSIZE // reserve stack | |||||
| sub r4, fp, #128 | |||||
| vstm r4, { s8 - s15} // store floating point registers | |||||
| mov Y, OLD_Y | |||||
| ldr INC_Y, OLD_INC_Y | |||||
| cmp N, #0 | |||||
| ble ccopy_kernel_L999 | |||||
| cmp INC_X, #0 | |||||
| beq ccopy_kernel_L999 | |||||
| cmp INC_Y, #0 | |||||
| beq ccopy_kernel_L999 | |||||
| cmp INC_X, #1 | |||||
| bne ccopy_kernel_S_BEGIN | |||||
| cmp INC_Y, #1 | |||||
| bne ccopy_kernel_S_BEGIN | |||||
| ccopy_kernel_F_BEGIN: | |||||
| asrs I, N, #2 // I = N / 4 | |||||
| ble ccopy_kernel_F1 | |||||
| ccopy_kernel_F4: | |||||
| COPY_F4 | |||||
| subs I, I, #1 | |||||
| bne ccopy_kernel_F4 | |||||
| ccopy_kernel_F1: | |||||
| ands I, N, #3 | |||||
| ble ccopy_kernel_L999 | |||||
| ccopy_kernel_F10: | |||||
| COPY_F1 | |||||
| subs I, I, #1 | |||||
| bne ccopy_kernel_F10 | |||||
| b ccopy_kernel_L999 | |||||
| ccopy_kernel_S_BEGIN: | |||||
| lsl INC_X, INC_X, #3 // INC_X * SIZE * 2 | |||||
| lsl INC_Y, INC_Y, #3 // INC_Y * SIZE * 2 | |||||
| asrs I, N, #2 // I = N / 4 | |||||
| ble ccopy_kernel_S1 | |||||
| ccopy_kernel_S4: | |||||
| COPY_S4 | |||||
| subs I, I, #1 | |||||
| bne ccopy_kernel_S4 | |||||
| ccopy_kernel_S1: | |||||
| ands I, N, #3 | |||||
| ble ccopy_kernel_L999 | |||||
| ccopy_kernel_S10: | |||||
| COPY_S1 | |||||
| subs I, I, #1 | |||||
| bne ccopy_kernel_S10 | |||||
| ccopy_kernel_L999: | |||||
| sub r3, fp, #128 | |||||
| vldm r3, { s8 - s15} // restore floating point registers | |||||
| mov r0, #0 // set return value | |||||
| sub sp, fp, #24 | |||||
| pop {r4 - r9, fp} | |||||
| bx lr | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,284 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| /************************************************************************************** | |||||
| * 2013/11/11 Saar | |||||
| * BLASTEST : OK | |||||
| * CTEST : OK | |||||
| * TEST : OK | |||||
| * | |||||
| **************************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define STACKSIZE 256 | |||||
| #define N r0 | |||||
| #define X r1 | |||||
| #define INC_X r2 | |||||
| #define OLD_Y r3 | |||||
| /****************************************************** | |||||
| * [fp, #-128] - [fp, #-64] is reserved | |||||
| * for store and restore of floating point | |||||
| * registers | |||||
| *******************************************************/ | |||||
| #define OLD_INC_Y [fp, #4 ] | |||||
| #define I r5 | |||||
| #define Y r6 | |||||
| #define INC_Y r7 | |||||
| #define X_PRE 512 | |||||
| /************************************************************************************** | |||||
| * Macro definitions | |||||
| **************************************************************************************/ | |||||
| .macro KERNEL_F4 | |||||
| pld [ X, #X_PRE ] | |||||
| pld [ Y, #X_PRE ] | |||||
| fldmias X!, { s4 - s5 } | |||||
| fldmias Y!, { s8 - s9 } | |||||
| fmacs s0 , s4, s8 | |||||
| fmacs s1 , s4, s9 | |||||
| fldmias X!, { s6 - s7 } | |||||
| fmacs s2 , s5, s9 | |||||
| fmacs s3 , s5, s8 | |||||
| fldmias Y!, { s10 - s11 } | |||||
| fmacs s0 , s6, s10 | |||||
| fmacs s1 , s6, s11 | |||||
| fmacs s2 , s7, s11 | |||||
| fmacs s3 , s7, s10 | |||||
| fldmias X!, { s4 - s5 } | |||||
| fldmias Y!, { s8 - s9 } | |||||
| fmacs s0 , s4, s8 | |||||
| fmacs s1 , s4, s9 | |||||
| fldmias X!, { s6 - s7 } | |||||
| fmacs s2 , s5, s9 | |||||
| fmacs s3 , s5, s8 | |||||
| fldmias Y!, { s10 - s11 } | |||||
| fmacs s0 , s6, s10 | |||||
| fmacs s1 , s6, s11 | |||||
| fmacs s2 , s7, s11 | |||||
| fmacs s3 , s7, s10 | |||||
| .endm | |||||
| .macro KERNEL_F1 | |||||
| fldmias X!, { s4 - s5 } | |||||
| fldmias Y!, { s8 - s9 } | |||||
| fmacs s0 , s4, s8 | |||||
| fmacs s1 , s4, s9 | |||||
| fmacs s2 , s5, s9 | |||||
| fmacs s3 , s5, s8 | |||||
| .endm | |||||
| /*************************************************************************************************************************/ | |||||
| .macro KERNEL_S4 | |||||
| nop | |||||
| fldmias X, { s4 - s5 } | |||||
| fldmias Y, { s8 - s9 } | |||||
| fmacs s0 , s4, s8 | |||||
| fmacs s1 , s4, s9 | |||||
| fmacs s2 , s5, s9 | |||||
| fmacs s3 , s5, s8 | |||||
| add X, X, INC_X | |||||
| add Y, Y, INC_Y | |||||
| fldmias X, { s4 - s5 } | |||||
| fldmias Y, { s8 - s9 } | |||||
| fmacs s0 , s4, s8 | |||||
| fmacs s1 , s4, s9 | |||||
| fmacs s2 , s5, s9 | |||||
| fmacs s3 , s5, s8 | |||||
| add X, X, INC_X | |||||
| add Y, Y, INC_Y | |||||
| fldmias X, { s4 - s5 } | |||||
| fldmias Y, { s8 - s9 } | |||||
| fmacs s0 , s4, s8 | |||||
| fmacs s1 , s4, s9 | |||||
| fmacs s2 , s5, s9 | |||||
| fmacs s3 , s5, s8 | |||||
| add X, X, INC_X | |||||
| add Y, Y, INC_Y | |||||
| fldmias X, { s4 - s5 } | |||||
| fldmias Y, { s8 - s9 } | |||||
| fmacs s0 , s4, s8 | |||||
| fmacs s1 , s4, s9 | |||||
| fmacs s2 , s5, s9 | |||||
| fmacs s3 , s5, s8 | |||||
| add X, X, INC_X | |||||
| add Y, Y, INC_Y | |||||
| .endm | |||||
| .macro KERNEL_S1 | |||||
| fldmias X, { s4 - s5 } | |||||
| fldmias Y, { s8 - s9 } | |||||
| fmacs s0 , s4, s8 | |||||
| fmacs s1 , s4, s9 | |||||
| fmacs s2 , s5, s9 | |||||
| fmacs s3 , s5, s8 | |||||
| add X, X, INC_X | |||||
| add Y, Y, INC_Y | |||||
| .endm | |||||
| /************************************************************************************** | |||||
| * End of macro definitions | |||||
| **************************************************************************************/ | |||||
| PROLOGUE | |||||
| .align 5 | |||||
| push {r4 - r9, fp} | |||||
| add fp, sp, #24 | |||||
| sub sp, sp, #STACKSIZE // reserve stack | |||||
| sub r4, fp, #128 | |||||
| vstm r4, { s8 - s15} // store floating point registers | |||||
| mov Y, OLD_Y | |||||
| ldr INC_Y, OLD_INC_Y | |||||
| vsub.f32 s0 , s0 , s0 | |||||
| vsub.f32 s1 , s1 , s1 | |||||
| vsub.f32 s2 , s2 , s2 | |||||
| vsub.f32 s3 , s3 , s3 | |||||
| cmp N, #0 | |||||
| ble cdot_kernel_L999 | |||||
| cmp INC_X, #0 | |||||
| beq cdot_kernel_L999 | |||||
| cmp INC_Y, #0 | |||||
| beq cdot_kernel_L999 | |||||
| cmp INC_X, #1 | |||||
| bne cdot_kernel_S_BEGIN | |||||
| cmp INC_Y, #1 | |||||
| bne cdot_kernel_S_BEGIN | |||||
| cdot_kernel_F_BEGIN: | |||||
| asrs I, N, #2 // I = N / 4 | |||||
| ble cdot_kernel_F1 | |||||
| cdot_kernel_F4: | |||||
| KERNEL_F4 | |||||
| subs I, I, #1 | |||||
| bne cdot_kernel_F4 | |||||
| cdot_kernel_F1: | |||||
| ands I, N, #3 | |||||
| ble cdot_kernel_L999 | |||||
| cdot_kernel_F10: | |||||
| KERNEL_F1 | |||||
| subs I, I, #1 | |||||
| bne cdot_kernel_F10 | |||||
| b cdot_kernel_L999 | |||||
| cdot_kernel_S_BEGIN: | |||||
| lsl INC_X, INC_X, #3 // INC_X * SIZE * 2 | |||||
| lsl INC_Y, INC_Y, #3 // INC_Y * SIZE * 2 | |||||
| asrs I, N, #2 // I = N / 4 | |||||
| ble cdot_kernel_S1 | |||||
| cdot_kernel_S4: | |||||
| KERNEL_S4 | |||||
| subs I, I, #1 | |||||
| bne cdot_kernel_S4 | |||||
| cdot_kernel_S1: | |||||
| ands I, N, #3 | |||||
| ble cdot_kernel_L999 | |||||
| cdot_kernel_S10: | |||||
| KERNEL_S1 | |||||
| subs I, I, #1 | |||||
| bne cdot_kernel_S10 | |||||
| cdot_kernel_L999: | |||||
| sub r3, fp, #128 | |||||
| vldm r3, { s8 - s15} // restore floating point registers | |||||
| #if !defined(CONJ) | |||||
| vsub.f32 s0 , s0, s2 | |||||
| vadd.f32 s1 , s1, s3 | |||||
| #else | |||||
| vadd.f32 s0 , s0, s2 | |||||
| vsub.f32 s1 , s1, s3 | |||||
| #endif | |||||
| sub sp, fp, #24 | |||||
| pop {r4 - r9, fp} | |||||
| bx lr | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,258 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| /************************************************************************************** | |||||
| * 2013/11/05 Saar | |||||
| * BLASTEST : OK | |||||
| * CTEST : OK | |||||
| * TEST : OK | |||||
| * | |||||
| **************************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define STACKSIZE 256 | |||||
| #define OLD_M r0 | |||||
| #define OLD_N r1 | |||||
| #define OLD_A r2 | |||||
| #define OLD_LDA r3 | |||||
| /****************************************************** | |||||
| * [fp, #-128] - [fp, #-64] is reserved | |||||
| * for store and restore of floating point | |||||
| * registers | |||||
| *******************************************************/ | |||||
| #define LDA [fp, #-260 ] | |||||
| #define B [fp, #4 ] | |||||
| #define M r0 | |||||
| #define N r1 | |||||
| #define A r2 | |||||
| #define BO r5 | |||||
| #define AO1 r6 | |||||
| #define AO2 r7 | |||||
| #define I r3 | |||||
| #define J r12 | |||||
| #define A_PRE 256 | |||||
| /************************************************************************************** | |||||
| * Macro definitions | |||||
| **************************************************************************************/ | |||||
| .macro COPY2x2 | |||||
| flds s0 , [ AO1, #0 ] | |||||
| flds s1 , [ AO1, #4 ] | |||||
| flds s4 , [ AO1, #8 ] | |||||
| flds s5 , [ AO1, #12 ] | |||||
| flds s2 , [ AO2, #0 ] | |||||
| flds s3 , [ AO2, #4 ] | |||||
| add AO1, AO1, #16 | |||||
| flds s6 , [ AO2, #8 ] | |||||
| flds s7 , [ AO2, #12 ] | |||||
| fstmias BO!, { s0 - s7 } | |||||
| add AO2, AO2, #16 | |||||
| .endm | |||||
| .macro COPY1x2 | |||||
| flds s0 , [ AO1, #0 ] | |||||
| flds s1 , [ AO1, #4 ] | |||||
| flds s2 , [ AO2, #0 ] | |||||
| flds s3 , [ AO2, #4 ] | |||||
| add AO1, AO1, #8 | |||||
| fstmias BO!, { s0 - s3 } | |||||
| add AO2, AO2, #8 | |||||
| .endm | |||||
| .macro COPY2x1 | |||||
| flds s0 , [ AO1, #0 ] | |||||
| flds s1 , [ AO1, #4 ] | |||||
| flds s2 , [ AO1, #8 ] | |||||
| flds s3 , [ AO1, #12 ] | |||||
| fstmias BO!, { s0 - s3 } | |||||
| add AO1, AO1, #16 | |||||
| .endm | |||||
| .macro COPY1x1 | |||||
| flds s0 , [ AO1, #0 ] | |||||
| flds s1 , [ AO1, #4 ] | |||||
| fstmias BO!, { s0 - s1 } | |||||
| add AO1, AO1, #8 | |||||
| .endm | |||||
| /************************************************************************************** | |||||
| * End of macro definitions | |||||
| **************************************************************************************/ | |||||
| PROLOGUE | |||||
| .align 5 | |||||
| push {r4 - r9, fp} | |||||
| add fp, sp, #24 | |||||
| sub sp, sp, #STACKSIZE // reserve stack | |||||
| lsl r3, r3, #3 // lda = lda * 4 * 2 | |||||
| str r3, LDA | |||||
| sub r4, fp, #128 | |||||
| vstm r4, { s8 - s15} // store floating point registers | |||||
| ldr BO, B | |||||
| /*********************************************************************************************/ | |||||
| cgemm_ncopy_L2_BEGIN: | |||||
| asrs J, N, #1 // J = N / 2 | |||||
| ble cgemm_ncopy_L1_BEGIN | |||||
| cgemm_ncopy_L2_M2_BEGIN: | |||||
| mov AO1, A // AO1 = A | |||||
| ldr r4 , LDA | |||||
| add AO2, AO1, r4 | |||||
| add A , AO2, r4 // A = A + 2 * LDA | |||||
| asrs I, M, #1 // I = M / 2 | |||||
| ble cgemm_ncopy_L2_M2_40 | |||||
| cgemm_ncopy_L2_M2_20: | |||||
| pld [ AO1, #A_PRE ] | |||||
| pld [ AO2, #A_PRE ] | |||||
| COPY2x2 | |||||
| subs I , I , #1 | |||||
| ble cgemm_ncopy_L2_M2_40 | |||||
| COPY2x2 | |||||
| subs I , I , #1 | |||||
| bne cgemm_ncopy_L2_M2_20 | |||||
| cgemm_ncopy_L2_M2_40: | |||||
| ands I, M , #1 | |||||
| ble cgemm_ncopy_L2_M2_END | |||||
| cgemm_ncopy_L2_M2_60: | |||||
| COPY1x2 | |||||
| subs I , I , #1 | |||||
| bne cgemm_ncopy_L2_M2_60 | |||||
| cgemm_ncopy_L2_M2_END: | |||||
| subs J , J, #1 // j-- | |||||
| bne cgemm_ncopy_L2_M2_BEGIN | |||||
| /*********************************************************************************************/ | |||||
| cgemm_ncopy_L1_BEGIN: | |||||
| tst N, #1 | |||||
| ble cgemm_ncopy_L999 | |||||
| cgemm_ncopy_L1_M2_BEGIN: | |||||
| mov AO1, A // AO1 = A | |||||
| ldr r4 , LDA | |||||
| add A , AO1, r4 // A = A + 1 * LDA | |||||
| asrs I, M, #1 // I = M / 2 | |||||
| ble cgemm_ncopy_L1_M2_40 | |||||
| cgemm_ncopy_L1_M2_20: | |||||
| COPY2x1 | |||||
| subs I , I , #1 | |||||
| bne cgemm_ncopy_L1_M2_20 | |||||
| cgemm_ncopy_L1_M2_40: | |||||
| ands I, M , #1 | |||||
| ble cgemm_ncopy_L1_M2_END | |||||
| cgemm_ncopy_L1_M2_60: | |||||
| COPY1x1 | |||||
| subs I , I , #1 | |||||
| bne cgemm_ncopy_L1_M2_60 | |||||
| cgemm_ncopy_L1_M2_END: | |||||
| cgemm_ncopy_L999: | |||||
| sub r3, fp, #128 | |||||
| vldm r3, { s8 - s15} // restore floating point registers | |||||
| movs r0, #0 // set return value | |||||
| sub sp, fp, #24 | |||||
| pop {r4 - r9, fp} | |||||
| bx lr | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,243 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| /************************************************************************************** | |||||
| * 2013/11/07 Saar | |||||
| * BLASTEST : OK | |||||
| * CTEST : OK | |||||
| * TEST : OK | |||||
| * | |||||
| **************************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define STACKSIZE 256 | |||||
| #define OLD_M r0 | |||||
| #define OLD_N r1 | |||||
| #define OLD_A r2 | |||||
| #define OLD_LDA r3 | |||||
| /****************************************************** | |||||
| * [fp, #-128] - [fp, #-64] is reserved | |||||
| * for store and restore of floating point | |||||
| * registers | |||||
| *******************************************************/ | |||||
| #define B [fp, #4 ] | |||||
| #define A [fp, #-248 ] | |||||
| #define M r0 | |||||
| #define N r1 | |||||
| #define M4 r2 | |||||
| #define LDA r5 | |||||
| #define AO1 r6 | |||||
| #define BO1 r7 | |||||
| #define BO2 r8 | |||||
| #define I r4 | |||||
| #define J r12 | |||||
| #define A_PRE 256 | |||||
| /************************************************************************************** | |||||
| * Macro definitions | |||||
| **************************************************************************************/ | |||||
| .macro COPY2x2 | |||||
| fldmias AO1, { s0 - s3 } | |||||
| add r3, AO1, LDA | |||||
| fldmias r3, { s4 - s7 } | |||||
| fstmias BO1, { s0 - s7 } | |||||
| add AO1, AO1, #16 | |||||
| add BO1, BO1, M4 | |||||
| .endm | |||||
| .macro COPY1x2 | |||||
| fldmias AO1, { s0 -s1 } | |||||
| add r3, AO1, LDA | |||||
| fldmias r3, { s2 - s3 } | |||||
| fstmias BO2, { s0 - s3 } | |||||
| add AO1, AO1, #8 | |||||
| add BO2, BO2, #16 | |||||
| .endm | |||||
| /*************************************************************************************************************************/ | |||||
| .macro COPY2x1 | |||||
| fldmias AO1, { s0 - s3 } | |||||
| fstmias BO1, { s0 - s3 } | |||||
| add AO1, AO1, #16 | |||||
| add BO1, BO1, M4 | |||||
| .endm | |||||
| .macro COPY1x1 | |||||
| fldmias AO1, { s0 - s1 } | |||||
| fstmias BO2, { s0 - s1 } | |||||
| add AO1, AO1, #8 | |||||
| add BO2, BO2, #8 | |||||
| .endm | |||||
| /************************************************************************************** | |||||
| * End of macro definitions | |||||
| **************************************************************************************/ | |||||
| PROLOGUE | |||||
| .align 5 | |||||
| push {r4 - r9, fp} | |||||
| add fp, sp, #24 | |||||
| sub sp, sp, #STACKSIZE // reserve stack | |||||
| str OLD_A, A // store A | |||||
| lsl LDA, OLD_LDA, #3 // lda = lda * SIZE * 2 | |||||
| sub r4, fp, #128 | |||||
| vstm r4, { s8 - s15} // store floating point registers | |||||
| lsl r4 , M, #3 // M * SIZE * 2 | |||||
| ldr r3, B | |||||
| and BO2 , N , #-2 | |||||
| mul BO2, BO2, r4 | |||||
| add BO2 , BO2, r3 | |||||
| lsl M4, M, #4 // M4 = M * 2 * SIZE * 2 | |||||
| cgemm_tcopy_L2_BEGIN: | |||||
| asrs J, M, #1 // J = N / 2 | |||||
| ble cgemm_tcopy_L1_BEGIN | |||||
| cgemm_tcopy_L2_M2_BEGIN: | |||||
| ldr AO1, A // AO1 = A | |||||
| lsl r3, LDA, #1 // r3 = 2 * LDA | |||||
| add r3, r3 , AO1 // A = A + 2 * LDA | |||||
| str r3, A // store A | |||||
| ldr BO1, B | |||||
| add r3, BO1, #32 // B = B + 4 * SIZE *2 | |||||
| str r3, B | |||||
| asrs I, N, #1 // I = M / 2 | |||||
| ble cgemm_tcopy_L2_M2_60 | |||||
| cgemm_tcopy_L2_M2_40: | |||||
| COPY2x2 | |||||
| subs I, I, #1 | |||||
| bne cgemm_tcopy_L2_M2_40 | |||||
| cgemm_tcopy_L2_M2_60: | |||||
| tst N , #1 | |||||
| ble cgemm_tcopy_L2_M2_END | |||||
| COPY1x2 | |||||
| cgemm_tcopy_L2_M2_END: | |||||
| subs J , J, #1 // j-- | |||||
| bne cgemm_tcopy_L2_M2_BEGIN | |||||
| /*********************************************************************************************/ | |||||
| cgemm_tcopy_L1_BEGIN: | |||||
| tst M, #1 | |||||
| ble cgemm_tcopy_L999 | |||||
| cgemm_tcopy_L1_M2_BEGIN: | |||||
| ldr AO1, A // AO1 = A | |||||
| add r3, LDA , AO1 // A = A + 1 * LDA | |||||
| str r3, A // store A | |||||
| ldr BO1, B | |||||
| add r3, BO1, #16 // B = B + 2 * SIZE *2 | |||||
| str r3, B | |||||
| asrs I, N, #1 // I = M / 2 | |||||
| ble cgemm_tcopy_L1_M2_60 | |||||
| cgemm_tcopy_L1_M2_40: | |||||
| COPY2x1 | |||||
| subs I, I, #1 | |||||
| bne cgemm_tcopy_L1_M2_40 | |||||
| cgemm_tcopy_L1_M2_60: | |||||
| tst N , #1 | |||||
| ble cgemm_tcopy_L1_M2_END | |||||
| COPY1x1 | |||||
| cgemm_tcopy_L1_M2_END: | |||||
| cgemm_tcopy_L999: | |||||
| sub r3, fp, #128 | |||||
| vldm r3, { s8 - s15} // restore floating point registers | |||||
| mov r0, #0 // set return value | |||||
| sub sp, fp, #24 | |||||
| pop {r4 - r9, fp} | |||||
| bx lr | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,697 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| /************************************************************************************** | |||||
| * 2013/11/29 Saar | |||||
| * BLASTEST : OK | |||||
| * CTEST : OK | |||||
| * TEST : OK | |||||
| * | |||||
| **************************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define STACKSIZE 256 | |||||
| #define OLD_LDA [fp, #0 ] | |||||
| #define X [fp, #4 ] | |||||
| #define OLD_INC_X [fp, #8 ] | |||||
| #define Y [fp, #12 ] | |||||
| #define OLD_INC_Y [fp, #16 ] | |||||
| #define OLD_A r3 | |||||
| #define OLD_M r0 | |||||
| #define AO1 r0 | |||||
| #define N r1 | |||||
| #define J r2 | |||||
| #define AO2 r4 | |||||
| #define XO r5 | |||||
| #define YO r6 | |||||
| #define LDA r7 | |||||
| #define INC_X r8 | |||||
| #define INC_Y r9 | |||||
| #define I r12 | |||||
| #define ALPHA_I [fp, #-236] | |||||
| #define ALPHA_R [fp, #-244] | |||||
| #define M [fp, #-252 ] | |||||
| #define A [fp, #-256 ] | |||||
| #define X_PRE 64 | |||||
| #define Y_PRE 0 | |||||
| #define A_PRE 0 | |||||
| /**************************************************************************************/ | |||||
| #if !defined(CONJ) && !defined(XCONJ) | |||||
| #define KMAC_R fnmacs | |||||
| #define KMAC_I fmacs | |||||
| #define FMAC_R1 fmacs | |||||
| #define FMAC_R2 fnmacs | |||||
| #define FMAC_I1 fmacs | |||||
| #define FMAC_I2 fmacs | |||||
| #elif defined(CONJ) && !defined(XCONJ) | |||||
| #define KMAC_R fmacs | |||||
| #define KMAC_I fnmacs | |||||
| #define FMAC_R1 fmacs | |||||
| #define FMAC_R2 fnmacs | |||||
| #define FMAC_I1 fmacs | |||||
| #define FMAC_I2 fmacs | |||||
| #elif !defined(CONJ) && defined(XCONJ) | |||||
| #define KMAC_R fmacs | |||||
| #define KMAC_I fnmacs | |||||
| #define FMAC_R1 fmacs | |||||
| #define FMAC_R2 fmacs | |||||
| #define FMAC_I1 fnmacs | |||||
| #define FMAC_I2 fmacs | |||||
| #else | |||||
| #define KMAC_R fnmacs | |||||
| #define KMAC_I fmacs | |||||
| #define FMAC_R1 fmacs | |||||
| #define FMAC_R2 fmacs | |||||
| #define FMAC_I1 fnmacs | |||||
| #define FMAC_I2 fmacs | |||||
| #endif | |||||
| .macro INIT_F4 | |||||
| pld [ YO, #Y_PRE ] | |||||
| vsub.f32 s8 , s8 , s8 | |||||
| vmov.f32 s9 , s8 | |||||
| vmov.f32 s10, s8 | |||||
| vmov.f32 s11, s8 | |||||
| vmov.f32 s12, s8 | |||||
| vmov.f32 s13, s8 | |||||
| vmov.f32 s14, s8 | |||||
| vmov.f32 s15, s8 | |||||
| .endm | |||||
| .macro KERNEL_F4X4 | |||||
| pld [ XO, #X_PRE ] | |||||
| KERNEL_F4X1 | |||||
| KERNEL_F4X1 | |||||
| KERNEL_F4X1 | |||||
| KERNEL_F4X1 | |||||
| .endm | |||||
| .macro KERNEL_F4X1 | |||||
| pld [ AO2, #A_PRE ] | |||||
| flds s0 , [ AO1 ] | |||||
| flds s1 , [ AO1, #4 ] | |||||
| flds s2 , [ AO1, #8 ] | |||||
| flds s3 , [ AO1, #12 ] | |||||
| flds s4 , [ XO ] | |||||
| flds s5 , [ XO, #4 ] | |||||
| fmacs s8 , s0, s4 | |||||
| fmacs s9 , s0, s5 | |||||
| fmacs s10 , s2, s4 | |||||
| fmacs s11 , s2, s5 | |||||
| KMAC_R s8 , s1, s5 | |||||
| KMAC_I s9 , s1, s4 | |||||
| KMAC_R s10 , s3, s5 | |||||
| KMAC_I s11 , s3, s4 | |||||
| flds s0 , [ AO1, #16 ] | |||||
| flds s1 , [ AO1, #20 ] | |||||
| flds s2 , [ AO1, #24 ] | |||||
| flds s3 , [ AO1, #28 ] | |||||
| fmacs s12 , s0, s4 | |||||
| fmacs s13 , s0, s5 | |||||
| fmacs s14 , s2, s4 | |||||
| fmacs s15 , s2, s5 | |||||
| KMAC_R s12 , s1, s5 | |||||
| KMAC_I s13 , s1, s4 | |||||
| KMAC_R s14 , s3, s5 | |||||
| KMAC_I s15 , s3, s4 | |||||
| add XO , XO, #8 | |||||
| add AO1 , AO1, LDA | |||||
| add AO2 , AO2, LDA | |||||
| .endm | |||||
| .macro SAVE_F4 | |||||
| flds s0, ALPHA_R | |||||
| flds s1, ALPHA_I | |||||
| fldmias YO, { s4 - s7 } | |||||
| FMAC_R1 s4 , s0 , s8 | |||||
| FMAC_I1 s5 , s0 , s9 | |||||
| FMAC_R2 s4 , s1 , s9 | |||||
| FMAC_I2 s5 , s1 , s8 | |||||
| FMAC_R1 s6 , s0 , s10 | |||||
| FMAC_I1 s7 , s0 , s11 | |||||
| FMAC_R2 s6 , s1 , s11 | |||||
| FMAC_I2 s7 , s1 , s10 | |||||
| fstmias YO!, { s4 - s7 } | |||||
| fldmias YO, { s4 - s7 } | |||||
| FMAC_R1 s4 , s0 , s12 | |||||
| FMAC_I1 s5 , s0 , s13 | |||||
| FMAC_R2 s4 , s1 , s13 | |||||
| FMAC_I2 s5 , s1 , s12 | |||||
| FMAC_R1 s6 , s0 , s14 | |||||
| FMAC_I1 s7 , s0 , s15 | |||||
| FMAC_R2 s6 , s1 , s15 | |||||
| FMAC_I2 s7 , s1 , s14 | |||||
| fstmias YO!, { s4 - s7 } | |||||
| .endm | |||||
| .macro INIT_F1 | |||||
| vsub.f32 s8 , s8 , s8 | |||||
| vmov.f32 s9 , s8 | |||||
| .endm | |||||
| .macro KERNEL_F1X1 | |||||
| flds s0 , [ AO1 ] | |||||
| flds s1 , [ AO1, #4 ] | |||||
| flds s4 , [ XO ] | |||||
| flds s5 , [ XO, #4 ] | |||||
| fmacs s8 , s0, s4 | |||||
| fmacs s9 , s0, s5 | |||||
| KMAC_R s8 , s1, s5 | |||||
| KMAC_I s9 , s1, s4 | |||||
| add XO , XO, #8 | |||||
| add AO1 , AO1, LDA | |||||
| .endm | |||||
| .macro SAVE_F1 | |||||
| flds s0, ALPHA_R | |||||
| flds s1, ALPHA_I | |||||
| fldmias YO, { s4 - s5 } | |||||
| FMAC_R1 s4 , s0 , s8 | |||||
| FMAC_I1 s5 , s0 , s9 | |||||
| FMAC_R2 s4 , s1 , s9 | |||||
| FMAC_I2 s5 , s1 , s8 | |||||
| fstmias YO, { s4 - s5 } | |||||
| add YO, YO, #8 | |||||
| .endm | |||||
| /****************************************************************************************/ | |||||
| .macro INIT_S4 | |||||
| vsub.f32 s8 , s8 , s8 | |||||
| vmov.f32 s9 , s8 | |||||
| vmov.f32 s10, s8 | |||||
| vmov.f32 s11, s8 | |||||
| vmov.f32 s12, s8 | |||||
| vmov.f32 s13, s8 | |||||
| vmov.f32 s14, s8 | |||||
| vmov.f32 s15, s8 | |||||
| .endm | |||||
| .macro KERNEL_S4X4 | |||||
| KERNEL_S4X1 | |||||
| KERNEL_S4X1 | |||||
| KERNEL_S4X1 | |||||
| KERNEL_S4X1 | |||||
| .endm | |||||
| .macro KERNEL_S4X1 | |||||
| flds s0 , [ AO1 ] | |||||
| flds s1 , [ AO1, #4 ] | |||||
| flds s2 , [ AO1, #8 ] | |||||
| flds s3 , [ AO1, #12 ] | |||||
| flds s4 , [ XO ] | |||||
| flds s5 , [ XO, #4 ] | |||||
| fmacs s8 , s0, s4 | |||||
| fmacs s9 , s0, s5 | |||||
| fmacs s10 , s2, s4 | |||||
| fmacs s11 , s2, s5 | |||||
| KMAC_R s8 , s1, s5 | |||||
| KMAC_I s9 , s1, s4 | |||||
| KMAC_R s10 , s3, s5 | |||||
| KMAC_I s11 , s3, s4 | |||||
| flds s0 , [ AO1, #16 ] | |||||
| flds s1 , [ AO1, #20 ] | |||||
| flds s2 , [ AO1, #24 ] | |||||
| flds s3 , [ AO1, #28 ] | |||||
| fmacs s12 , s0, s4 | |||||
| fmacs s13 , s0, s5 | |||||
| fmacs s14 , s2, s4 | |||||
| fmacs s15 , s2, s5 | |||||
| KMAC_R s12 , s1, s5 | |||||
| KMAC_I s13 , s1, s4 | |||||
| KMAC_R s14 , s3, s5 | |||||
| KMAC_I s15 , s3, s4 | |||||
| add XO , XO, INC_X | |||||
| add AO1 , AO1, LDA | |||||
| add AO2 , AO2, LDA | |||||
| .endm | |||||
| .macro SAVE_S4 | |||||
| flds s0, ALPHA_R | |||||
| flds s1, ALPHA_I | |||||
| fldmias YO, { s4 - s5 } | |||||
| FMAC_R1 s4 , s0 , s8 | |||||
| FMAC_I1 s5 , s0 , s9 | |||||
| FMAC_R2 s4 , s1 , s9 | |||||
| FMAC_I2 s5 , s1 , s8 | |||||
| fstmias YO, { s4 - s5 } | |||||
| add YO, YO, INC_Y | |||||
| fldmias YO, { s6 - s7 } | |||||
| FMAC_R1 s6 , s0 , s10 | |||||
| FMAC_I1 s7 , s0 , s11 | |||||
| FMAC_R2 s6 , s1 , s11 | |||||
| FMAC_I2 s7 , s1 , s10 | |||||
| fstmias YO, { s6 - s7 } | |||||
| add YO, YO, INC_Y | |||||
| fldmias YO, { s4 - s5 } | |||||
| FMAC_R1 s4 , s0 , s12 | |||||
| FMAC_I1 s5 , s0 , s13 | |||||
| FMAC_R2 s4 , s1 , s13 | |||||
| FMAC_I2 s5 , s1 , s12 | |||||
| fstmias YO, { s4 - s5 } | |||||
| add YO, YO, INC_Y | |||||
| fldmias YO, { s6 - s7 } | |||||
| FMAC_R1 s6 , s0 , s14 | |||||
| FMAC_I1 s7 , s0 , s15 | |||||
| FMAC_R2 s6 , s1 , s15 | |||||
| FMAC_I2 s7 , s1 , s14 | |||||
| fstmias YO, { s6 - s7 } | |||||
| add YO, YO, INC_Y | |||||
| .endm | |||||
| .macro INIT_S1 | |||||
| vsub.f32 s8 , s8 , s8 | |||||
| vmov.f32 s9 , s8 | |||||
| .endm | |||||
| .macro KERNEL_S1X1 | |||||
| flds s0 , [ AO1 ] | |||||
| flds s1 , [ AO1, #4 ] | |||||
| flds s4 , [ XO ] | |||||
| flds s5 , [ XO, #4 ] | |||||
| fmacs s8 , s0, s4 | |||||
| fmacs s9 , s0, s5 | |||||
| KMAC_R s8 , s1, s5 | |||||
| KMAC_I s9 , s1, s4 | |||||
| add XO , XO, INC_X | |||||
| add AO1 , AO1, LDA | |||||
| .endm | |||||
| .macro SAVE_S1 | |||||
| flds s0, ALPHA_R | |||||
| flds s1, ALPHA_I | |||||
| fldmias YO, { s4 - s5 } | |||||
| FMAC_R1 s4 , s0 , s8 | |||||
| FMAC_I1 s5 , s0 , s9 | |||||
| FMAC_R2 s4 , s1 , s9 | |||||
| FMAC_I2 s5 , s1 , s8 | |||||
| fstmias YO, { s4 - s5 } | |||||
| add YO, YO, INC_Y | |||||
| .endm | |||||
| /************************************************************************************** | |||||
| * End of macro definitions | |||||
| **************************************************************************************/ | |||||
| PROLOGUE | |||||
| .align 5 | |||||
| push {r4 - r9 , fp} | |||||
| add fp, sp, #28 | |||||
| sub sp, sp, #STACKSIZE // reserve stack | |||||
| sub r12, fp, #192 | |||||
| #if defined(DOUBLE) | |||||
| vstm r12, { d8 - d15 } // store floating point registers | |||||
| #else | |||||
| vstm r12, { s8 - s15 } // store floating point registers | |||||
| #endif | |||||
| cmp OLD_M, #0 | |||||
| ble cgemvn_kernel_L999 | |||||
| cmp N, #0 | |||||
| ble cgemvn_kernel_L999 | |||||
| str OLD_A, A | |||||
| str OLD_M, M | |||||
| vstr s0 , ALPHA_R | |||||
| vstr s1 , ALPHA_I | |||||
| ldr INC_X , OLD_INC_X | |||||
| ldr INC_Y , OLD_INC_Y | |||||
| cmp INC_X, #0 | |||||
| beq cgemvn_kernel_L999 | |||||
| cmp INC_Y, #0 | |||||
| beq cgemvn_kernel_L999 | |||||
| ldr LDA, OLD_LDA | |||||
| #if defined(DOUBLE) | |||||
| lsl LDA, LDA, #4 // LDA * SIZE * 2 | |||||
| #else | |||||
| lsl LDA, LDA, #3 // LDA * SIZE * 2 | |||||
| #endif | |||||
| cmp INC_X, #1 | |||||
| bne cgemvn_kernel_S4_BEGIN | |||||
| cmp INC_Y, #1 | |||||
| bne cgemvn_kernel_S4_BEGIN | |||||
| cgemvn_kernel_F4_BEGIN: | |||||
| ldr YO , Y | |||||
| ldr I, M | |||||
| asrs I, I, #2 // I = M / 4 | |||||
| ble cgemvn_kernel_F1_BEGIN | |||||
| cgemvn_kernel_F4X4: | |||||
| ldr AO1, A | |||||
| add AO2, AO1, LDA | |||||
| add r3 , AO1, #32 | |||||
| str r3 , A | |||||
| add AO2, AO2, LDA | |||||
| add AO2, AO2, LDA | |||||
| ldr XO , X | |||||
| INIT_F4 | |||||
| asrs J, N, #2 // J = N / 4 | |||||
| ble cgemvn_kernel_F4X1 | |||||
| cgemvn_kernel_F4X4_10: | |||||
| KERNEL_F4X4 | |||||
| subs J, J, #1 | |||||
| bne cgemvn_kernel_F4X4_10 | |||||
| cgemvn_kernel_F4X1: | |||||
| ands J, N , #3 | |||||
| ble cgemvn_kernel_F4_END | |||||
| cgemvn_kernel_F4X1_10: | |||||
| KERNEL_F4X1 | |||||
| subs J, J, #1 | |||||
| bne cgemvn_kernel_F4X1_10 | |||||
| cgemvn_kernel_F4_END: | |||||
| SAVE_F4 | |||||
| subs I , I , #1 | |||||
| bne cgemvn_kernel_F4X4 | |||||
| cgemvn_kernel_F1_BEGIN: | |||||
| ldr I, M | |||||
| ands I, I , #3 | |||||
| ble cgemvn_kernel_L999 | |||||
| cgemvn_kernel_F1X1: | |||||
| ldr AO1, A | |||||
| add r3, AO1, #8 | |||||
| str r3, A | |||||
| ldr XO , X | |||||
| INIT_F1 | |||||
| mov J, N | |||||
| cgemvn_kernel_F1X1_10: | |||||
| KERNEL_F1X1 | |||||
| subs J, J, #1 | |||||
| bne cgemvn_kernel_F1X1_10 | |||||
| cgemvn_kernel_F1_END: | |||||
| SAVE_F1 | |||||
| subs I , I , #1 | |||||
| bne cgemvn_kernel_F1X1 | |||||
| b cgemvn_kernel_L999 | |||||
| /*************************************************************************************************************/ | |||||
| cgemvn_kernel_S4_BEGIN: | |||||
| #if defined(DOUBLE) | |||||
| lsl INC_X, INC_X, #4 // INC_X * SIZE * 2 | |||||
| lsl INC_Y, INC_Y, #4 // INC_Y * SIZE * 2 | |||||
| #else | |||||
| lsl INC_X, INC_X, #3 // INC_X * SIZE * 2 | |||||
| lsl INC_Y, INC_Y, #3 // INC_Y * SIZE * 2 | |||||
| #endif | |||||
| ldr YO , Y | |||||
| ldr I, M | |||||
| asrs I, I, #2 // I = M / 4 | |||||
| ble cgemvn_kernel_S1_BEGIN | |||||
| cgemvn_kernel_S4X4: | |||||
| ldr AO1, A | |||||
| add AO2, AO1, LDA | |||||
| add r3 , AO1, #32 | |||||
| str r3 , A | |||||
| ldr XO , X | |||||
| INIT_S4 | |||||
| asrs J, N, #2 // J = N / 4 | |||||
| ble cgemvn_kernel_S4X1 | |||||
| cgemvn_kernel_S4X4_10: | |||||
| KERNEL_S4X4 | |||||
| subs J, J, #1 | |||||
| bne cgemvn_kernel_S4X4_10 | |||||
| cgemvn_kernel_S4X1: | |||||
| ands J, N , #3 | |||||
| ble cgemvn_kernel_S4_END | |||||
| cgemvn_kernel_S4X1_10: | |||||
| KERNEL_S4X1 | |||||
| subs J, J, #1 | |||||
| bne cgemvn_kernel_S4X1_10 | |||||
| cgemvn_kernel_S4_END: | |||||
| SAVE_S4 | |||||
| subs I , I , #1 | |||||
| bne cgemvn_kernel_S4X4 | |||||
| cgemvn_kernel_S1_BEGIN: | |||||
| ldr I, M | |||||
| ands I, I , #3 | |||||
| ble cgemvn_kernel_L999 | |||||
| cgemvn_kernel_S1X1: | |||||
| ldr AO1, A | |||||
| add r3, AO1, #8 | |||||
| str r3, A | |||||
| ldr XO , X | |||||
| INIT_S1 | |||||
| mov J, N | |||||
| cgemvn_kernel_S1X1_10: | |||||
| KERNEL_S1X1 | |||||
| subs J, J, #1 | |||||
| bne cgemvn_kernel_S1X1_10 | |||||
| cgemvn_kernel_S1_END: | |||||
| SAVE_S1 | |||||
| subs I , I , #1 | |||||
| bne cgemvn_kernel_S1X1 | |||||
| /*************************************************************************************************************/ | |||||
| cgemvn_kernel_L999: | |||||
| sub r3, fp, #192 | |||||
| #if defined(DOUBLE) | |||||
| vldm r3, { d8 - d15 } // restore floating point registers | |||||
| #else | |||||
| vldm r3, { s8 - s15 } // restore floating point registers | |||||
| #endif | |||||
| mov r0, #0 // set return value | |||||
| sub sp, fp, #28 | |||||
| pop {r4 -r9 ,fp} | |||||
| bx lr | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,607 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| /************************************************************************************** | |||||
| * 2013/11/29 Saar | |||||
| * BLASTEST : OK | |||||
| * CTEST : OK | |||||
| * TEST : OK | |||||
| * | |||||
| **************************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define STACKSIZE 256 | |||||
| #define OLD_LDA [fp, #0 ] | |||||
| #define X [fp, #4 ] | |||||
| #define OLD_INC_X [fp, #8 ] | |||||
| #define Y [fp, #12 ] | |||||
| #define OLD_INC_Y [fp, #16 ] | |||||
| #define OLD_A r3 | |||||
| #define OLD_N r1 | |||||
| #define M r0 | |||||
| #define AO1 r1 | |||||
| #define J r2 | |||||
| #define AO2 r4 | |||||
| #define XO r5 | |||||
| #define YO r6 | |||||
| #define LDA r7 | |||||
| #define INC_X r8 | |||||
| #define INC_Y r9 | |||||
| #define I r12 | |||||
| #define N [fp, #-252 ] | |||||
| #define A [fp, #-256 ] | |||||
| #define X_PRE 512 | |||||
| #define A_PRE 512 | |||||
| /************************************************************************************** | |||||
| * Macro definitions | |||||
| **************************************************************************************/ | |||||
| #if !defined(CONJ) && !defined(XCONJ) | |||||
| #define KMAC_R fnmacs | |||||
| #define KMAC_I fmacs | |||||
| #define FMAC_R1 fmacs | |||||
| #define FMAC_R2 fnmacs | |||||
| #define FMAC_I1 fmacs | |||||
| #define FMAC_I2 fmacs | |||||
| #elif defined(CONJ) && !defined(XCONJ) | |||||
| #define KMAC_R fmacs | |||||
| #define KMAC_I fnmacs | |||||
| #define FMAC_R1 fmacs | |||||
| #define FMAC_R2 fnmacs | |||||
| #define FMAC_I1 fmacs | |||||
| #define FMAC_I2 fmacs | |||||
| #elif !defined(CONJ) && defined(XCONJ) | |||||
| #define KMAC_R fmacs | |||||
| #define KMAC_I fnmacs | |||||
| #define FMAC_R1 fmacs | |||||
| #define FMAC_R2 fmacs | |||||
| #define FMAC_I1 fnmacs | |||||
| #define FMAC_I2 fmacs | |||||
| #else | |||||
| #define KMAC_R fnmacs | |||||
| #define KMAC_I fmacs | |||||
| #define FMAC_R1 fmacs | |||||
| #define FMAC_R2 fmacs | |||||
| #define FMAC_I1 fnmacs | |||||
| #define FMAC_I2 fmacs | |||||
| #endif | |||||
| .macro INIT_F2 | |||||
| vsub.f32 s12, s12, s12 | |||||
| vsub.f32 s13, s13, s13 | |||||
| vsub.f32 s14, s14, s14 | |||||
| vsub.f32 s15, s15, s15 | |||||
| .endm | |||||
| .macro KERNEL_F2X4 | |||||
| KERNEL_F2X1 | |||||
| KERNEL_F2X1 | |||||
| KERNEL_F2X1 | |||||
| KERNEL_F2X1 | |||||
| .endm | |||||
| .macro KERNEL_F2X1 | |||||
| fldmias XO! , { s2 - s3 } | |||||
| fldmias AO1!, { s4 - s5 } | |||||
| fldmias AO2!, { s8 - s9 } | |||||
| fmacs s12 , s4 , s2 | |||||
| fmacs s13 , s4 , s3 | |||||
| KMAC_R s12 , s5 , s3 | |||||
| KMAC_I s13 , s5 , s2 | |||||
| fmacs s14 , s8 , s2 | |||||
| fmacs s15 , s8 , s3 | |||||
| KMAC_R s14 , s9 , s3 | |||||
| KMAC_I s15 , s9 , s2 | |||||
| .endm | |||||
| .macro SAVE_F2 | |||||
| fldmias YO, { s4 - s7 } | |||||
| FMAC_R1 s4 , s0 , s12 | |||||
| FMAC_I1 s5 , s0 , s13 | |||||
| FMAC_R2 s4 , s1 , s13 | |||||
| FMAC_I2 s5 , s1 , s12 | |||||
| FMAC_R1 s6 , s0 , s14 | |||||
| FMAC_I1 s7 , s0 , s15 | |||||
| FMAC_R2 s6 , s1 , s15 | |||||
| FMAC_I2 s7 , s1 , s14 | |||||
| fstmias YO!, { s4 - s7 } | |||||
| .endm | |||||
| /************************************************************************************************/ | |||||
| .macro INIT_F1 | |||||
| vsub.f32 s12, s12, s12 | |||||
| vsub.f32 s13, s13, s13 | |||||
| .endm | |||||
| .macro KERNEL_F1X4 | |||||
| KERNEL_F1X1 | |||||
| KERNEL_F1X1 | |||||
| KERNEL_F1X1 | |||||
| KERNEL_F1X1 | |||||
| .endm | |||||
| .macro KERNEL_F1X1 | |||||
| fldmias XO! , { s2 - s3 } | |||||
| fldmias AO1!, { s4 - s5 } | |||||
| fmacs s12 , s4 , s2 | |||||
| fmacs s13 , s4 , s3 | |||||
| KMAC_R s12 , s5 , s3 | |||||
| KMAC_I s13 , s5 , s2 | |||||
| .endm | |||||
| .macro SAVE_F1 | |||||
| fldmias YO, { s4 - s5 } | |||||
| FMAC_R1 s4 , s0 , s12 | |||||
| FMAC_I1 s5 , s0 , s13 | |||||
| FMAC_R2 s4 , s1 , s13 | |||||
| FMAC_I2 s5 , s1 , s12 | |||||
| fstmias YO!, { s4 - s5 } | |||||
| .endm | |||||
| /************************************************************************************************/ | |||||
| .macro INIT_S2 | |||||
| vsub.f32 s12, s12, s12 | |||||
| vsub.f32 s13, s13, s13 | |||||
| vsub.f32 s14, s14, s14 | |||||
| vsub.f32 s15, s15, s15 | |||||
| .endm | |||||
| .macro KERNEL_S2X4 | |||||
| KERNEL_S2X1 | |||||
| KERNEL_S2X1 | |||||
| KERNEL_S2X1 | |||||
| KERNEL_S2X1 | |||||
| .endm | |||||
| .macro KERNEL_S2X1 | |||||
| fldmias XO , { s2 - s3 } | |||||
| fldmias AO1!, { s4 - s5 } | |||||
| fldmias AO2!, { s8 - s9 } | |||||
| fmacs s12 , s4 , s2 | |||||
| fmacs s13 , s4 , s3 | |||||
| KMAC_R s12 , s5 , s3 | |||||
| KMAC_I s13 , s5 , s2 | |||||
| fmacs s14 , s8 , s2 | |||||
| fmacs s15 , s8 , s3 | |||||
| KMAC_R s14 , s9 , s3 | |||||
| KMAC_I s15 , s9 , s2 | |||||
| add XO, XO, INC_X | |||||
| .endm | |||||
| .macro SAVE_S2 | |||||
| fldmias YO, { s4 - s5 } | |||||
| FMAC_R1 s4 , s0 , s12 | |||||
| FMAC_I1 s5 , s0 , s13 | |||||
| FMAC_R2 s4 , s1 , s13 | |||||
| FMAC_I2 s5 , s1 , s12 | |||||
| fstmias YO, { s4 - s5 } | |||||
| add YO, YO, INC_Y | |||||
| fldmias YO, { s6 - s7 } | |||||
| FMAC_R1 s6 , s0 , s14 | |||||
| FMAC_I1 s7 , s0 , s15 | |||||
| FMAC_R2 s6 , s1 , s15 | |||||
| FMAC_I2 s7 , s1 , s14 | |||||
| fstmias YO, { s6 - s7 } | |||||
| add YO, YO, INC_Y | |||||
| .endm | |||||
| /************************************************************************************************/ | |||||
| .macro INIT_S1 | |||||
| vsub.f32 s12, s12, s12 | |||||
| vsub.f32 s13, s13, s13 | |||||
| .endm | |||||
| .macro KERNEL_S1X4 | |||||
| KERNEL_S1X1 | |||||
| KERNEL_S1X1 | |||||
| KERNEL_S1X1 | |||||
| KERNEL_S1X1 | |||||
| .endm | |||||
| .macro KERNEL_S1X1 | |||||
| fldmias XO , { s2 - s3 } | |||||
| fldmias AO1!, { s4 - s5 } | |||||
| fmacs s12 , s4 , s2 | |||||
| fmacs s13 , s4 , s3 | |||||
| KMAC_R s12 , s5 , s3 | |||||
| KMAC_I s13 , s5 , s2 | |||||
| add XO, XO, INC_X | |||||
| .endm | |||||
| .macro SAVE_S1 | |||||
| fldmias YO, { s4 - s5 } | |||||
| FMAC_R1 s4 , s0 , s12 | |||||
| FMAC_I1 s5 , s0 , s13 | |||||
| FMAC_R2 s4 , s1 , s13 | |||||
| FMAC_I2 s5 , s1 , s12 | |||||
| fstmias YO, { s4 - s5 } | |||||
| add YO, YO, INC_Y | |||||
| .endm | |||||
| /************************************************************************************** | |||||
| * End of macro definitions | |||||
| **************************************************************************************/ | |||||
| PROLOGUE | |||||
| .align 5 | |||||
| push {r4 - r9 , fp} | |||||
| add fp, sp, #28 | |||||
| sub sp, sp, #STACKSIZE // reserve stack | |||||
| sub r12, fp, #192 | |||||
| #if defined(DOUBLE) | |||||
| vstm r12, { d8 - d15 } // store floating point registers | |||||
| #else | |||||
| vstm r12, { s8 - s15 } // store floating point registers | |||||
| #endif | |||||
| cmp M, #0 | |||||
| ble cgemvt_kernel_L999 | |||||
| cmp OLD_N, #0 | |||||
| ble cgemvt_kernel_L999 | |||||
| str OLD_A, A | |||||
| str OLD_N, N | |||||
| ldr INC_X , OLD_INC_X | |||||
| ldr INC_Y , OLD_INC_Y | |||||
| cmp INC_X, #0 | |||||
| beq cgemvt_kernel_L999 | |||||
| cmp INC_Y, #0 | |||||
| beq cgemvt_kernel_L999 | |||||
| ldr LDA, OLD_LDA | |||||
| #if defined(DOUBLE) | |||||
| lsl LDA, LDA, #4 // LDA * SIZE | |||||
| #else | |||||
| lsl LDA, LDA, #3 // LDA * SIZE | |||||
| #endif | |||||
| cmp INC_X, #1 | |||||
| bne cgemvt_kernel_S2_BEGIN | |||||
| cmp INC_Y, #1 | |||||
| bne cgemvt_kernel_S2_BEGIN | |||||
| cgemvt_kernel_F2_BEGIN: | |||||
| ldr YO , Y | |||||
| ldr J, N | |||||
| asrs J, J, #1 // J = N / 2 | |||||
| ble cgemvt_kernel_F1_BEGIN | |||||
| cgemvt_kernel_F2X4: | |||||
| ldr AO1, A | |||||
| add AO2, AO1, LDA | |||||
| add r3 , AO2, LDA | |||||
| str r3 , A | |||||
| ldr XO , X | |||||
| INIT_F2 | |||||
| asrs I, M, #2 // I = M / 4 | |||||
| ble cgemvt_kernel_F2X1 | |||||
| cgemvt_kernel_F2X4_10: | |||||
| KERNEL_F2X4 | |||||
| subs I, I, #1 | |||||
| bne cgemvt_kernel_F2X4_10 | |||||
| cgemvt_kernel_F2X1: | |||||
| ands I, M , #3 | |||||
| ble cgemvt_kernel_F2_END | |||||
| cgemvt_kernel_F2X1_10: | |||||
| KERNEL_F2X1 | |||||
| subs I, I, #1 | |||||
| bne cgemvt_kernel_F2X1_10 | |||||
| cgemvt_kernel_F2_END: | |||||
| SAVE_F2 | |||||
| subs J , J , #1 | |||||
| bne cgemvt_kernel_F2X4 | |||||
| cgemvt_kernel_F1_BEGIN: | |||||
| ldr J, N | |||||
| ands J, J, #1 | |||||
| ble cgemvt_kernel_L999 | |||||
| cgemvt_kernel_F1X4: | |||||
| ldr AO1, A | |||||
| ldr XO , X | |||||
| INIT_F1 | |||||
| asrs I, M, #2 // I = M / 4 | |||||
| ble cgemvt_kernel_F1X1 | |||||
| cgemvt_kernel_F1X4_10: | |||||
| KERNEL_F1X4 | |||||
| subs I, I, #1 | |||||
| bne cgemvt_kernel_F1X4_10 | |||||
| cgemvt_kernel_F1X1: | |||||
| ands I, M , #3 | |||||
| ble cgemvt_kernel_F1_END | |||||
| cgemvt_kernel_F1X1_10: | |||||
| KERNEL_F1X1 | |||||
| subs I, I, #1 | |||||
| bne cgemvt_kernel_F1X1_10 | |||||
| cgemvt_kernel_F1_END: | |||||
| SAVE_F1 | |||||
| b cgemvt_kernel_L999 | |||||
| /*************************************************************************************************************/ | |||||
| cgemvt_kernel_S2_BEGIN: | |||||
| #if defined(DOUBLE) | |||||
| lsl INC_X, INC_X, #4 // INC_X * SIZE | |||||
| lsl INC_Y, INC_Y, #4 // INC_Y * SIZE | |||||
| #else | |||||
| lsl INC_X, INC_X, #3 // INC_X * SIZE | |||||
| lsl INC_Y, INC_Y, #3 // INC_Y * SIZE | |||||
| #endif | |||||
| ldr YO , Y | |||||
| ldr J, N | |||||
| asrs J, J, #1 // J = N / 2 | |||||
| ble cgemvt_kernel_S1_BEGIN | |||||
| cgemvt_kernel_S2X4: | |||||
| ldr AO1, A | |||||
| add AO2, AO1, LDA | |||||
| add r3 , AO2, LDA | |||||
| str r3 , A | |||||
| ldr XO , X | |||||
| INIT_S2 | |||||
| asrs I, M, #2 // I = M / 4 | |||||
| ble cgemvt_kernel_S2X1 | |||||
| cgemvt_kernel_S2X4_10: | |||||
| KERNEL_S2X4 | |||||
| subs I, I, #1 | |||||
| bne cgemvt_kernel_S2X4_10 | |||||
| cgemvt_kernel_S2X1: | |||||
| ands I, M , #3 | |||||
| ble cgemvt_kernel_S2_END | |||||
| cgemvt_kernel_S2X1_10: | |||||
| KERNEL_S2X1 | |||||
| subs I, I, #1 | |||||
| bne cgemvt_kernel_S2X1_10 | |||||
| cgemvt_kernel_S2_END: | |||||
| SAVE_S2 | |||||
| subs J , J , #1 | |||||
| bne cgemvt_kernel_S2X4 | |||||
| cgemvt_kernel_S1_BEGIN: | |||||
| ldr J, N | |||||
| ands J, J, #1 | |||||
| ble cgemvt_kernel_L999 | |||||
| cgemvt_kernel_S1X4: | |||||
| ldr AO1, A | |||||
| ldr XO , X | |||||
| INIT_S1 | |||||
| asrs I, M, #2 // I = M / 4 | |||||
| ble cgemvt_kernel_S1X1 | |||||
| cgemvt_kernel_S1X4_10: | |||||
| KERNEL_S1X4 | |||||
| subs I, I, #1 | |||||
| bne cgemvt_kernel_S1X4_10 | |||||
| cgemvt_kernel_S1X1: | |||||
| ands I, M , #3 | |||||
| ble cgemvt_kernel_S1_END | |||||
| cgemvt_kernel_S1X1_10: | |||||
| KERNEL_S1X1 | |||||
| subs I, I, #1 | |||||
| bne cgemvt_kernel_S1X1_10 | |||||
| cgemvt_kernel_S1_END: | |||||
| SAVE_S1 | |||||
| /*************************************************************************************************************/ | |||||
| cgemvt_kernel_L999: | |||||
| sub r3, fp, #192 | |||||
| #if defined(DOUBLE) | |||||
| vldm r3, { d8 - d15 } // restore floating point registers | |||||
| #else | |||||
| vldm r3, { s8 - s15 } // restore floating point registers | |||||
| #endif | |||||
| mov r0, #0 // set return value | |||||
| sub sp, fp, #28 | |||||
| pop {r4 -r9 ,fp} | |||||
| bx lr | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,59 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| /************************************************************************************** | |||||
| * 2013/09/14 Saar | |||||
| * BLASTEST float : OK | |||||
| * BLASTEST double : OK | |||||
| * CTEST : OK | |||||
| * TEST : OK | |||||
| * | |||||
| **************************************************************************************/ | |||||
| #include "common.h" | |||||
| int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||||
| { | |||||
| BLASLONG i=0; | |||||
| BLASLONG ix=0,iy=0; | |||||
| if ( n < 0 ) return(0); | |||||
| while(i < n) | |||||
| { | |||||
| y[iy] = x[ix] ; | |||||
| ix += inc_x ; | |||||
| iy += inc_y ; | |||||
| i++ ; | |||||
| } | |||||
| return(0); | |||||
| } | |||||
| @@ -0,0 +1,222 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| /************************************************************************************** | |||||
| * 2013/11/07 Saar | |||||
| * BLASTEST : OK | |||||
| * CTEST : OK | |||||
| * TEST : OK | |||||
| * | |||||
| **************************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define STACKSIZE 256 | |||||
| #define N r0 | |||||
| #define X r1 | |||||
| #define INC_X r2 | |||||
| #define OLD_Y r3 | |||||
| /****************************************************** | |||||
| * [fp, #-128] - [fp, #-64] is reserved | |||||
| * for store and restore of floating point | |||||
| * registers | |||||
| *******************************************************/ | |||||
| #define OLD_INC_Y [fp, #4 ] | |||||
| #define I r5 | |||||
| #define Y r6 | |||||
| #define INC_Y r7 | |||||
| #define X_PRE 256 | |||||
| /************************************************************************************** | |||||
| * Macro definitions | |||||
| **************************************************************************************/ | |||||
| .macro COPY_F4 | |||||
| pld [ X, #X_PRE ] | |||||
| fldmiad X!, { d0 - d3 } | |||||
| fstmiad Y!, { d0 - d3 } | |||||
| .endm | |||||
| .macro COPY_F1 | |||||
| fldmiad X!, { d0 } | |||||
| fstmiad Y!, { d0 } | |||||
| .endm | |||||
| /*************************************************************************************************************************/ | |||||
| .macro COPY_S4 | |||||
| nop | |||||
| fldmiad X, { d0 } | |||||
| fstmiad Y, { d0 } | |||||
| add X, X, INC_X | |||||
| add Y, Y, INC_Y | |||||
| fldmiad X, { d1 } | |||||
| fstmiad Y, { d1 } | |||||
| add X, X, INC_X | |||||
| add Y, Y, INC_Y | |||||
| fldmiad X, { d0 } | |||||
| fstmiad Y, { d0 } | |||||
| add X, X, INC_X | |||||
| add Y, Y, INC_Y | |||||
| fldmiad X, { d1 } | |||||
| fstmiad Y, { d1 } | |||||
| add X, X, INC_X | |||||
| add Y, Y, INC_Y | |||||
| .endm | |||||
| .macro COPY_S1 | |||||
| fldmiad X, { d0 } | |||||
| fstmiad Y, { d0 } | |||||
| add X, X, INC_X | |||||
| add Y, Y, INC_Y | |||||
| .endm | |||||
| /************************************************************************************** | |||||
| * End of macro definitions | |||||
| **************************************************************************************/ | |||||
| PROLOGUE | |||||
| .align 5 | |||||
| push {r4 - r9, fp} | |||||
| add fp, sp, #24 | |||||
| sub sp, sp, #STACKSIZE // reserve stack | |||||
| sub r4, fp, #128 | |||||
| vstm r4, { d8 - d15} // store floating point registers | |||||
| mov Y, OLD_Y | |||||
| ldr INC_Y, OLD_INC_Y | |||||
| cmp N, #0 | |||||
| ble dcopy_kernel_L999 | |||||
| cmp INC_X, #0 | |||||
| beq dcopy_kernel_L999 | |||||
| cmp INC_Y, #0 | |||||
| beq dcopy_kernel_L999 | |||||
| cmp INC_X, #1 | |||||
| bne dcopy_kernel_S_BEGIN | |||||
| cmp INC_Y, #1 | |||||
| bne dcopy_kernel_S_BEGIN | |||||
| dcopy_kernel_F_BEGIN: | |||||
| asrs I, N, #2 // I = N / 4 | |||||
| ble dcopy_kernel_F1 | |||||
| dcopy_kernel_F4: | |||||
| COPY_F4 | |||||
| subs I, I, #1 | |||||
| bne dcopy_kernel_F4 | |||||
| dcopy_kernel_F1: | |||||
| ands I, N, #3 | |||||
| ble dcopy_kernel_L999 | |||||
| dcopy_kernel_F10: | |||||
| COPY_F1 | |||||
| subs I, I, #1 | |||||
| bne dcopy_kernel_F10 | |||||
| b dcopy_kernel_L999 | |||||
| dcopy_kernel_S_BEGIN: | |||||
| lsl INC_X, INC_X, #3 // INC_X * SIZE | |||||
| lsl INC_Y, INC_Y, #3 // INC_Y * SIZE | |||||
| asrs I, N, #2 // I = N / 4 | |||||
| ble dcopy_kernel_S1 | |||||
| dcopy_kernel_S4: | |||||
| COPY_S4 | |||||
| subs I, I, #1 | |||||
| bne dcopy_kernel_S4 | |||||
| dcopy_kernel_S1: | |||||
| ands I, N, #3 | |||||
| ble dcopy_kernel_L999 | |||||
| dcopy_kernel_S10: | |||||
| COPY_S1 | |||||
| subs I, I, #1 | |||||
| bne dcopy_kernel_S10 | |||||
| dcopy_kernel_L999: | |||||
| sub r3, fp, #128 | |||||
| vldm r3, { d8 - d15} // restore floating point registers | |||||
| mov r0, #0 // set return value | |||||
| sub sp, fp, #24 | |||||
| pop {r4 - r9, fp} | |||||
| bx lr | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,248 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| /************************************************************************************** | |||||
| * 2013/11/11 Saar | |||||
| * BLASTEST : OK | |||||
| * CTEST : OK | |||||
| * TEST : OK | |||||
| * | |||||
| **************************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define STACKSIZE 256 | |||||
| #define N r0 | |||||
| #define X r1 | |||||
| #define INC_X r2 | |||||
| #define OLD_Y r3 | |||||
| /****************************************************** | |||||
| * [fp, #-128] - [fp, #-64] is reserved | |||||
| * for store and restore of floating point | |||||
| * registers | |||||
| *******************************************************/ | |||||
| #define OLD_INC_Y [fp, #4 ] | |||||
| #define I r5 | |||||
| #define Y r6 | |||||
| #define INC_Y r7 | |||||
| #define X_PRE 512 | |||||
| /************************************************************************************** | |||||
| * Macro definitions | |||||
| **************************************************************************************/ | |||||
| .macro KERNEL_F4 | |||||
| pld [ X, #X_PRE ] | |||||
| fldmiad X!, { d8 } | |||||
| pld [ Y, #X_PRE ] | |||||
| fldmiad Y!, { d4 } | |||||
| fldmiad Y!, { d5 } | |||||
| fmacd d0 , d4, d8 | |||||
| fldmiad X!, { d9 } | |||||
| fldmiad Y!, { d6 } | |||||
| fmacd d1 , d5, d9 | |||||
| fldmiad X!, { d10 } | |||||
| fldmiad X!, { d11 } | |||||
| fmacd d0 , d6, d10 | |||||
| fldmiad Y!, { d7 } | |||||
| fmacd d1 , d7, d11 | |||||
| .endm | |||||
| .macro KERNEL_F1 | |||||
| fldmiad X!, { d4 } | |||||
| fldmiad Y!, { d8 } | |||||
| fmacd d0 , d4, d8 | |||||
| .endm | |||||
| /*************************************************************************************************************************/ | |||||
| .macro KERNEL_S4 | |||||
| nop | |||||
| fldmiad X, { d4 } | |||||
| fldmiad Y, { d8 } | |||||
| add X, X, INC_X | |||||
| add Y, Y, INC_Y | |||||
| fmacd d0 , d4, d8 | |||||
| fldmiad X, { d5 } | |||||
| fldmiad Y, { d9 } | |||||
| add X, X, INC_X | |||||
| add Y, Y, INC_Y | |||||
| fmacd d1 , d5, d9 | |||||
| fldmiad X, { d6 } | |||||
| fldmiad Y, { d10 } | |||||
| add X, X, INC_X | |||||
| add Y, Y, INC_Y | |||||
| fmacd d0 , d6, d10 | |||||
| fldmiad X, { d7 } | |||||
| fldmiad Y, { d11 } | |||||
| add X, X, INC_X | |||||
| add Y, Y, INC_Y | |||||
| fmacd d1 , d7, d11 | |||||
| .endm | |||||
| .macro KERNEL_S1 | |||||
| fldmiad X, { d4 } | |||||
| fldmiad Y, { d8 } | |||||
| add X, X, INC_X | |||||
| fmacd d0 , d4, d8 | |||||
| add Y, Y, INC_Y | |||||
| .endm | |||||
| /************************************************************************************** | |||||
| * End of macro definitions | |||||
| **************************************************************************************/ | |||||
| PROLOGUE | |||||
| .align 5 | |||||
| push {r4 - r9, fp} | |||||
| add fp, sp, #24 | |||||
| sub sp, sp, #STACKSIZE // reserve stack | |||||
| sub r4, fp, #128 | |||||
| vstm r4, { d8 - d15} // store floating point registers | |||||
| mov Y, OLD_Y | |||||
| ldr INC_Y, OLD_INC_Y | |||||
| vsub.f64 d0 , d0 , d0 | |||||
| vsub.f64 d1 , d1 , d1 | |||||
| cmp N, #0 | |||||
| ble ddot_kernel_L999 | |||||
| cmp INC_X, #0 | |||||
| beq ddot_kernel_L999 | |||||
| cmp INC_Y, #0 | |||||
| beq ddot_kernel_L999 | |||||
| cmp INC_X, #1 | |||||
| bne ddot_kernel_S_BEGIN | |||||
| cmp INC_Y, #1 | |||||
| bne ddot_kernel_S_BEGIN | |||||
| ddot_kernel_F_BEGIN: | |||||
| asrs I, N, #2 // I = N / 4 | |||||
| ble ddot_kernel_F1 | |||||
| ddot_kernel_F4: | |||||
| KERNEL_F4 | |||||
| subs I, I, #1 | |||||
| ble ddot_kernel_F1 | |||||
| KERNEL_F4 | |||||
| subs I, I, #1 | |||||
| bne ddot_kernel_F4 | |||||
| ddot_kernel_F1: | |||||
| ands I, N, #3 | |||||
| ble ddot_kernel_L999 | |||||
| ddot_kernel_F10: | |||||
| KERNEL_F1 | |||||
| subs I, I, #1 | |||||
| bne ddot_kernel_F10 | |||||
| b ddot_kernel_L999 | |||||
| ddot_kernel_S_BEGIN: | |||||
| lsl INC_X, INC_X, #3 // INC_X * SIZE | |||||
| lsl INC_Y, INC_Y, #3 // INC_Y * SIZE | |||||
| asrs I, N, #2 // I = N / 4 | |||||
| ble ddot_kernel_S1 | |||||
| ddot_kernel_S4: | |||||
| KERNEL_S4 | |||||
| subs I, I, #1 | |||||
| bne ddot_kernel_S4 | |||||
| ddot_kernel_S1: | |||||
| ands I, N, #3 | |||||
| ble ddot_kernel_L999 | |||||
| ddot_kernel_S10: | |||||
| KERNEL_S1 | |||||
| subs I, I, #1 | |||||
| bne ddot_kernel_S10 | |||||
| ddot_kernel_L999: | |||||
| sub r3, fp, #128 | |||||
| vldm r3, { d8 - d15} // restore floating point registers | |||||
| vadd.f64 d0 , d0, d1 // set return value | |||||
| sub sp, fp, #24 | |||||
| pop {r4 - r9, fp} | |||||
| bx lr | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,806 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| /************************************************************************************** | |||||
| * 2013/11/27 Saar | |||||
| * BLASTEST : OK | |||||
| * CTEST : OK | |||||
| * TEST : OK | |||||
| * | |||||
| **************************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define STACKSIZE 256 | |||||
| #define OLD_M r0 | |||||
| #define OLD_N r1 | |||||
| #define OLD_K r2 | |||||
| #define OLD_A r3 | |||||
| #define OLD_ALPHA d0 | |||||
| /****************************************************** | |||||
| * [fp, #-128] - [fp, #-64] is reserved | |||||
| * for store and restore of floating point | |||||
| * registers | |||||
| *******************************************************/ | |||||
| #define LDC [fp, #-252 ] | |||||
| #define M [fp, #-256 ] | |||||
| #define N [fp, #-260 ] | |||||
| #define K [fp, #-264 ] | |||||
| #define A [fp, #-268 ] | |||||
| #define ALPHA [fp, #-280] | |||||
| #define B [fp, #4 ] | |||||
| #define C [fp, #8 ] | |||||
| #define OLD_LDC [fp, #12 ] | |||||
| #define I r0 | |||||
| #define J r1 | |||||
| #define L r2 | |||||
| #define AO r5 | |||||
| #define BO r6 | |||||
| #define CO1 r8 | |||||
| #define CO2 r9 | |||||
| #define K1 r7 | |||||
| #define BC r12 | |||||
| #define A_PRE 96 | |||||
| #define B_PRE 96 | |||||
| #define C_PRE 32 | |||||
| /************************************************************************************** | |||||
| * Macro definitions | |||||
| **************************************************************************************/ | |||||
| .macro INIT4x2 | |||||
| vsub.f64 d8 , d8 , d8 | |||||
| vmov.f64 d9, d8 | |||||
| vmov.f64 d10, d8 | |||||
| vmov.f64 d11, d8 | |||||
| vmov.f64 d12, d8 | |||||
| vmov.f64 d13, d8 | |||||
| vmov.f64 d14, d8 | |||||
| vmov.f64 d15, d8 | |||||
| .endm | |||||
| .macro KERNEL4x2_SUB | |||||
| pld [ AO, #A_PRE ] | |||||
| fldd d4 , [ BO ] | |||||
| fldd d0 , [ AO ] | |||||
| fldd d1 , [ AO, #8 ] | |||||
| fmacd d8 , d0, d4 | |||||
| fldd d2 , [ AO, #16 ] | |||||
| fmacd d9 , d1, d4 | |||||
| fldd d3 , [ AO, #24 ] | |||||
| fmacd d10 , d2, d4 | |||||
| fldd d5 , [ BO, #8 ] | |||||
| fmacd d11 , d3, d4 | |||||
| fmacd d12 , d0, d5 | |||||
| fmacd d13 , d1, d5 | |||||
| add AO , AO, #32 | |||||
| fmacd d14 , d2, d5 | |||||
| add BO , BO, #16 | |||||
| fmacd d15 , d3, d5 | |||||
| .endm | |||||
| .macro SAVE4x2 | |||||
| ldr r3 , LDC | |||||
| add CO2 , CO1, r3 | |||||
| fldd d0, ALPHA | |||||
| fldd d4 , [CO1] | |||||
| fldd d5 , [CO1, #8 ] | |||||
| pld [ CO1, #C_PRE ] | |||||
| fmacd d4 , d0 , d8 | |||||
| fldd d6 , [CO1, #16 ] | |||||
| fmacd d5 , d0 , d9 | |||||
| fldd d7 , [CO1, #24 ] | |||||
| fmacd d6 , d0 , d10 | |||||
| fstd d4 , [CO1] | |||||
| fmacd d7 , d0 , d11 | |||||
| fstd d5 , [CO1, #8 ] | |||||
| fstd d6 , [CO1, #16 ] | |||||
| fstd d7 , [CO1, #24 ] | |||||
| fldd d4 , [CO2] | |||||
| fldd d5 , [CO2, #8 ] | |||||
| pld [ CO2, #C_PRE ] | |||||
| fmacd d4 , d0 , d12 | |||||
| fldd d6 , [CO2, #16 ] | |||||
| fmacd d5 , d0 , d13 | |||||
| fldd d7 , [CO2, #24 ] | |||||
| fmacd d6 , d0 , d14 | |||||
| fstd d4 , [CO2] | |||||
| fmacd d7 , d0 , d15 | |||||
| add CO1, CO1, #32 | |||||
| fstd d5 , [CO2, #8 ] | |||||
| fstd d6 , [CO2, #16 ] | |||||
| fstd d7 , [CO2, #24 ] | |||||
| .endm | |||||
| /******************************************************************************/ | |||||
| .macro INIT2x2 | |||||
| vsub.f64 d8 , d8 , d8 | |||||
| vmov.f64 d9, d8 | |||||
| vmov.f64 d12, d8 | |||||
| vmov.f64 d13, d8 | |||||
| .endm | |||||
| .macro KERNEL2x2_SUB | |||||
| fldd d4 , [ BO ] | |||||
| fldd d5 , [ BO, #8 ] | |||||
| fldd d0 , [ AO ] | |||||
| fldd d1 , [ AO, #8 ] | |||||
| fmacd d8 , d0, d4 | |||||
| fmacd d9 , d1, d4 | |||||
| fmacd d12 , d0, d5 | |||||
| fmacd d13 , d1, d5 | |||||
| add AO , AO, #16 | |||||
| add BO , BO, #16 | |||||
| .endm | |||||
| .macro SAVE2x2 | |||||
| ldr r3 , LDC | |||||
| add CO2 , CO1, r3 | |||||
| fldd d0, ALPHA | |||||
| fldd d4 , [CO1] | |||||
| fldd d5 , [CO1, #8 ] | |||||
| fmacd d4 , d0 , d8 | |||||
| fmacd d5 , d0 , d9 | |||||
| fstd d4 , [CO1] | |||||
| fstd d5 , [CO1, #8 ] | |||||
| fldd d4 , [CO2] | |||||
| fldd d5 , [CO2, #8 ] | |||||
| fmacd d4 , d0 , d12 | |||||
| fmacd d5 , d0 , d13 | |||||
| fstd d4 , [CO2] | |||||
| fstd d5 , [CO2, #8 ] | |||||
| add CO1, CO1, #16 | |||||
| .endm | |||||
| /******************************************************************************/ | |||||
| .macro INIT1x2 | |||||
| vsub.f64 d8 , d8 , d8 | |||||
| vmov.f64 d12, d8 | |||||
| .endm | |||||
| .macro KERNEL1x2_SUB | |||||
| fldd d4 , [ BO ] | |||||
| fldd d5 , [ BO, #8 ] | |||||
| fldd d0 , [ AO ] | |||||
| fmacd d8 , d0, d4 | |||||
| fmacd d12 , d0, d5 | |||||
| add AO , AO, #8 | |||||
| add BO , BO, #16 | |||||
| .endm | |||||
| .macro SAVE1x2 | |||||
| ldr r3 , LDC | |||||
| add CO2 , CO1, r3 | |||||
| fldd d0, ALPHA | |||||
| fldd d4 , [CO1] | |||||
| fmacd d4 , d0 , d8 | |||||
| fstd d4 , [CO1] | |||||
| fldd d4 , [CO2] | |||||
| fmacd d4 , d0 , d12 | |||||
| fstd d4 , [CO2] | |||||
| add CO1, CO1, #8 | |||||
| .endm | |||||
| /******************************************************************************/ | |||||
| .macro INIT4x1 | |||||
| vsub.f64 d8 , d8 , d8 | |||||
| vmov.f64 d9, d8 | |||||
| vmov.f64 d10, d8 | |||||
| vmov.f64 d11, d8 | |||||
| .endm | |||||
| .macro KERNEL4x1_SUB | |||||
| fldd d4 , [ BO ] | |||||
| fldd d0 , [ AO ] | |||||
| fldd d1 , [ AO, #8 ] | |||||
| fldd d2 , [ AO, #16 ] | |||||
| fldd d3 , [ AO, #24 ] | |||||
| fmacd d8 , d0, d4 | |||||
| fmacd d9 , d1, d4 | |||||
| fmacd d10 , d2, d4 | |||||
| fmacd d11 , d3, d4 | |||||
| add AO , AO, #32 | |||||
| add BO , BO, #8 | |||||
| .endm | |||||
| .macro SAVE4x1 | |||||
| fldd d0, ALPHA | |||||
| fldd d4 , [CO1] | |||||
| fldd d5 , [CO1, #8 ] | |||||
| fldd d6 , [CO1, #16 ] | |||||
| fldd d7 , [CO1, #24 ] | |||||
| fmacd d4 , d0 , d8 | |||||
| fmacd d5 , d0 , d9 | |||||
| fmacd d6 , d0 , d10 | |||||
| fmacd d7 , d0 , d11 | |||||
| fstd d4 , [CO1] | |||||
| fstd d5 , [CO1, #8 ] | |||||
| fstd d6 , [CO1, #16 ] | |||||
| fstd d7 , [CO1, #24 ] | |||||
| add CO1, CO1, #32 | |||||
| .endm | |||||
| /******************************************************************************/ | |||||
| .macro INIT2x1 | |||||
| vsub.f64 d8 , d8 , d8 | |||||
| vmov.f64 d9 , d8 | |||||
| .endm | |||||
| .macro KERNEL2x1_SUB | |||||
| fldd d4 , [ BO ] | |||||
| fldd d0 , [ AO ] | |||||
| fldd d1 , [ AO, #8 ] | |||||
| fmacd d8 , d0, d4 | |||||
| fmacd d9 , d1, d4 | |||||
| add AO , AO, #16 | |||||
| add BO , BO, #8 | |||||
| .endm | |||||
| .macro SAVE2x1 | |||||
| fldd d0, ALPHA | |||||
| fldd d4 , [CO1] | |||||
| fldd d5 , [CO1, #8 ] | |||||
| fmacd d4 , d0 , d8 | |||||
| fmacd d5 , d0 , d9 | |||||
| fstd d4 , [CO1] | |||||
| fstd d5 , [CO1, #8 ] | |||||
| add CO1, CO1, #16 | |||||
| .endm | |||||
| /******************************************************************************/ | |||||
| .macro INIT1x1 | |||||
| vsub.f64 d8 , d8 , d8 | |||||
| .endm | |||||
| .macro KERNEL1x1_SUB | |||||
| fldd d4 , [ BO ] | |||||
| fldd d0 , [ AO ] | |||||
| fmacd d8 , d0, d4 | |||||
| add AO , AO, #8 | |||||
| add BO , BO, #8 | |||||
| .endm | |||||
| .macro SAVE1x1 | |||||
| fldd d0, ALPHA | |||||
| fldd d4 , [CO1] | |||||
| fmacd d4 , d0 , d8 | |||||
| fstd d4 , [CO1] | |||||
| add CO1, CO1, #8 | |||||
| .endm | |||||
| /************************************************************************************** | |||||
| * End of macro definitions | |||||
| **************************************************************************************/ | |||||
| PROLOGUE | |||||
| .align 5 | |||||
| push {r4 - r9, fp} | |||||
| add fp, sp, #24 | |||||
| sub sp, sp, #STACKSIZE // reserve stack | |||||
| str OLD_M, M | |||||
| str OLD_N, N | |||||
| str OLD_K, K | |||||
| str OLD_A, A | |||||
| vstr OLD_ALPHA, ALPHA | |||||
| sub r3, fp, #128 | |||||
| vstm r3, { d8 - d15} // store floating point registers | |||||
| ldr r3, OLD_LDC | |||||
| lsl r3, r3, #3 // ldc = ldc * 8 | |||||
| str r3, LDC | |||||
| ldr K1, K | |||||
| ldr BC, B | |||||
| ldr J, N | |||||
| asrs J, J, #1 // J = J / 2 | |||||
| ble dgemm_kernel_L1_BEGIN | |||||
| /*********************************************************************************************/ | |||||
| dgemm_kernel_L2_BEGIN: | |||||
| ldr CO1, C // CO1 = C | |||||
| ldr r4 , LDC | |||||
| lsl r4 , r4 , #1 // LDC * 2 | |||||
| add r3 , r4, CO1 | |||||
| str r3 , C // store C | |||||
| ldr AO, A // AO = A | |||||
| dgemm_kernel_L2_M4_BEGIN: | |||||
| ldr I, M | |||||
| asrs I, I, #2 // I = I / 4 | |||||
| ble dgemm_kernel_L2_M2_BEGIN | |||||
| dgemm_kernel_L2_M4_20: | |||||
| INIT4x2 | |||||
| mov BO, BC | |||||
| asrs L , K1, #3 // L = L / 8 | |||||
| ble dgemm_kernel_L2_M4_40 | |||||
| .align 5 | |||||
| dgemm_kernel_L2_M4_22: | |||||
| pld [ BO, #B_PRE ] | |||||
| KERNEL4x2_SUB | |||||
| KERNEL4x2_SUB | |||||
| pld [ BO, #B_PRE ] | |||||
| KERNEL4x2_SUB | |||||
| KERNEL4x2_SUB | |||||
| pld [ BO, #B_PRE ] | |||||
| KERNEL4x2_SUB | |||||
| KERNEL4x2_SUB | |||||
| pld [ BO, #B_PRE ] | |||||
| KERNEL4x2_SUB | |||||
| KERNEL4x2_SUB | |||||
| subs L, L, #1 | |||||
| bgt dgemm_kernel_L2_M4_22 | |||||
| dgemm_kernel_L2_M4_40: | |||||
| ands L , K1, #7 // L = L % 8 | |||||
| ble dgemm_kernel_L2_M4_100 | |||||
| dgemm_kernel_L2_M4_42: | |||||
| KERNEL4x2_SUB | |||||
| subs L, L, #1 | |||||
| bgt dgemm_kernel_L2_M4_42 | |||||
| dgemm_kernel_L2_M4_100: | |||||
| SAVE4x2 | |||||
| dgemm_kernel_L2_M4_END: | |||||
| subs I, I, #1 | |||||
| bgt dgemm_kernel_L2_M4_20 | |||||
| dgemm_kernel_L2_M2_BEGIN: | |||||
| ldr I, M | |||||
| tst I , #3 | |||||
| ble dgemm_kernel_L2_END | |||||
| tst I, #2 // I = I / 2 | |||||
| ble dgemm_kernel_L2_M1_BEGIN | |||||
| dgemm_kernel_L2_M2_20: | |||||
| INIT2x2 | |||||
| mov BO, BC | |||||
| asrs L , K1, #3 // L = L / 8 | |||||
| ble dgemm_kernel_L2_M2_40 | |||||
| dgemm_kernel_L2_M2_22: | |||||
| KERNEL2x2_SUB | |||||
| KERNEL2x2_SUB | |||||
| KERNEL2x2_SUB | |||||
| KERNEL2x2_SUB | |||||
| KERNEL2x2_SUB | |||||
| KERNEL2x2_SUB | |||||
| KERNEL2x2_SUB | |||||
| KERNEL2x2_SUB | |||||
| subs L, L, #1 | |||||
| bgt dgemm_kernel_L2_M2_22 | |||||
| dgemm_kernel_L2_M2_40: | |||||
| ands L , K1, #7 // L = L % 8 | |||||
| ble dgemm_kernel_L2_M2_100 | |||||
| dgemm_kernel_L2_M2_42: | |||||
| KERNEL2x2_SUB | |||||
| subs L, L, #1 | |||||
| bgt dgemm_kernel_L2_M2_42 | |||||
| dgemm_kernel_L2_M2_100: | |||||
| SAVE2x2 | |||||
| dgemm_kernel_L2_M2_END: | |||||
| dgemm_kernel_L2_M1_BEGIN: | |||||
| tst I, #1 // I = I % 2 | |||||
| ble dgemm_kernel_L2_END | |||||
| dgemm_kernel_L2_M1_20: | |||||
| INIT1x2 | |||||
| mov BO, BC | |||||
| asrs L , K1, #3 // L = L / 8 | |||||
| ble dgemm_kernel_L2_M1_40 | |||||
| dgemm_kernel_L2_M1_22: | |||||
| KERNEL1x2_SUB | |||||
| KERNEL1x2_SUB | |||||
| KERNEL1x2_SUB | |||||
| KERNEL1x2_SUB | |||||
| KERNEL1x2_SUB | |||||
| KERNEL1x2_SUB | |||||
| KERNEL1x2_SUB | |||||
| KERNEL1x2_SUB | |||||
| subs L, L, #1 | |||||
| bgt dgemm_kernel_L2_M1_22 | |||||
| dgemm_kernel_L2_M1_40: | |||||
| ands L , K1, #7 // L = L % 8 | |||||
| ble dgemm_kernel_L2_M1_100 | |||||
| dgemm_kernel_L2_M1_42: | |||||
| KERNEL1x2_SUB | |||||
| subs L, L, #1 | |||||
| bgt dgemm_kernel_L2_M1_42 | |||||
| dgemm_kernel_L2_M1_100: | |||||
| SAVE1x2 | |||||
| dgemm_kernel_L2_END: | |||||
| mov r3, BC | |||||
| mov r4, K1 | |||||
| lsl r4, r4, #4 // k * 2 * 8 | |||||
| add r3, r3, r4 // B = B + K * 2 * 8 | |||||
| mov BC, r3 | |||||
| subs J , #1 // j-- | |||||
| bgt dgemm_kernel_L2_BEGIN | |||||
| /*********************************************************************************************/ | |||||
| dgemm_kernel_L1_BEGIN: | |||||
| ldr J , N | |||||
| tst J , #1 | |||||
| ble dgemm_kernel_L999 | |||||
| ldr CO1, C // CO1 = C | |||||
| ldr r4 , LDC | |||||
| add r3 , r4, CO1 | |||||
| str r3 , C // store C | |||||
| ldr AO, A // AO = A | |||||
| dgemm_kernel_L1_M4_BEGIN: | |||||
| ldr I, M | |||||
| asrs I, I, #2 // I = I / 4 | |||||
| ble dgemm_kernel_L1_M2_BEGIN | |||||
| dgemm_kernel_L1_M4_20: | |||||
| INIT4x1 | |||||
| mov BO, BC | |||||
| asrs L , K1, #3 // L = L / 8 | |||||
| ble dgemm_kernel_L1_M4_40 | |||||
| .align 5 | |||||
| dgemm_kernel_L1_M4_22: | |||||
| KERNEL4x1_SUB | |||||
| KERNEL4x1_SUB | |||||
| KERNEL4x1_SUB | |||||
| KERNEL4x1_SUB | |||||
| KERNEL4x1_SUB | |||||
| KERNEL4x1_SUB | |||||
| KERNEL4x1_SUB | |||||
| KERNEL4x1_SUB | |||||
| subs L, L, #1 | |||||
| bgt dgemm_kernel_L1_M4_22 | |||||
| dgemm_kernel_L1_M4_40: | |||||
| ands L , K1, #7 // L = L % 8 | |||||
| ble dgemm_kernel_L1_M4_100 | |||||
| dgemm_kernel_L1_M4_42: | |||||
| KERNEL4x1_SUB | |||||
| subs L, L, #1 | |||||
| bgt dgemm_kernel_L1_M4_42 | |||||
| dgemm_kernel_L1_M4_100: | |||||
| SAVE4x1 | |||||
| dgemm_kernel_L1_M4_END: | |||||
| subs I, I, #1 | |||||
| bgt dgemm_kernel_L1_M4_20 | |||||
| dgemm_kernel_L1_M2_BEGIN: | |||||
| ldr I, M | |||||
| tst I , #3 | |||||
| ble dgemm_kernel_L1_END | |||||
| tst I, #2 // I = I / 2 | |||||
| ble dgemm_kernel_L1_M1_BEGIN | |||||
| dgemm_kernel_L1_M2_20: | |||||
| INIT2x1 | |||||
| mov BO, BC | |||||
| asrs L , K1, #3 // L = L / 8 | |||||
| ble dgemm_kernel_L1_M2_40 | |||||
| dgemm_kernel_L1_M2_22: | |||||
| KERNEL2x1_SUB | |||||
| KERNEL2x1_SUB | |||||
| KERNEL2x1_SUB | |||||
| KERNEL2x1_SUB | |||||
| KERNEL2x1_SUB | |||||
| KERNEL2x1_SUB | |||||
| KERNEL2x1_SUB | |||||
| KERNEL2x1_SUB | |||||
| subs L, L, #1 | |||||
| bgt dgemm_kernel_L1_M2_22 | |||||
| dgemm_kernel_L1_M2_40: | |||||
| ands L , K1, #7 // L = L % 8 | |||||
| ble dgemm_kernel_L1_M2_100 | |||||
| dgemm_kernel_L1_M2_42: | |||||
| KERNEL2x1_SUB | |||||
| subs L, L, #1 | |||||
| bgt dgemm_kernel_L1_M2_42 | |||||
| dgemm_kernel_L1_M2_100: | |||||
| SAVE2x1 | |||||
| dgemm_kernel_L1_M2_END: | |||||
| dgemm_kernel_L1_M1_BEGIN: | |||||
| tst I, #1 // I = I % 2 | |||||
| ble dgemm_kernel_L1_END | |||||
| dgemm_kernel_L1_M1_20: | |||||
| INIT1x1 | |||||
| mov BO, BC | |||||
| asrs L , K1, #3 // L = L / 8 | |||||
| ble dgemm_kernel_L1_M1_40 | |||||
| dgemm_kernel_L1_M1_22: | |||||
| KERNEL1x1_SUB | |||||
| KERNEL1x1_SUB | |||||
| KERNEL1x1_SUB | |||||
| KERNEL1x1_SUB | |||||
| KERNEL1x1_SUB | |||||
| KERNEL1x1_SUB | |||||
| KERNEL1x1_SUB | |||||
| KERNEL1x1_SUB | |||||
| subs L, L, #1 | |||||
| bgt dgemm_kernel_L1_M1_22 | |||||
| dgemm_kernel_L1_M1_40: | |||||
| ands L , K1, #7 // L = L % 8 | |||||
| ble dgemm_kernel_L1_M1_100 | |||||
| dgemm_kernel_L1_M1_42: | |||||
| KERNEL1x1_SUB | |||||
| subs L, L, #1 | |||||
| bgt dgemm_kernel_L1_M1_42 | |||||
| dgemm_kernel_L1_M1_100: | |||||
| SAVE1x1 | |||||
| dgemm_kernel_L1_END: | |||||
| dgemm_kernel_L999: | |||||
| sub r3, fp, #128 | |||||
| vldm r3, { d8 - d15} // restore floating point registers | |||||
| movs r0, #0 // set return value | |||||
| sub sp, fp, #24 | |||||
| pop {r4 - r9, fp} | |||||
| bx lr | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,225 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| /************************************************************************************** | |||||
| * 2013/11/24 Saar | |||||
| * BLASTEST : OK | |||||
| * CTEST : OK | |||||
| * TEST : OK | |||||
| * | |||||
| **************************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define STACKSIZE 256 | |||||
| #define OLD_M r0 | |||||
| #define OLD_N r1 | |||||
| #define OLD_A r2 | |||||
| #define OLD_LDA r3 | |||||
| #define B [fp, #4 ] | |||||
| #define M r0 | |||||
| #define N r1 | |||||
| #define A r2 | |||||
| #define BO r5 | |||||
| #define AO1 r6 | |||||
| #define AO2 r7 | |||||
| #define LDA r8 | |||||
| #define I r3 | |||||
| #define J r12 | |||||
| #define A_PRE 256 | |||||
| /************************************************************************************** | |||||
| * Macro definitions | |||||
| **************************************************************************************/ | |||||
| .macro COPY2x2 | |||||
| fldd d0 , [ AO1, #0 ] | |||||
| fldd d2 , [ AO1, #8 ] | |||||
| fldd d1 , [ AO2, #0 ] | |||||
| fldd d3 , [ AO2, #8 ] | |||||
| add AO1, AO1, #16 | |||||
| fstmiad BO!, { d0 - d3 } | |||||
| add AO2, AO2, #16 | |||||
| .endm | |||||
| .macro COPY1x2 | |||||
| fldd d0 , [ AO1, #0 ] | |||||
| fldd d1 , [ AO2, #0 ] | |||||
| add AO1, AO1, #8 | |||||
| fstmiad BO!, { d0 - d1 } | |||||
| add AO2, AO2, #8 | |||||
| .endm | |||||
| .macro COPY2x1 | |||||
| fldd d0 , [ AO1, #0 ] | |||||
| fldd d1 , [ AO1, #8 ] | |||||
| fstmiad BO!, { d0 - d1 } | |||||
| add AO1, AO1, #16 | |||||
| .endm | |||||
| .macro COPY1x1 | |||||
| fldd d0 , [ AO1, #0 ] | |||||
| fstmiad BO!, { d0 } | |||||
| add AO1, AO1, #8 | |||||
| .endm | |||||
| /************************************************************************************** | |||||
| * End of macro definitions | |||||
| **************************************************************************************/ | |||||
| PROLOGUE | |||||
| .align 5 | |||||
| push {r4 - r9, fp} | |||||
| add fp, sp, #24 | |||||
| lsl LDA, OLD_LDA, #3 // lda = lda * 8 | |||||
| ldr BO, B | |||||
| /*********************************************************************************************/ | |||||
| dgemm_ncopy_L2_BEGIN: | |||||
| asrs J, N, #1 // J = N / 2 | |||||
| ble dgemm_ncopy_L1_BEGIN | |||||
| dgemm_ncopy_L2_M2_BEGIN: | |||||
| mov AO1, A // AO1 = A | |||||
| add AO2, AO1, LDA | |||||
| add A , AO2, LDA // A = A + 2 * LDA | |||||
| asrs I, M, #1 // I = M / 2 | |||||
| ble dgemm_ncopy_L2_M2_40 | |||||
| dgemm_ncopy_L2_M2_20: | |||||
| COPY2x2 | |||||
| subs I , I , #1 | |||||
| bne dgemm_ncopy_L2_M2_20 | |||||
| dgemm_ncopy_L2_M2_40: | |||||
| ands I, M , #1 | |||||
| ble dgemm_ncopy_L2_M2_END | |||||
| dgemm_ncopy_L2_M2_60: | |||||
| COPY1x2 | |||||
| subs I , I , #1 | |||||
| bne dgemm_ncopy_L2_M2_60 | |||||
| dgemm_ncopy_L2_M2_END: | |||||
| subs J , J, #1 // j-- | |||||
| bne dgemm_ncopy_L2_M2_BEGIN | |||||
| /*********************************************************************************************/ | |||||
| dgemm_ncopy_L1_BEGIN: | |||||
| tst N, #1 | |||||
| ble dgemm_ncopy_L999 | |||||
| dgemm_ncopy_L1_M2_BEGIN: | |||||
| mov AO1, A // AO1 = A | |||||
| add A , AO1, LDA // A = A + 1 * LDA | |||||
| asrs I, M, #1 // I = M / 2 | |||||
| ble dgemm_ncopy_L1_M2_40 | |||||
| dgemm_ncopy_L1_M2_20: | |||||
| COPY2x1 | |||||
| subs I , I , #1 | |||||
| bne dgemm_ncopy_L1_M2_20 | |||||
| dgemm_ncopy_L1_M2_40: | |||||
| ands I, M , #1 | |||||
| ble dgemm_ncopy_L1_M2_END | |||||
| dgemm_ncopy_L1_M2_60: | |||||
| COPY1x1 | |||||
| subs I , I , #1 | |||||
| bne dgemm_ncopy_L1_M2_60 | |||||
| dgemm_ncopy_L1_M2_END: | |||||
| dgemm_ncopy_L999: | |||||
| movs r0, #0 // set return value | |||||
| sub sp, fp, #24 | |||||
| pop {r4 - r9, fp} | |||||
| bx lr | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,349 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| /************************************************************************************** | |||||
| * 2013/11/05 Saar | |||||
| * BLASTEST : OK | |||||
| * CTEST : OK | |||||
| * TEST : OK | |||||
| * | |||||
| **************************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define STACKSIZE 256 | |||||
| #define OLD_M r0 | |||||
| #define OLD_N r1 | |||||
| #define OLD_A r2 | |||||
| #define OLD_LDA r3 | |||||
| /****************************************************** | |||||
| * [fp, #-128] - [fp, #-64] is reserved | |||||
| * for store and restore of floating point | |||||
| * registers | |||||
| *******************************************************/ | |||||
| #define LDA [fp, #-260 ] | |||||
| #define B [fp, #4 ] | |||||
| #define M r0 | |||||
| #define N r1 | |||||
| #define A r2 | |||||
| #define BO r5 | |||||
| #define AO1 r6 | |||||
| #define AO2 r7 | |||||
| #define AO3 r8 | |||||
| #define AO4 r9 | |||||
| #define I r3 | |||||
| #define J r12 | |||||
| #define A_PRE 256 | |||||
| /************************************************************************************** | |||||
| * Macro definitions | |||||
| **************************************************************************************/ | |||||
| .macro COPY4x4 | |||||
| pld [ AO1, #A_PRE ] | |||||
| pld [ AO2, #A_PRE ] | |||||
| pld [ AO3, #A_PRE ] | |||||
| pld [ AO4, #A_PRE ] | |||||
| fldd d0 , [ AO1, #0 ] | |||||
| fldd d1 , [ AO2, #0 ] | |||||
| fldd d2 , [ AO3, #0 ] | |||||
| fldd d3 , [ AO4, #0 ] | |||||
| fldd d4 , [ AO1, #8 ] | |||||
| fldd d8 , [ AO1, #16 ] | |||||
| fldd d12, [ AO1, #24 ] | |||||
| fldd d5 , [ AO2, #8 ] | |||||
| add AO1, AO1, #32 | |||||
| fldd d9 , [ AO2, #16 ] | |||||
| fldd d13, [ AO2, #24 ] | |||||
| fldd d6 , [ AO3, #8 ] | |||||
| add AO2, AO2, #32 | |||||
| fldd d10, [ AO3, #16 ] | |||||
| fldd d14, [ AO3, #24 ] | |||||
| fldd d7 , [ AO4, #8 ] | |||||
| add AO3, AO3, #32 | |||||
| fldd d11, [ AO4, #16 ] | |||||
| fldd d15, [ AO4, #24 ] | |||||
| fstmiad BO!, { d0 - d3 } | |||||
| add AO4, AO4, #32 | |||||
| fstmiad BO!, { d4 - d7 } | |||||
| fstmiad BO!, { d8 - d15 } | |||||
| .endm | |||||
| .macro COPY1x4 | |||||
| fldd d0 , [ AO1, #0 ] | |||||
| fldd d1 , [ AO2, #0 ] | |||||
| add AO1, AO1, #8 | |||||
| fldd d2 , [ AO3, #0 ] | |||||
| add AO2, AO2, #8 | |||||
| fldd d3 , [ AO4, #0 ] | |||||
| add AO3, AO3, #8 | |||||
| fstmiad BO!, { d0 - d3 } | |||||
| add AO4, AO4, #8 | |||||
| .endm | |||||
| .macro COPY4x2 | |||||
| fldd d0 , [ AO1, #0 ] | |||||
| fldd d2 , [ AO1, #8 ] | |||||
| fldd d4 , [ AO1, #16 ] | |||||
| fldd d6 , [ AO1, #24 ] | |||||
| fldd d1 , [ AO2, #0 ] | |||||
| fldd d3 , [ AO2, #8 ] | |||||
| add AO1, AO1, #32 | |||||
| fldd d5 , [ AO2, #16 ] | |||||
| fldd d7 , [ AO2, #24 ] | |||||
| fstmiad BO!, { d0 - d7 } | |||||
| add AO2, AO2, #32 | |||||
| .endm | |||||
| .macro COPY1x2 | |||||
| fldd d0 , [ AO1, #0 ] | |||||
| fldd d1 , [ AO2, #0 ] | |||||
| add AO1, AO1, #8 | |||||
| fstmiad BO!, { d0 - d1 } | |||||
| add AO2, AO2, #8 | |||||
| .endm | |||||
| .macro COPY4x1 | |||||
| fldd d0 , [ AO1, #0 ] | |||||
| fldd d1 , [ AO1, #8 ] | |||||
| fldd d2 , [ AO1, #16 ] | |||||
| fldd d3 , [ AO1, #24 ] | |||||
| fstmiad BO!, { d0 - d3 } | |||||
| add AO1, AO1, #32 | |||||
| .endm | |||||
| .macro COPY1x1 | |||||
| fldd d0 , [ AO1, #0 ] | |||||
| fstmiad BO!, { d0 } | |||||
| add AO1, AO1, #8 | |||||
| .endm | |||||
| /************************************************************************************** | |||||
| * End of macro definitions | |||||
| **************************************************************************************/ | |||||
| PROLOGUE | |||||
| .align 5 | |||||
| push {r4 - r9, fp} | |||||
| add fp, sp, #24 | |||||
| sub sp, sp, #STACKSIZE // reserve stack | |||||
| lsl r3, r3, #3 // lda = lda * 8 | |||||
| str r3, LDA | |||||
| sub r4, fp, #128 | |||||
| vstm r4, { d8 - d15} // store floating point registers | |||||
| ldr BO, B | |||||
| dgemm_ncopy_L4_BEGIN: | |||||
| asrs J, N, #2 // J = N / 4 | |||||
| ble dgemm_ncopy_L2_BEGIN | |||||
| dgemm_ncopy_L4_M4_BEGIN: | |||||
| mov AO1, A // AO1 = A | |||||
| ldr r4 , LDA | |||||
| add AO2, AO1, r4 | |||||
| add AO3, AO2, r4 | |||||
| add AO4, AO3, r4 | |||||
| add A , AO4, r4 // A = A + 4 * LDA | |||||
| asrs I, M, #2 // I = M / 4 | |||||
| ble dgemm_ncopy_L4_M4_40 | |||||
| dgemm_ncopy_L4_M4_20: | |||||
| COPY4x4 | |||||
| subs I , I , #1 | |||||
| bne dgemm_ncopy_L4_M4_20 | |||||
| dgemm_ncopy_L4_M4_40: | |||||
| ands I, M , #3 | |||||
| ble dgemm_ncopy_L4_M4_END | |||||
| dgemm_ncopy_L4_M4_60: | |||||
| COPY1x4 | |||||
| subs I , I , #1 | |||||
| bne dgemm_ncopy_L4_M4_60 | |||||
| dgemm_ncopy_L4_M4_END: | |||||
| subs J , J, #1 // j-- | |||||
| bne dgemm_ncopy_L4_M4_BEGIN | |||||
| /*********************************************************************************************/ | |||||
| dgemm_ncopy_L2_BEGIN: | |||||
| tst N, #3 | |||||
| ble dgemm_ncopy_L999 | |||||
| tst N, #2 | |||||
| ble dgemm_ncopy_L1_BEGIN | |||||
| dgemm_ncopy_L2_M4_BEGIN: | |||||
| mov AO1, A // AO1 = A | |||||
| ldr r4 , LDA | |||||
| add AO2, AO1, r4 | |||||
| add A , AO2, r4 // A = A + 2 * LDA | |||||
| asrs I, M, #2 // I = M / 4 | |||||
| ble dgemm_ncopy_L2_M4_40 | |||||
| dgemm_ncopy_L2_M4_20: | |||||
| COPY4x2 | |||||
| subs I , I , #1 | |||||
| bne dgemm_ncopy_L2_M4_20 | |||||
| dgemm_ncopy_L2_M4_40: | |||||
| ands I, M , #3 | |||||
| ble dgemm_ncopy_L2_M4_END | |||||
| dgemm_ncopy_L2_M4_60: | |||||
| COPY1x2 | |||||
| subs I , I , #1 | |||||
| bne dgemm_ncopy_L2_M4_60 | |||||
| dgemm_ncopy_L2_M4_END: | |||||
| /*********************************************************************************************/ | |||||
| dgemm_ncopy_L1_BEGIN: | |||||
| tst N, #1 | |||||
| ble dgemm_ncopy_L999 | |||||
| dgemm_ncopy_L1_M4_BEGIN: | |||||
| mov AO1, A // AO1 = A | |||||
| ldr r4 , LDA | |||||
| add A , AO1, r4 // A = A + 1 * LDA | |||||
| asrs I, M, #2 // I = M / 4 | |||||
| ble dgemm_ncopy_L1_M4_40 | |||||
| dgemm_ncopy_L1_M4_20: | |||||
| COPY4x1 | |||||
| subs I , I , #1 | |||||
| bne dgemm_ncopy_L1_M4_20 | |||||
| dgemm_ncopy_L1_M4_40: | |||||
| ands I, M , #3 | |||||
| ble dgemm_ncopy_L1_M4_END | |||||
| dgemm_ncopy_L1_M4_60: | |||||
| COPY1x1 | |||||
| subs I , I , #1 | |||||
| bne dgemm_ncopy_L1_M4_60 | |||||
| dgemm_ncopy_L1_M4_END: | |||||
| dgemm_ncopy_L999: | |||||
| sub r3, fp, #128 | |||||
| vldm r3, { d8 - d15} // restore floating point registers | |||||
| movs r0, #0 // set return value | |||||
| sub sp, fp, #24 | |||||
| pop {r4 - r9, fp} | |||||
| bx lr | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,408 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| /************************************************************************************** | |||||
| * 2013/11/06 Saar | |||||
| * BLASTEST : OK | |||||
| * CTEST : OK | |||||
| * TEST : OK | |||||
| * | |||||
| **************************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define STACKSIZE 256 | |||||
| #define OLD_M r0 | |||||
| #define OLD_N r1 | |||||
| #define OLD_A r2 | |||||
| #define OLD_LDA r3 | |||||
| /****************************************************** | |||||
| * [fp, #-128] - [fp, #-64] is reserved | |||||
| * for store and restore of floating point | |||||
| * registers | |||||
| *******************************************************/ | |||||
| #define B [fp, #4 ] | |||||
| #define A [fp, #-248 ] | |||||
| #define M r0 | |||||
| #define N r1 | |||||
| #define M4 r2 | |||||
| #define LDA r5 | |||||
| #define AO1 r6 | |||||
| #define BO1 r7 | |||||
| #define BO2 r8 | |||||
| #define BO3 r9 | |||||
| #define I r4 | |||||
| #define J r12 | |||||
| #define A_PRE 256 | |||||
| /************************************************************************************** | |||||
| * Macro definitions | |||||
| **************************************************************************************/ | |||||
| .macro COPY4x4 | |||||
| pld [ AO1, #A_PRE ] | |||||
| fldmiad AO1, { d0 - d3 } | |||||
| add r3, AO1, LDA | |||||
| pld [ r3, #A_PRE ] | |||||
| fldmiad r3, { d4 - d7 } | |||||
| add r3, r3, LDA | |||||
| pld [ r3, #A_PRE ] | |||||
| fldmiad r3, { d8 - d11 } | |||||
| add r3, r3, LDA | |||||
| pld [ r3, #A_PRE ] | |||||
| fldmiad r3, { d12 - d15 } | |||||
| fstmiad BO1, { d0 - d15 } | |||||
| add AO1, AO1, #32 | |||||
| add BO1, BO1, M4 | |||||
| .endm | |||||
| .macro COPY2x4 | |||||
| fldmiad AO1, { d0 - d1 } | |||||
| add r3, AO1, LDA | |||||
| fldmiad r3, { d2 - d3 } | |||||
| add r3, r3, LDA | |||||
| fldmiad r3, { d4 - d5 } | |||||
| add r3, r3, LDA | |||||
| fldmiad r3, { d6 - d7 } | |||||
| fstmiad BO2, { d0 - d7 } | |||||
| add AO1, AO1, #16 | |||||
| add BO2, BO2, #64 | |||||
| .endm | |||||
| .macro COPY1x4 | |||||
| fldmiad AO1, { d0 } | |||||
| add r3, AO1, LDA | |||||
| fldmiad r3, { d1 } | |||||
| add r3, r3, LDA | |||||
| fldmiad r3, { d2 } | |||||
| add r3, r3, LDA | |||||
| fldmiad r3, { d3 } | |||||
| fstmiad BO3, { d0 - d3 } | |||||
| add AO1, AO1, #8 | |||||
| add BO3, BO3, #32 | |||||
| .endm | |||||
| /*************************************************************************************************************************/ | |||||
| .macro COPY4x2 | |||||
| pld [ AO1, #A_PRE ] | |||||
| fldmiad AO1, { d0 - d3 } | |||||
| add r3, AO1, LDA | |||||
| pld [ r3, #A_PRE ] | |||||
| fldmiad r3, { d4 - d7 } | |||||
| fstmiad BO1, { d0 - d7 } | |||||
| add AO1, AO1, #32 | |||||
| add BO1, BO1, M4 | |||||
| .endm | |||||
| .macro COPY2x2 | |||||
| fldmiad AO1, { d0 - d1 } | |||||
| add r3, AO1, LDA | |||||
| fldmiad r3, { d2 - d3 } | |||||
| fstmiad BO2, { d0 - d3 } | |||||
| add AO1, AO1, #16 | |||||
| add BO2, BO2, #32 | |||||
| .endm | |||||
| .macro COPY1x2 | |||||
| fldmiad AO1, { d0 } | |||||
| add r3, AO1, LDA | |||||
| fldmiad r3, { d1 } | |||||
| fstmiad BO3, { d0 - d1 } | |||||
| add AO1, AO1, #8 | |||||
| add BO3, BO3, #16 | |||||
| .endm | |||||
| /*************************************************************************************************************************/ | |||||
| .macro COPY4x1 | |||||
| pld [ AO1, #A_PRE ] | |||||
| fldmiad AO1, { d0 - d3 } | |||||
| fstmiad BO1, { d0 - d3 } | |||||
| add AO1, AO1, #32 | |||||
| add BO1, BO1, M4 | |||||
| .endm | |||||
| .macro COPY2x1 | |||||
| fldmiad AO1, { d0 - d1 } | |||||
| fstmiad BO2, { d0 - d1 } | |||||
| add AO1, AO1, #16 | |||||
| add BO2, BO2, #16 | |||||
| .endm | |||||
| .macro COPY1x1 | |||||
| fldmiad AO1, { d0 } | |||||
| fstmiad BO3, { d0 } | |||||
| add AO1, AO1, #8 | |||||
| add BO3, BO3, #8 | |||||
| .endm | |||||
| /************************************************************************************** | |||||
| * End of macro definitions | |||||
| **************************************************************************************/ | |||||
| PROLOGUE | |||||
| .align 5 | |||||
| push {r4 - r9, fp} | |||||
| add fp, sp, #24 | |||||
| sub sp, sp, #STACKSIZE // reserve stack | |||||
| str OLD_A, A // store A | |||||
| lsl LDA, OLD_LDA, #3 // lda = lda * SIZE | |||||
| sub r4, fp, #128 | |||||
| vstm r4, { d8 - d15} // store floating point registers | |||||
| lsl r4 , M, #3 // M * SIZE | |||||
| ldr r3, B | |||||
| and BO2 , N , #-4 | |||||
| and BO3 , N , #-2 | |||||
| mul BO2, BO2, r4 | |||||
| mul BO3, BO3, r4 | |||||
| add BO2 , BO2, r3 | |||||
| add BO3 , BO3, r3 | |||||
| lsl M4, M, #5 // M4 = M * 4 * SIZE | |||||
| dgemm_tcopy_L4_BEGIN: | |||||
| asrs J, M, #2 // J = N / 4 | |||||
| ble dgemm_tcopy_L2_BEGIN | |||||
| dgemm_tcopy_L4_M4_BEGIN: | |||||
| ldr AO1, A // AO1 = A | |||||
| lsl r3, LDA, #2 // r3 = 4 * LDA | |||||
| add r3, r3 , AO1 // A = A + 4 * LDA | |||||
| str r3, A // store A | |||||
| ldr BO1, B | |||||
| add r3, BO1, #128 // B = B + 16 * SIZE | |||||
| str r3, B | |||||
| asrs I, N, #2 // I = M / 4 | |||||
| ble dgemm_tcopy_L4_M4_40 | |||||
| dgemm_tcopy_L4_M4_20: | |||||
| COPY4x4 | |||||
| subs I , I , #1 | |||||
| bne dgemm_tcopy_L4_M4_20 | |||||
| dgemm_tcopy_L4_M4_40: | |||||
| tst N , #2 | |||||
| ble dgemm_tcopy_L4_M4_60 | |||||
| COPY2x4 | |||||
| dgemm_tcopy_L4_M4_60: | |||||
| tst N, #1 | |||||
| ble dgemm_tcopy_L4_M4_END | |||||
| COPY1x4 | |||||
| dgemm_tcopy_L4_M4_END: | |||||
| subs J , J, #1 // j-- | |||||
| bne dgemm_tcopy_L4_M4_BEGIN | |||||
| /*********************************************************************************************/ | |||||
| dgemm_tcopy_L2_BEGIN: | |||||
| tst M, #3 | |||||
| ble dgemm_tcopy_L999 | |||||
| tst M, #2 | |||||
| ble dgemm_tcopy_L1_BEGIN | |||||
| dgemm_tcopy_L2_M4_BEGIN: | |||||
| ldr AO1, A // AO1 = A | |||||
| lsl r3, LDA, #1 // r3 = 2 * LDA | |||||
| add r3, r3 , AO1 // A = A + 2 * LDA | |||||
| str r3, A // store A | |||||
| ldr BO1, B | |||||
| add r3, BO1, #64 // B = B + 8 * SIZE | |||||
| str r3, B | |||||
| asrs I, N, #2 // I = M / 4 | |||||
| ble dgemm_tcopy_L2_M4_40 | |||||
| dgemm_tcopy_L2_M4_20: | |||||
| COPY4x2 | |||||
| subs I , I , #1 | |||||
| bne dgemm_tcopy_L2_M4_20 | |||||
| dgemm_tcopy_L2_M4_40: | |||||
| tst N , #2 | |||||
| ble dgemm_tcopy_L2_M4_60 | |||||
| COPY2x2 | |||||
| dgemm_tcopy_L2_M4_60: | |||||
| tst N , #1 | |||||
| ble dgemm_tcopy_L2_M4_END | |||||
| COPY1x2 | |||||
| dgemm_tcopy_L2_M4_END: | |||||
| /*********************************************************************************************/ | |||||
| dgemm_tcopy_L1_BEGIN: | |||||
| tst M, #1 | |||||
| ble dgemm_tcopy_L999 | |||||
| dgemm_tcopy_L1_M4_BEGIN: | |||||
| ldr AO1, A // AO1 = A | |||||
| add r3, LDA , AO1 // A = A + 1 * LDA | |||||
| str r3, A // store A | |||||
| ldr BO1, B | |||||
| add r3, BO1, #32 // B = B + 4 * SIZE | |||||
| str r3, B | |||||
| asrs I, N, #2 // I = M / 4 | |||||
| ble dgemm_tcopy_L1_M4_40 | |||||
| dgemm_tcopy_L1_M4_20: | |||||
| COPY4x1 | |||||
| subs I , I , #1 | |||||
| bne dgemm_tcopy_L1_M4_20 | |||||
| dgemm_tcopy_L1_M4_40: | |||||
| tst N , #2 | |||||
| ble dgemm_tcopy_L1_M4_60 | |||||
| COPY2x1 | |||||
| dgemm_tcopy_L1_M4_60: | |||||
| tst N , #1 | |||||
| ble dgemm_tcopy_L1_M4_END | |||||
| COPY1x1 | |||||
| dgemm_tcopy_L1_M4_END: | |||||
| dgemm_tcopy_L999: | |||||
| sub r3, fp, #128 | |||||
| vldm r3, { d8 - d15} // restore floating point registers | |||||
| mov r0, #0 // set return value | |||||
| sub sp, fp, #24 | |||||
| pop {r4 - r9, fp} | |||||
| bx lr | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,64 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| /************************************************************************************** | |||||
| * 2013/09/14 Saar | |||||
| * BLASTEST float : OK | |||||
| * BLASTEST double : OK | |||||
| * CTEST : OK | |||||
| * TEST : OK | |||||
| * | |||||
| **************************************************************************************/ | |||||
| #include "common.h" | |||||
| #if defined(DSDOT) | |||||
| double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||||
| #else | |||||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||||
| #endif | |||||
| { | |||||
| BLASLONG i=0; | |||||
| BLASLONG ix=0,iy=0; | |||||
| double dot = 0.0 ; | |||||
| if ( n < 0 ) return(dot); | |||||
| while(i < n) | |||||
| { | |||||
| dot += y[iy] * x[ix] ; | |||||
| ix += inc_x ; | |||||
| iy += inc_y ; | |||||
| i++ ; | |||||
| } | |||||
| return(dot); | |||||
| } | |||||
| @@ -0,0 +1,67 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| /************************************************************************************** | |||||
| * * 2013/09/14 Saar | |||||
| * * BLASTEST float : OK | |||||
| * * BLASTEST double : OK | |||||
| * CTEST : OK | |||||
| * TEST : OK | |||||
| * * | |||||
| * **************************************************************************************/ | |||||
| #include "common.h" | |||||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||||
| { | |||||
| BLASLONG i; | |||||
| BLASLONG ix,iy; | |||||
| BLASLONG j; | |||||
| FLOAT *a_ptr; | |||||
| FLOAT temp; | |||||
| ix = 0; | |||||
| a_ptr = a; | |||||
| for (j=0; j<n; j++) | |||||
| { | |||||
| temp = alpha * x[ix]; | |||||
| iy = 0; | |||||
| for (i=0; i<m; i++) | |||||
| { | |||||
| y[iy] += temp * a_ptr[i]; | |||||
| iy += inc_y; | |||||
| } | |||||
| a_ptr += lda; | |||||
| ix += inc_x; | |||||
| } | |||||
| } | |||||
| @@ -0,0 +1,740 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| /************************************************************************************** | |||||
| * 2013/11/28 Saar | |||||
| * BLASTEST : OK | |||||
| * CTEST : OK | |||||
| * TEST : OK | |||||
| * | |||||
| **************************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define STACKSIZE 256 | |||||
| #define OLD_LDA [fp, #0 ] | |||||
| #define X [fp, #4 ] | |||||
| #define OLD_INC_X [fp, #8 ] | |||||
| #define Y [fp, #12 ] | |||||
| #define OLD_INC_Y [fp, #16 ] | |||||
| #define OLD_A r3 | |||||
| #define OLD_M r0 | |||||
| #define AO1 r0 | |||||
| #define N r1 | |||||
| #define J r2 | |||||
| #define AO2 r4 | |||||
| #define XO r5 | |||||
| #define YO r6 | |||||
| #define LDA r7 | |||||
| #define INC_X r8 | |||||
| #define INC_Y r9 | |||||
| #define I r12 | |||||
| #define M [fp, #-252 ] | |||||
| #define A [fp, #-256 ] | |||||
| #define X_PRE 64 | |||||
| #define Y_PRE 0 | |||||
| #define A_PRE 0 | |||||
| /************************************************************************************** | |||||
| * Macro definitions | |||||
| **************************************************************************************/ | |||||
| #if defined(DOUBLE) | |||||
| .macro INIT_F8 | |||||
| pld [ YO , #Y_PRE ] | |||||
| pld [ YO , #Y_PRE+32 ] | |||||
| vsub.f64 d8 , d8 , d8 | |||||
| vmov.f64 d9 , d8 | |||||
| vmov.f64 d10 , d8 | |||||
| vmov.f64 d11 , d8 | |||||
| vmov.f64 d12 , d8 | |||||
| vmov.f64 d13 , d8 | |||||
| vmov.f64 d14 , d8 | |||||
| vmov.f64 d15 , d8 | |||||
| .endm | |||||
| .macro KERNEL_F8X8 | |||||
| pld [ XO , #X_PRE ] | |||||
| KERNEL_F8X1 | |||||
| KERNEL_F8X1 | |||||
| KERNEL_F8X1 | |||||
| KERNEL_F8X1 | |||||
| pld [ XO , #X_PRE ] | |||||
| KERNEL_F8X1 | |||||
| KERNEL_F8X1 | |||||
| KERNEL_F8X1 | |||||
| KERNEL_F8X1 | |||||
| .endm | |||||
| .macro KERNEL_F8X1 | |||||
| pld [ AO2 , #A_PRE ] | |||||
| fldmiad XO! , { d2 } | |||||
| fldmiad AO1 , { d4 - d7 } | |||||
| vmla.f64 d8 , d2 , d4 | |||||
| pld [ AO2 , #4*SIZE ] | |||||
| vmla.f64 d9 , d2 , d5 | |||||
| add r3, AO1, #4*SIZE | |||||
| vmla.f64 d10 , d2 , d6 | |||||
| vmla.f64 d11 , d2 , d7 | |||||
| fldmiad r3 , { d4 - d7 } | |||||
| vmla.f64 d12 , d2 , d4 | |||||
| vmla.f64 d13 , d2 , d5 | |||||
| add AO1, AO1, LDA | |||||
| vmla.f64 d14 , d2 , d6 | |||||
| add AO2, AO2, LDA | |||||
| vmla.f64 d15 , d2 , d7 | |||||
| .endm | |||||
| .macro SAVE_F8 | |||||
| fldmiad YO, { d4 - d7 } | |||||
| vmla.f64 d4 , d0, d8 | |||||
| vmla.f64 d5 , d0, d9 | |||||
| vmla.f64 d6 , d0, d10 | |||||
| vmla.f64 d7 , d0, d11 | |||||
| fstmiad YO!, { d4 - d7 } | |||||
| fldmiad YO, { d4 - d7 } | |||||
| vmla.f64 d4 , d0, d12 | |||||
| vmla.f64 d5 , d0, d13 | |||||
| vmla.f64 d6 , d0, d14 | |||||
| vmla.f64 d7 , d0, d15 | |||||
| fstmiad YO!, { d4 - d7 } | |||||
| .endm | |||||
| .macro INIT_F1 | |||||
| vsub.f64 d12 , d12 , d12 | |||||
| .endm | |||||
| .macro KERNEL_F1X1 | |||||
| fldmiad XO! , { d2 } | |||||
| fldmiad AO1 , { d8 } | |||||
| vmla.f64 d12 , d2 , d8 | |||||
| add AO1, AO1, LDA | |||||
| .endm | |||||
| .macro SAVE_F1 | |||||
| fldmiad YO, { d4 } | |||||
| vmla.f64 d4, d0, d12 | |||||
| fstmiad YO!, { d4 } | |||||
| .endm | |||||
| /*********************************************************************************************/ | |||||
| .macro INIT_S4 | |||||
| vsub.f64 d12 , d12 , d12 | |||||
| vmov.f64 d13 , d12 | |||||
| vmov.f64 d14 , d12 | |||||
| vmov.f64 d15 , d12 | |||||
| .endm | |||||
| .macro KERNEL_S4X4 | |||||
| KERNEL_S4X1 | |||||
| KERNEL_S4X1 | |||||
| KERNEL_S4X1 | |||||
| KERNEL_S4X1 | |||||
| .endm | |||||
| .macro KERNEL_S4X1 | |||||
| pld [ AO2 , #A_PRE ] | |||||
| fldmiad XO , { d2 } | |||||
| fldmiad AO1 , { d8 - d11 } | |||||
| vmla.f64 d12 , d2 , d8 | |||||
| add AO1, AO1, LDA | |||||
| vmla.f64 d13 , d2 , d9 | |||||
| add AO2, AO2, LDA | |||||
| vmla.f64 d14 , d2 , d10 | |||||
| vmla.f64 d15 , d2 , d11 | |||||
| add XO, XO , INC_X | |||||
| .endm | |||||
| .macro SAVE_S4 | |||||
| fldmiad YO, { d4 } | |||||
| vmla.f64 d4 , d0, d12 | |||||
| fstmiad YO, { d4 } | |||||
| add YO, YO, INC_Y | |||||
| fldmiad YO, { d5 } | |||||
| vmla.f64 d5 , d0, d13 | |||||
| fstmiad YO, { d5 } | |||||
| add YO, YO, INC_Y | |||||
| fldmiad YO, { d4 } | |||||
| vmla.f64 d4 , d0, d14 | |||||
| fstmiad YO, { d4 } | |||||
| add YO, YO, INC_Y | |||||
| fldmiad YO, { d5 } | |||||
| vmla.f64 d5 , d0, d15 | |||||
| fstmiad YO, { d5 } | |||||
| add YO, YO, INC_Y | |||||
| .endm | |||||
| .macro INIT_S1 | |||||
| vsub.f64 d12 , d12 , d12 | |||||
| .endm | |||||
| .macro KERNEL_S1X1 | |||||
| fldmiad XO , { d2 } | |||||
| fldmiad AO1 , { d8 } | |||||
| vmla.f64 d12 , d2 , d8 | |||||
| add AO1, AO1, LDA | |||||
| add XO, XO , INC_X | |||||
| .endm | |||||
| .macro SAVE_S1 | |||||
| fldmiad YO, { d4 } | |||||
| vmla.f64 d4, d0, d12 | |||||
| fstmiad YO , { d4 } | |||||
| add YO, YO, INC_Y | |||||
| .endm | |||||
| #else /************************* SINGLE PRECISION *****************************************/ | |||||
| .macro INIT_F8 | |||||
| pld [ YO , #Y_PRE ] | |||||
| vsub.f32 s8 , s8 , s8 | |||||
| vmov.f32 s9 , s8 | |||||
| vmov.f32 s10 , s8 | |||||
| vmov.f32 s11 , s8 | |||||
| vmov.f32 s12 , s8 | |||||
| vmov.f32 s13 , s8 | |||||
| vmov.f32 s14 , s8 | |||||
| vmov.f32 s15 , s8 | |||||
| .endm | |||||
| .macro KERNEL_F8X8 | |||||
| pld [ XO , #X_PRE ] | |||||
| KERNEL_F8X1 | |||||
| KERNEL_F8X1 | |||||
| KERNEL_F8X1 | |||||
| KERNEL_F8X1 | |||||
| KERNEL_F8X1 | |||||
| KERNEL_F8X1 | |||||
| KERNEL_F8X1 | |||||
| KERNEL_F8X1 | |||||
| .endm | |||||
| .macro KERNEL_F8X1 | |||||
| pld [ AO2, #A_PRE ] | |||||
| fldmias XO! , { s2 } | |||||
| fldmias AO1 , { s4 - s7 } | |||||
| vmla.f32 s8 , s2 , s4 | |||||
| vmla.f32 s9 , s2 , s5 | |||||
| vmla.f32 s10 , s2 , s6 | |||||
| vmla.f32 s11 , s2 , s7 | |||||
| add r3, AO1, #4*SIZE | |||||
| fldmias r3 , { s4 - s7 } | |||||
| vmla.f32 s12 , s2 , s4 | |||||
| vmla.f32 s13 , s2 , s5 | |||||
| vmla.f32 s14 , s2 , s6 | |||||
| vmla.f32 s15 , s2 , s7 | |||||
| add AO1, AO1, LDA | |||||
| add AO2, AO2, LDA | |||||
| .endm | |||||
| .macro SAVE_F8 | |||||
| fldmias YO, { s4 - s7 } | |||||
| vmla.f32 s4 , s0, s8 | |||||
| vmla.f32 s5 , s0, s9 | |||||
| vmla.f32 s6 , s0, s10 | |||||
| vmla.f32 s7 , s0, s11 | |||||
| fstmias YO!, { s4 - s7 } | |||||
| fldmias YO, { s4 - s7 } | |||||
| vmla.f32 s4 , s0, s12 | |||||
| vmla.f32 s5 , s0, s13 | |||||
| vmla.f32 s6 , s0, s14 | |||||
| vmla.f32 s7 , s0, s15 | |||||
| fstmias YO!, { s4 - s7 } | |||||
| .endm | |||||
| .macro INIT_F1 | |||||
| vsub.f32 s12 , s12 , s12 | |||||
| .endm | |||||
| .macro KERNEL_F1X1 | |||||
| fldmias XO! , { s2 } | |||||
| fldmias AO1 , { s8 } | |||||
| vmla.f32 s12 , s2 , s8 | |||||
| add AO1, AO1, LDA | |||||
| .endm | |||||
| .macro SAVE_F1 | |||||
| fldmias YO, { s4 } | |||||
| vmla.f32 s4, s0, s12 | |||||
| fstmias YO!, { s4 } | |||||
| .endm | |||||
| /*********************************************************************************************/ | |||||
| .macro INIT_S4 | |||||
| vsub.f32 s12 , s12 , s12 | |||||
| vmov.f32 s13 , s12 | |||||
| vmov.f32 s14 , s12 | |||||
| vmov.f32 s15 , s12 | |||||
| .endm | |||||
| .macro KERNEL_S4X4 | |||||
| pld [ AO2 , #A_PRE ] | |||||
| KERNEL_S4X1 | |||||
| KERNEL_S4X1 | |||||
| pld [ AO2 , #A_PRE ] | |||||
| KERNEL_S4X1 | |||||
| KERNEL_S4X1 | |||||
| .endm | |||||
| .macro KERNEL_S4X1 | |||||
| fldmias XO , { s2 } | |||||
| fldmias AO1 , { s8 - s11 } | |||||
| vmla.f32 s12 , s2 , s8 | |||||
| vmla.f32 s13 , s2 , s9 | |||||
| vmla.f32 s14 , s2 , s10 | |||||
| vmla.f32 s15 , s2 , s11 | |||||
| add AO1, AO1, LDA | |||||
| add AO2, AO2, LDA | |||||
| add XO, XO , INC_X | |||||
| .endm | |||||
| .macro SAVE_S4 | |||||
| fldmias YO, { s4 } | |||||
| vmla.f32 s4 , s0, s12 | |||||
| fstmias YO, { s4 } | |||||
| add YO, YO, INC_Y | |||||
| fldmias YO, { s5 } | |||||
| vmla.f32 s5 , s0, s13 | |||||
| fstmias YO, { s5 } | |||||
| add YO, YO, INC_Y | |||||
| fldmias YO, { s4 } | |||||
| vmla.f32 s4 , s0, s14 | |||||
| fstmias YO, { s4 } | |||||
| add YO, YO, INC_Y | |||||
| fldmias YO, { s5 } | |||||
| vmla.f32 s5 , s0, s15 | |||||
| fstmias YO, { s5 } | |||||
| add YO, YO, INC_Y | |||||
| .endm | |||||
| .macro INIT_S1 | |||||
| vsub.f32 s12 , s12 , s12 | |||||
| .endm | |||||
| .macro KERNEL_S1X1 | |||||
| fldmias XO , { s2 } | |||||
| fldmias AO1 , { s8 } | |||||
| vmla.f32 s12 , s2 , s8 | |||||
| add AO1, AO1, LDA | |||||
| add XO, XO , INC_X | |||||
| .endm | |||||
| .macro SAVE_S1 | |||||
| fldmias YO, { s4 } | |||||
| vmla.f32 s4, s0, s12 | |||||
| fstmias YO , { s4 } | |||||
| add YO, YO, INC_Y | |||||
| .endm | |||||
| #endif | |||||
| /************************************************************************************** | |||||
| * End of macro definitions | |||||
| **************************************************************************************/ | |||||
| PROLOGUE | |||||
| .align 5 | |||||
| push {r4 - r9 , fp} | |||||
| add fp, sp, #28 | |||||
| sub sp, sp, #STACKSIZE // reserve stack | |||||
| sub r12, fp, #192 | |||||
| #if defined(DOUBLE) | |||||
| vstm r12, { d8 - d15 } // store floating point registers | |||||
| #else | |||||
| vstm r12, { s8 - s15 } // store floating point registers | |||||
| #endif | |||||
| cmp OLD_M, #0 | |||||
| ble gemvn_kernel_L999 | |||||
| cmp N, #0 | |||||
| ble gemvn_kernel_L999 | |||||
| str OLD_A, A | |||||
| str OLD_M, M | |||||
| ldr INC_X , OLD_INC_X | |||||
| ldr INC_Y , OLD_INC_Y | |||||
| cmp INC_X, #0 | |||||
| beq gemvn_kernel_L999 | |||||
| cmp INC_Y, #0 | |||||
| beq gemvn_kernel_L999 | |||||
| ldr LDA, OLD_LDA | |||||
| #if defined(DOUBLE) | |||||
| lsl LDA, LDA, #3 // LDA * SIZE | |||||
| #else | |||||
| lsl LDA, LDA, #2 // LDA * SIZE | |||||
| #endif | |||||
| cmp INC_X, #1 | |||||
| bne gemvn_kernel_S4_BEGIN | |||||
| cmp INC_Y, #1 | |||||
| bne gemvn_kernel_S4_BEGIN | |||||
| gemvn_kernel_F4_BEGIN: | |||||
| ldr YO , Y | |||||
| ldr I, M | |||||
| asrs I, I, #3 // I = M / 8 | |||||
| ble gemvn_kernel_F1_BEGIN | |||||
| gemvn_kernel_F4X4: | |||||
| ldr AO1, A | |||||
| add AO2, AO1, LDA | |||||
| add r3 , AO1, #8*SIZE | |||||
| str r3 , A | |||||
| add AO2, AO2, LDA | |||||
| add AO2, AO2, LDA | |||||
| ldr XO , X | |||||
| INIT_F8 | |||||
| asrs J, N, #3 // J = N / 8 | |||||
| ble gemvn_kernel_F4X1 | |||||
| gemvn_kernel_F4X4_10: | |||||
| KERNEL_F8X8 | |||||
| subs J, J, #1 | |||||
| bne gemvn_kernel_F4X4_10 | |||||
| gemvn_kernel_F4X1: | |||||
| ands J, N , #7 | |||||
| ble gemvn_kernel_F4_END | |||||
| gemvn_kernel_F4X1_10: | |||||
| KERNEL_F8X1 | |||||
| subs J, J, #1 | |||||
| bne gemvn_kernel_F4X1_10 | |||||
| gemvn_kernel_F4_END: | |||||
| SAVE_F8 | |||||
| subs I , I , #1 | |||||
| bne gemvn_kernel_F4X4 | |||||
| gemvn_kernel_F1_BEGIN: | |||||
| ldr I, M | |||||
| ands I, I , #7 | |||||
| ble gemvn_kernel_L999 | |||||
| gemvn_kernel_F1X1: | |||||
| ldr AO1, A | |||||
| add r3, AO1, #SIZE | |||||
| str r3, A | |||||
| ldr XO , X | |||||
| INIT_F1 | |||||
| mov J, N | |||||
| gemvn_kernel_F1X1_10: | |||||
| KERNEL_F1X1 | |||||
| subs J, J, #1 | |||||
| bne gemvn_kernel_F1X1_10 | |||||
| gemvn_kernel_F1_END: | |||||
| SAVE_F1 | |||||
| subs I , I , #1 | |||||
| bne gemvn_kernel_F1X1 | |||||
| b gemvn_kernel_L999 | |||||
| /*************************************************************************************************************/ | |||||
| gemvn_kernel_S4_BEGIN: | |||||
| #if defined(DOUBLE) | |||||
| lsl INC_X, INC_X, #3 // INC_X * SIZE | |||||
| lsl INC_Y, INC_Y, #3 // INC_Y * SIZE | |||||
| #else | |||||
| lsl INC_X, INC_X, #2 // INC_X * SIZE | |||||
| lsl INC_Y, INC_Y, #2 // INC_Y * SIZE | |||||
| #endif | |||||
| ldr YO , Y | |||||
| ldr I, M | |||||
| asrs I, I, #2 // I = M / 4 | |||||
| ble gemvn_kernel_S1_BEGIN | |||||
| gemvn_kernel_S4X4: | |||||
| ldr AO1, A | |||||
| add AO2, AO1, LDA | |||||
| add r3 , AO1, #4*SIZE | |||||
| str r3 , A | |||||
| ldr XO , X | |||||
| INIT_S4 | |||||
| asrs J, N, #2 // J = N / 4 | |||||
| ble gemvn_kernel_S4X1 | |||||
| gemvn_kernel_S4X4_10: | |||||
| KERNEL_S4X4 | |||||
| subs J, J, #1 | |||||
| bne gemvn_kernel_S4X4_10 | |||||
| gemvn_kernel_S4X1: | |||||
| ands J, N , #3 | |||||
| ble gemvn_kernel_S4_END | |||||
| gemvn_kernel_S4X1_10: | |||||
| KERNEL_S4X1 | |||||
| subs J, J, #1 | |||||
| bne gemvn_kernel_S4X1_10 | |||||
| gemvn_kernel_S4_END: | |||||
| SAVE_S4 | |||||
| subs I , I , #1 | |||||
| bne gemvn_kernel_S4X4 | |||||
| gemvn_kernel_S1_BEGIN: | |||||
| ldr I, M | |||||
| ands I, I , #3 | |||||
| ble gemvn_kernel_L999 | |||||
| gemvn_kernel_S1X1: | |||||
| ldr AO1, A | |||||
| add r3, AO1, #SIZE | |||||
| str r3, A | |||||
| ldr XO , X | |||||
| INIT_S1 | |||||
| mov J, N | |||||
| gemvn_kernel_S1X1_10: | |||||
| KERNEL_S1X1 | |||||
| subs J, J, #1 | |||||
| bne gemvn_kernel_S1X1_10 | |||||
| gemvn_kernel_S1_END: | |||||
| SAVE_S1 | |||||
| subs I , I , #1 | |||||
| bne gemvn_kernel_S1X1 | |||||
| /*************************************************************************************************************/ | |||||
| gemvn_kernel_L999: | |||||
| sub r3, fp, #192 | |||||
| #if defined(DOUBLE) | |||||
| vldm r3, { d8 - d15 } // restore floating point registers | |||||
| #else | |||||
| vldm r3, { s8 - s15 } // restore floating point registers | |||||
| #endif | |||||
| mov r0, #0 // set return value | |||||
| sub sp, fp, #28 | |||||
| pop {r4 -r9 ,fp} | |||||
| bx lr | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,781 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| /************************************************************************************** | |||||
| * 2013/11/19 Saar | |||||
| * BLASTEST : OK | |||||
| * CTEST : OK | |||||
| * TEST : OK | |||||
| * | |||||
| **************************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define STACKSIZE 256 | |||||
| #define OLD_LDA [fp, #0 ] | |||||
| #define X [fp, #4 ] | |||||
| #define OLD_INC_X [fp, #8 ] | |||||
| #define Y [fp, #12 ] | |||||
| #define OLD_INC_Y [fp, #16 ] | |||||
| #define OLD_A r3 | |||||
| #define OLD_M r0 | |||||
| #define AO1 r0 | |||||
| #define N r1 | |||||
| #define J r2 | |||||
| #define AO2 r4 | |||||
| #define XO r5 | |||||
| #define YO r6 | |||||
| #define LDA r7 | |||||
| #define INC_X r8 | |||||
| #define INC_Y r9 | |||||
| #define I r12 | |||||
| #define M [fp, #-252 ] | |||||
| #define A [fp, #-256 ] | |||||
| #define X_PRE 64 | |||||
| #define Y_PRE 0 | |||||
| #define A_PRE 0 | |||||
| /************************************************************************************** | |||||
| * Macro definitions | |||||
| **************************************************************************************/ | |||||
| #if defined(DOUBLE) | |||||
| .macro INIT_F8 | |||||
| pld [ YO , #Y_PRE ] | |||||
| pld [ YO , #Y_PRE+32 ] | |||||
| vsub.f64 d24 , d24 , d24 | |||||
| vmov.f64 d25 , d24 | |||||
| vmov.f64 d26 , d24 | |||||
| vmov.f64 d27 , d24 | |||||
| vmov.f64 d28 , d24 | |||||
| vmov.f64 d29 , d24 | |||||
| vmov.f64 d30 , d24 | |||||
| vmov.f64 d31 , d24 | |||||
| .endm | |||||
| .macro KERNEL_F8X8 | |||||
| pld [ XO , #X_PRE ] | |||||
| KERNEL_F8X1 | |||||
| KERNEL_F8X1 | |||||
| KERNEL_F8X1 | |||||
| KERNEL_F8X1 | |||||
| pld [ XO , #X_PRE ] | |||||
| KERNEL_F8X1 | |||||
| KERNEL_F8X1 | |||||
| KERNEL_F8X1 | |||||
| KERNEL_F8X1 | |||||
| .endm | |||||
| .macro KERNEL_F8X1 | |||||
| fldmiad XO! , { d4 } | |||||
| fldmiad AO1 , { d8 - d15 } | |||||
| vmla.f64 d24 , d4 , d8 | |||||
| pld [ AO2 , #A_PRE ] | |||||
| vmla.f64 d25 , d4 , d9 | |||||
| pld [ AO2 , #A_PRE+32 ] | |||||
| vmla.f64 d26 , d4 , d10 | |||||
| vmla.f64 d27 , d4 , d11 | |||||
| vmla.f64 d28 , d4 , d12 | |||||
| vmla.f64 d29 , d4 , d13 | |||||
| add AO1, AO1, LDA | |||||
| vmla.f64 d30 , d4 , d14 | |||||
| add AO2, AO2, LDA | |||||
| vmla.f64 d31 , d4 , d15 | |||||
| .endm | |||||
| .macro SAVE_F8 | |||||
| fldmiad YO, { d16 - d23 } | |||||
| vmla.f64 d16, d0, d24 | |||||
| vmla.f64 d17, d0, d25 | |||||
| vmla.f64 d18, d0, d26 | |||||
| vmla.f64 d19, d0, d27 | |||||
| vmla.f64 d20, d0, d28 | |||||
| vmla.f64 d21, d0, d29 | |||||
| vmla.f64 d22, d0, d30 | |||||
| vmla.f64 d23, d0, d31 | |||||
| fstmiad YO!, { d16 - d23 } | |||||
| .endm | |||||
| .macro INIT_F1 | |||||
| vsub.f64 d24 , d24 , d24 | |||||
| .endm | |||||
| .macro KERNEL_F1X1 | |||||
| fldmiad XO! , { d4 } | |||||
| fldmiad AO1 , { d8 } | |||||
| vmla.f64 d24 , d4 , d8 | |||||
| add AO1, AO1, LDA | |||||
| .endm | |||||
| .macro SAVE_F1 | |||||
| fldmiad YO, { d16 } | |||||
| vmla.f64 d16, d0, d24 | |||||
| fstmiad YO!, { d16 } | |||||
| .endm | |||||
| /*********************************************************************************************/ | |||||
| .macro INIT_S8 | |||||
| vsub.f64 d24 , d24 , d24 | |||||
| vmov.f64 d25 , d24 | |||||
| vmov.f64 d26 , d24 | |||||
| vmov.f64 d27 , d24 | |||||
| vmov.f64 d28 , d24 | |||||
| vmov.f64 d29 , d24 | |||||
| vmov.f64 d30 , d24 | |||||
| vmov.f64 d31 , d24 | |||||
| .endm | |||||
| .macro KERNEL_S8X8 | |||||
| KERNEL_S8X1 | |||||
| KERNEL_S8X1 | |||||
| KERNEL_S8X1 | |||||
| KERNEL_S8X1 | |||||
| KERNEL_S8X1 | |||||
| KERNEL_S8X1 | |||||
| KERNEL_S8X1 | |||||
| KERNEL_S8X1 | |||||
| .endm | |||||
| .macro KERNEL_S8X1 | |||||
| pld [ AO2 , #A_PRE ] | |||||
| pld [ AO2 , #A_PRE+32 ] | |||||
| fldmiad XO , { d4 } | |||||
| fldmiad AO1 , { d8 - d15 } | |||||
| vmla.f64 d24 , d4 , d8 | |||||
| vmla.f64 d25 , d4 , d9 | |||||
| vmla.f64 d26 , d4 , d10 | |||||
| vmla.f64 d27 , d4 , d11 | |||||
| vmla.f64 d28 , d4 , d12 | |||||
| vmla.f64 d29 , d4 , d13 | |||||
| vmla.f64 d30 , d4 , d14 | |||||
| vmla.f64 d31 , d4 , d15 | |||||
| add AO1, AO1, LDA | |||||
| add AO2, AO2, LDA | |||||
| add XO, XO, INC_X | |||||
| .endm | |||||
| .macro SAVE_S8 | |||||
| fldmiad YO, { d16 } | |||||
| vmla.f64 d16, d0, d24 | |||||
| fstmiad YO, { d16 } | |||||
| add YO, YO, INC_Y | |||||
| fldmiad YO, { d17 } | |||||
| vmla.f64 d17, d0, d25 | |||||
| fstmiad YO, { d17 } | |||||
| add YO, YO, INC_Y | |||||
| fldmiad YO, { d18 } | |||||
| vmla.f64 d18, d0, d26 | |||||
| fstmiad YO, { d18 } | |||||
| add YO, YO, INC_Y | |||||
| fldmiad YO, { d19 } | |||||
| vmla.f64 d19, d0, d27 | |||||
| fstmiad YO, { d19 } | |||||
| add YO, YO, INC_Y | |||||
| fldmiad YO, { d20 } | |||||
| vmla.f64 d20, d0, d28 | |||||
| fstmiad YO, { d20 } | |||||
| add YO, YO, INC_Y | |||||
| fldmiad YO, { d21 } | |||||
| vmla.f64 d21, d0, d29 | |||||
| fstmiad YO, { d21 } | |||||
| add YO, YO, INC_Y | |||||
| fldmiad YO, { d22 } | |||||
| vmla.f64 d22, d0, d30 | |||||
| fstmiad YO, { d22 } | |||||
| add YO, YO, INC_Y | |||||
| fldmiad YO, { d23 } | |||||
| vmla.f64 d23, d0, d31 | |||||
| fstmiad YO, { d23 } | |||||
| add YO, YO, INC_Y | |||||
| .endm | |||||
| .macro INIT_S1 | |||||
| vsub.f64 d24 , d24 , d24 | |||||
| .endm | |||||
| .macro KERNEL_S1X1 | |||||
| fldmiad XO , { d4 } | |||||
| fldmiad AO1 , { d8 } | |||||
| vmla.f64 d24 , d4 , d8 | |||||
| add AO1, AO1, LDA | |||||
| add XO, XO, INC_X | |||||
| .endm | |||||
| .macro SAVE_S1 | |||||
| fldmiad YO, { d16 } | |||||
| vmla.f64 d16, d0, d24 | |||||
| fstmiad YO, { d16 } | |||||
| add YO, YO, INC_Y | |||||
| .endm | |||||
| #else /************************* SINGLE PRECISION *****************************************/ | |||||
| .macro INIT_F8 | |||||
| pld [ YO , #Y_PRE ] | |||||
| vsub.f32 s24 , s24 , s24 | |||||
| vmov.f32 s25 , s24 | |||||
| vmov.f32 s26 , s24 | |||||
| vmov.f32 s27 , s24 | |||||
| vmov.f32 s28 , s24 | |||||
| vmov.f32 s29 , s24 | |||||
| vmov.f32 s30 , s24 | |||||
| vmov.f32 s31 , s24 | |||||
| .endm | |||||
| .macro KERNEL_F8X8 | |||||
| pld [ XO , #X_PRE ] | |||||
| KERNEL_F8X1 | |||||
| KERNEL_F8X1 | |||||
| KERNEL_F8X1 | |||||
| KERNEL_F8X1 | |||||
| KERNEL_F8X1 | |||||
| KERNEL_F8X1 | |||||
| KERNEL_F8X1 | |||||
| KERNEL_F8X1 | |||||
| .endm | |||||
| .macro KERNEL_F8X1 | |||||
| pld [ AO2 , #A_PRE ] | |||||
| fldmias XO! , { s4 } | |||||
| fldmias AO1 , { s8 - s15 } | |||||
| vmla.f32 s24 , s4 , s8 | |||||
| vmla.f32 s25 , s4 , s9 | |||||
| vmla.f32 s26 , s4 , s10 | |||||
| vmla.f32 s27 , s4 , s11 | |||||
| vmla.f32 s28 , s4 , s12 | |||||
| vmla.f32 s29 , s4 , s13 | |||||
| vmla.f32 s30 , s4 , s14 | |||||
| vmla.f32 s31 , s4 , s15 | |||||
| add AO1, AO1, LDA | |||||
| add AO2, AO2, LDA | |||||
| .endm | |||||
| .macro SAVE_F8 | |||||
| fldmias YO, { s16 - s23 } | |||||
| vmla.f32 s16, s0, s24 | |||||
| vmla.f32 s17, s0, s25 | |||||
| vmla.f32 s18, s0, s26 | |||||
| vmla.f32 s19, s0, s27 | |||||
| vmla.f32 s20, s0, s28 | |||||
| vmla.f32 s21, s0, s29 | |||||
| vmla.f32 s22, s0, s30 | |||||
| vmla.f32 s23, s0, s31 | |||||
| fstmias YO!, { s16 - s23 } | |||||
| .endm | |||||
| .macro INIT_F1 | |||||
| vsub.f32 s24 , s24 , s24 | |||||
| .endm | |||||
| .macro KERNEL_F1X1 | |||||
| fldmias XO! , { s4 } | |||||
| fldmias AO1 , { s8 } | |||||
| vmla.f32 s24 , s4 , s8 | |||||
| add AO1, AO1, LDA | |||||
| .endm | |||||
| .macro SAVE_F1 | |||||
| fldmias YO, { s16 } | |||||
| vmla.f32 s16, s0, s24 | |||||
| fstmias YO!, { s16 } | |||||
| .endm | |||||
| /*********************************************************************************************/ | |||||
| .macro INIT_S8 | |||||
| vsub.f32 s24 , s24 , s24 | |||||
| vmov.f32 s25 , s24 | |||||
| vmov.f32 s26 , s24 | |||||
| vmov.f32 s27 , s24 | |||||
| vmov.f32 s28 , s24 | |||||
| vmov.f32 s29 , s24 | |||||
| vmov.f32 s30 , s24 | |||||
| vmov.f32 s31 , s24 | |||||
| .endm | |||||
| .macro KERNEL_S8X8 | |||||
| KERNEL_S8X1 | |||||
| KERNEL_S8X1 | |||||
| KERNEL_S8X1 | |||||
| KERNEL_S8X1 | |||||
| KERNEL_S8X1 | |||||
| KERNEL_S8X1 | |||||
| KERNEL_S8X1 | |||||
| KERNEL_S8X1 | |||||
| .endm | |||||
| .macro KERNEL_S8X1 | |||||
| pld [ AO2 , #A_PRE ] | |||||
| fldmias XO , { s4 } | |||||
| fldmias AO1 , { s8 - s15 } | |||||
| vmla.f32 s24 , s4 , s8 | |||||
| vmla.f32 s25 , s4 , s9 | |||||
| vmla.f32 s26 , s4 , s10 | |||||
| vmla.f32 s27 , s4 , s11 | |||||
| vmla.f32 s28 , s4 , s12 | |||||
| vmla.f32 s29 , s4 , s13 | |||||
| vmla.f32 s30 , s4 , s14 | |||||
| vmla.f32 s31 , s4 , s15 | |||||
| add AO1, AO1, LDA | |||||
| add AO2, AO2, LDA | |||||
| add XO, XO, INC_X | |||||
| .endm | |||||
| .macro SAVE_S8 | |||||
| fldmias YO, { s16 } | |||||
| vmla.f32 s16, s0, s24 | |||||
| fstmias YO, { s16 } | |||||
| add YO, YO, INC_Y | |||||
| fldmias YO, { s17 } | |||||
| vmla.f32 s17, s0, s25 | |||||
| fstmias YO, { s17 } | |||||
| add YO, YO, INC_Y | |||||
| fldmias YO, { s18 } | |||||
| vmla.f32 s18, s0, s26 | |||||
| fstmias YO, { s18 } | |||||
| add YO, YO, INC_Y | |||||
| fldmias YO, { s19 } | |||||
| vmla.f32 s19, s0, s27 | |||||
| fstmias YO, { s19 } | |||||
| add YO, YO, INC_Y | |||||
| fldmias YO, { s20 } | |||||
| vmla.f32 s20, s0, s28 | |||||
| fstmias YO, { s20 } | |||||
| add YO, YO, INC_Y | |||||
| fldmias YO, { s21 } | |||||
| vmla.f32 s21, s0, s29 | |||||
| fstmias YO, { s21 } | |||||
| add YO, YO, INC_Y | |||||
| fldmias YO, { s22 } | |||||
| vmla.f32 s22, s0, s30 | |||||
| fstmias YO, { s22 } | |||||
| add YO, YO, INC_Y | |||||
| fldmias YO, { s23 } | |||||
| vmla.f32 s23, s0, s31 | |||||
| fstmias YO, { s23 } | |||||
| add YO, YO, INC_Y | |||||
| .endm | |||||
| .macro INIT_S1 | |||||
| vsub.f32 s24 , s24 , s24 | |||||
| .endm | |||||
| .macro KERNEL_S1X1 | |||||
| fldmias XO , { s4 } | |||||
| fldmias AO1 , { s8 } | |||||
| vmla.f32 s24 , s4 , s8 | |||||
| add AO1, AO1, LDA | |||||
| add XO, XO, INC_X | |||||
| .endm | |||||
| .macro SAVE_S1 | |||||
| fldmias YO, { s16 } | |||||
| vmla.f32 s16, s0, s24 | |||||
| fstmias YO, { s16 } | |||||
| add YO, YO, INC_Y | |||||
| .endm | |||||
| #endif | |||||
| /************************************************************************************** | |||||
| * End of macro definitions | |||||
| **************************************************************************************/ | |||||
| PROLOGUE | |||||
| .align 5 | |||||
| push {r4 - r9 , fp} | |||||
| add fp, sp, #28 | |||||
| sub sp, sp, #STACKSIZE // reserve stack | |||||
| sub r12, fp, #192 | |||||
| #if defined(DOUBLE) | |||||
| vstm r12, { d8 - d15 } // store floating point registers | |||||
| #else | |||||
| vstm r12, { s8 - s31 } // store floating point registers | |||||
| #endif | |||||
| cmp OLD_M, #0 | |||||
| ble gemvn_kernel_L999 | |||||
| cmp N, #0 | |||||
| ble gemvn_kernel_L999 | |||||
| str OLD_A, A | |||||
| str OLD_M, M | |||||
| ldr INC_X , OLD_INC_X | |||||
| ldr INC_Y , OLD_INC_Y | |||||
| cmp INC_X, #0 | |||||
| beq gemvn_kernel_L999 | |||||
| cmp INC_Y, #0 | |||||
| beq gemvn_kernel_L999 | |||||
| ldr LDA, OLD_LDA | |||||
| #if defined(DOUBLE) | |||||
| lsl LDA, LDA, #3 // LDA * SIZE | |||||
| #else | |||||
| lsl LDA, LDA, #2 // LDA * SIZE | |||||
| #endif | |||||
| cmp INC_X, #1 | |||||
| bne gemvn_kernel_S8_BEGIN | |||||
| cmp INC_Y, #1 | |||||
| bne gemvn_kernel_S8_BEGIN | |||||
| gemvn_kernel_F8_BEGIN: | |||||
| ldr YO , Y | |||||
| ldr I, M | |||||
| asrs I, I, #3 // I = M / 8 | |||||
| ble gemvn_kernel_F1_BEGIN | |||||
| gemvn_kernel_F8X8: | |||||
| ldr AO1, A | |||||
| add AO2, AO1, LDA | |||||
| add r3 , AO1, #8*SIZE | |||||
| str r3 , A | |||||
| ldr XO , X | |||||
| INIT_F8 | |||||
| asrs J, N, #3 // J = N / 8 | |||||
| ble gemvn_kernel_F8X1 | |||||
| gemvn_kernel_F8X8_10: | |||||
| KERNEL_F8X8 | |||||
| subs J, J, #1 | |||||
| bne gemvn_kernel_F8X8_10 | |||||
| gemvn_kernel_F8X1: | |||||
| ands J, N , #7 | |||||
| ble gemvn_kernel_F8_END | |||||
| gemvn_kernel_F8X1_10: | |||||
| KERNEL_F8X1 | |||||
| subs J, J, #1 | |||||
| bne gemvn_kernel_F8X1_10 | |||||
| gemvn_kernel_F8_END: | |||||
| SAVE_F8 | |||||
| subs I , I , #1 | |||||
| bne gemvn_kernel_F8X8 | |||||
| gemvn_kernel_F1_BEGIN: | |||||
| ldr I, M | |||||
| ands I, I , #7 | |||||
| ble gemvn_kernel_L999 | |||||
| gemvn_kernel_F1X1: | |||||
| ldr AO1, A | |||||
| add r3, AO1, #SIZE | |||||
| str r3, A | |||||
| ldr XO , X | |||||
| INIT_F1 | |||||
| mov J, N | |||||
| gemvn_kernel_F1X1_10: | |||||
| KERNEL_F1X1 | |||||
| subs J, J, #1 | |||||
| bne gemvn_kernel_F1X1_10 | |||||
| gemvn_kernel_F1_END: | |||||
| SAVE_F1 | |||||
| subs I , I , #1 | |||||
| bne gemvn_kernel_F1X1 | |||||
| b gemvn_kernel_L999 | |||||
| /*************************************************************************************************************/ | |||||
| gemvn_kernel_S8_BEGIN: | |||||
| #if defined(DOUBLE) | |||||
| lsl INC_X, INC_X, #3 // INC_X * SIZE | |||||
| lsl INC_Y, INC_Y, #3 // INC_Y * SIZE | |||||
| #else | |||||
| lsl INC_X, INC_X, #2 // INC_X * SIZE | |||||
| lsl INC_Y, INC_Y, #2 // INC_Y * SIZE | |||||
| #endif | |||||
| ldr YO , Y | |||||
| ldr I, M | |||||
| asrs I, I, #3 // I = M / 8 | |||||
| ble gemvn_kernel_S1_BEGIN | |||||
| gemvn_kernel_S8X8: | |||||
| ldr AO1, A | |||||
| add AO2, AO1, LDA | |||||
| add r3 , AO1, #8*SIZE | |||||
| str r3 , A | |||||
| ldr XO , X | |||||
| INIT_S8 | |||||
| asrs J, N, #3 // J = N / 8 | |||||
| ble gemvn_kernel_S8X1 | |||||
| gemvn_kernel_S8X8_10: | |||||
| KERNEL_S8X8 | |||||
| subs J, J, #1 | |||||
| bne gemvn_kernel_S8X8_10 | |||||
| gemvn_kernel_S8X1: | |||||
| ands J, N , #7 | |||||
| ble gemvn_kernel_S8_END | |||||
| gemvn_kernel_S8X1_10: | |||||
| KERNEL_S8X1 | |||||
| subs J, J, #1 | |||||
| bne gemvn_kernel_S8X1_10 | |||||
| gemvn_kernel_S8_END: | |||||
| SAVE_S8 | |||||
| subs I , I , #1 | |||||
| bne gemvn_kernel_S8X8 | |||||
| gemvn_kernel_S1_BEGIN: | |||||
| ldr I, M | |||||
| ands I, I , #7 | |||||
| ble gemvn_kernel_L999 | |||||
| gemvn_kernel_S1X1: | |||||
| ldr AO1, A | |||||
| add r3, AO1, #SIZE | |||||
| str r3, A | |||||
| ldr XO , X | |||||
| INIT_S1 | |||||
| mov J, N | |||||
| gemvn_kernel_S1X1_10: | |||||
| KERNEL_S1X1 | |||||
| subs J, J, #1 | |||||
| bne gemvn_kernel_S1X1_10 | |||||
| gemvn_kernel_S1_END: | |||||
| SAVE_S1 | |||||
| subs I , I , #1 | |||||
| bne gemvn_kernel_S1X1 | |||||
| /*************************************************************************************************************/ | |||||
| gemvn_kernel_L999: | |||||
| sub r3, fp, #192 | |||||
| #if defined(DOUBLE) | |||||
| vldm r3, { d8 - d15 } // restore floating point registers | |||||
| #else | |||||
| vldm r3, { s8 - s31 } // restore floating point registers | |||||
| #endif | |||||
| mov r0, #0 // set return value | |||||
| sub sp, fp, #28 | |||||
| pop {r4 -r9 ,fp} | |||||
| bx lr | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,67 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| /************************************************************************************** | |||||
| * * 2013/09/14 Saar | |||||
| * * BLASTEST float : OK | |||||
| * * BLASTEST double : OK | |||||
| * CTEST : OK | |||||
| * TEST : OK | |||||
| * * | |||||
| * **************************************************************************************/ | |||||
| #include "common.h" | |||||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||||
| { | |||||
| BLASLONG i; | |||||
| BLASLONG ix,iy; | |||||
| BLASLONG j; | |||||
| FLOAT *a_ptr; | |||||
| FLOAT temp; | |||||
| iy = 0; | |||||
| a_ptr = a; | |||||
| for (j=0; j<n; j++) | |||||
| { | |||||
| temp = 0.0; | |||||
| ix = 0; | |||||
| for (i=0; i<m; i++) | |||||
| { | |||||
| temp += a_ptr[i] * x[ix]; | |||||
| ix += inc_x; | |||||
| } | |||||
| y[iy] += alpha * temp; | |||||
| iy += inc_y; | |||||
| a_ptr += lda; | |||||
| } | |||||
| } | |||||
| @@ -0,0 +1,750 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| /************************************************************************************** | |||||
| * 2013/11/25 Saar | |||||
| * BLASTEST : OK | |||||
| * CTEST : OK | |||||
| * TEST : OK | |||||
| * | |||||
| **************************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define STACKSIZE 256 | |||||
| #define OLD_LDA [fp, #0 ] | |||||
| #define X [fp, #4 ] | |||||
| #define OLD_INC_X [fp, #8 ] | |||||
| #define Y [fp, #12 ] | |||||
| #define OLD_INC_Y [fp, #16 ] | |||||
| #define OLD_A r3 | |||||
| #define OLD_N r1 | |||||
| #define M r0 | |||||
| #define AO1 r1 | |||||
| #define J r2 | |||||
| #define AO2 r4 | |||||
| #define XO r5 | |||||
| #define YO r6 | |||||
| #define LDA r7 | |||||
| #define INC_X r8 | |||||
| #define INC_Y r9 | |||||
| #define I r12 | |||||
| #define N [fp, #-252 ] | |||||
| #define A [fp, #-256 ] | |||||
| #define X_PRE 512 | |||||
| #define A_PRE 512 | |||||
| /************************************************************************************** | |||||
| * Macro definitions | |||||
| **************************************************************************************/ | |||||
| #if defined(DOUBLE) | |||||
| .macro INIT_F2 | |||||
| vsub.f64 d2 , d2 , d2 | |||||
| vsub.f64 d3 , d3 , d3 | |||||
| .endm | |||||
| .macro KERNEL_F2X4 | |||||
| pld [ XO , #X_PRE ] | |||||
| fldmiad XO! , { d12 - d15 } | |||||
| pld [ AO1 , #A_PRE ] | |||||
| fldmiad AO1!, { d8 - d9 } | |||||
| pld [ AO2 , #A_PRE ] | |||||
| fldmiad AO2!, { d4 - d5 } | |||||
| fldmiad AO1!, { d10 - d11 } | |||||
| fldmiad AO2!, { d6 - d7 } | |||||
| vmla.f64 d2 , d12 , d8 | |||||
| vmla.f64 d3 , d12 , d4 | |||||
| vmla.f64 d2 , d13 , d9 | |||||
| vmla.f64 d3 , d13 , d5 | |||||
| vmla.f64 d2 , d14, d10 | |||||
| vmla.f64 d3 , d14, d6 | |||||
| vmla.f64 d2 , d15, d11 | |||||
| vmla.f64 d3 , d15, d7 | |||||
| .endm | |||||
| .macro KERNEL_F2X1 | |||||
| fldmiad XO! , { d1 } | |||||
| fldmiad AO1!, { d8 } | |||||
| fldmiad AO2!, { d4 } | |||||
| vmla.f64 d2 , d1 , d8 | |||||
| vmla.f64 d3 , d1 , d4 | |||||
| .endm | |||||
| .macro SAVE_F2 | |||||
| fldmiad YO, { d4 - d5 } | |||||
| vmla.f64 d4, d0, d2 | |||||
| vmla.f64 d5, d0, d3 | |||||
| fstmiad YO!, { d4 - d5 } | |||||
| .endm | |||||
| .macro INIT_F1 | |||||
| vsub.f64 d2 , d2 , d2 | |||||
| .endm | |||||
| .macro KERNEL_F1X4 | |||||
| pld [ XO , #X_PRE ] | |||||
| fldmiad XO! , { d12 - d15 } | |||||
| pld [ AO1 , #A_PRE ] | |||||
| fldmiad AO1!, { d8 - d9 } | |||||
| fldmiad AO1!, { d10 - d11 } | |||||
| vmla.f64 d2 , d12 , d8 | |||||
| vmla.f64 d2 , d13 , d9 | |||||
| vmla.f64 d2 , d14, d10 | |||||
| vmla.f64 d2 , d15, d11 | |||||
| .endm | |||||
| .macro KERNEL_F1X1 | |||||
| fldmiad XO! , { d1 } | |||||
| fldmiad AO1!, { d8 } | |||||
| vmla.f64 d2 , d1 , d8 | |||||
| .endm | |||||
| .macro SAVE_F1 | |||||
| fldmiad YO, { d4 } | |||||
| vmla.f64 d4, d0, d2 | |||||
| fstmiad YO!, { d4 } | |||||
| .endm | |||||
| .macro INIT_S2 | |||||
| vsub.f64 d2 , d2 , d2 | |||||
| vsub.f64 d3 , d3 , d3 | |||||
| .endm | |||||
| .macro KERNEL_S2X4 | |||||
| fldmiad XO , { d12 } | |||||
| add XO, XO, INC_X | |||||
| pld [ AO1 , #A_PRE ] | |||||
| fldmiad AO1!, { d8 - d9 } | |||||
| pld [ AO2 , #A_PRE ] | |||||
| fldmiad AO2!, { d4 - d5 } | |||||
| fldmiad XO , { d13 } | |||||
| add XO, XO, INC_X | |||||
| fldmiad AO1!, { d10 - d11 } | |||||
| fldmiad AO2!, { d6 - d7 } | |||||
| fldmiad XO , { d14 } | |||||
| add XO, XO, INC_X | |||||
| fldmiad XO , { d15 } | |||||
| add XO, XO, INC_X | |||||
| vmla.f64 d2 , d12 , d8 | |||||
| vmla.f64 d3 , d12 , d4 | |||||
| vmla.f64 d2 , d13 , d9 | |||||
| vmla.f64 d3 , d13 , d5 | |||||
| vmla.f64 d2 , d14, d10 | |||||
| vmla.f64 d3 , d14, d6 | |||||
| vmla.f64 d2 , d15, d11 | |||||
| vmla.f64 d3 , d15, d7 | |||||
| .endm | |||||
| .macro KERNEL_S2X1 | |||||
| fldmiad XO , { d1 } | |||||
| fldmiad AO1!, { d8 } | |||||
| fldmiad AO2!, { d4 } | |||||
| vmla.f64 d2 , d1 , d8 | |||||
| add XO, XO, INC_X | |||||
| vmla.f64 d3 , d1 , d4 | |||||
| .endm | |||||
| .macro SAVE_S2 | |||||
| fldmiad YO, { d4 } | |||||
| vmla.f64 d4, d0, d2 | |||||
| fstmiad YO, { d4 } | |||||
| add YO, YO, INC_Y | |||||
| fldmiad YO, { d5 } | |||||
| vmla.f64 d5, d0, d3 | |||||
| fstmiad YO, { d5 } | |||||
| add YO, YO, INC_Y | |||||
| .endm | |||||
| .macro INIT_S1 | |||||
| vsub.f64 d2 , d2 , d2 | |||||
| .endm | |||||
| .macro KERNEL_S1X4 | |||||
| fldmiad XO , { d12 } | |||||
| add XO, XO, INC_X | |||||
| pld [ AO1 , #A_PRE ] | |||||
| fldmiad AO1!, { d8 - d9 } | |||||
| fldmiad XO , { d13 } | |||||
| add XO, XO, INC_X | |||||
| fldmiad AO1!, { d10 - d11 } | |||||
| fldmiad XO , { d14 } | |||||
| add XO, XO, INC_X | |||||
| fldmiad XO , { d15 } | |||||
| add XO, XO, INC_X | |||||
| vmla.f64 d2 , d12 , d8 | |||||
| vmla.f64 d2 , d13 , d9 | |||||
| vmla.f64 d2 , d14, d10 | |||||
| vmla.f64 d2 , d15, d11 | |||||
| .endm | |||||
| .macro KERNEL_S1X1 | |||||
| fldmiad XO , { d1 } | |||||
| fldmiad AO1!, { d8 } | |||||
| vmla.f64 d2 , d1 , d8 | |||||
| add XO, XO, INC_X | |||||
| .endm | |||||
| .macro SAVE_S1 | |||||
| fldmiad YO, { d4 } | |||||
| vmla.f64 d4, d0, d2 | |||||
| fstmiad YO, { d4 } | |||||
| add YO, YO, INC_Y | |||||
| .endm | |||||
| #else /************************* SINGLE PRECISION *****************************************/ | |||||
| .macro INIT_F2 | |||||
| vsub.f32 s2 , s2 , s2 | |||||
| vsub.f32 s3 , s3 , s3 | |||||
| .endm | |||||
| .macro KERNEL_F2X4 | |||||
| fldmias XO! , { s12 - s15 } | |||||
| fldmias AO1!, { s8 - s9 } | |||||
| fldmias AO2!, { s4 - s5 } | |||||
| fldmias AO1!, { s10 - s11 } | |||||
| fldmias AO2!, { s6 - s7 } | |||||
| vmla.f32 s2 , s12 , s8 | |||||
| vmla.f32 s3 , s12 , s4 | |||||
| vmla.f32 s2 , s13 , s9 | |||||
| vmla.f32 s3 , s13 , s5 | |||||
| vmla.f32 s2 , s14, s10 | |||||
| vmla.f32 s3 , s14, s6 | |||||
| vmla.f32 s2 , s15, s11 | |||||
| vmla.f32 s3 , s15, s7 | |||||
| .endm | |||||
| .macro KERNEL_F2X1 | |||||
| fldmias XO! , { s1 } | |||||
| fldmias AO1!, { s8 } | |||||
| fldmias AO2!, { s4 } | |||||
| vmla.f32 s2 , s1 , s8 | |||||
| vmla.f32 s3 , s1 , s4 | |||||
| .endm | |||||
| .macro SAVE_F2 | |||||
| fldmias YO, { s4 - s5 } | |||||
| vmla.f32 s4, s0, s2 | |||||
| vmla.f32 s5, s0, s3 | |||||
| fstmias YO!, { s4 - s5 } | |||||
| .endm | |||||
| .macro INIT_F1 | |||||
| vsub.f32 s2 , s2 , s2 | |||||
| .endm | |||||
| .macro KERNEL_F1X4 | |||||
| fldmias XO! , { s12 - s15 } | |||||
| fldmias AO1!, { s8 - s9 } | |||||
| fldmias AO1!, { s10 - s11 } | |||||
| vmla.f32 s2 , s12 , s8 | |||||
| vmla.f32 s2 , s13 , s9 | |||||
| vmla.f32 s2 , s14, s10 | |||||
| vmla.f32 s2 , s15, s11 | |||||
| .endm | |||||
| .macro KERNEL_F1X1 | |||||
| fldmias XO! , { s1 } | |||||
| fldmias AO1!, { s8 } | |||||
| vmla.f32 s2 , s1 , s8 | |||||
| .endm | |||||
| .macro SAVE_F1 | |||||
| fldmias YO, { s4 } | |||||
| vmla.f32 s4, s0, s2 | |||||
| fstmias YO!, { s4 } | |||||
| .endm | |||||
| .macro INIT_S2 | |||||
| vsub.f32 s2 , s2 , s2 | |||||
| vsub.f32 s3 , s3 , s3 | |||||
| .endm | |||||
| .macro KERNEL_S2X4 | |||||
| fldmias XO , { s12 } | |||||
| add XO, XO, INC_X | |||||
| fldmias AO1!, { s8 - s9 } | |||||
| fldmias AO2!, { s4 - s5 } | |||||
| fldmias XO , { s13 } | |||||
| add XO, XO, INC_X | |||||
| fldmias AO1!, { s10 - s11 } | |||||
| fldmias AO2!, { s6 - s7 } | |||||
| fldmias XO , { s14 } | |||||
| add XO, XO, INC_X | |||||
| fldmias XO , { s15 } | |||||
| add XO, XO, INC_X | |||||
| vmla.f32 s2 , s12 , s8 | |||||
| vmla.f32 s3 , s12 , s4 | |||||
| vmla.f32 s2 , s13 , s9 | |||||
| vmla.f32 s3 , s13 , s5 | |||||
| vmla.f32 s2 , s14, s10 | |||||
| vmla.f32 s3 , s14, s6 | |||||
| vmla.f32 s2 , s15, s11 | |||||
| vmla.f32 s3 , s15, s7 | |||||
| .endm | |||||
| .macro KERNEL_S2X1 | |||||
| fldmias XO , { s1 } | |||||
| fldmias AO1!, { s8 } | |||||
| fldmias AO2!, { s4 } | |||||
| vmla.f32 s2 , s1 , s8 | |||||
| add XO, XO, INC_X | |||||
| vmla.f32 s3 , s1 , s4 | |||||
| .endm | |||||
| .macro SAVE_S2 | |||||
| fldmias YO, { s4 } | |||||
| vmla.f32 s4, s0, s2 | |||||
| fstmias YO, { s4 } | |||||
| add YO, YO, INC_Y | |||||
| fldmias YO, { s5 } | |||||
| vmla.f32 s5, s0, s3 | |||||
| fstmias YO, { s5 } | |||||
| add YO, YO, INC_Y | |||||
| .endm | |||||
| .macro INIT_S1 | |||||
| vsub.f32 s2 , s2 , s2 | |||||
| .endm | |||||
| .macro KERNEL_S1X4 | |||||
| fldmias XO , { s12 } | |||||
| add XO, XO, INC_X | |||||
| pld [ AO1 , #A_PRE ] | |||||
| fldmias AO1!, { s8 - s9 } | |||||
| fldmias XO , { s13 } | |||||
| add XO, XO, INC_X | |||||
| fldmias AO1!, { s10 - s11 } | |||||
| fldmias XO , { s14 } | |||||
| add XO, XO, INC_X | |||||
| fldmias XO , { s15 } | |||||
| add XO, XO, INC_X | |||||
| vmla.f32 s2 , s12 , s8 | |||||
| vmla.f32 s2 , s13 , s9 | |||||
| vmla.f32 s2 , s14, s10 | |||||
| vmla.f32 s2 , s15, s11 | |||||
| .endm | |||||
| .macro KERNEL_S1X1 | |||||
| fldmias XO , { s1 } | |||||
| fldmias AO1!, { s8 } | |||||
| vmla.f32 s2 , s1 , s8 | |||||
| add XO, XO, INC_X | |||||
| .endm | |||||
| .macro SAVE_S1 | |||||
| fldmias YO, { s4 } | |||||
| vmla.f32 s4, s0, s2 | |||||
| fstmias YO, { s4 } | |||||
| add YO, YO, INC_Y | |||||
| .endm | |||||
| #endif | |||||
| /************************************************************************************** | |||||
| * End of macro definitions | |||||
| **************************************************************************************/ | |||||
| PROLOGUE | |||||
| .align 5 | |||||
| push {r4 - r9 , fp} | |||||
| add fp, sp, #28 | |||||
| sub sp, sp, #STACKSIZE // reserve stack | |||||
| sub r12, fp, #192 | |||||
| #if defined(DOUBLE) | |||||
| vstm r12, { d8 - d15 } // store floating point registers | |||||
| #else | |||||
| vstm r12, { s8 - s15 } // store floating point registers | |||||
| #endif | |||||
| cmp M, #0 | |||||
| ble gemvt_kernel_L999 | |||||
| cmp OLD_N, #0 | |||||
| ble gemvt_kernel_L999 | |||||
| str OLD_A, A | |||||
| str OLD_N, N | |||||
| ldr INC_X , OLD_INC_X | |||||
| ldr INC_Y , OLD_INC_Y | |||||
| cmp INC_X, #0 | |||||
| beq gemvt_kernel_L999 | |||||
| cmp INC_Y, #0 | |||||
| beq gemvt_kernel_L999 | |||||
| ldr LDA, OLD_LDA | |||||
| #if defined(DOUBLE) | |||||
| lsl LDA, LDA, #3 // LDA * SIZE | |||||
| #else | |||||
| lsl LDA, LDA, #2 // LDA * SIZE | |||||
| #endif | |||||
| cmp INC_X, #1 | |||||
| bne gemvt_kernel_S2_BEGIN | |||||
| cmp INC_Y, #1 | |||||
| bne gemvt_kernel_S2_BEGIN | |||||
| gemvt_kernel_F2_BEGIN: | |||||
| ldr YO , Y | |||||
| ldr J, N | |||||
| asrs J, J, #1 // J = N / 2 | |||||
| ble gemvt_kernel_F1_BEGIN | |||||
| gemvt_kernel_F2X4: | |||||
| ldr AO1, A | |||||
| add AO2, AO1, LDA | |||||
| add r3 , AO2, LDA | |||||
| str r3 , A | |||||
| ldr XO , X | |||||
| INIT_F2 | |||||
| asrs I, M, #2 // I = M / 4 | |||||
| ble gemvt_kernel_F2X1 | |||||
| gemvt_kernel_F2X4_10: | |||||
| KERNEL_F2X4 | |||||
| subs I, I, #1 | |||||
| bne gemvt_kernel_F2X4_10 | |||||
| gemvt_kernel_F2X1: | |||||
| ands I, M , #3 | |||||
| ble gemvt_kernel_F2_END | |||||
| gemvt_kernel_F2X1_10: | |||||
| KERNEL_F2X1 | |||||
| subs I, I, #1 | |||||
| bne gemvt_kernel_F2X1_10 | |||||
| gemvt_kernel_F2_END: | |||||
| SAVE_F2 | |||||
| subs J , J , #1 | |||||
| bne gemvt_kernel_F2X4 | |||||
| gemvt_kernel_F1_BEGIN: | |||||
| ldr J, N | |||||
| ands J, J, #1 | |||||
| ble gemvt_kernel_L999 | |||||
| gemvt_kernel_F1X4: | |||||
| ldr AO1, A | |||||
| ldr XO , X | |||||
| INIT_F1 | |||||
| asrs I, M, #2 // I = M / 4 | |||||
| ble gemvt_kernel_F1X1 | |||||
| gemvt_kernel_F1X4_10: | |||||
| KERNEL_F1X4 | |||||
| subs I, I, #1 | |||||
| bne gemvt_kernel_F1X4_10 | |||||
| gemvt_kernel_F1X1: | |||||
| ands I, M , #3 | |||||
| ble gemvt_kernel_F1_END | |||||
| gemvt_kernel_F1X1_10: | |||||
| KERNEL_F1X1 | |||||
| subs I, I, #1 | |||||
| bne gemvt_kernel_F1X1_10 | |||||
| gemvt_kernel_F1_END: | |||||
| SAVE_F1 | |||||
| b gemvt_kernel_L999 | |||||
| /*************************************************************************************************************/ | |||||
| gemvt_kernel_S2_BEGIN: | |||||
| #if defined(DOUBLE) | |||||
| lsl INC_X, INC_X, #3 // INC_X * SIZE | |||||
| lsl INC_Y, INC_Y, #3 // INC_Y * SIZE | |||||
| #else | |||||
| lsl INC_X, INC_X, #2 // INC_X * SIZE | |||||
| lsl INC_Y, INC_Y, #2 // INC_Y * SIZE | |||||
| #endif | |||||
| ldr YO , Y | |||||
| ldr J, N | |||||
| asrs J, J, #1 // J = N / 2 | |||||
| ble gemvt_kernel_S1_BEGIN | |||||
| gemvt_kernel_S2X4: | |||||
| ldr AO1, A | |||||
| add AO2, AO1, LDA | |||||
| add r3 , AO2, LDA | |||||
| str r3 , A | |||||
| ldr XO , X | |||||
| INIT_S2 | |||||
| asrs I, M, #2 // I = M / 4 | |||||
| ble gemvt_kernel_S2X1 | |||||
| gemvt_kernel_S2X4_10: | |||||
| KERNEL_S2X4 | |||||
| subs I, I, #1 | |||||
| bne gemvt_kernel_S2X4_10 | |||||
| gemvt_kernel_S2X1: | |||||
| ands I, M , #3 | |||||
| ble gemvt_kernel_S2_END | |||||
| gemvt_kernel_S2X1_10: | |||||
| KERNEL_S2X1 | |||||
| subs I, I, #1 | |||||
| bne gemvt_kernel_S2X1_10 | |||||
| gemvt_kernel_S2_END: | |||||
| SAVE_S2 | |||||
| subs J , J , #1 | |||||
| bne gemvt_kernel_S2X4 | |||||
| gemvt_kernel_S1_BEGIN: | |||||
| ldr J, N | |||||
| ands J, J, #1 | |||||
| ble gemvt_kernel_L999 | |||||
| gemvt_kernel_S1X4: | |||||
| ldr AO1, A | |||||
| ldr XO , X | |||||
| INIT_S1 | |||||
| asrs I, M, #2 // I = M / 4 | |||||
| ble gemvt_kernel_S1X1 | |||||
| gemvt_kernel_S1X4_10: | |||||
| KERNEL_S1X4 | |||||
| subs I, I, #1 | |||||
| bne gemvt_kernel_S1X4_10 | |||||
| gemvt_kernel_S1X1: | |||||
| ands I, M , #3 | |||||
| ble gemvt_kernel_S1_END | |||||
| gemvt_kernel_S1X1_10: | |||||
| KERNEL_S1X1 | |||||
| subs I, I, #1 | |||||
| bne gemvt_kernel_S1X1_10 | |||||
| gemvt_kernel_S1_END: | |||||
| SAVE_S1 | |||||
| /*************************************************************************************************************/ | |||||
| gemvt_kernel_L999: | |||||
| sub r3, fp, #192 | |||||
| #if defined(DOUBLE) | |||||
| vldm r3, { d8 - d15 } // restore floating point registers | |||||
| #else | |||||
| vldm r3, { s8 - s15 } // restore floating point registers | |||||
| #endif | |||||
| mov r0, #0 // set return value | |||||
| sub sp, fp, #28 | |||||
| pop {r4 -r9 ,fp} | |||||
| bx lr | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,732 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| /************************************************************************************** | |||||
| * 2013/11/18 Saar | |||||
| * BLASTEST : OK | |||||
| * CTEST : OK | |||||
| * TEST : OK | |||||
| * | |||||
| **************************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define STACKSIZE 256 | |||||
| #define OLD_LDA [fp, #0 ] | |||||
| #define X [fp, #4 ] | |||||
| #define OLD_INC_X [fp, #8 ] | |||||
| #define Y [fp, #12 ] | |||||
| #define OLD_INC_Y [fp, #16 ] | |||||
| #define OLD_A r3 | |||||
| #define OLD_N r1 | |||||
| #define M r0 | |||||
| #define AO1 r1 | |||||
| #define J r2 | |||||
| #define AO2 r4 | |||||
| #define XO r5 | |||||
| #define YO r6 | |||||
| #define LDA r7 | |||||
| #define INC_X r8 | |||||
| #define INC_Y r9 | |||||
| #define I r12 | |||||
| #define N [fp, #-252 ] | |||||
| #define A [fp, #-256 ] | |||||
| #define X_PRE 512 | |||||
| #define A_PRE 512 | |||||
| /************************************************************************************** | |||||
| * Macro definitions | |||||
| **************************************************************************************/ | |||||
| #if defined(DOUBLE) | |||||
| .macro INIT_F2 | |||||
| vsub.f64 d4 , d4 , d4 | |||||
| vsub.f64 d5 , d5 , d5 | |||||
| .endm | |||||
| .macro KERNEL_F2X4 | |||||
| pld [ XO , #X_PRE ] | |||||
| fldmiad XO! , { d28 - d31 } | |||||
| pld [ AO1 , #A_PRE ] | |||||
| fldmiad AO1!, { d8 - d9 } | |||||
| pld [ AO2 , #A_PRE ] | |||||
| fldmiad AO2!, { d16 - d17 } | |||||
| vmla.f64 d4 , d28 , d8 | |||||
| vmla.f64 d5 , d28 , d16 | |||||
| fldmiad AO1!, { d10 - d11 } | |||||
| vmla.f64 d4 , d29 , d9 | |||||
| vmla.f64 d5 , d29 , d17 | |||||
| fldmiad AO2!, { d18 - d19 } | |||||
| vmla.f64 d4 , d30, d10 | |||||
| vmla.f64 d5 , d30, d18 | |||||
| vmla.f64 d4 , d31, d11 | |||||
| vmla.f64 d5 , d31, d19 | |||||
| .endm | |||||
| .macro KERNEL_F2X1 | |||||
| fldmiad XO! , { d2 } | |||||
| fldmiad AO1!, { d8 } | |||||
| fldmiad AO2!, { d16 } | |||||
| vmla.f64 d4 , d2 , d8 | |||||
| vmla.f64 d5 , d2 , d16 | |||||
| .endm | |||||
| .macro SAVE_F2 | |||||
| fldmiad YO, { d24 - d25 } | |||||
| vmla.f64 d24, d0, d4 | |||||
| vmla.f64 d25, d0, d5 | |||||
| fstmiad YO!, { d24 - d25 } | |||||
| .endm | |||||
| .macro INIT_S2 | |||||
| vsub.f64 d4 , d4 , d4 | |||||
| vsub.f64 d5 , d5 , d5 | |||||
| .endm | |||||
| .macro KERNEL_S2X4 | |||||
| pld [ AO1 , #A_PRE ] | |||||
| fldmiad XO , { d28 } | |||||
| add XO, XO, INC_X | |||||
| fldmiad AO1!, { d8 - d9 } | |||||
| pld [ AO2 , #A_PRE ] | |||||
| fldmiad AO2!, { d16 - d17 } | |||||
| vmla.f64 d4 , d28 , d8 | |||||
| fldmiad XO , { d29 } | |||||
| add XO, XO, INC_X | |||||
| vmla.f64 d5 , d28 , d16 | |||||
| fldmiad AO1!, { d10 - d11 } | |||||
| vmla.f64 d4 , d29 , d9 | |||||
| fldmiad XO , { d30 } | |||||
| add XO, XO, INC_X | |||||
| vmla.f64 d5 , d29 , d17 | |||||
| fldmiad AO2!, { d18 - d19 } | |||||
| vmla.f64 d4 , d30, d10 | |||||
| fldmiad XO , { d31 } | |||||
| add XO, XO, INC_X | |||||
| vmla.f64 d5 , d30, d18 | |||||
| vmla.f64 d4 , d31, d11 | |||||
| vmla.f64 d5 , d31, d19 | |||||
| .endm | |||||
| .macro KERNEL_S2X1 | |||||
| fldmiad XO , { d2 } | |||||
| fldmiad AO1!, { d8 } | |||||
| add XO, XO, INC_X | |||||
| fldmiad AO2!, { d16 } | |||||
| vmla.f64 d4 , d2 , d8 | |||||
| vmla.f64 d5 , d2 , d16 | |||||
| .endm | |||||
| .macro SAVE_S2 | |||||
| fldmiad YO, { d24 } | |||||
| vmla.f64 d24, d0, d4 | |||||
| fstmiad YO, { d24 } | |||||
| add YO, YO, INC_Y | |||||
| fldmiad YO, { d24 } | |||||
| vmla.f64 d24, d0, d5 | |||||
| fstmiad YO, { d24 } | |||||
| add YO, YO, INC_Y | |||||
| .endm | |||||
| .macro INIT_F1 | |||||
| vsub.f64 d4 , d4 , d4 | |||||
| .endm | |||||
| .macro KERNEL_F1X4 | |||||
| pld [ XO , #X_PRE ] | |||||
| fldmiad XO! , { d28 - d31 } | |||||
| pld [ AO1 , #A_PRE ] | |||||
| fldmiad AO1!, { d8 - d9 } | |||||
| vmla.f64 d4 , d28 , d8 | |||||
| fldmiad AO1!, { d10 - d11 } | |||||
| vmla.f64 d4 , d29 , d9 | |||||
| vmla.f64 d4 , d30, d10 | |||||
| vmla.f64 d4 , d31, d11 | |||||
| .endm | |||||
| .macro KERNEL_F1X1 | |||||
| fldmiad XO! , { d2 } | |||||
| fldmiad AO1!, { d8 } | |||||
| vmla.f64 d4 , d2 , d8 | |||||
| .endm | |||||
| .macro SAVE_F1 | |||||
| fldmiad YO, { d24 } | |||||
| vmla.f64 d24, d0, d4 | |||||
| fstmiad YO!, { d24 } | |||||
| .endm | |||||
| .macro INIT_S1 | |||||
| vsub.f64 d4 , d4 , d4 | |||||
| .endm | |||||
| .macro KERNEL_S1X4 | |||||
| pld [ AO1 , #A_PRE ] | |||||
| fldmiad XO , { d28 } | |||||
| add XO, XO, INC_X | |||||
| fldmiad AO1!, { d8 - d9 } | |||||
| vmla.f64 d4 , d28 , d8 | |||||
| fldmiad XO , { d29 } | |||||
| add XO, XO, INC_X | |||||
| fldmiad AO1!, { d10 - d11 } | |||||
| vmla.f64 d4 , d29 , d9 | |||||
| fldmiad XO , { d30 } | |||||
| add XO, XO, INC_X | |||||
| vmla.f64 d4 , d30, d10 | |||||
| fldmiad XO , { d31 } | |||||
| add XO, XO, INC_X | |||||
| vmla.f64 d4 , d31, d11 | |||||
| .endm | |||||
| .macro KERNEL_S1X1 | |||||
| fldmiad XO , { d2 } | |||||
| fldmiad AO1!, { d8 } | |||||
| add XO, XO, INC_X | |||||
| vmla.f64 d4 , d2 , d8 | |||||
| .endm | |||||
| .macro SAVE_S1 | |||||
| fldmiad YO, { d24 } | |||||
| vmla.f64 d24, d0, d4 | |||||
| fstmiad YO, { d24 } | |||||
| add YO, YO, INC_Y | |||||
| .endm | |||||
| #else /************************* SINGLE PRECISION *****************************************/ | |||||
| .macro INIT_F2 | |||||
| vsub.f32 s4 , s4 , s4 | |||||
| vsub.f32 s5 , s5 , s5 | |||||
| .endm | |||||
| .macro KERNEL_F2X4 | |||||
| fldmias XO! , { s28 - s31 } | |||||
| fldmias AO1!, { s8 - s9 } | |||||
| fldmias AO2!, { s16 - s17 } | |||||
| vmla.f32 s4 , s28 , s8 | |||||
| vmla.f32 s5 , s28 , s16 | |||||
| fldmias AO1!, { s10 - s11 } | |||||
| vmla.f32 s4 , s29 , s9 | |||||
| vmla.f32 s5 , s29 , s17 | |||||
| fldmias AO2!, { s18 - s19 } | |||||
| vmla.f32 s4 , s30, s10 | |||||
| vmla.f32 s5 , s30, s18 | |||||
| vmla.f32 s4 , s31, s11 | |||||
| vmla.f32 s5 , s31, s19 | |||||
| .endm | |||||
| .macro KERNEL_F2X1 | |||||
| fldmias XO! , { s2 } | |||||
| fldmias AO1!, { s8 } | |||||
| fldmias AO2!, { s16 } | |||||
| vmla.f32 s4 , s2 , s8 | |||||
| vmla.f32 s5 , s2 , s16 | |||||
| .endm | |||||
| .macro SAVE_F2 | |||||
| fldmias YO, { s24 - s25 } | |||||
| vmla.f32 s24, s0, s4 | |||||
| vmla.f32 s25, s0, s5 | |||||
| fstmias YO!, { s24 - s25 } | |||||
| .endm | |||||
| .macro INIT_S2 | |||||
| vsub.f32 s4 , s4 , s4 | |||||
| vsub.f32 s5 , s5 , s5 | |||||
| .endm | |||||
| .macro KERNEL_S2X4 | |||||
| fldmias XO , { s28 } | |||||
| add XO, XO, INC_X | |||||
| fldmias AO1!, { s8 - s9 } | |||||
| fldmias AO2!, { s16 - s17 } | |||||
| vmla.f32 s4 , s28 , s8 | |||||
| fldmias XO , { s29 } | |||||
| add XO, XO, INC_X | |||||
| vmla.f32 s5 , s28 , s16 | |||||
| fldmias AO1!, { s10 - s11 } | |||||
| vmla.f32 s4 , s29 , s9 | |||||
| fldmias XO , { s30 } | |||||
| add XO, XO, INC_X | |||||
| vmla.f32 s5 , s29 , s17 | |||||
| fldmias AO2!, { s18 - s19 } | |||||
| vmla.f32 s4 , s30, s10 | |||||
| fldmias XO , { s31 } | |||||
| add XO, XO, INC_X | |||||
| vmla.f32 s5 , s30, s18 | |||||
| vmla.f32 s4 , s31, s11 | |||||
| vmla.f32 s5 , s31, s19 | |||||
| .endm | |||||
| .macro KERNEL_S2X1 | |||||
| fldmias XO , { s2 } | |||||
| fldmias AO1!, { s8 } | |||||
| add XO, XO, INC_X | |||||
| fldmias AO2!, { s16 } | |||||
| vmla.f32 s4 , s2 , s8 | |||||
| vmla.f32 s5 , s2 , s16 | |||||
| .endm | |||||
| .macro SAVE_S2 | |||||
| fldmias YO, { s24 } | |||||
| vmla.f32 s24, s0, s4 | |||||
| fstmias YO, { s24 } | |||||
| add YO, YO, INC_Y | |||||
| fldmias YO, { s24 } | |||||
| vmla.f32 s24, s0, s5 | |||||
| fstmias YO, { s24 } | |||||
| add YO, YO, INC_Y | |||||
| .endm | |||||
| .macro INIT_F1 | |||||
| vsub.f32 s4 , s4 , s4 | |||||
| .endm | |||||
| .macro KERNEL_F1X4 | |||||
| fldmias XO! , { s28 - s31 } | |||||
| fldmias AO1!, { s8 - s9 } | |||||
| vmla.f32 s4 , s28 , s8 | |||||
| fldmias AO1!, { s10 - s11 } | |||||
| vmla.f32 s4 , s29 , s9 | |||||
| vmla.f32 s4 , s30, s10 | |||||
| vmla.f32 s4 , s31, s11 | |||||
| .endm | |||||
| .macro KERNEL_F1X1 | |||||
| fldmias XO! , { s2 } | |||||
| fldmias AO1!, { s8 } | |||||
| vmla.f32 s4 , s2 , s8 | |||||
| .endm | |||||
| .macro SAVE_F1 | |||||
| fldmias YO, { s24 } | |||||
| vmla.f32 s24, s0, s4 | |||||
| fstmias YO!, { s24 } | |||||
| .endm | |||||
| .macro INIT_S1 | |||||
| vsub.f32 s4 , s4 , s4 | |||||
| .endm | |||||
| .macro KERNEL_S1X4 | |||||
| fldmias XO , { s28 } | |||||
| add XO, XO, INC_X | |||||
| fldmias AO1!, { s8 - s9 } | |||||
| vmla.f32 s4 , s28 , s8 | |||||
| fldmias XO , { s29 } | |||||
| add XO, XO, INC_X | |||||
| fldmias AO1!, { s10 - s11 } | |||||
| vmla.f32 s4 , s29 , s9 | |||||
| fldmias XO , { s30 } | |||||
| add XO, XO, INC_X | |||||
| vmla.f32 s4 , s30, s10 | |||||
| fldmias XO , { s31 } | |||||
| add XO, XO, INC_X | |||||
| vmla.f32 s4 , s31, s11 | |||||
| .endm | |||||
| .macro KERNEL_S1X1 | |||||
| fldmias XO , { s2 } | |||||
| fldmias AO1!, { s8 } | |||||
| add XO, XO, INC_X | |||||
| vmla.f32 s4 , s2 , s8 | |||||
| .endm | |||||
| .macro SAVE_S1 | |||||
| fldmias YO, { s24 } | |||||
| vmla.f32 s24, s0, s4 | |||||
| fstmias YO, { s24 } | |||||
| add YO, YO, INC_Y | |||||
| .endm | |||||
| #endif | |||||
| /************************************************************************************** | |||||
| * End of macro definitions | |||||
| **************************************************************************************/ | |||||
| PROLOGUE | |||||
| .align 5 | |||||
| push {r4 - r9 , fp} | |||||
| add fp, sp, #28 | |||||
| sub sp, sp, #STACKSIZE // reserve stack | |||||
| sub r12, fp, #192 | |||||
| #if defined(DOUBLE) | |||||
| vstm r12, { d8 - d15 } // store floating point registers | |||||
| #else | |||||
| vstm r12, { s8 - s31 } // store floating point registers | |||||
| #endif | |||||
| cmp M, #0 | |||||
| ble gemvt_kernel_L999 | |||||
| cmp OLD_N, #0 | |||||
| ble gemvt_kernel_L999 | |||||
| str OLD_A, A | |||||
| str OLD_N, N | |||||
| ldr INC_X , OLD_INC_X | |||||
| ldr INC_Y , OLD_INC_Y | |||||
| cmp INC_X, #0 | |||||
| beq gemvt_kernel_L999 | |||||
| cmp INC_Y, #0 | |||||
| beq gemvt_kernel_L999 | |||||
| ldr LDA, OLD_LDA | |||||
| #if defined(DOUBLE) | |||||
| lsl LDA, LDA, #3 // LDA * SIZE | |||||
| #else | |||||
| lsl LDA, LDA, #2 // LDA * SIZE | |||||
| #endif | |||||
| cmp INC_X, #1 | |||||
| bne gemvt_kernel_S2_BEGIN | |||||
| cmp INC_Y, #1 | |||||
| bne gemvt_kernel_S2_BEGIN | |||||
| gemvt_kernel_F2_BEGIN: | |||||
| ldr YO , Y | |||||
| ldr J, N | |||||
| asrs J, J, #1 // J = N / 2 | |||||
| ble gemvt_kernel_F1_BEGIN | |||||
| gemvt_kernel_F2X4: | |||||
| ldr AO1, A | |||||
| add AO2, AO1, LDA | |||||
| add r3 , AO2, LDA | |||||
| str r3 , A | |||||
| ldr XO , X | |||||
| INIT_F2 | |||||
| asrs I, M, #2 // I = M / 4 | |||||
| ble gemvt_kernel_F2X1 | |||||
| gemvt_kernel_F2X4_10: | |||||
| KERNEL_F2X4 | |||||
| subs I, I, #1 | |||||
| bne gemvt_kernel_F2X4_10 | |||||
| gemvt_kernel_F2X1: | |||||
| ands I, M , #3 | |||||
| ble gemvt_kernel_F2_END | |||||
| gemvt_kernel_F2X1_10: | |||||
| KERNEL_F2X1 | |||||
| subs I, I, #1 | |||||
| bne gemvt_kernel_F2X1_10 | |||||
| gemvt_kernel_F2_END: | |||||
| SAVE_F2 | |||||
| subs J , J , #1 | |||||
| bne gemvt_kernel_F2X4 | |||||
| gemvt_kernel_F1_BEGIN: | |||||
| ldr J, N | |||||
| ands J, J, #1 | |||||
| ble gemvt_kernel_L999 | |||||
| gemvt_kernel_F1X4: | |||||
| ldr AO1, A | |||||
| ldr XO , X | |||||
| INIT_F1 | |||||
| asrs I, M, #2 // I = M / 4 | |||||
| ble gemvt_kernel_F1X1 | |||||
| gemvt_kernel_F1X4_10: | |||||
| KERNEL_F1X4 | |||||
| subs I, I, #1 | |||||
| bne gemvt_kernel_F1X4_10 | |||||
| gemvt_kernel_F1X1: | |||||
| ands I, M , #3 | |||||
| ble gemvt_kernel_F1_END | |||||
| gemvt_kernel_F1X1_10: | |||||
| KERNEL_F1X1 | |||||
| subs I, I, #1 | |||||
| bne gemvt_kernel_F1X1_10 | |||||
| gemvt_kernel_F1_END: | |||||
| SAVE_F1 | |||||
| b gemvt_kernel_L999 | |||||
| /*************************************************************************************************************/ | |||||
| gemvt_kernel_S2_BEGIN: | |||||
| #if defined(DOUBLE) | |||||
| lsl INC_X, INC_X, #3 // INC_X * SIZE | |||||
| lsl INC_Y, INC_Y, #3 // INC_Y * SIZE | |||||
| #else | |||||
| lsl INC_X, INC_X, #2 // INC_X * SIZE | |||||
| lsl INC_Y, INC_Y, #2 // INC_Y * SIZE | |||||
| #endif | |||||
| ldr YO , Y | |||||
| ldr J, N | |||||
| asrs J, J, #1 // J = N / 2 | |||||
| ble gemvt_kernel_S1_BEGIN | |||||
| gemvt_kernel_S2X4: | |||||
| ldr AO1, A | |||||
| add AO2, AO1, LDA | |||||
| add r3 , AO2, LDA | |||||
| str r3 , A | |||||
| ldr XO , X | |||||
| INIT_S2 | |||||
| asrs I, M, #2 // I = M / 4 | |||||
| ble gemvt_kernel_S2X1 | |||||
| gemvt_kernel_S2X4_10: | |||||
| KERNEL_S2X4 | |||||
| subs I, I, #1 | |||||
| bne gemvt_kernel_S2X4_10 | |||||
| gemvt_kernel_S2X1: | |||||
| ands I, M , #3 | |||||
| ble gemvt_kernel_S2_END | |||||
| gemvt_kernel_S2X1_10: | |||||
| KERNEL_S2X1 | |||||
| subs I, I, #1 | |||||
| bne gemvt_kernel_S2X1_10 | |||||
| gemvt_kernel_S2_END: | |||||
| SAVE_S2 | |||||
| subs J , J , #1 | |||||
| bne gemvt_kernel_S2X4 | |||||
| gemvt_kernel_S1_BEGIN: | |||||
| ldr J, N | |||||
| ands J, J, #1 | |||||
| ble gemvt_kernel_L999 | |||||
| gemvt_kernel_S1X4: | |||||
| ldr AO1, A | |||||
| ldr XO , X | |||||
| INIT_S1 | |||||
| asrs I, M, #2 // I = M / 4 | |||||
| ble gemvt_kernel_S1X1 | |||||
| gemvt_kernel_S1X4_10: | |||||
| KERNEL_S1X4 | |||||
| subs I, I, #1 | |||||
| bne gemvt_kernel_S1X4_10 | |||||
| gemvt_kernel_S1X1: | |||||
| ands I, M , #3 | |||||
| ble gemvt_kernel_S1_END | |||||
| gemvt_kernel_S1X1_10: | |||||
| KERNEL_S1X1 | |||||
| subs I, I, #1 | |||||
| bne gemvt_kernel_S1X1_10 | |||||
| gemvt_kernel_S1_END: | |||||
| SAVE_S1 | |||||
| /*************************************************************************************************************/ | |||||
| gemvt_kernel_L999: | |||||
| sub r3, fp, #192 | |||||
| #if defined(DOUBLE) | |||||
| vldm r3, { d8 - d15 } // restore floating point registers | |||||
| #else | |||||
| vldm r3, { s8 - s31 } // restore floating point registers | |||||
| #endif | |||||
| mov r0, #0 // set return value | |||||
| sub sp, fp, #28 | |||||
| pop {r4 -r9 ,fp} | |||||
| bx lr | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,75 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| /************************************************************************************** | |||||
| * 2013/09/14 Saar | |||||
| * BLASTEST float : NoTest | |||||
| * BLASTEST double : NoTest | |||||
| * CTEST : OK | |||||
| * TEST : OK | |||||
| * | |||||
| **************************************************************************************/ | |||||
| #include "common.h" | |||||
| #include <math.h> | |||||
| #if defined(DOUBLE) | |||||
| #define ABS fabs | |||||
| #else | |||||
| #define ABS fabsf | |||||
| #endif | |||||
| BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| { | |||||
| BLASLONG i=0; | |||||
| BLASLONG ix=0; | |||||
| FLOAT maxf=0.0; | |||||
| BLASLONG max=0; | |||||
| if (n < 0 || inc_x < 1 ) return(max); | |||||
| maxf=ABS(x[0]); | |||||
| while(i < n) | |||||
| { | |||||
| if( ABS(x[ix]) > ABS(maxf) ) | |||||
| { | |||||
| max = i; | |||||
| maxf = ABS(x[ix]); | |||||
| } | |||||
| ix += inc_x; | |||||
| i++; | |||||
| } | |||||
| return(max+1); | |||||
| } | |||||
| @@ -0,0 +1,478 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| /************************************************************************************** | |||||
| * 2013/11/14 Saar | |||||
| * BLASTEST : OK | |||||
| * CTEST : OK | |||||
| * TEST : OK | |||||
| * | |||||
| **************************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define STACKSIZE 256 | |||||
| #define N r0 | |||||
| #define X r1 | |||||
| #define INC_X r2 | |||||
| #define INDEX r3 | |||||
| #define Z r4 | |||||
| #define I r12 | |||||
| #define X_PRE 512 | |||||
| /************************************************************************************** | |||||
| * Macro definitions | |||||
| **************************************************************************************/ | |||||
| #if defined(USE_ABS) | |||||
| #if defined(DOUBLE) | |||||
| #define VABS(x0,x1) vabs.f64 x0, x1 | |||||
| #else | |||||
| #define VABS(x0,x1) vabs.f32 x0, x1 | |||||
| #endif | |||||
| #else | |||||
| #define VABS(x0,x1) nop | |||||
| #endif | |||||
| /*****************************************************************************************/ | |||||
| #if defined(USE_MIN) | |||||
| #define MOVCOND movlt | |||||
| #if defined(DOUBLE) | |||||
| #define VMOVCOND vmovlt.f64 | |||||
| #else | |||||
| #define VMOVCOND vmovlt.f32 | |||||
| #endif | |||||
| #else | |||||
| #define MOVCOND movgt | |||||
| #if defined(DOUBLE) | |||||
| #define VMOVCOND vmovgt.f64 | |||||
| #else | |||||
| #define VMOVCOND vmovgt.f32 | |||||
| #endif | |||||
| #endif | |||||
| /*****************************************************************************************/ | |||||
| #if !defined(COMPLEX) | |||||
| #if defined(DOUBLE) | |||||
| .macro INIT_F | |||||
| fldmiad X!, { d0 } | |||||
| VABS( d0, d0 ) | |||||
| mov Z, #1 | |||||
| mov INDEX, Z | |||||
| .endm | |||||
| .macro KERNEL_F1 | |||||
| fldmiad X!, { d4 } | |||||
| add Z, Z, #1 | |||||
| VABS( d4, d4 ) | |||||
| vcmpe.f64 d4, d0 | |||||
| vmrs APSR_nzcv, fpscr | |||||
| VMOVCOND d0, d4 | |||||
| MOVCOND INDEX, Z | |||||
| .endm | |||||
| .macro INIT_S | |||||
| fldmiad X, { d0 } | |||||
| VABS( d0, d0 ) | |||||
| mov Z, #1 | |||||
| mov INDEX, Z | |||||
| add X, X, INC_X | |||||
| .endm | |||||
| .macro KERNEL_S1 | |||||
| fldmiad X, { d4 } | |||||
| add Z, Z, #1 | |||||
| VABS( d4, d4 ) | |||||
| vcmpe.f64 d4, d0 | |||||
| vmrs APSR_nzcv, fpscr | |||||
| VMOVCOND d0, d4 | |||||
| MOVCOND INDEX, Z | |||||
| add X, X, INC_X | |||||
| .endm | |||||
| #else | |||||
| .macro INIT_F | |||||
| fldmias X!, { s0 } | |||||
| VABS( s0, s0 ) | |||||
| mov Z, #1 | |||||
| mov INDEX, Z | |||||
| .endm | |||||
| .macro KERNEL_F1 | |||||
| fldmias X!, { s4 } | |||||
| add Z, Z, #1 | |||||
| VABS( s4, s4 ) | |||||
| vcmpe.f32 s4, s0 | |||||
| vmrs APSR_nzcv, fpscr | |||||
| VMOVCOND s0, s4 | |||||
| MOVCOND INDEX, Z | |||||
| .endm | |||||
| .macro INIT_S | |||||
| fldmias X, { s0 } | |||||
| VABS( s0, s0 ) | |||||
| mov Z, #1 | |||||
| mov INDEX, Z | |||||
| add X, X, INC_X | |||||
| .endm | |||||
| .macro KERNEL_S1 | |||||
| fldmias X, { s4 } | |||||
| add Z, Z, #1 | |||||
| VABS( s4, s4 ) | |||||
| vcmpe.f32 s4, s0 | |||||
| vmrs APSR_nzcv, fpscr | |||||
| VMOVCOND s0, s4 | |||||
| MOVCOND INDEX, Z | |||||
| add X, X, INC_X | |||||
| .endm | |||||
| #endif | |||||
| #else | |||||
| #if defined(DOUBLE) | |||||
| .macro INIT_F | |||||
| fldmiad X!, { d0 -d1 } | |||||
| vabs.f64 d0, d0 | |||||
| vabs.f64 d1, d1 | |||||
| vadd.f64 d0 , d0, d1 | |||||
| mov Z, #1 | |||||
| mov INDEX, Z | |||||
| .endm | |||||
| .macro KERNEL_F1 | |||||
| fldmiad X!, { d4 - d5 } | |||||
| add Z, Z, #1 | |||||
| vabs.f64 d4, d4 | |||||
| vabs.f64 d5, d5 | |||||
| vadd.f64 d4 , d4, d5 | |||||
| vcmpe.f64 d4, d0 | |||||
| vmrs APSR_nzcv, fpscr | |||||
| VMOVCOND d0, d4 | |||||
| MOVCOND INDEX, Z | |||||
| .endm | |||||
| .macro INIT_S | |||||
| fldmiad X, { d0 -d1 } | |||||
| vabs.f64 d0, d0 | |||||
| vabs.f64 d1, d1 | |||||
| vadd.f64 d0 , d0, d1 | |||||
| mov Z, #1 | |||||
| mov INDEX, Z | |||||
| add X, X, INC_X | |||||
| .endm | |||||
| .macro KERNEL_S1 | |||||
| fldmiad X, { d4 - d5 } | |||||
| add Z, Z, #1 | |||||
| vabs.f64 d4, d4 | |||||
| vabs.f64 d5, d5 | |||||
| vadd.f64 d4 , d4, d5 | |||||
| vcmpe.f64 d4, d0 | |||||
| vmrs APSR_nzcv, fpscr | |||||
| VMOVCOND d0, d4 | |||||
| MOVCOND INDEX, Z | |||||
| add X, X, INC_X | |||||
| .endm | |||||
| #else | |||||
| .macro INIT_F | |||||
| fldmias X!, { s0 -s1 } | |||||
| vabs.f32 s0, s0 | |||||
| vabs.f32 s1, s1 | |||||
| vadd.f32 s0 , s0, s1 | |||||
| mov Z, #1 | |||||
| mov INDEX, Z | |||||
| .endm | |||||
| .macro KERNEL_F1 | |||||
| fldmias X!, { s4 - s5 } | |||||
| add Z, Z, #1 | |||||
| vabs.f32 s4, s4 | |||||
| vabs.f32 s5, s5 | |||||
| vadd.f32 s4 , s4, s5 | |||||
| vcmpe.f32 s4, s0 | |||||
| vmrs APSR_nzcv, fpscr | |||||
| VMOVCOND s0, s4 | |||||
| MOVCOND INDEX, Z | |||||
| .endm | |||||
| .macro INIT_S | |||||
| fldmias X, { s0 -s1 } | |||||
| vabs.f32 s0, s0 | |||||
| vabs.f32 s1, s1 | |||||
| vadd.f32 s0 , s0, s1 | |||||
| mov Z, #1 | |||||
| mov INDEX, Z | |||||
| add X, X, INC_X | |||||
| .endm | |||||
| .macro KERNEL_S1 | |||||
| fldmias X, { s4 - s5 } | |||||
| add Z, Z, #1 | |||||
| vabs.f32 s4, s4 | |||||
| vabs.f32 s5, s5 | |||||
| vadd.f32 s4 , s4, s5 | |||||
| vcmpe.f32 s4, s0 | |||||
| vmrs APSR_nzcv, fpscr | |||||
| VMOVCOND s0, s4 | |||||
| MOVCOND INDEX, Z | |||||
| add X, X, INC_X | |||||
| .endm | |||||
| #endif | |||||
| #endif | |||||
| /************************************************************************************** | |||||
| * End of macro definitions | |||||
| **************************************************************************************/ | |||||
| PROLOGUE | |||||
| .align 5 | |||||
| push {r4} | |||||
| #if defined(DOUBLE) | |||||
| vsub.f64 d0 , d0 , d0 | |||||
| #else | |||||
| vsub.f32 s0 , s0 , s0 | |||||
| #endif | |||||
| mov INDEX, #0 | |||||
| cmp N, #0 | |||||
| ble iamax_kernel_L999 | |||||
| cmp INC_X, #0 | |||||
| beq iamax_kernel_L999 | |||||
| cmp INC_X, #1 | |||||
| bne iamax_kernel_S_BEGIN | |||||
| iamax_kernel_F_BEGIN: | |||||
| INIT_F | |||||
| subs N, N , #1 | |||||
| ble iamax_kernel_L999 | |||||
| asrs I, N, #2 // I = N / 4 | |||||
| ble iamax_kernel_F1 | |||||
| .align 5 | |||||
| iamax_kernel_F4: | |||||
| pld [ X, #X_PRE ] | |||||
| KERNEL_F1 | |||||
| KERNEL_F1 | |||||
| #if defined(COMPLEX) && defined(DOUBLE) | |||||
| pld [ X, #X_PRE ] | |||||
| #endif | |||||
| KERNEL_F1 | |||||
| KERNEL_F1 | |||||
| subs I, I, #1 | |||||
| ble iamax_kernel_F1 | |||||
| #if defined(COMPLEX) || defined(DOUBLE) | |||||
| pld [ X, #X_PRE ] | |||||
| #endif | |||||
| KERNEL_F1 | |||||
| KERNEL_F1 | |||||
| #if defined(COMPLEX) && defined(DOUBLE) | |||||
| pld [ X, #X_PRE ] | |||||
| #endif | |||||
| KERNEL_F1 | |||||
| KERNEL_F1 | |||||
| subs I, I, #1 | |||||
| bne iamax_kernel_F4 | |||||
| iamax_kernel_F1: | |||||
| ands I, N, #3 | |||||
| ble iamax_kernel_L999 | |||||
| iamax_kernel_F10: | |||||
| KERNEL_F1 | |||||
| subs I, I, #1 | |||||
| bne iamax_kernel_F10 | |||||
| b iamax_kernel_L999 | |||||
| iamax_kernel_S_BEGIN: | |||||
| #if defined(COMPLEX) | |||||
| #if defined(DOUBLE) | |||||
| lsl INC_X, INC_X, #4 // INC_X * SIZE * 2 | |||||
| #else | |||||
| lsl INC_X, INC_X, #3 // INC_X * SIZE * 2 | |||||
| #endif | |||||
| #else | |||||
| #if defined(DOUBLE) | |||||
| lsl INC_X, INC_X, #3 // INC_X * SIZE | |||||
| #else | |||||
| lsl INC_X, INC_X, #2 // INC_X * SIZE | |||||
| #endif | |||||
| #endif | |||||
| INIT_S | |||||
| subs N, N , #1 | |||||
| ble iamax_kernel_L999 | |||||
| asrs I, N, #2 // I = N / 4 | |||||
| ble iamax_kernel_S1 | |||||
| .align 5 | |||||
| iamax_kernel_S4: | |||||
| KERNEL_S1 | |||||
| KERNEL_S1 | |||||
| KERNEL_S1 | |||||
| KERNEL_S1 | |||||
| subs I, I, #1 | |||||
| bne iamax_kernel_S4 | |||||
| iamax_kernel_S1: | |||||
| ands I, N, #3 | |||||
| ble iamax_kernel_L999 | |||||
| iamax_kernel_S10: | |||||
| KERNEL_S1 | |||||
| subs I, I, #1 | |||||
| bne iamax_kernel_S10 | |||||
| iamax_kernel_L999: | |||||
| mov r0, INDEX // set return value | |||||
| pop {r4} | |||||
| bx lr | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,75 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| /************************************************************************************** | |||||
| * 2013/09/14 Saar | |||||
| * BLASTEST float : NoTest | |||||
| * BLASTEST double : NoTest | |||||
| * CTEST : NoTest | |||||
| * TEST : NoTest | |||||
| * | |||||
| **************************************************************************************/ | |||||
| #include "common.h" | |||||
| #include <math.h> | |||||
| #if defined(DOUBLE) | |||||
| #define ABS fabs | |||||
| #else | |||||
| #define ABS fabsf | |||||
| #endif | |||||
| BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| { | |||||
| BLASLONG i=0; | |||||
| BLASLONG ix=0; | |||||
| FLOAT minf=0.0; | |||||
| BLASLONG min=0; | |||||
| if (n < 0 || inc_x < 1 ) return(min); | |||||
| minf=ABS(x[0]); | |||||
| while(i < n) | |||||
| { | |||||
| if( ABS(x[ix]) < ABS(minf) ) | |||||
| { | |||||
| min = i; | |||||
| minf = ABS(x[ix]); | |||||
| } | |||||
| ix += inc_x; | |||||
| i++; | |||||
| } | |||||
| return(min+1); | |||||
| } | |||||
| @@ -0,0 +1,67 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| /************************************************************************************** | |||||
| * 2013/09/14 Saar | |||||
| * BLASTEST float : NoTest | |||||
| * BLASTEST double : NoTest | |||||
| * CTEST : NoTest | |||||
| * TEST : NoTest | |||||
| * | |||||
| **************************************************************************************/ | |||||
| #include "common.h" | |||||
| #include <math.h> | |||||
| BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| { | |||||
| BLASLONG i=0; | |||||
| BLASLONG ix=0; | |||||
| FLOAT maxf=0.0; | |||||
| BLASLONG max=0; | |||||
| if (n < 0 || inc_x < 1 ) return(max); | |||||
| maxf=x[0]; | |||||
| while(i < n) | |||||
| { | |||||
| if( x[ix] > maxf ) | |||||
| { | |||||
| max = i; | |||||
| maxf = x[ix]; | |||||
| } | |||||
| ix += inc_x; | |||||
| i++; | |||||
| } | |||||
| return(max+1); | |||||
| } | |||||
| @@ -0,0 +1,65 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| /************************************************************************************** | |||||
| * 2013/08/19 Saar | |||||
| * BLASTEST float | |||||
| * BLASTEST double | |||||
| * | |||||
| **************************************************************************************/ | |||||
| #include "common.h" | |||||
| #include <math.h> | |||||
| BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| { | |||||
| BLASLONG i=0; | |||||
| BLASLONG ix=0; | |||||
| FLOAT minf=0.0; | |||||
| BLASLONG min=0; | |||||
| if (n < 0 || inc_x < 1 ) return(min); | |||||
| minf=x[0]; | |||||
| while(i < n) | |||||
| { | |||||
| if( x[ix] > minf ) | |||||
| { | |||||
| min = i; | |||||
| minf = x[ix]; | |||||
| } | |||||
| ix += inc_x; | |||||
| i++; | |||||
| } | |||||
| return(min+1); | |||||
| } | |||||
| @@ -0,0 +1,81 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| /************************************************************************************** | |||||
| * 2013/09/14 Saar | |||||
| * BLASTEST float : NoTest | |||||
| * BLASTEST double : NoTest | |||||
| * CTEST : OK | |||||
| * TEST : OK | |||||
| * | |||||
| **************************************************************************************/ | |||||
| #include "common.h" | |||||
| #include <math.h> | |||||
| #if defined(DOUBLE) | |||||
| #define ABS fabs | |||||
| #else | |||||
| #define ABS fabsf | |||||
| #endif | |||||
| #define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) | |||||
| BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| { | |||||
| BLASLONG i=0; | |||||
| BLASLONG ix=0; | |||||
| FLOAT maxf[2]; | |||||
| BLASLONG max=0; | |||||
| BLASLONG inc_x2; | |||||
| if (n < 0 || inc_x < 1 ) return(max); | |||||
| inc_x2 = 2 * inc_x; | |||||
| maxf[0] = ABS(x[ix]); | |||||
| maxf[1] = ABS(x[ix+1]); | |||||
| while(i < n) | |||||
| { | |||||
| if( CABS1(x,ix) > CABS1(maxf,0) ) | |||||
| { | |||||
| max = i; | |||||
| maxf[0] = ABS(x[ix]); | |||||
| maxf[1] = ABS(x[ix+1]); | |||||
| } | |||||
| ix += inc_x2; | |||||
| i++; | |||||
| } | |||||
| return(max+1); | |||||
| } | |||||
| @@ -0,0 +1,81 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| /************************************************************************************** | |||||
| * 2013/09/14 Saar | |||||
| * BLASTEST float : NoTest | |||||
| * BLASTEST double : NoTest | |||||
| * CTEST : NoTest | |||||
| * TEST : NoTest | |||||
| * | |||||
| **************************************************************************************/ | |||||
| #include "common.h" | |||||
| #include <math.h> | |||||
| #if defined(DOUBLE) | |||||
| #define ABS fabs | |||||
| #else | |||||
| #define ABS fabsf | |||||
| #endif | |||||
| #define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) | |||||
| BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| { | |||||
| BLASLONG i=0; | |||||
| BLASLONG ix=0; | |||||
| FLOAT minf[2]; | |||||
| BLASLONG min=0; | |||||
| BLASLONG inc_x2; | |||||
| if (n < 0 || inc_x < 1 ) return(min); | |||||
| inc_x2 = 2 * inc_x; | |||||
| minf[0] = ABS(x[ix]); | |||||
| minf[1] = ABS(x[ix+1]); | |||||
| while(i < n) | |||||
| { | |||||
| if( CABS1(x,ix) < CABS1(minf,0) ) | |||||
| { | |||||
| min = i; | |||||
| minf[0] = ABS(x[ix]); | |||||
| minf[1] = ABS(x[ix+1]); | |||||
| } | |||||
| ix += inc_x2; | |||||
| i++; | |||||
| } | |||||
| return(min+1); | |||||
| } | |||||
| @@ -0,0 +1,63 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| /************************************************************************************** | |||||
| * 2013/09/14 Saar | |||||
| * BLASTEST float : NoTest | |||||
| * BLASTEST double : NoTest | |||||
| * CTEST : NoTest | |||||
| * TEST : NoTest | |||||
| * | |||||
| **************************************************************************************/ | |||||
| #include "common.h" | |||||
| #include <math.h> | |||||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| { | |||||
| BLASLONG i=0; | |||||
| BLASLONG ix=0; | |||||
| FLOAT maxf=0.0; | |||||
| if (n < 0 || inc_x < 1 ) return(maxf); | |||||
| maxf=x[0]; | |||||
| while(i < n) | |||||
| { | |||||
| if( x[ix] > maxf ) | |||||
| { | |||||
| maxf = x[ix]; | |||||
| } | |||||
| ix += inc_x; | |||||
| i++; | |||||
| } | |||||
| return(maxf); | |||||
| } | |||||
| @@ -0,0 +1,63 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| /************************************************************************************** | |||||
| * 2013/09/14 Saar | |||||
| * BLASTEST float : NoTest | |||||
| * BLASTEST double : NoTest | |||||
| * CTEST : NoTest | |||||
| * TEST : NoTest | |||||
| * | |||||
| **************************************************************************************/ | |||||
| #include "common.h" | |||||
| #include <math.h> | |||||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| { | |||||
| BLASLONG i=0; | |||||
| BLASLONG ix=0; | |||||
| FLOAT minf=0.0; | |||||
| if (n < 0 || inc_x < 1 ) return(minf); | |||||
| minf=x[0]; | |||||
| while(i < n) | |||||
| { | |||||
| if( x[ix] < minf ) | |||||
| { | |||||
| minf = x[ix]; | |||||
| } | |||||
| ix += inc_x; | |||||
| i++; | |||||
| } | |||||
| return(minf); | |||||
| } | |||||
| @@ -0,0 +1,88 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| /************************************************************************************** | |||||
| * 2013/09/13 Saar | |||||
| * BLASTEST float : OK | |||||
| * BLASTEST double : OK | |||||
| * CTEST : OK | |||||
| * TEST : OK | |||||
| * | |||||
| **************************************************************************************/ | |||||
| #include "common.h" | |||||
| #include <math.h> | |||||
| #if defined(DOUBLE) | |||||
| #define ABS fabs | |||||
| #else | |||||
| #define ABS fabsf | |||||
| #endif | |||||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| { | |||||
| BLASLONG i=0; | |||||
| FLOAT scale = 0.0; | |||||
| FLOAT ssq = 1.0; | |||||
| FLOAT absxi = 0.0; | |||||
| if (n < 0 || inc_x < 1 ) return(0.0); | |||||
| if ( n == 1 ) return( ABS(x[0]) ); | |||||
| n *= inc_x; | |||||
| while(i < n) | |||||
| { | |||||
| if ( x[i] != 0.0 ) | |||||
| { | |||||
| absxi = ABS( x[i] ); | |||||
| if ( scale < absxi ) | |||||
| { | |||||
| ssq = 1 + ssq * ( scale / absxi ) * ( scale / absxi ); | |||||
| scale = absxi ; | |||||
| } | |||||
| else | |||||
| { | |||||
| ssq += ( absxi/scale ) * ( absxi/scale ); | |||||
| } | |||||
| } | |||||
| i += inc_x; | |||||
| } | |||||
| scale = scale * sqrt( ssq ); | |||||
| return(scale); | |||||
| } | |||||
| @@ -0,0 +1,565 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| /************************************************************************************** | |||||
| * 2013/11/22 Saar | |||||
| * BLASTEST : OK | |||||
| * CTEST : OK | |||||
| * TEST : OK | |||||
| * | |||||
| **************************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define STACKSIZE 256 | |||||
| #define N r0 | |||||
| #define X r1 | |||||
| #define INC_X r2 | |||||
| #define I r12 | |||||
| #define X_PRE 512 | |||||
| /************************************************************************************** | |||||
| * Macro definitions | |||||
| **************************************************************************************/ | |||||
| #if !defined(COMPLEX) | |||||
| #if defined(DOUBLE) | |||||
| .macro KERNEL_F1 | |||||
| fldmiad X!, { d4 } | |||||
| vcmpe.f64 d4, d6 // compare with 0.0 | |||||
| vmrs APSR_nzcv, fpscr | |||||
| beq KERNEL_F1_NEXT_\@ | |||||
| vabs.f64 d4, d4 | |||||
| vcmpe.f64 d0, d4 // compare with scale | |||||
| vmrs APSR_nzcv, fpscr | |||||
| vdivge.f64 d2 , d4, d0 // scale >= x ? x / scale | |||||
| vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale ) | |||||
| bge KERNEL_F1_NEXT_\@ | |||||
| vdiv.f64 d2 , d0, d4 // scale / x | |||||
| vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x ) | |||||
| vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x ) | |||||
| vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) | |||||
| vmov.f64 d0 , d4 // scale = x | |||||
| KERNEL_F1_NEXT_\@: | |||||
| .endm | |||||
| .macro KERNEL_F8 | |||||
| pld [ X, #X_PRE ] | |||||
| KERNEL_F1 | |||||
| KERNEL_F1 | |||||
| KERNEL_F1 | |||||
| KERNEL_F1 | |||||
| pld [ X, #X_PRE ] | |||||
| KERNEL_F1 | |||||
| KERNEL_F1 | |||||
| KERNEL_F1 | |||||
| KERNEL_F1 | |||||
| .endm | |||||
| .macro KERNEL_S1 | |||||
| fldmiad X, { d4 } | |||||
| vcmpe.f64 d4, d6 // compare with 0.0 | |||||
| vmrs APSR_nzcv, fpscr | |||||
| beq KERNEL_S1_NEXT | |||||
| vabs.f64 d4, d4 | |||||
| vcmpe.f64 d0, d4 // compare with scale | |||||
| vmrs APSR_nzcv, fpscr | |||||
| vdivge.f64 d2 , d4, d0 // scale >= x ? x / scale | |||||
| vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale ) | |||||
| bge KERNEL_S1_NEXT | |||||
| vdiv.f64 d2 , d0, d4 // scale / x | |||||
| vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x ) | |||||
| vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x ) | |||||
| vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) | |||||
| vmov.f64 d0 , d4 // scale = x | |||||
| KERNEL_S1_NEXT: | |||||
| add X, X, INC_X | |||||
| .endm | |||||
| #else | |||||
| .macro KERNEL_F1 | |||||
| fldmias X!, { s4 } | |||||
| vcmpe.f32 s4, s6 // compare with 0.0 | |||||
| vmrs APSR_nzcv, fpscr | |||||
| beq KERNEL_F1_NEXT_\@ | |||||
| vabs.f32 s4, s4 | |||||
| vcmpe.f32 s0, s4 // compare with scale | |||||
| vmrs APSR_nzcv, fpscr | |||||
| vdivge.f32 s2 , s4, s0 // scale >= x ? x / scale | |||||
| vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale ) | |||||
| bge KERNEL_F1_NEXT_\@ | |||||
| vdiv.f32 s2 , s0, s4 // scale / x | |||||
| vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x ) | |||||
| vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x ) | |||||
| vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) | |||||
| vmov.f32 s0 , s4 // scale = x | |||||
| KERNEL_F1_NEXT_\@: | |||||
| .endm | |||||
| .macro KERNEL_F8 | |||||
| pld [ X, #X_PRE ] | |||||
| KERNEL_F1 | |||||
| KERNEL_F1 | |||||
| KERNEL_F1 | |||||
| KERNEL_F1 | |||||
| KERNEL_F1 | |||||
| KERNEL_F1 | |||||
| KERNEL_F1 | |||||
| KERNEL_F1 | |||||
| .endm | |||||
| .macro KERNEL_S1 | |||||
| fldmias X, { s4 } | |||||
| vcmpe.f32 s4, s6 // compare with 0.0 | |||||
| vmrs APSR_nzcv, fpscr | |||||
| beq KERNEL_S1_NEXT | |||||
| vabs.f32 s4, s4 | |||||
| vcmpe.f32 s0, s4 // compare with scale | |||||
| vmrs APSR_nzcv, fpscr | |||||
| vdivge.f32 s2 , s4, s0 // scale >= x ? x / scale | |||||
| vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale ) | |||||
| bge KERNEL_S1_NEXT | |||||
| vdiv.f32 s2 , s0, s4 // scale / x | |||||
| vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x ) | |||||
| vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x ) | |||||
| vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) | |||||
| vmov.f32 s0 , s4 // scale = x | |||||
| KERNEL_S1_NEXT: | |||||
| add X, X, INC_X | |||||
| .endm | |||||
| #endif | |||||
| #else | |||||
| #if defined(DOUBLE) | |||||
| .macro KERNEL_F1 | |||||
| fldmiad X!, { d4 - d5 } | |||||
| vcmpe.f64 d4, d6 // compare with 0.0 | |||||
| vmrs APSR_nzcv, fpscr | |||||
| beq KERNEL_F1_NEXT_\@ | |||||
| vabs.f64 d4, d4 | |||||
| vcmpe.f64 d0, d4 // compare with scale | |||||
| vmrs APSR_nzcv, fpscr | |||||
| vdivge.f64 d2 , d4, d0 // scale >= x ? x / scale | |||||
| vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale ) | |||||
| bge KERNEL_F1_NEXT_\@ | |||||
| vdiv.f64 d2 , d0, d4 // scale / x | |||||
| vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x ) | |||||
| vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x ) | |||||
| vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) | |||||
| vmov.f64 d0 , d4 // scale = x | |||||
| KERNEL_F1_NEXT_\@: | |||||
| vcmpe.f64 d5, d6 // compare with 0.0 | |||||
| vmrs APSR_nzcv, fpscr | |||||
| beq KERNEL_F1_END_\@ | |||||
| vabs.f64 d5, d5 | |||||
| vcmpe.f64 d0, d5 // compare with scale | |||||
| vmrs APSR_nzcv, fpscr | |||||
| vdivge.f64 d2 , d5, d0 // scale >= x ? x / scale | |||||
| vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale ) | |||||
| bge KERNEL_F1_END_\@ | |||||
| vdiv.f64 d2 , d0, d5 // scale / x | |||||
| vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x ) | |||||
| vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x ) | |||||
| vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) | |||||
| vmov.f64 d0 , d5 // scale = x | |||||
| KERNEL_F1_END_\@: | |||||
| .endm | |||||
| .macro KERNEL_F8 | |||||
| pld [ X, #X_PRE ] | |||||
| KERNEL_F1 | |||||
| KERNEL_F1 | |||||
| pld [ X, #X_PRE ] | |||||
| KERNEL_F1 | |||||
| KERNEL_F1 | |||||
| pld [ X, #X_PRE ] | |||||
| KERNEL_F1 | |||||
| KERNEL_F1 | |||||
| pld [ X, #X_PRE ] | |||||
| KERNEL_F1 | |||||
| KERNEL_F1 | |||||
| .endm | |||||
| .macro KERNEL_S1 | |||||
| fldmiad X, { d4 - d5 } | |||||
| vcmpe.f64 d4, d6 // compare with 0.0 | |||||
| vmrs APSR_nzcv, fpscr | |||||
| beq KERNEL_S1_NEXT_\@ | |||||
| vabs.f64 d4, d4 | |||||
| vcmpe.f64 d0, d4 // compare with scale | |||||
| vmrs APSR_nzcv, fpscr | |||||
| vdivge.f64 d2 , d4, d0 // scale >= x ? x / scale | |||||
| vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale ) | |||||
| bge KERNEL_S1_NEXT_\@ | |||||
| vdiv.f64 d2 , d0, d4 // scale / x | |||||
| vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x ) | |||||
| vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x ) | |||||
| vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) | |||||
| vmov.f64 d0 , d4 // scale = x | |||||
| KERNEL_S1_NEXT_\@: | |||||
| vcmpe.f64 d5, d6 // compare with 0.0 | |||||
| vmrs APSR_nzcv, fpscr | |||||
| beq KERNEL_S1_END_\@ | |||||
| vabs.f64 d5, d5 | |||||
| vcmpe.f64 d0, d5 // compare with scale | |||||
| vmrs APSR_nzcv, fpscr | |||||
| vdivge.f64 d2 , d5, d0 // scale >= x ? x / scale | |||||
| vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale ) | |||||
| bge KERNEL_S1_END_\@ | |||||
| vdiv.f64 d2 , d0, d5 // scale / x | |||||
| vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x ) | |||||
| vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x ) | |||||
| vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) | |||||
| vmov.f64 d0 , d5 // scale = x | |||||
| KERNEL_S1_END_\@: | |||||
| add X, X, INC_X | |||||
| .endm | |||||
| #else | |||||
| .macro KERNEL_F1 | |||||
| fldmias X!, { s4 - s5 } | |||||
| vcmpe.f32 s4, s6 // compare with 0.0 | |||||
| vmrs APSR_nzcv, fpscr | |||||
| beq KERNEL_F1_NEXT_\@ | |||||
| vabs.f32 s4, s4 | |||||
| vcmpe.f32 s0, s4 // compare with scale | |||||
| vmrs APSR_nzcv, fpscr | |||||
| vdivge.f32 s2 , s4, s0 // scale >= x ? x / scale | |||||
| vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale ) | |||||
| bge KERNEL_F1_NEXT_\@ | |||||
| vdiv.f32 s2 , s0, s4 // scale / x | |||||
| vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x ) | |||||
| vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x ) | |||||
| vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) | |||||
| vmov.f32 s0 , s4 // scale = x | |||||
| KERNEL_F1_NEXT_\@: | |||||
| vcmpe.f32 s5, s6 // compare with 0.0 | |||||
| vmrs APSR_nzcv, fpscr | |||||
| beq KERNEL_F1_END_\@ | |||||
| vabs.f32 s5, s5 | |||||
| vcmpe.f32 s0, s5 // compare with scale | |||||
| vmrs APSR_nzcv, fpscr | |||||
| vdivge.f32 s2 , s5, s0 // scale >= x ? x / scale | |||||
| vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale ) | |||||
| bge KERNEL_F1_END_\@ | |||||
| vdiv.f32 s2 , s0, s5 // scale / x | |||||
| vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x ) | |||||
| vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x ) | |||||
| vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) | |||||
| vmov.f32 s0 , s5 // scale = x | |||||
| KERNEL_F1_END_\@: | |||||
| .endm | |||||
| .macro KERNEL_F8 | |||||
| pld [ X, #X_PRE ] | |||||
| KERNEL_F1 | |||||
| KERNEL_F1 | |||||
| KERNEL_F1 | |||||
| KERNEL_F1 | |||||
| pld [ X, #X_PRE ] | |||||
| KERNEL_F1 | |||||
| KERNEL_F1 | |||||
| KERNEL_F1 | |||||
| KERNEL_F1 | |||||
| .endm | |||||
| .macro KERNEL_S1 | |||||
| fldmias X, { s4 - s5 } | |||||
| vcmpe.f32 s4, s6 // compare with 0.0 | |||||
| vmrs APSR_nzcv, fpscr | |||||
| beq KERNEL_S1_NEXT_\@ | |||||
| vabs.f32 s4, s4 | |||||
| vcmpe.f32 s0, s4 // compare with scale | |||||
| vmrs APSR_nzcv, fpscr | |||||
| vdivge.f32 s2 , s4, s0 // scale >= x ? x / scale | |||||
| vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale ) | |||||
| bge KERNEL_S1_NEXT_\@ | |||||
| vdiv.f32 s2 , s0, s4 // scale / x | |||||
| vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x ) | |||||
| vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x ) | |||||
| vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) | |||||
| vmov.f32 s0 , s4 // scale = x | |||||
| KERNEL_S1_NEXT_\@: | |||||
| vcmpe.f32 s5, s6 // compare with 0.0 | |||||
| vmrs APSR_nzcv, fpscr | |||||
| beq KERNEL_S1_END_\@ | |||||
| vabs.f32 s5, s5 | |||||
| vcmpe.f32 s0, s5 // compare with scale | |||||
| vmrs APSR_nzcv, fpscr | |||||
| vdivge.f32 s2 , s5, s0 // scale >= x ? x / scale | |||||
| vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale ) | |||||
| bge KERNEL_S1_END_\@ | |||||
| vdiv.f32 s2 , s0, s5 // scale / x | |||||
| vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x ) | |||||
| vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x ) | |||||
| vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) | |||||
| vmov.f32 s0 , s5 // scale = x | |||||
| KERNEL_S1_END_\@: | |||||
| add X, X, INC_X | |||||
| .endm | |||||
| #endif | |||||
| #endif | |||||
| /************************************************************************************** | |||||
| * End of macro definitions | |||||
| **************************************************************************************/ | |||||
| PROLOGUE | |||||
| b nrm2_begin | |||||
| #if defined(COMPLEX) | |||||
| #if defined(DOUBLE) | |||||
| znrm2_one: | |||||
| .word 0x00000000 | |||||
| .word 0x3ff00000 | |||||
| #else | |||||
| cnrm2_one: | |||||
| .word 0x3f800000 | |||||
| #endif | |||||
| #else | |||||
| #if defined(DOUBLE) | |||||
| dnrm2_one: | |||||
| .word 0x00000000 | |||||
| .word 0x3ff00000 | |||||
| #else | |||||
| snrm2_one: | |||||
| .word 0x3f800000 | |||||
| #endif | |||||
| #endif | |||||
| .align 5 | |||||
| nrm2_begin: | |||||
| #if defined(COMPLEX) | |||||
| #if defined(DOUBLE) | |||||
| vsub.f64 d0 , d0 , d0 // scale=0.0 | |||||
| vldr.64 d1 , znrm2_one // ssq=1.0 | |||||
| vmov.f64 d7 , d1 // value 1.0 | |||||
| vmov.f64 d6 , d0 // value 0.0 | |||||
| #else | |||||
| vsub.f32 s0 , s0 , s0 // scale=0.0 | |||||
| vldr.32 s1 , cnrm2_one // ssq=1.0 | |||||
| vmov.f32 s7 , s1 // value 1.0 | |||||
| vmov.f32 s6 , s0 // value 0.0 | |||||
| #endif | |||||
| #else | |||||
| #if defined(DOUBLE) | |||||
| vsub.f64 d0 , d0 , d0 // scale=0.0 | |||||
| vldr.64 d1 , dnrm2_one // ssq=1.0 | |||||
| vmov.f64 d7 , d1 // value 1.0 | |||||
| vmov.f64 d6 , d0 // value 0.0 | |||||
| #else | |||||
| vsub.f32 s0 , s0 , s0 // scale=0.0 | |||||
| vldr.32 s1 , snrm2_one // ssq=1.0 | |||||
| vmov.f32 s7 , s1 // value 1.0 | |||||
| vmov.f32 s6 , s0 // value 0.0 | |||||
| #endif | |||||
| #endif | |||||
| cmp N, #0 | |||||
| ble nrm2_kernel_L999 | |||||
| cmp INC_X, #0 | |||||
| beq nrm2_kernel_L999 | |||||
| cmp INC_X, #1 | |||||
| bne nrm2_kernel_S_BEGIN | |||||
| nrm2_kernel_F_BEGIN: | |||||
| asrs I, N, #3 // I = N / 8 | |||||
| ble nrm2_kernel_F1 | |||||
| nrm2_kernel_F8: | |||||
| KERNEL_F8 | |||||
| subs I, I, #1 | |||||
| bne nrm2_kernel_F8 | |||||
| nrm2_kernel_F1: | |||||
| ands I, N, #7 | |||||
| ble nrm2_kernel_L999 | |||||
| nrm2_kernel_F10: | |||||
| KERNEL_F1 | |||||
| subs I, I, #1 | |||||
| bne nrm2_kernel_F10 | |||||
| b nrm2_kernel_L999 | |||||
| nrm2_kernel_S_BEGIN: | |||||
| #if defined(COMPLEX) | |||||
| #if defined(DOUBLE) | |||||
| lsl INC_X, INC_X, #4 // INC_X * SIZE * 2 | |||||
| #else | |||||
| lsl INC_X, INC_X, #3 // INC_X * SIZE * 2 | |||||
| #endif | |||||
| #else | |||||
| #if defined(DOUBLE) | |||||
| lsl INC_X, INC_X, #3 // INC_X * SIZE | |||||
| #else | |||||
| lsl INC_X, INC_X, #2 // INC_X * SIZE | |||||
| #endif | |||||
| #endif | |||||
| nrm2_kernel_S1: | |||||
| mov I, N | |||||
| .align 5 | |||||
| nrm2_kernel_S10: | |||||
| KERNEL_S1 | |||||
| subs I, I, #1 | |||||
| bne nrm2_kernel_S10 | |||||
| nrm2_kernel_L999: | |||||
| #if defined(DOUBLE) | |||||
| vsqrt.f64 d1, d1 | |||||
| vmul.f64 d0, d0, d1 | |||||
| #else | |||||
| vsqrt.f32 s1, s1 | |||||
| vmul.f32 s0, s0, s1 | |||||
| #endif | |||||
| bx lr | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,508 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| /************************************************************************************** | |||||
| * 2013/11/16 Saar | |||||
| * BLASTEST : OK | |||||
| * CTEST : OK | |||||
| * TEST : OK | |||||
| * | |||||
| **************************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define STACKSIZE 256 | |||||
| #define N r0 | |||||
| #define X r1 | |||||
| #define INC_X r2 | |||||
| #define I r12 | |||||
| #define X_PRE 512 | |||||
| /************************************************************************************** | |||||
| * Macro definitions | |||||
| **************************************************************************************/ | |||||
| #if !defined(COMPLEX) | |||||
| #if defined(DOUBLE) | |||||
| .macro KERNEL_F1 | |||||
| fldmiad X!, { d4 } | |||||
| vcmpe.f64 d4, d6 // compare with 0.0 | |||||
| vmrs APSR_nzcv, fpscr | |||||
| beq KERNEL_F1_NEXT_\@ | |||||
| vabs.f64 d4, d4 | |||||
| vcmpe.f64 d0, d4 // compare with scale | |||||
| vmrs APSR_nzcv, fpscr | |||||
| vdivge.f64 d2 , d4, d0 // scale >= x ? x / scale | |||||
| vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale ) | |||||
| bge KERNEL_F1_NEXT_\@ | |||||
| vdiv.f64 d2 , d0, d4 // scale / x | |||||
| vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x ) | |||||
| vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x ) | |||||
| vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) | |||||
| vmov.f64 d0 , d4 // scale = x | |||||
| KERNEL_F1_NEXT_\@: | |||||
| .endm | |||||
| .macro KERNEL_F8 | |||||
| pld [ X, #X_PRE ] | |||||
| KERNEL_F1 | |||||
| KERNEL_F1 | |||||
| KERNEL_F1 | |||||
| KERNEL_F1 | |||||
| pld [ X, #X_PRE ] | |||||
| KERNEL_F1 | |||||
| KERNEL_F1 | |||||
| KERNEL_F1 | |||||
| KERNEL_F1 | |||||
| .endm | |||||
| .macro KERNEL_S1 | |||||
| fldmiad X, { d4 } | |||||
| vcmpe.f64 d4, d6 // compare with 0.0 | |||||
| vmrs APSR_nzcv, fpscr | |||||
| beq KERNEL_S1_NEXT | |||||
| vabs.f64 d4, d4 | |||||
| vcmpe.f64 d0, d4 // compare with scale | |||||
| vmrs APSR_nzcv, fpscr | |||||
| vdivge.f64 d2 , d4, d0 // scale >= x ? x / scale | |||||
| vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale ) | |||||
| bge KERNEL_S1_NEXT | |||||
| vdiv.f64 d2 , d0, d4 // scale / x | |||||
| vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x ) | |||||
| vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x ) | |||||
| vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) | |||||
| vmov.f64 d0 , d4 // scale = x | |||||
| KERNEL_S1_NEXT: | |||||
| add X, X, INC_X | |||||
| .endm | |||||
| #else | |||||
| .macro KERNEL_F1 | |||||
| fldmias X!, { s4 } | |||||
| vcmpe.f32 s4, s6 // compare with 0.0 | |||||
| vmrs APSR_nzcv, fpscr | |||||
| beq KERNEL_F1_NEXT_\@ | |||||
| vabs.f32 s4, s4 | |||||
| vcmpe.f32 s0, s4 // compare with scale | |||||
| vmrs APSR_nzcv, fpscr | |||||
| vdivge.f32 s2 , s4, s0 // scale >= x ? x / scale | |||||
| vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale ) | |||||
| bge KERNEL_F1_NEXT_\@ | |||||
| vdiv.f32 s2 , s0, s4 // scale / x | |||||
| vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x ) | |||||
| vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x ) | |||||
| vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) | |||||
| vmov.f32 s0 , s4 // scale = x | |||||
| KERNEL_F1_NEXT_\@: | |||||
| .endm | |||||
| .macro KERNEL_F8 | |||||
| pld [ X, #X_PRE ] | |||||
| KERNEL_F1 | |||||
| KERNEL_F1 | |||||
| KERNEL_F1 | |||||
| KERNEL_F1 | |||||
| KERNEL_F1 | |||||
| KERNEL_F1 | |||||
| KERNEL_F1 | |||||
| KERNEL_F1 | |||||
| .endm | |||||
| .macro KERNEL_S1 | |||||
| fldmias X, { s4 } | |||||
| vcmpe.f32 s4, s6 // compare with 0.0 | |||||
| vmrs APSR_nzcv, fpscr | |||||
| beq KERNEL_S1_NEXT | |||||
| vabs.f32 s4, s4 | |||||
| vcmpe.f32 s0, s4 // compare with scale | |||||
| vmrs APSR_nzcv, fpscr | |||||
| vdivge.f32 s2 , s4, s0 // scale >= x ? x / scale | |||||
| vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale ) | |||||
| bge KERNEL_S1_NEXT | |||||
| vdiv.f32 s2 , s0, s4 // scale / x | |||||
| vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x ) | |||||
| vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x ) | |||||
| vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) | |||||
| vmov.f32 s0 , s4 // scale = x | |||||
| KERNEL_S1_NEXT: | |||||
| add X, X, INC_X | |||||
| .endm | |||||
| #endif | |||||
| #else | |||||
| #if defined(DOUBLE) | |||||
| .macro KERNEL_F1 | |||||
| fldmiad X!, { d4 - d5 } | |||||
| vcmpe.f64 d4, d6 // compare with 0.0 | |||||
| vmrs APSR_nzcv, fpscr | |||||
| beq KERNEL_F1_NEXT_\@ | |||||
| vabs.f64 d4, d4 | |||||
| vcmpe.f64 d0, d4 // compare with scale | |||||
| vmrs APSR_nzcv, fpscr | |||||
| vdivge.f64 d2 , d4, d0 // scale >= x ? x / scale | |||||
| vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale ) | |||||
| bge KERNEL_F1_NEXT_\@ | |||||
| vdiv.f64 d2 , d0, d4 // scale / x | |||||
| vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x ) | |||||
| vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x ) | |||||
| vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) | |||||
| vmov.f64 d0 , d4 // scale = x | |||||
| KERNEL_F1_NEXT_\@: | |||||
| vcmpe.f64 d5, d6 // compare with 0.0 | |||||
| vmrs APSR_nzcv, fpscr | |||||
| beq KERNEL_F1_END_\@ | |||||
| vabs.f64 d5, d5 | |||||
| vcmpe.f64 d0, d5 // compare with scale | |||||
| vmrs APSR_nzcv, fpscr | |||||
| vdivge.f64 d2 , d5, d0 // scale >= x ? x / scale | |||||
| vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale ) | |||||
| bge KERNEL_F1_END_\@ | |||||
| vdiv.f64 d2 , d0, d5 // scale / x | |||||
| vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x ) | |||||
| vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x ) | |||||
| vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) | |||||
| vmov.f64 d0 , d5 // scale = x | |||||
| KERNEL_F1_END_\@: | |||||
| .endm | |||||
| .macro KERNEL_F8 | |||||
| pld [ X, #X_PRE ] | |||||
| KERNEL_F1 | |||||
| KERNEL_F1 | |||||
| pld [ X, #X_PRE ] | |||||
| KERNEL_F1 | |||||
| KERNEL_F1 | |||||
| pld [ X, #X_PRE ] | |||||
| KERNEL_F1 | |||||
| KERNEL_F1 | |||||
| pld [ X, #X_PRE ] | |||||
| KERNEL_F1 | |||||
| KERNEL_F1 | |||||
| .endm | |||||
| .macro KERNEL_S1 | |||||
| fldmiad X, { d4 - d5 } | |||||
| vcmpe.f64 d4, d6 // compare with 0.0 | |||||
| vmrs APSR_nzcv, fpscr | |||||
| beq KERNEL_S1_NEXT_\@ | |||||
| vabs.f64 d4, d4 | |||||
| vcmpe.f64 d0, d4 // compare with scale | |||||
| vmrs APSR_nzcv, fpscr | |||||
| vdivge.f64 d2 , d4, d0 // scale >= x ? x / scale | |||||
| vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale ) | |||||
| bge KERNEL_S1_NEXT_\@ | |||||
| vdiv.f64 d2 , d0, d4 // scale / x | |||||
| vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x ) | |||||
| vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x ) | |||||
| vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) | |||||
| vmov.f64 d0 , d4 // scale = x | |||||
| KERNEL_S1_NEXT_\@: | |||||
| vcmpe.f64 d5, d6 // compare with 0.0 | |||||
| vmrs APSR_nzcv, fpscr | |||||
| beq KERNEL_S1_END_\@ | |||||
| vabs.f64 d5, d5 | |||||
| vcmpe.f64 d0, d5 // compare with scale | |||||
| vmrs APSR_nzcv, fpscr | |||||
| vdivge.f64 d2 , d5, d0 // scale >= x ? x / scale | |||||
| vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale ) | |||||
| bge KERNEL_S1_END_\@ | |||||
| vdiv.f64 d2 , d0, d5 // scale / x | |||||
| vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x ) | |||||
| vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x ) | |||||
| vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) | |||||
| vmov.f64 d0 , d5 // scale = x | |||||
| KERNEL_S1_END_\@: | |||||
| add X, X, INC_X | |||||
| .endm | |||||
| #else | |||||
| .macro KERNEL_F1 | |||||
| fldmias X!, { s4 - s5 } | |||||
| vcmpe.f32 s4, s6 // compare with 0.0 | |||||
| vmrs APSR_nzcv, fpscr | |||||
| beq KERNEL_F1_NEXT_\@ | |||||
| vabs.f32 s4, s4 | |||||
| vcmpe.f32 s0, s4 // compare with scale | |||||
| vmrs APSR_nzcv, fpscr | |||||
| vdivge.f32 s2 , s4, s0 // scale >= x ? x / scale | |||||
| vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale ) | |||||
| bge KERNEL_F1_NEXT_\@ | |||||
| vdiv.f32 s2 , s0, s4 // scale / x | |||||
| vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x ) | |||||
| vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x ) | |||||
| vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) | |||||
| vmov.f32 s0 , s4 // scale = x | |||||
| KERNEL_F1_NEXT_\@: | |||||
| vcmpe.f32 s5, s6 // compare with 0.0 | |||||
| vmrs APSR_nzcv, fpscr | |||||
| beq KERNEL_F1_END_\@ | |||||
| vabs.f32 s5, s5 | |||||
| vcmpe.f32 s0, s5 // compare with scale | |||||
| vmrs APSR_nzcv, fpscr | |||||
| vdivge.f32 s2 , s5, s0 // scale >= x ? x / scale | |||||
| vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale ) | |||||
| bge KERNEL_F1_END_\@ | |||||
| vdiv.f32 s2 , s0, s5 // scale / x | |||||
| vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x ) | |||||
| vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x ) | |||||
| vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) | |||||
| vmov.f32 s0 , s5 // scale = x | |||||
| KERNEL_F1_END_\@: | |||||
| .endm | |||||
| .macro KERNEL_F8 | |||||
| pld [ X, #X_PRE ] | |||||
| KERNEL_F1 | |||||
| KERNEL_F1 | |||||
| KERNEL_F1 | |||||
| KERNEL_F1 | |||||
| pld [ X, #X_PRE ] | |||||
| KERNEL_F1 | |||||
| KERNEL_F1 | |||||
| KERNEL_F1 | |||||
| KERNEL_F1 | |||||
| .endm | |||||
| .macro KERNEL_S1 | |||||
| fldmias X, { s4 - s5 } | |||||
| vcmpe.f32 s4, s6 // compare with 0.0 | |||||
| vmrs APSR_nzcv, fpscr | |||||
| beq KERNEL_S1_NEXT_\@ | |||||
| vabs.f32 s4, s4 | |||||
| vcmpe.f32 s0, s4 // compare with scale | |||||
| vmrs APSR_nzcv, fpscr | |||||
| vdivge.f32 s2 , s4, s0 // scale >= x ? x / scale | |||||
| vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale ) | |||||
| bge KERNEL_S1_NEXT_\@ | |||||
| vdiv.f32 s2 , s0, s4 // scale / x | |||||
| vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x ) | |||||
| vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x ) | |||||
| vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) | |||||
| vmov.f32 s0 , s4 // scale = x | |||||
| KERNEL_S1_NEXT_\@: | |||||
| vcmpe.f32 s5, s6 // compare with 0.0 | |||||
| vmrs APSR_nzcv, fpscr | |||||
| beq KERNEL_S1_END_\@ | |||||
| vabs.f32 s5, s5 | |||||
| vcmpe.f32 s0, s5 // compare with scale | |||||
| vmrs APSR_nzcv, fpscr | |||||
| vdivge.f32 s2 , s5, s0 // scale >= x ? x / scale | |||||
| vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale ) | |||||
| bge KERNEL_S1_END_\@ | |||||
| vdiv.f32 s2 , s0, s5 // scale / x | |||||
| vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x ) | |||||
| vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x ) | |||||
| vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) | |||||
| vmov.f32 s0 , s5 // scale = x | |||||
| KERNEL_S1_END_\@: | |||||
| add X, X, INC_X | |||||
| .endm | |||||
| #endif | |||||
| #endif | |||||
| /************************************************************************************** | |||||
| * End of macro definitions | |||||
| **************************************************************************************/ | |||||
| PROLOGUE | |||||
| .align 5 | |||||
| #if defined(DOUBLE) | |||||
| vsub.f64 d0 , d0 , d0 // scale=0.0 | |||||
| vmov.f64 d1 , #1.0 // ssq=1.0 | |||||
| vmov.f64 d7 , d1 // value 1.0 | |||||
| vmov.f64 d6 , d0 // value 0.0 | |||||
| #else | |||||
| vsub.f32 s0 , s0 , s0 // scale=0.0 | |||||
| vmov.f32 s1 , #1.0 // ssq=1.0 | |||||
| vmov.f32 s7 , s1 // value 1.0 | |||||
| vmov.f32 s6 , s0 // value 0.0 | |||||
| #endif | |||||
| cmp N, #0 | |||||
| ble nrm2_kernel_L999 | |||||
| cmp INC_X, #0 | |||||
| beq nrm2_kernel_L999 | |||||
| cmp INC_X, #1 | |||||
| bne nrm2_kernel_S_BEGIN | |||||
| nrm2_kernel_F_BEGIN: | |||||
| asrs I, N, #3 // I = N / 8 | |||||
| ble nrm2_kernel_F1 | |||||
| nrm2_kernel_F8: | |||||
| KERNEL_F8 | |||||
| subs I, I, #1 | |||||
| bne nrm2_kernel_F8 | |||||
| nrm2_kernel_F1: | |||||
| ands I, N, #7 | |||||
| ble nrm2_kernel_L999 | |||||
| nrm2_kernel_F10: | |||||
| KERNEL_F1 | |||||
| subs I, I, #1 | |||||
| bne nrm2_kernel_F10 | |||||
| b nrm2_kernel_L999 | |||||
| nrm2_kernel_S_BEGIN: | |||||
| #if defined(COMPLEX) | |||||
| #if defined(DOUBLE) | |||||
| lsl INC_X, INC_X, #4 // INC_X * SIZE * 2 | |||||
| #else | |||||
| lsl INC_X, INC_X, #3 // INC_X * SIZE * 2 | |||||
| #endif | |||||
| #else | |||||
| #if defined(DOUBLE) | |||||
| lsl INC_X, INC_X, #3 // INC_X * SIZE | |||||
| #else | |||||
| lsl INC_X, INC_X, #2 // INC_X * SIZE | |||||
| #endif | |||||
| #endif | |||||
| nrm2_kernel_S1: | |||||
| mov I, N | |||||
| .align 5 | |||||
| nrm2_kernel_S10: | |||||
| KERNEL_S1 | |||||
| subs I, I, #1 | |||||
| bne nrm2_kernel_S10 | |||||
| nrm2_kernel_L999: | |||||
| #if defined(DOUBLE) | |||||
| vsqrt.f64 d1, d1 | |||||
| vmul.f64 d0, d0, d1 | |||||
| #else | |||||
| vsqrt.f32 s1, s1 | |||||
| vmul.f32 s0, s0, s1 | |||||
| #endif | |||||
| bx lr | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,62 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| /************************************************************************************** | |||||
| * 2013/09/14 Saar | |||||
| * BLASTEST float : OK | |||||
| * BLASTEST double : OK | |||||
| * CTEST : OK | |||||
| * TEST : OK | |||||
| * | |||||
| **************************************************************************************/ | |||||
| #include "common.h" | |||||
| int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) | |||||
| { | |||||
| BLASLONG i=0; | |||||
| BLASLONG ix=0,iy=0; | |||||
| FLOAT temp; | |||||
| if ( n <= 0 ) return(0); | |||||
| while(i < n) | |||||
| { | |||||
| temp = c*x[ix] + s*y[iy] ; | |||||
| y[iy] = c*y[iy] - s*x[ix] ; | |||||
| x[ix] = temp ; | |||||
| ix += inc_x ; | |||||
| iy += inc_y ; | |||||
| i++ ; | |||||
| } | |||||
| return(0); | |||||
| } | |||||
| @@ -0,0 +1,584 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| /************************************************************************************** | |||||
| * 2013/11/15 Saar | |||||
| * BLASTEST : OK | |||||
| * CTEST : OK | |||||
| * TEST : OK | |||||
| * | |||||
| **************************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define STACKSIZE 256 | |||||
| #define OLD_INC_Y [fp, #0 ] | |||||
| #define N r0 | |||||
| #define X r1 | |||||
| #define INC_X r2 | |||||
| #define Y r3 | |||||
| #define INC_Y r4 | |||||
| #define I r12 | |||||
| #define X_PRE 512 | |||||
| /************************************************************************************** | |||||
| * Macro definitions | |||||
| **************************************************************************************/ | |||||
| /*****************************************************************************************/ | |||||
| #if !defined(COMPLEX) | |||||
| #if defined(DOUBLE) | |||||
| .macro KERNEL_F4 | |||||
| pld [ X, #X_PRE ] | |||||
| pld [ Y, #X_PRE ] | |||||
| fldmiad X, { d4 } | |||||
| fldmiad Y, { d5 } | |||||
| vmul.f64 d2 , d0, d4 | |||||
| fmacd d2 , d1, d5 | |||||
| vmul.f64 d3 , d0, d5 | |||||
| fnmacd d3 , d1, d4 | |||||
| fstmiad X!, { d2 } | |||||
| fstmiad Y!, { d3 } | |||||
| fldmiad X, { d4 } | |||||
| fldmiad Y, { d5 } | |||||
| vmul.f64 d2 , d0, d4 | |||||
| fmacd d2 , d1, d5 | |||||
| vmul.f64 d3 , d0, d5 | |||||
| fnmacd d3 , d1, d4 | |||||
| fstmiad X!, { d2 } | |||||
| fstmiad Y!, { d3 } | |||||
| fldmiad X, { d4 } | |||||
| fldmiad Y, { d5 } | |||||
| vmul.f64 d2 , d0, d4 | |||||
| fmacd d2 , d1, d5 | |||||
| vmul.f64 d3 , d0, d5 | |||||
| fnmacd d3 , d1, d4 | |||||
| fstmiad X!, { d2 } | |||||
| fstmiad Y!, { d3 } | |||||
| fldmiad X, { d4 } | |||||
| fldmiad Y, { d5 } | |||||
| vmul.f64 d2 , d0, d4 | |||||
| fmacd d2 , d1, d5 | |||||
| vmul.f64 d3 , d0, d5 | |||||
| fnmacd d3 , d1, d4 | |||||
| fstmiad X!, { d2 } | |||||
| fstmiad Y!, { d3 } | |||||
| .endm | |||||
| .macro KERNEL_F1 | |||||
| fldmiad X, { d4 } | |||||
| fldmiad Y, { d5 } | |||||
| vmul.f64 d2 , d0, d4 | |||||
| fmacd d2 , d1, d5 | |||||
| vmul.f64 d3 , d0, d5 | |||||
| fnmacd d3 , d1, d4 | |||||
| fstmiad X!, { d2 } | |||||
| fstmiad Y!, { d3 } | |||||
| .endm | |||||
| .macro KERNEL_S1 | |||||
| fldmiad X, { d4 } | |||||
| fldmiad Y, { d5 } | |||||
| vmul.f64 d2 , d0, d4 | |||||
| fmacd d2 , d1, d5 | |||||
| vmul.f64 d3 , d0, d5 | |||||
| fnmacd d3 , d1, d4 | |||||
| fstmiad X, { d2 } | |||||
| fstmiad Y, { d3 } | |||||
| add X, X, INC_X | |||||
| add Y, Y, INC_Y | |||||
| .endm | |||||
| #else | |||||
| .macro KERNEL_F4 | |||||
| fldmias X, { s4 } | |||||
| fldmias Y, { s5 } | |||||
| vmul.f32 s2 , s0, s4 | |||||
| fmacs s2 , s1, s5 | |||||
| vmul.f32 s3 , s0, s5 | |||||
| fnmacs s3 , s1, s4 | |||||
| fstmias X!, { s2 } | |||||
| fstmias Y!, { s3 } | |||||
| fldmias X, { s4 } | |||||
| fldmias Y, { s5 } | |||||
| vmul.f32 s2 , s0, s4 | |||||
| fmacs s2 , s1, s5 | |||||
| vmul.f32 s3 , s0, s5 | |||||
| fnmacs s3 , s1, s4 | |||||
| fstmias X!, { s2 } | |||||
| fstmias Y!, { s3 } | |||||
| fldmias X, { s4 } | |||||
| fldmias Y, { s5 } | |||||
| vmul.f32 s2 , s0, s4 | |||||
| fmacs s2 , s1, s5 | |||||
| vmul.f32 s3 , s0, s5 | |||||
| fnmacs s3 , s1, s4 | |||||
| fstmias X!, { s2 } | |||||
| fstmias Y!, { s3 } | |||||
| fldmias X, { s4 } | |||||
| fldmias Y, { s5 } | |||||
| vmul.f32 s2 , s0, s4 | |||||
| fmacs s2 , s1, s5 | |||||
| vmul.f32 s3 , s0, s5 | |||||
| fnmacs s3 , s1, s4 | |||||
| fstmias X!, { s2 } | |||||
| fstmias Y!, { s3 } | |||||
| .endm | |||||
| .macro KERNEL_F1 | |||||
| fldmias X, { s4 } | |||||
| fldmias Y, { s5 } | |||||
| vmul.f32 s2 , s0, s4 | |||||
| fmacs s2 , s1, s5 | |||||
| vmul.f32 s3 , s0, s5 | |||||
| fnmacs s3 , s1, s4 | |||||
| fstmias X!, { s2 } | |||||
| fstmias Y!, { s3 } | |||||
| .endm | |||||
| .macro KERNEL_S1 | |||||
| fldmias X, { s4 } | |||||
| fldmias Y, { s5 } | |||||
| vmul.f32 s2 , s0, s4 | |||||
| fmacs s2 , s1, s5 | |||||
| vmul.f32 s3 , s0, s5 | |||||
| fnmacs s3 , s1, s4 | |||||
| fstmias X, { s2 } | |||||
| fstmias Y, { s3 } | |||||
| add X, X, INC_X | |||||
| add Y, Y, INC_Y | |||||
| .endm | |||||
| #endif | |||||
| #else | |||||
| #if defined(DOUBLE) | |||||
| .macro KERNEL_F4 | |||||
| pld [ X, #X_PRE ] | |||||
| pld [ Y, #X_PRE ] | |||||
| fldmiad X, { d4 - d5 } | |||||
| fldmiad Y, { d6 - d7 } | |||||
| vmul.f64 d2 , d0, d4 | |||||
| fmacd d2 , d1, d6 | |||||
| vmul.f64 d3 , d0, d6 | |||||
| fnmacd d3 , d1, d4 | |||||
| fstmiad X!, { d2 } | |||||
| fstmiad Y!, { d3 } | |||||
| vmul.f64 d2 , d0, d5 | |||||
| fmacd d2 , d1, d7 | |||||
| vmul.f64 d3 , d0, d7 | |||||
| fnmacd d3 , d1, d5 | |||||
| fstmiad X!, { d2 } | |||||
| fstmiad Y!, { d3 } | |||||
| fldmiad X, { d4 - d5 } | |||||
| fldmiad Y, { d6 - d7 } | |||||
| vmul.f64 d2 , d0, d4 | |||||
| fmacd d2 , d1, d6 | |||||
| vmul.f64 d3 , d0, d6 | |||||
| fnmacd d3 , d1, d4 | |||||
| fstmiad X!, { d2 } | |||||
| fstmiad Y!, { d3 } | |||||
| vmul.f64 d2 , d0, d5 | |||||
| fmacd d2 , d1, d7 | |||||
| vmul.f64 d3 , d0, d7 | |||||
| fnmacd d3 , d1, d5 | |||||
| fstmiad X!, { d2 } | |||||
| fstmiad Y!, { d3 } | |||||
| pld [ X, #X_PRE ] | |||||
| pld [ Y, #X_PRE ] | |||||
| fldmiad X, { d4 - d5 } | |||||
| fldmiad Y, { d6 - d7 } | |||||
| vmul.f64 d2 , d0, d4 | |||||
| fmacd d2 , d1, d6 | |||||
| vmul.f64 d3 , d0, d6 | |||||
| fnmacd d3 , d1, d4 | |||||
| fstmiad X!, { d2 } | |||||
| fstmiad Y!, { d3 } | |||||
| vmul.f64 d2 , d0, d5 | |||||
| fmacd d2 , d1, d7 | |||||
| vmul.f64 d3 , d0, d7 | |||||
| fnmacd d3 , d1, d5 | |||||
| fstmiad X!, { d2 } | |||||
| fstmiad Y!, { d3 } | |||||
| fldmiad X, { d4 - d5 } | |||||
| fldmiad Y, { d6 - d7 } | |||||
| vmul.f64 d2 , d0, d4 | |||||
| fmacd d2 , d1, d6 | |||||
| vmul.f64 d3 , d0, d6 | |||||
| fnmacd d3 , d1, d4 | |||||
| fstmiad X!, { d2 } | |||||
| fstmiad Y!, { d3 } | |||||
| vmul.f64 d2 , d0, d5 | |||||
| fmacd d2 , d1, d7 | |||||
| vmul.f64 d3 , d0, d7 | |||||
| fnmacd d3 , d1, d5 | |||||
| fstmiad X!, { d2 } | |||||
| fstmiad Y!, { d3 } | |||||
| .endm | |||||
| .macro KERNEL_F1 | |||||
| fldmiad X, { d4 - d5 } | |||||
| fldmiad Y, { d6 - d7 } | |||||
| vmul.f64 d2 , d0, d4 | |||||
| fmacd d2 , d1, d6 | |||||
| vmul.f64 d3 , d0, d6 | |||||
| fnmacd d3 , d1, d4 | |||||
| fstmiad X!, { d2 } | |||||
| fstmiad Y!, { d3 } | |||||
| vmul.f64 d2 , d0, d5 | |||||
| fmacd d2 , d1, d7 | |||||
| vmul.f64 d3 , d0, d7 | |||||
| fnmacd d3 , d1, d5 | |||||
| fstmiad X!, { d2 } | |||||
| fstmiad Y!, { d3 } | |||||
| .endm | |||||
| .macro KERNEL_S1 | |||||
| fldmiad X, { d4 - d5 } | |||||
| fldmiad Y, { d6 - d7 } | |||||
| vmul.f64 d2 , d0, d4 | |||||
| fmacd d2 , d1, d6 | |||||
| vmul.f64 d3 , d0, d6 | |||||
| fnmacd d3 , d1, d4 | |||||
| vstr d2 , [ X, #0 ] | |||||
| vstr d3 , [ Y, #0 ] | |||||
| vmul.f64 d2 , d0, d5 | |||||
| fmacd d2 , d1, d7 | |||||
| vmul.f64 d3 , d0, d7 | |||||
| fnmacd d3 , d1, d5 | |||||
| vstr d2 , [ X, #8 ] | |||||
| vstr d3 , [ Y, #8 ] | |||||
| add X, X, INC_X | |||||
| add Y, Y, INC_Y | |||||
| .endm | |||||
| #else | |||||
| .macro KERNEL_F4 | |||||
| pld [ X, #X_PRE ] | |||||
| pld [ Y, #X_PRE ] | |||||
| fldmias X, { s4 - s5 } | |||||
| fldmias Y, { s6 - s7 } | |||||
| vmul.f32 s2 , s0, s4 | |||||
| fmacs s2 , s1, s6 | |||||
| vmul.f32 s3 , s0, s6 | |||||
| fnmacs s3 , s1, s4 | |||||
| fstmias X!, { s2 } | |||||
| fstmias Y!, { s3 } | |||||
| vmul.f32 s2 , s0, s5 | |||||
| fmacs s2 , s1, s7 | |||||
| vmul.f32 s3 , s0, s7 | |||||
| fnmacs s3 , s1, s5 | |||||
| fstmias X!, { s2 } | |||||
| fstmias Y!, { s3 } | |||||
| fldmias X, { s4 - s5 } | |||||
| fldmias Y, { s6 - s7 } | |||||
| vmul.f32 s2 , s0, s4 | |||||
| fmacs s2 , s1, s6 | |||||
| vmul.f32 s3 , s0, s6 | |||||
| fnmacs s3 , s1, s4 | |||||
| fstmias X!, { s2 } | |||||
| fstmias Y!, { s3 } | |||||
| vmul.f32 s2 , s0, s5 | |||||
| fmacs s2 , s1, s7 | |||||
| vmul.f32 s3 , s0, s7 | |||||
| fnmacs s3 , s1, s5 | |||||
| fstmias X!, { s2 } | |||||
| fstmias Y!, { s3 } | |||||
| pld [ X, #X_PRE ] | |||||
| pld [ Y, #X_PRE ] | |||||
| fldmias X, { s4 - s5 } | |||||
| fldmias Y, { s6 - s7 } | |||||
| vmul.f32 s2 , s0, s4 | |||||
| fmacs s2 , s1, s6 | |||||
| vmul.f32 s3 , s0, s6 | |||||
| fnmacs s3 , s1, s4 | |||||
| fstmias X!, { s2 } | |||||
| fstmias Y!, { s3 } | |||||
| vmul.f32 s2 , s0, s5 | |||||
| fmacs s2 , s1, s7 | |||||
| vmul.f32 s3 , s0, s7 | |||||
| fnmacs s3 , s1, s5 | |||||
| fstmias X!, { s2 } | |||||
| fstmias Y!, { s3 } | |||||
| fldmias X, { s4 - s5 } | |||||
| fldmias Y, { s6 - s7 } | |||||
| vmul.f32 s2 , s0, s4 | |||||
| fmacs s2 , s1, s6 | |||||
| vmul.f32 s3 , s0, s6 | |||||
| fnmacs s3 , s1, s4 | |||||
| fstmias X!, { s2 } | |||||
| fstmias Y!, { s3 } | |||||
| vmul.f32 s2 , s0, s5 | |||||
| fmacs s2 , s1, s7 | |||||
| vmul.f32 s3 , s0, s7 | |||||
| fnmacs s3 , s1, s5 | |||||
| fstmias X!, { s2 } | |||||
| fstmias Y!, { s3 } | |||||
| .endm | |||||
| .macro KERNEL_F1 | |||||
| fldmias X, { s4 - s5 } | |||||
| fldmias Y, { s6 - s7 } | |||||
| vmul.f32 s2 , s0, s4 | |||||
| fmacs s2 , s1, s6 | |||||
| vmul.f32 s3 , s0, s6 | |||||
| fnmacs s3 , s1, s4 | |||||
| fstmias X!, { s2 } | |||||
| fstmias Y!, { s3 } | |||||
| vmul.f32 s2 , s0, s5 | |||||
| fmacs s2 , s1, s7 | |||||
| vmul.f32 s3 , s0, s7 | |||||
| fnmacs s3 , s1, s5 | |||||
| fstmias X!, { s2 } | |||||
| fstmias Y!, { s3 } | |||||
| .endm | |||||
| .macro KERNEL_S1 | |||||
| fldmias X, { s4 - s5 } | |||||
| fldmias Y, { s6 - s7 } | |||||
| vmul.f32 s2 , s0, s4 | |||||
| fmacs s2 , s1, s6 | |||||
| vmul.f32 s3 , s0, s6 | |||||
| fnmacs s3 , s1, s4 | |||||
| vstr s2 , [ X, #0 ] | |||||
| vstr s3 , [ Y, #0 ] | |||||
| vmul.f32 s2 , s0, s5 | |||||
| fmacs s2 , s1, s7 | |||||
| vmul.f32 s3 , s0, s7 | |||||
| fnmacs s3 , s1, s5 | |||||
| vstr s2 , [ X, #4 ] | |||||
| vstr s3 , [ Y, #4 ] | |||||
| add X, X, INC_X | |||||
| add Y, Y, INC_Y | |||||
| .endm | |||||
| #endif | |||||
| #endif | |||||
| /************************************************************************************** | |||||
| * End of macro definitions | |||||
| **************************************************************************************/ | |||||
| PROLOGUE | |||||
| .align 5 | |||||
| push {r4 , fp} | |||||
| add fp, sp, #8 | |||||
| ldr INC_Y , OLD_INC_Y | |||||
| cmp N, #0 | |||||
| ble rot_kernel_L999 | |||||
| cmp INC_X, #0 | |||||
| beq rot_kernel_L999 | |||||
| cmp INC_Y, #0 | |||||
| beq rot_kernel_L999 | |||||
| cmp INC_X, #1 | |||||
| bne rot_kernel_S_BEGIN | |||||
| cmp INC_Y, #1 | |||||
| bne rot_kernel_S_BEGIN | |||||
| rot_kernel_F_BEGIN: | |||||
| asrs I, N, #2 // I = N / 4 | |||||
| ble rot_kernel_F1 | |||||
| .align 5 | |||||
| rot_kernel_F4: | |||||
| #if !defined(COMPLEX) && !defined(DOUBLE) | |||||
| pld [ X, #X_PRE ] | |||||
| pld [ Y, #X_PRE ] | |||||
| #endif | |||||
| KERNEL_F4 | |||||
| subs I, I, #1 | |||||
| ble rot_kernel_F1 | |||||
| KERNEL_F4 | |||||
| subs I, I, #1 | |||||
| bne rot_kernel_F4 | |||||
| rot_kernel_F1: | |||||
| ands I, N, #3 | |||||
| ble rot_kernel_L999 | |||||
| rot_kernel_F10: | |||||
| KERNEL_F1 | |||||
| subs I, I, #1 | |||||
| bne rot_kernel_F10 | |||||
| b rot_kernel_L999 | |||||
| rot_kernel_S_BEGIN: | |||||
| #if defined(COMPLEX) | |||||
| #if defined(DOUBLE) | |||||
| lsl INC_X, INC_X, #4 // INC_X * SIZE * 2 | |||||
| lsl INC_Y, INC_Y, #4 // INC_Y * SIZE * 2 | |||||
| #else | |||||
| lsl INC_X, INC_X, #3 // INC_X * SIZE * 2 | |||||
| lsl INC_Y, INC_Y, #3 // INC_Y * SIZE * 2 | |||||
| #endif | |||||
| #else | |||||
| #if defined(DOUBLE) | |||||
| lsl INC_X, INC_X, #3 // INC_X * SIZE | |||||
| lsl INC_Y, INC_Y, #3 // INC_Y * SIZE | |||||
| #else | |||||
| lsl INC_X, INC_X, #2 // INC_X * SIZE | |||||
| lsl INC_Y, INC_Y, #2 // INC_Y * SIZE | |||||
| #endif | |||||
| #endif | |||||
| asrs I, N, #2 // I = N / 4 | |||||
| ble rot_kernel_S1 | |||||
| .align 5 | |||||
| rot_kernel_S4: | |||||
| KERNEL_S1 | |||||
| KERNEL_S1 | |||||
| KERNEL_S1 | |||||
| KERNEL_S1 | |||||
| subs I, I, #1 | |||||
| bne rot_kernel_S4 | |||||
| rot_kernel_S1: | |||||
| ands I, N, #3 | |||||
| ble rot_kernel_L999 | |||||
| rot_kernel_S10: | |||||
| KERNEL_S1 | |||||
| subs I, I, #1 | |||||
| bne rot_kernel_S10 | |||||
| rot_kernel_L999: | |||||
| mov r0, #0 // set return value | |||||
| sub sp, fp, #8 | |||||
| pop {r4,fp} | |||||
| bx lr | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,58 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| /************************************************************************************** | |||||
| * 2013/09/14 Saar | |||||
| * BLASTEST float : OK | |||||
| * BLASTEST double : OK | |||||
| * CTEST : OK | |||||
| * TEST : OK | |||||
| * | |||||
| **************************************************************************************/ | |||||
| #include "common.h" | |||||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | |||||
| { | |||||
| BLASLONG i=0; | |||||
| if ( n < 0 || inc_x < 1 ) return(0); | |||||
| if ( da == 1.0 ) return(0); | |||||
| n *= inc_x; | |||||
| while(i < n) | |||||
| { | |||||
| x[i] = da * x[i] ; | |||||
| i += inc_x ; | |||||
| } | |||||
| return(0); | |||||
| } | |||||
| @@ -0,0 +1,376 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| /************************************************************************************** | |||||
| * 2013/11/15 Saar | |||||
| * BLASTEST : OK | |||||
| * CTEST : OK | |||||
| * TEST : OK | |||||
| * | |||||
| **************************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define STACKSIZE 256 | |||||
| #define OLD_INC_X [sp, #0 ] | |||||
| #define N r0 | |||||
| #define INC_X r1 | |||||
| #define X r3 | |||||
| #define I r12 | |||||
| #define X_PRE 512 | |||||
| /************************************************************************************** | |||||
| * Macro definitions | |||||
| **************************************************************************************/ | |||||
| /*****************************************************************************************/ | |||||
| #if !defined(COMPLEX) | |||||
| #if defined(DOUBLE) | |||||
| .macro KERNEL_F4 | |||||
| pld [ X, #X_PRE ] | |||||
| fldmiad X, { d4 - d7 } | |||||
| vmul.f64 d4, d4, d0 | |||||
| vmul.f64 d5, d5, d0 | |||||
| vmul.f64 d6, d6, d0 | |||||
| fstmiad X!, { d4 - d5 } | |||||
| vmul.f64 d7, d7, d0 | |||||
| fstmiad X!, { d6 - d7 } | |||||
| .endm | |||||
| .macro KERNEL_F1 | |||||
| fldmiad X, { d4 } | |||||
| vmul.f64 d4, d4, d0 | |||||
| fstmiad X!, { d4 } | |||||
| .endm | |||||
| .macro KERNEL_S1 | |||||
| fldmiad X, { d4 } | |||||
| vmul.f64 d4, d4, d0 | |||||
| fstmiad X, { d4 } | |||||
| add X, X, INC_X | |||||
| .endm | |||||
| #else | |||||
| .macro KERNEL_F4 | |||||
| fldmias X, { s4 - s7 } | |||||
| vmul.f32 s4, s4, s0 | |||||
| vmul.f32 s5, s5, s0 | |||||
| vmul.f32 s6, s6, s0 | |||||
| fstmias X!, { s4 - s5 } | |||||
| vmul.f32 s7, s7, s0 | |||||
| fstmias X!, { s6 - s7 } | |||||
| .endm | |||||
| .macro KERNEL_F1 | |||||
| fldmias X, { s4 } | |||||
| vmul.f32 s4, s4, s0 | |||||
| fstmias X!, { s4 } | |||||
| .endm | |||||
| .macro KERNEL_S1 | |||||
| fldmias X, { s4 } | |||||
| vmul.f32 s4, s4, s0 | |||||
| fstmias X, { s4 } | |||||
| add X, X, INC_X | |||||
| .endm | |||||
| #endif | |||||
| #else | |||||
| #if defined(DOUBLE) | |||||
| .macro KERNEL_F4 | |||||
| pld [ X, #X_PRE ] | |||||
| fldmiad X, { d4 - d5 } | |||||
| vmul.f64 d2, d0, d4 | |||||
| fnmacd d2, d1, d5 | |||||
| vmul.f64 d3, d0, d5 | |||||
| fmacd d3, d1, d4 | |||||
| fstmiad X!, { d2 - d3 } | |||||
| fldmiad X, { d4 - d5 } | |||||
| vmul.f64 d2, d0, d4 | |||||
| fnmacd d2, d1, d5 | |||||
| vmul.f64 d3, d0, d5 | |||||
| fmacd d3, d1, d4 | |||||
| fstmiad X!, { d2 - d3 } | |||||
| pld [ X, #X_PRE ] | |||||
| fldmiad X, { d4 - d5 } | |||||
| vmul.f64 d2, d0, d4 | |||||
| fnmacd d2, d1, d5 | |||||
| vmul.f64 d3, d0, d5 | |||||
| fmacd d3, d1, d4 | |||||
| fstmiad X!, { d2 - d3 } | |||||
| fldmiad X, { d4 - d5 } | |||||
| vmul.f64 d2, d0, d4 | |||||
| fnmacd d2, d1, d5 | |||||
| vmul.f64 d3, d0, d5 | |||||
| fmacd d3, d1, d4 | |||||
| fstmiad X!, { d2 - d3 } | |||||
| .endm | |||||
| .macro KERNEL_F1 | |||||
| fldmiad X, { d4 - d5 } | |||||
| vmul.f64 d2, d0, d4 | |||||
| fnmacd d2, d1, d5 | |||||
| vmul.f64 d3, d0, d5 | |||||
| fmacd d3, d1, d4 | |||||
| fstmiad X!, { d2 - d3 } | |||||
| .endm | |||||
| .macro KERNEL_S1 | |||||
| fldmiad X, { d4 - d5 } | |||||
| vmul.f64 d2, d0, d4 | |||||
| fnmacd d2, d1, d5 | |||||
| vmul.f64 d3, d0, d5 | |||||
| fmacd d3, d1, d4 | |||||
| fstmiad X, { d2 - d3 } | |||||
| add X, X, INC_X | |||||
| .endm | |||||
| #else | |||||
| .macro KERNEL_F4 | |||||
| pld [ X, #X_PRE ] | |||||
| fldmias X, { s4 - s5 } | |||||
| vmul.f32 s2, s0, s4 | |||||
| fnmacs s2, s1, s5 | |||||
| vmul.f32 s3, s0, s5 | |||||
| fmacs s3, s1, s4 | |||||
| fstmias X!, { s2 - s3 } | |||||
| fldmias X, { s4 - s5 } | |||||
| vmul.f32 s2, s0, s4 | |||||
| fnmacs s2, s1, s5 | |||||
| vmul.f32 s3, s0, s5 | |||||
| fmacs s3, s1, s4 | |||||
| fstmias X!, { s2 - s3 } | |||||
| fldmias X, { s4 - s5 } | |||||
| vmul.f32 s2, s0, s4 | |||||
| fnmacs s2, s1, s5 | |||||
| vmul.f32 s3, s0, s5 | |||||
| fmacs s3, s1, s4 | |||||
| fstmias X!, { s2 - s3 } | |||||
| fldmias X, { s4 - s5 } | |||||
| vmul.f32 s2, s0, s4 | |||||
| fnmacs s2, s1, s5 | |||||
| vmul.f32 s3, s0, s5 | |||||
| fmacs s3, s1, s4 | |||||
| fstmias X!, { s2 - s3 } | |||||
| .endm | |||||
| .macro KERNEL_F1 | |||||
| fldmias X, { s4 - s5 } | |||||
| vmul.f32 s2, s0, s4 | |||||
| fnmacs s2, s1, s5 | |||||
| vmul.f32 s3, s0, s5 | |||||
| fmacs s3, s1, s4 | |||||
| fstmias X!, { s2 - s3 } | |||||
| .endm | |||||
| .macro KERNEL_S1 | |||||
| fldmias X, { s4 - s5 } | |||||
| vmul.f32 s2, s0, s4 | |||||
| fnmacs s2, s1, s5 | |||||
| vmul.f32 s3, s0, s5 | |||||
| fmacs s3, s1, s4 | |||||
| fstmias X, { s2 - s3 } | |||||
| add X, X, INC_X | |||||
| .endm | |||||
| #endif | |||||
| #endif | |||||
| /************************************************************************************** | |||||
| * End of macro definitions | |||||
| **************************************************************************************/ | |||||
| PROLOGUE | |||||
| .align 5 | |||||
| ldr INC_X , OLD_INC_X | |||||
| cmp N, #0 | |||||
| ble scal_kernel_L999 | |||||
| cmp INC_X, #0 | |||||
| ble scal_kernel_L999 | |||||
| cmp INC_X, #1 | |||||
| bne scal_kernel_S_BEGIN | |||||
| scal_kernel_F_BEGIN: | |||||
| asrs I, N, #2 // I = N / 4 | |||||
| ble scal_kernel_F1 | |||||
| .align 5 | |||||
| scal_kernel_F4: | |||||
| #if !defined(COMPLEX) && !defined(DOUBLE) | |||||
| pld [ X, #X_PRE ] | |||||
| #endif | |||||
| KERNEL_F4 | |||||
| subs I, I, #1 | |||||
| ble scal_kernel_F1 | |||||
| KERNEL_F4 | |||||
| subs I, I, #1 | |||||
| bne scal_kernel_F4 | |||||
| scal_kernel_F1: | |||||
| ands I, N, #3 | |||||
| ble scal_kernel_L999 | |||||
| scal_kernel_F10: | |||||
| KERNEL_F1 | |||||
| subs I, I, #1 | |||||
| bne scal_kernel_F10 | |||||
| b scal_kernel_L999 | |||||
| scal_kernel_S_BEGIN: | |||||
| #if defined(COMPLEX) | |||||
| #if defined(DOUBLE) | |||||
| lsl INC_X, INC_X, #4 // INC_X * SIZE * 2 | |||||
| #else | |||||
| lsl INC_X, INC_X, #3 // INC_X * SIZE * 2 | |||||
| #endif | |||||
| #else | |||||
| #if defined(DOUBLE) | |||||
| lsl INC_X, INC_X, #3 // INC_X * SIZE | |||||
| #else | |||||
| lsl INC_X, INC_X, #2 // INC_X * SIZE | |||||
| #endif | |||||
| #endif | |||||
| asrs I, N, #2 // I = N / 4 | |||||
| ble scal_kernel_S1 | |||||
| .align 5 | |||||
| scal_kernel_S4: | |||||
| KERNEL_S1 | |||||
| KERNEL_S1 | |||||
| KERNEL_S1 | |||||
| KERNEL_S1 | |||||
| subs I, I, #1 | |||||
| bne scal_kernel_S4 | |||||
| scal_kernel_S1: | |||||
| ands I, N, #3 | |||||
| ble scal_kernel_L999 | |||||
| scal_kernel_S10: | |||||
| KERNEL_S1 | |||||
| subs I, I, #1 | |||||
| bne scal_kernel_S10 | |||||
| scal_kernel_L999: | |||||
| mov r0, #0 // set return value | |||||
| bx lr | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,224 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| /************************************************************************************** | |||||
| * 2013/11/07 Saar | |||||
| * BLASTEST : OK | |||||
| * CTEST : OK | |||||
| * TEST : OK | |||||
| * | |||||
| **************************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define STACKSIZE 256 | |||||
| #define N r0 | |||||
| #define X r1 | |||||
| #define INC_X r2 | |||||
| #define OLD_Y r3 | |||||
| /****************************************************** | |||||
| * [fp, #-128] - [fp, #-64] is reserved | |||||
| * for store and restore of floating point | |||||
| * registers | |||||
| *******************************************************/ | |||||
| #define OLD_INC_Y [fp, #4 ] | |||||
| #define I r5 | |||||
| #define Y r6 | |||||
| #define INC_Y r7 | |||||
| #define X_PRE 256 | |||||
| /************************************************************************************** | |||||
| * Macro definitions | |||||
| **************************************************************************************/ | |||||
| .macro COPY_F8 | |||||
| pld [ X, #X_PRE ] | |||||
| fldmias X!, { s0 - s3 } | |||||
| fldmias X!, { s4 - s7 } | |||||
| fstmias Y!, { s0 - s3 } | |||||
| fstmias Y!, { s4 - s7 } | |||||
| .endm | |||||
| .macro COPY_F1 | |||||
| fldmias X!, { s0 } | |||||
| fstmias Y!, { s0 } | |||||
| .endm | |||||
| /*************************************************************************************************************************/ | |||||
| .macro COPY_S4 | |||||
| nop | |||||
| fldmias X, { s0 } | |||||
| fstmias Y, { s0 } | |||||
| add X, X, INC_X | |||||
| add Y, Y, INC_Y | |||||
| fldmias X, { s1 } | |||||
| fstmias Y, { s1 } | |||||
| add X, X, INC_X | |||||
| add Y, Y, INC_Y | |||||
| fldmias X, { s0 } | |||||
| fstmias Y, { s0 } | |||||
| add X, X, INC_X | |||||
| add Y, Y, INC_Y | |||||
| fldmias X, { s1 } | |||||
| fstmias Y, { s1 } | |||||
| add X, X, INC_X | |||||
| add Y, Y, INC_Y | |||||
| .endm | |||||
| .macro COPY_S1 | |||||
| fldmias X, { s0 } | |||||
| fstmias Y, { s0 } | |||||
| add X, X, INC_X | |||||
| add Y, Y, INC_Y | |||||
| .endm | |||||
| /************************************************************************************** | |||||
| * End of macro definitions | |||||
| **************************************************************************************/ | |||||
| PROLOGUE | |||||
| .align 5 | |||||
| push {r4 - r9, fp} | |||||
| add fp, sp, #24 | |||||
| sub sp, sp, #STACKSIZE // reserve stack | |||||
| sub r4, fp, #128 | |||||
| vstm r4, { s8 - s15} // store floating point registers | |||||
| mov Y, OLD_Y | |||||
| ldr INC_Y, OLD_INC_Y | |||||
| cmp N, #0 | |||||
| ble scopy_kernel_L999 | |||||
| cmp INC_X, #0 | |||||
| beq scopy_kernel_L999 | |||||
| cmp INC_Y, #0 | |||||
| beq scopy_kernel_L999 | |||||
| cmp INC_X, #1 | |||||
| bne scopy_kernel_S_BEGIN | |||||
| cmp INC_Y, #1 | |||||
| bne scopy_kernel_S_BEGIN | |||||
| scopy_kernel_F_BEGIN: | |||||
| asrs I, N, #3 // I = N / 8 | |||||
| ble scopy_kernel_F1 | |||||
| scopy_kernel_F8: | |||||
| COPY_F8 | |||||
| subs I, I, #1 | |||||
| bne scopy_kernel_F8 | |||||
| scopy_kernel_F1: | |||||
| ands I, N, #7 | |||||
| ble scopy_kernel_L999 | |||||
| scopy_kernel_F10: | |||||
| COPY_F1 | |||||
| subs I, I, #1 | |||||
| bne scopy_kernel_F10 | |||||
| b scopy_kernel_L999 | |||||
| scopy_kernel_S_BEGIN: | |||||
| lsl INC_X, INC_X, #2 // INC_X * SIZE | |||||
| lsl INC_Y, INC_Y, #2 // INC_Y * SIZE | |||||
| asrs I, N, #2 // I = N / 4 | |||||
| ble scopy_kernel_S1 | |||||
| scopy_kernel_S4: | |||||
| COPY_S4 | |||||
| subs I, I, #1 | |||||
| bne scopy_kernel_S4 | |||||
| scopy_kernel_S1: | |||||
| ands I, N, #3 | |||||
| ble scopy_kernel_L999 | |||||
| scopy_kernel_S10: | |||||
| COPY_S1 | |||||
| subs I, I, #1 | |||||
| bne scopy_kernel_S10 | |||||
| scopy_kernel_L999: | |||||
| sub r3, fp, #128 | |||||
| vldm r3, { s8 - s15} // restore floating point registers | |||||
| mov r0, #0 // set return value | |||||
| sub sp, fp, #24 | |||||
| pop {r4 - r9, fp} | |||||
| bx lr | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,347 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| /************************************************************************************** | |||||
| * 2013/11/11 Saar | |||||
| * BLASTEST : OK | |||||
| * CTEST : OK (no test for dsdot) | |||||
| * TEST : OK (no test for dsdot) | |||||
| * | |||||
| **************************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define STACKSIZE 256 | |||||
| #define N r0 | |||||
| #define X r1 | |||||
| #define INC_X r2 | |||||
| #define OLD_Y r3 | |||||
| /****************************************************** | |||||
| * [fp, #-128] - [fp, #-64] is reserved | |||||
| * for store and restore of floating point | |||||
| * registers | |||||
| *******************************************************/ | |||||
| #define OLD_INC_Y [fp, #4 ] | |||||
| #define I r5 | |||||
| #define Y r6 | |||||
| #define INC_Y r7 | |||||
| #define X_PRE 512 | |||||
| /************************************************************************************** | |||||
| * Macro definitions | |||||
| **************************************************************************************/ | |||||
| #if defined(DSDOT) | |||||
| .macro KERNEL_F4 | |||||
| fldmias X!, { s14 } | |||||
| fldmias Y!, { s15 } | |||||
| vmul.f32 s15, s14, s15 | |||||
| vcvt.f64.f32 d4, s15 | |||||
| vadd.f64 d0 , d0, d4 | |||||
| fldmias X!, { s14 } | |||||
| fldmias Y!, { s15 } | |||||
| vmul.f32 s15, s14, s15 | |||||
| vcvt.f64.f32 d4, s15 | |||||
| vadd.f64 d0 , d0, d4 | |||||
| fldmias X!, { s14 } | |||||
| fldmias Y!, { s15 } | |||||
| vmul.f32 s15, s14, s15 | |||||
| vcvt.f64.f32 d4, s15 | |||||
| vadd.f64 d0 , d0, d4 | |||||
| fldmias X!, { s14 } | |||||
| fldmias Y!, { s15 } | |||||
| vmul.f32 s15, s14, s15 | |||||
| vcvt.f64.f32 d4, s15 | |||||
| vadd.f64 d0 , d0, d4 | |||||
| .endm | |||||
| .macro KERNEL_F1 | |||||
| fldmias X!, { s14 } | |||||
| fldmias Y!, { s15 } | |||||
| vmul.f32 s15, s14, s15 | |||||
| vcvt.f64.f32 d4, s15 | |||||
| vadd.f64 d0 , d0, d4 | |||||
| .endm | |||||
| .macro KERNEL_S4 | |||||
| nop | |||||
| fldmias X, { s14 } | |||||
| fldmias Y, { s15 } | |||||
| vmul.f32 s15, s14, s15 | |||||
| vcvt.f64.f32 d4, s15 | |||||
| vadd.f64 d0 , d0, d4 | |||||
| add X, X, INC_X | |||||
| add Y, Y, INC_Y | |||||
| fldmias X, { s14 } | |||||
| fldmias Y, { s15 } | |||||
| vmul.f32 s15, s14, s15 | |||||
| vcvt.f64.f32 d4, s15 | |||||
| vadd.f64 d0 , d0, d4 | |||||
| add X, X, INC_X | |||||
| add Y, Y, INC_Y | |||||
| fldmias X, { s14 } | |||||
| fldmias Y, { s15 } | |||||
| vmul.f32 s15, s14, s15 | |||||
| vcvt.f64.f32 d4, s15 | |||||
| vadd.f64 d0 , d0, d4 | |||||
| add X, X, INC_X | |||||
| add Y, Y, INC_Y | |||||
| fldmias X, { s14 } | |||||
| fldmias Y, { s15 } | |||||
| vmul.f32 s15, s14, s15 | |||||
| vcvt.f64.f32 d4, s15 | |||||
| vadd.f64 d0 , d0, d4 | |||||
| add X, X, INC_X | |||||
| add Y, Y, INC_Y | |||||
| .endm | |||||
| .macro KERNEL_S1 | |||||
| fldmias X, { s14 } | |||||
| fldmias Y, { s15 } | |||||
| vmul.f32 s15, s14, s15 | |||||
| vcvt.f64.f32 d4, s15 | |||||
| vadd.f64 d0 , d0, d4 | |||||
| add X, X, INC_X | |||||
| add Y, Y, INC_Y | |||||
| .endm | |||||
| #else | |||||
| .macro KERNEL_F4 | |||||
| fldmias X!, { s8 - s9 } | |||||
| fldmias Y!, { s4 - s5} | |||||
| fmacs s0 , s4, s8 | |||||
| fldmias X!, { s10 - s11 } | |||||
| fmacs s1 , s5, s9 | |||||
| fldmias Y!, { s6 - s7 } | |||||
| fmacs s0 , s6, s10 | |||||
| fmacs s1 , s7, s11 | |||||
| .endm | |||||
| .macro KERNEL_F1 | |||||
| fldmias X!, { s4 } | |||||
| fldmias Y!, { s8 } | |||||
| fmacs s0 , s4, s8 | |||||
| .endm | |||||
| .macro KERNEL_S4 | |||||
| nop | |||||
| fldmias X, { s4 } | |||||
| fldmias Y, { s8 } | |||||
| add X, X, INC_X | |||||
| add Y, Y, INC_Y | |||||
| fmacs s0 , s4, s8 | |||||
| fldmias X, { s5 } | |||||
| fldmias Y, { s9 } | |||||
| add X, X, INC_X | |||||
| add Y, Y, INC_Y | |||||
| fmacs s1 , s5, s9 | |||||
| fldmias X, { s6 } | |||||
| fldmias Y, { s10 } | |||||
| add X, X, INC_X | |||||
| add Y, Y, INC_Y | |||||
| fmacs s0 , s6, s10 | |||||
| fldmias X, { s7 } | |||||
| fldmias Y, { s11 } | |||||
| add X, X, INC_X | |||||
| add Y, Y, INC_Y | |||||
| fmacs s1 , s7, s11 | |||||
| .endm | |||||
| .macro KERNEL_S1 | |||||
| fldmias X, { s4 } | |||||
| fldmias Y, { s8 } | |||||
| add X, X, INC_X | |||||
| fmacs s0 , s4, s8 | |||||
| add Y, Y, INC_Y | |||||
| .endm | |||||
| #endif | |||||
| /************************************************************************************** | |||||
| * End of macro definitions | |||||
| **************************************************************************************/ | |||||
| PROLOGUE | |||||
| .align 5 | |||||
| push {r4 - r9, fp} | |||||
| add fp, sp, #24 | |||||
| sub sp, sp, #STACKSIZE // reserve stack | |||||
| sub r4, fp, #128 | |||||
| vstm r4, { s8 - s15 } // store floating point registers | |||||
| mov Y, OLD_Y | |||||
| ldr INC_Y, OLD_INC_Y | |||||
| #if defined(DSDOT) | |||||
| vsub.f64 d0 , d0 , d0 | |||||
| vsub.f64 d1 , d1 , d1 | |||||
| #else | |||||
| vsub.f32 s0 , s0 , s0 | |||||
| vsub.f32 s1 , s1 , s1 | |||||
| #endif | |||||
| cmp N, #0 | |||||
| ble sdot_kernel_L999 | |||||
| cmp INC_X, #0 | |||||
| beq sdot_kernel_L999 | |||||
| cmp INC_Y, #0 | |||||
| beq sdot_kernel_L999 | |||||
| cmp INC_X, #1 | |||||
| bne sdot_kernel_S_BEGIN | |||||
| cmp INC_Y, #1 | |||||
| bne sdot_kernel_S_BEGIN | |||||
| sdot_kernel_F_BEGIN: | |||||
| asrs I, N, #2 // I = N / 4 | |||||
| ble sdot_kernel_F1 | |||||
| sdot_kernel_F4: | |||||
| KERNEL_F4 | |||||
| subs I, I, #1 | |||||
| bne sdot_kernel_F4 | |||||
| sdot_kernel_F1: | |||||
| ands I, N, #3 | |||||
| ble sdot_kernel_L999 | |||||
| sdot_kernel_F10: | |||||
| KERNEL_F1 | |||||
| subs I, I, #1 | |||||
| bne sdot_kernel_F10 | |||||
| b sdot_kernel_L999 | |||||
| sdot_kernel_S_BEGIN: | |||||
| lsl INC_X, INC_X, #2 // INC_X * SIZE | |||||
| lsl INC_Y, INC_Y, #2 // INC_Y * SIZE | |||||
| asrs I, N, #2 // I = N / 4 | |||||
| ble sdot_kernel_S1 | |||||
| sdot_kernel_S4: | |||||
| KERNEL_S4 | |||||
| subs I, I, #1 | |||||
| bne sdot_kernel_S4 | |||||
| sdot_kernel_S1: | |||||
| ands I, N, #3 | |||||
| ble sdot_kernel_L999 | |||||
| sdot_kernel_S10: | |||||
| KERNEL_S1 | |||||
| subs I, I, #1 | |||||
| bne sdot_kernel_S10 | |||||
| sdot_kernel_L999: | |||||
| sub r3, fp, #128 | |||||
| vldm r3, { s8 - s15} // restore floating point registers | |||||
| #if defined(DSDOT) | |||||
| vadd.f64 d0 , d0, d1 // set return value | |||||
| #else | |||||
| vadd.f32 s0 , s0, s1 // set return value | |||||
| #endif | |||||
| sub sp, fp, #24 | |||||
| pop {r4 - r9, fp} | |||||
| bx lr | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,797 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| /************************************************************************************** | |||||
| * 2013/11/28 Saar | |||||
| * BLASTEST : OK | |||||
| * CTEST : OK | |||||
| * TEST : OK | |||||
| * | |||||
| **************************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define STACKSIZE 256 | |||||
| #define OLD_M r0 | |||||
| #define OLD_N r1 | |||||
| #define OLD_K r2 | |||||
| #define OLD_A r3 | |||||
| #define OLD_ALPHA s0 | |||||
| /****************************************************** | |||||
| * [fp, #-128] - [fp, #-64] is reserved | |||||
| * for store and restore of floating point | |||||
| * registers | |||||
| *******************************************************/ | |||||
| #define LDC [fp, #-252 ] | |||||
| #define M [fp, #-256 ] | |||||
| #define N [fp, #-260 ] | |||||
| #define K [fp, #-264 ] | |||||
| #define A [fp, #-268 ] | |||||
| #define ALPHA [fp, #-280] | |||||
| #define B [fp, #4 ] | |||||
| #define C [fp, #8 ] | |||||
| #define OLD_LDC [fp, #12 ] | |||||
| #define I r0 | |||||
| #define J r1 | |||||
| #define L r2 | |||||
| #define AO r5 | |||||
| #define BO r6 | |||||
| #define CO1 r8 | |||||
| #define CO2 r9 | |||||
| #define K1 r7 | |||||
| #define BC r12 | |||||
| #define A_PRE 96 | |||||
| #define B_PRE 96 | |||||
| #define C_PRE 64 | |||||
| /************************************************************************************** | |||||
| * Macro definitions | |||||
| **************************************************************************************/ | |||||
| .macro INIT4x2 | |||||
| vsub.f32 s8 , s8 , s8 | |||||
| vmov.f32 s9, s8 | |||||
| vmov.f32 s10, s8 | |||||
| vmov.f32 s11, s8 | |||||
| vmov.f32 s12, s8 | |||||
| vmov.f32 s13, s8 | |||||
| vmov.f32 s14, s8 | |||||
| vmov.f32 s15, s8 | |||||
| .endm | |||||
| .macro KERNEL4x2_SUB | |||||
| fldmias AO! , { s0 - s3 } | |||||
| fldmias BO! , { s4 - s5 } | |||||
| fmacs s8 , s0, s4 | |||||
| fmacs s9 , s1, s4 | |||||
| fmacs s10 , s2, s4 | |||||
| fmacs s11 , s3, s4 | |||||
| fmacs s12 , s0, s5 | |||||
| fmacs s13 , s1, s5 | |||||
| fmacs s14 , s2, s5 | |||||
| fmacs s15 , s3, s5 | |||||
| .endm | |||||
| .macro SAVE4x2 | |||||
| ldr r3 , LDC | |||||
| add CO2 , CO1, r3 | |||||
| flds s0, ALPHA | |||||
| flds s4 , [CO1] | |||||
| flds s5 , [CO1, #4 ] | |||||
| flds s6 , [CO1, #8 ] | |||||
| flds s7 , [CO1, #12 ] | |||||
| fmacs s4 , s0 , s8 | |||||
| fmacs s5 , s0 , s9 | |||||
| fmacs s6 , s0 , s10 | |||||
| fmacs s7 , s0 , s11 | |||||
| fsts s4 , [CO1] | |||||
| fsts s5 , [CO1, #4 ] | |||||
| fsts s6 , [CO1, #8 ] | |||||
| fsts s7 , [CO1, #12 ] | |||||
| flds s4 , [CO2] | |||||
| flds s5 , [CO2, #4 ] | |||||
| flds s6 , [CO2, #8 ] | |||||
| flds s7 , [CO2, #12 ] | |||||
| fmacs s4 , s0 , s12 | |||||
| fmacs s5 , s0 , s13 | |||||
| fmacs s6 , s0 , s14 | |||||
| fmacs s7 , s0 , s15 | |||||
| fsts s4 , [CO2] | |||||
| fsts s5 , [CO2, #4 ] | |||||
| fsts s6 , [CO2, #8 ] | |||||
| fsts s7 , [CO2, #12 ] | |||||
| add CO1, CO1, #16 | |||||
| .endm | |||||
| /******************************************************************************/ | |||||
| .macro INIT2x2 | |||||
| vsub.f32 s8 , s8 , s8 | |||||
| vmov.f32 s9, s8 | |||||
| vmov.f32 s12, s8 | |||||
| vmov.f32 s13, s8 | |||||
| .endm | |||||
| .macro KERNEL2x2_SUB | |||||
| flds s4 , [ BO ] | |||||
| flds s5 , [ BO, #4 ] | |||||
| flds s0 , [ AO ] | |||||
| flds s1 , [ AO, #4 ] | |||||
| fmacs s8 , s0, s4 | |||||
| fmacs s9 , s1, s4 | |||||
| fmacs s12 , s0, s5 | |||||
| fmacs s13 , s1, s5 | |||||
| add AO , AO, #8 | |||||
| add BO , BO, #8 | |||||
| .endm | |||||
| .macro SAVE2x2 | |||||
| ldr r3 , LDC | |||||
| add CO2 , CO1, r3 | |||||
| flds s0, ALPHA | |||||
| flds s4 , [CO1] | |||||
| flds s5 , [CO1, #4 ] | |||||
| fmacs s4 , s0 , s8 | |||||
| fmacs s5 , s0 , s9 | |||||
| fsts s4 , [CO1] | |||||
| fsts s5 , [CO1, #4 ] | |||||
| flds s4 , [CO2] | |||||
| flds s5 , [CO2, #4 ] | |||||
| fmacs s4 , s0 , s12 | |||||
| fmacs s5 , s0 , s13 | |||||
| fsts s4 , [CO2] | |||||
| fsts s5 , [CO2, #4 ] | |||||
| add CO1, CO1, #8 | |||||
| .endm | |||||
| /******************************************************************************/ | |||||
| .macro INIT1x2 | |||||
| vsub.f32 s8 , s8 , s8 | |||||
| vmov.f32 s12, s8 | |||||
| .endm | |||||
| .macro KERNEL1x2_SUB | |||||
| flds s4 , [ BO ] | |||||
| flds s5 , [ BO, #4 ] | |||||
| flds s0 , [ AO ] | |||||
| fmacs s8 , s0, s4 | |||||
| fmacs s12 , s0, s5 | |||||
| add AO , AO, #4 | |||||
| add BO , BO, #8 | |||||
| .endm | |||||
| .macro SAVE1x2 | |||||
| ldr r3 , LDC | |||||
| add CO2 , CO1, r3 | |||||
| flds s0, ALPHA | |||||
| flds s4 , [CO1] | |||||
| fmacs s4 , s0 , s8 | |||||
| fsts s4 , [CO1] | |||||
| flds s4 , [CO2] | |||||
| fmacs s4 , s0 , s12 | |||||
| fsts s4 , [CO2] | |||||
| add CO1, CO1, #4 | |||||
| .endm | |||||
| /******************************************************************************/ | |||||
| .macro INIT4x1 | |||||
| vsub.f32 s8 , s8 , s8 | |||||
| vmov.f32 s9, s8 | |||||
| vmov.f32 s10, s8 | |||||
| vmov.f32 s11, s8 | |||||
| .endm | |||||
| .macro KERNEL4x1_SUB | |||||
| flds s4 , [ BO ] | |||||
| flds s0 , [ AO ] | |||||
| flds s1 , [ AO, #4 ] | |||||
| flds s2 , [ AO, #8 ] | |||||
| flds s3 , [ AO, #12 ] | |||||
| fmacs s8 , s0, s4 | |||||
| fmacs s9 , s1, s4 | |||||
| fmacs s10 , s2, s4 | |||||
| fmacs s11 , s3, s4 | |||||
| add AO , AO, #16 | |||||
| add BO , BO, #4 | |||||
| .endm | |||||
| .macro SAVE4x1 | |||||
| flds s0, ALPHA | |||||
| flds s4 , [CO1] | |||||
| flds s5 , [CO1, #4 ] | |||||
| flds s6 , [CO1, #8 ] | |||||
| flds s7 , [CO1, #12 ] | |||||
| fmacs s4 , s0 , s8 | |||||
| fmacs s5 , s0 , s9 | |||||
| fmacs s6 , s0 , s10 | |||||
| fmacs s7 , s0 , s11 | |||||
| fsts s4 , [CO1] | |||||
| fsts s5 , [CO1, #4 ] | |||||
| fsts s6 , [CO1, #8 ] | |||||
| fsts s7 , [CO1, #12 ] | |||||
| add CO1, CO1, #16 | |||||
| .endm | |||||
| /******************************************************************************/ | |||||
| .macro INIT2x1 | |||||
| vsub.f32 s8 , s8 , s8 | |||||
| vmov.f32 s9 , s8 | |||||
| .endm | |||||
| .macro KERNEL2x1_SUB | |||||
| flds s4 , [ BO ] | |||||
| flds s0 , [ AO ] | |||||
| flds s1 , [ AO, #4 ] | |||||
| fmacs s8 , s0, s4 | |||||
| fmacs s9 , s1, s4 | |||||
| add AO , AO, #8 | |||||
| add BO , BO, #4 | |||||
| .endm | |||||
| .macro SAVE2x1 | |||||
| flds s0, ALPHA | |||||
| flds s4 , [CO1] | |||||
| flds s5 , [CO1, #4 ] | |||||
| fmacs s4 , s0 , s8 | |||||
| fmacs s5 , s0 , s9 | |||||
| fsts s4 , [CO1] | |||||
| fsts s5 , [CO1, #4 ] | |||||
| add CO1, CO1, #8 | |||||
| .endm | |||||
| /******************************************************************************/ | |||||
| .macro INIT1x1 | |||||
| vsub.f32 s8 , s8 , s8 | |||||
| .endm | |||||
| .macro KERNEL1x1_SUB | |||||
| flds s4 , [ BO ] | |||||
| flds s0 , [ AO ] | |||||
| fmacs s8 , s0, s4 | |||||
| add AO , AO, #4 | |||||
| add BO , BO, #4 | |||||
| .endm | |||||
| .macro SAVE1x1 | |||||
| flds s0, ALPHA | |||||
| flds s4 , [CO1] | |||||
| fmacs s4 , s0 , s8 | |||||
| fsts s4 , [CO1] | |||||
| add CO1, CO1, #4 | |||||
| .endm | |||||
| /************************************************************************************** | |||||
| * End of macro definitions | |||||
| **************************************************************************************/ | |||||
| PROLOGUE | |||||
| .align 5 | |||||
| push {r4 - r9, fp} | |||||
| add fp, sp, #24 | |||||
| sub sp, sp, #STACKSIZE // reserve stack | |||||
| str OLD_M, M | |||||
| str OLD_N, N | |||||
| str OLD_K, K | |||||
| str OLD_A, A | |||||
| vstr OLD_ALPHA, ALPHA | |||||
| sub r3, fp, #128 | |||||
| vstm r3, { s8 - s15} // store floating point registers | |||||
| ldr r3, OLD_LDC | |||||
| lsl r3, r3, #2 // ldc = ldc * 4 | |||||
| str r3, LDC | |||||
| ldr K1, K | |||||
| ldr BC, B | |||||
| ldr J, N | |||||
| asrs J, J, #1 // J = J / 2 | |||||
| ble sgemm_kernel_L1_BEGIN | |||||
| /*********************************************************************************************/ | |||||
| sgemm_kernel_L2_BEGIN: | |||||
| ldr CO1, C // CO1 = C | |||||
| ldr r4 , LDC | |||||
| lsl r4 , r4 , #1 // LDC * 2 | |||||
| add r3 , r4, CO1 | |||||
| str r3 , C // store C | |||||
| ldr AO, A // AO = A | |||||
| sgemm_kernel_L2_M4_BEGIN: | |||||
| ldr I, M | |||||
| asrs I, I, #2 // I = I / 4 | |||||
| ble sgemm_kernel_L2_M2_BEGIN | |||||
| sgemm_kernel_L2_M4_20: | |||||
| INIT4x2 | |||||
| mov BO, BC | |||||
| asrs L , K1, #3 // L = L / 8 | |||||
| ble sgemm_kernel_L2_M4_40 | |||||
| .align 5 | |||||
| sgemm_kernel_L2_M4_22: | |||||
| pld [ AO, #A_PRE ] | |||||
| pld [ BO, #B_PRE ] | |||||
| KERNEL4x2_SUB | |||||
| KERNEL4x2_SUB | |||||
| pld [ AO, #A_PRE ] | |||||
| KERNEL4x2_SUB | |||||
| KERNEL4x2_SUB | |||||
| pld [ AO, #A_PRE ] | |||||
| pld [ BO, #B_PRE ] | |||||
| KERNEL4x2_SUB | |||||
| KERNEL4x2_SUB | |||||
| pld [ AO, #A_PRE ] | |||||
| KERNEL4x2_SUB | |||||
| KERNEL4x2_SUB | |||||
| subs L, L, #1 | |||||
| bgt sgemm_kernel_L2_M4_22 | |||||
| sgemm_kernel_L2_M4_40: | |||||
| ands L , K1, #7 // L = L % 8 | |||||
| ble sgemm_kernel_L2_M4_100 | |||||
| sgemm_kernel_L2_M4_42: | |||||
| KERNEL4x2_SUB | |||||
| subs L, L, #1 | |||||
| bgt sgemm_kernel_L2_M4_42 | |||||
| sgemm_kernel_L2_M4_100: | |||||
| SAVE4x2 | |||||
| sgemm_kernel_L2_M4_END: | |||||
| subs I, I, #1 | |||||
| bgt sgemm_kernel_L2_M4_20 | |||||
| sgemm_kernel_L2_M2_BEGIN: | |||||
| ldr I, M | |||||
| tst I , #3 | |||||
| ble sgemm_kernel_L2_END | |||||
| tst I, #2 // I = I / 2 | |||||
| ble sgemm_kernel_L2_M1_BEGIN | |||||
| sgemm_kernel_L2_M2_20: | |||||
| INIT2x2 | |||||
| mov BO, BC | |||||
| asrs L , K1, #3 // L = L / 8 | |||||
| ble sgemm_kernel_L2_M2_40 | |||||
| sgemm_kernel_L2_M2_22: | |||||
| KERNEL2x2_SUB | |||||
| KERNEL2x2_SUB | |||||
| KERNEL2x2_SUB | |||||
| KERNEL2x2_SUB | |||||
| KERNEL2x2_SUB | |||||
| KERNEL2x2_SUB | |||||
| KERNEL2x2_SUB | |||||
| KERNEL2x2_SUB | |||||
| subs L, L, #1 | |||||
| bgt sgemm_kernel_L2_M2_22 | |||||
| sgemm_kernel_L2_M2_40: | |||||
| ands L , K1, #7 // L = L % 8 | |||||
| ble sgemm_kernel_L2_M2_100 | |||||
| sgemm_kernel_L2_M2_42: | |||||
| KERNEL2x2_SUB | |||||
| subs L, L, #1 | |||||
| bgt sgemm_kernel_L2_M2_42 | |||||
| sgemm_kernel_L2_M2_100: | |||||
| SAVE2x2 | |||||
| sgemm_kernel_L2_M2_END: | |||||
| sgemm_kernel_L2_M1_BEGIN: | |||||
| tst I, #1 // I = I % 2 | |||||
| ble sgemm_kernel_L2_END | |||||
| sgemm_kernel_L2_M1_20: | |||||
| INIT1x2 | |||||
| mov BO, BC | |||||
| asrs L , K1, #3 // L = L / 8 | |||||
| ble sgemm_kernel_L2_M1_40 | |||||
| sgemm_kernel_L2_M1_22: | |||||
| KERNEL1x2_SUB | |||||
| KERNEL1x2_SUB | |||||
| KERNEL1x2_SUB | |||||
| KERNEL1x2_SUB | |||||
| KERNEL1x2_SUB | |||||
| KERNEL1x2_SUB | |||||
| KERNEL1x2_SUB | |||||
| KERNEL1x2_SUB | |||||
| subs L, L, #1 | |||||
| bgt sgemm_kernel_L2_M1_22 | |||||
| sgemm_kernel_L2_M1_40: | |||||
| ands L , K1, #7 // L = L % 8 | |||||
| ble sgemm_kernel_L2_M1_100 | |||||
| sgemm_kernel_L2_M1_42: | |||||
| KERNEL1x2_SUB | |||||
| subs L, L, #1 | |||||
| bgt sgemm_kernel_L2_M1_42 | |||||
| sgemm_kernel_L2_M1_100: | |||||
| SAVE1x2 | |||||
| sgemm_kernel_L2_END: | |||||
| mov r3, BC | |||||
| mov r4, K1 | |||||
| lsl r4, r4, #3 // k * 2 * 4 | |||||
| add r3, r3, r4 // B = B + K * 2 * 4 | |||||
| mov BC, r3 | |||||
| subs J , #1 // j-- | |||||
| bgt sgemm_kernel_L2_BEGIN | |||||
| /*********************************************************************************************/ | |||||
| sgemm_kernel_L1_BEGIN: | |||||
| ldr J , N | |||||
| tst J , #1 | |||||
| ble sgemm_kernel_L999 | |||||
| ldr CO1, C // CO1 = C | |||||
| ldr r4 , LDC | |||||
| add r3 , r4, CO1 | |||||
| str r3 , C // store C | |||||
| ldr AO, A // AO = A | |||||
| sgemm_kernel_L1_M4_BEGIN: | |||||
| ldr I, M | |||||
| asrs I, I, #2 // I = I / 4 | |||||
| ble sgemm_kernel_L1_M2_BEGIN | |||||
| sgemm_kernel_L1_M4_20: | |||||
| INIT4x1 | |||||
| mov BO, BC | |||||
| asrs L , K1, #3 // L = L / 8 | |||||
| ble sgemm_kernel_L1_M4_40 | |||||
| .align 5 | |||||
| sgemm_kernel_L1_M4_22: | |||||
| KERNEL4x1_SUB | |||||
| KERNEL4x1_SUB | |||||
| KERNEL4x1_SUB | |||||
| KERNEL4x1_SUB | |||||
| KERNEL4x1_SUB | |||||
| KERNEL4x1_SUB | |||||
| KERNEL4x1_SUB | |||||
| KERNEL4x1_SUB | |||||
| subs L, L, #1 | |||||
| bgt sgemm_kernel_L1_M4_22 | |||||
| sgemm_kernel_L1_M4_40: | |||||
| ands L , K1, #7 // L = L % 8 | |||||
| ble sgemm_kernel_L1_M4_100 | |||||
| sgemm_kernel_L1_M4_42: | |||||
| KERNEL4x1_SUB | |||||
| subs L, L, #1 | |||||
| bgt sgemm_kernel_L1_M4_42 | |||||
| sgemm_kernel_L1_M4_100: | |||||
| SAVE4x1 | |||||
| sgemm_kernel_L1_M4_END: | |||||
| subs I, I, #1 | |||||
| bgt sgemm_kernel_L1_M4_20 | |||||
| sgemm_kernel_L1_M2_BEGIN: | |||||
| ldr I, M | |||||
| tst I , #3 | |||||
| ble sgemm_kernel_L1_END | |||||
| tst I, #2 // I = I / 2 | |||||
| ble sgemm_kernel_L1_M1_BEGIN | |||||
| sgemm_kernel_L1_M2_20: | |||||
| INIT2x1 | |||||
| mov BO, BC | |||||
| asrs L , K1, #3 // L = L / 8 | |||||
| ble sgemm_kernel_L1_M2_40 | |||||
| sgemm_kernel_L1_M2_22: | |||||
| KERNEL2x1_SUB | |||||
| KERNEL2x1_SUB | |||||
| KERNEL2x1_SUB | |||||
| KERNEL2x1_SUB | |||||
| KERNEL2x1_SUB | |||||
| KERNEL2x1_SUB | |||||
| KERNEL2x1_SUB | |||||
| KERNEL2x1_SUB | |||||
| subs L, L, #1 | |||||
| bgt sgemm_kernel_L1_M2_22 | |||||
| sgemm_kernel_L1_M2_40: | |||||
| ands L , K1, #7 // L = L % 8 | |||||
| ble sgemm_kernel_L1_M2_100 | |||||
| sgemm_kernel_L1_M2_42: | |||||
| KERNEL2x1_SUB | |||||
| subs L, L, #1 | |||||
| bgt sgemm_kernel_L1_M2_42 | |||||
| sgemm_kernel_L1_M2_100: | |||||
| SAVE2x1 | |||||
| sgemm_kernel_L1_M2_END: | |||||
| sgemm_kernel_L1_M1_BEGIN: | |||||
| tst I, #1 // I = I % 2 | |||||
| ble sgemm_kernel_L1_END | |||||
| sgemm_kernel_L1_M1_20: | |||||
| INIT1x1 | |||||
| mov BO, BC | |||||
| asrs L , K1, #3 // L = L / 8 | |||||
| ble sgemm_kernel_L1_M1_40 | |||||
| sgemm_kernel_L1_M1_22: | |||||
| KERNEL1x1_SUB | |||||
| KERNEL1x1_SUB | |||||
| KERNEL1x1_SUB | |||||
| KERNEL1x1_SUB | |||||
| KERNEL1x1_SUB | |||||
| KERNEL1x1_SUB | |||||
| KERNEL1x1_SUB | |||||
| KERNEL1x1_SUB | |||||
| subs L, L, #1 | |||||
| bgt sgemm_kernel_L1_M1_22 | |||||
| sgemm_kernel_L1_M1_40: | |||||
| ands L , K1, #7 // L = L % 8 | |||||
| ble sgemm_kernel_L1_M1_100 | |||||
| sgemm_kernel_L1_M1_42: | |||||
| KERNEL1x1_SUB | |||||
| subs L, L, #1 | |||||
| bgt sgemm_kernel_L1_M1_42 | |||||
| sgemm_kernel_L1_M1_100: | |||||
| SAVE1x1 | |||||
| sgemm_kernel_L1_END: | |||||
| sgemm_kernel_L999: | |||||
| sub r3, fp, #128 | |||||
| vldm r3, { s8 - s15} // restore floating point registers | |||||
| movs r0, #0 // set return value | |||||
| sub sp, fp, #24 | |||||
| pop {r4 - r9, fp} | |||||
| bx lr | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,225 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| /************************************************************************************** | |||||
| * 2013/11/24 Saar | |||||
| * BLASTEST : OK | |||||
| * CTEST : OK | |||||
| * TEST : OK | |||||
| * | |||||
| **************************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define STACKSIZE 256 | |||||
| #define OLD_M r0 | |||||
| #define OLD_N r1 | |||||
| #define OLD_A r2 | |||||
| #define OLD_LDA r3 | |||||
| #define B [fp, #4 ] | |||||
| #define M r0 | |||||
| #define N r1 | |||||
| #define A r2 | |||||
| #define BO r5 | |||||
| #define AO1 r6 | |||||
| #define AO2 r7 | |||||
| #define LDA r8 | |||||
| #define I r3 | |||||
| #define J r12 | |||||
| #define A_PRE 256 | |||||
| /************************************************************************************** | |||||
| * Macro definitions | |||||
| **************************************************************************************/ | |||||
| .macro COPY2x2 | |||||
| flds s0 , [ AO1, #0 ] | |||||
| flds s2 , [ AO1, #4 ] | |||||
| flds s1 , [ AO2, #0 ] | |||||
| flds s3 , [ AO2, #4 ] | |||||
| add AO1, AO1, #8 | |||||
| fstmias BO!, { s0 - s3 } | |||||
| add AO2, AO2, #8 | |||||
| .endm | |||||
| .macro COPY1x2 | |||||
| flds s0 , [ AO1, #0 ] | |||||
| flds s1 , [ AO2, #0 ] | |||||
| add AO1, AO1, #4 | |||||
| fstmias BO!, { s0 - s1 } | |||||
| add AO2, AO2, #4 | |||||
| .endm | |||||
| .macro COPY2x1 | |||||
| flds s0 , [ AO1, #0 ] | |||||
| flds s1 , [ AO1, #4 ] | |||||
| fstmias BO!, { s0 - s1 } | |||||
| add AO1, AO1, #8 | |||||
| .endm | |||||
| .macro COPY1x1 | |||||
| flds s0 , [ AO1, #0 ] | |||||
| fstmias BO!, { s0 } | |||||
| add AO1, AO1, #4 | |||||
| .endm | |||||
| /************************************************************************************** | |||||
| * End of macro definitions | |||||
| **************************************************************************************/ | |||||
| PROLOGUE | |||||
| .align 5 | |||||
| push {r4 - r9, fp} | |||||
| add fp, sp, #24 | |||||
| lsl LDA, OLD_LDA, #2 // lda = lda * 4 | |||||
| ldr BO, B | |||||
| /*********************************************************************************************/ | |||||
| sgemm_ncopy_L2_BEGIN: | |||||
| asrs J, N, #1 // J = N / 2 | |||||
| ble sgemm_ncopy_L1_BEGIN | |||||
| sgemm_ncopy_L2_M2_BEGIN: | |||||
| mov AO1, A // AO1 = A | |||||
| add AO2, AO1, LDA | |||||
| add A , AO2, LDA // A = A + 2 * LDA | |||||
| asrs I, M, #1 // I = M / 2 | |||||
| ble sgemm_ncopy_L2_M2_40 | |||||
| sgemm_ncopy_L2_M2_20: | |||||
| COPY2x2 | |||||
| subs I , I , #1 | |||||
| bne sgemm_ncopy_L2_M2_20 | |||||
| sgemm_ncopy_L2_M2_40: | |||||
| ands I, M , #1 | |||||
| ble sgemm_ncopy_L2_M2_END | |||||
| sgemm_ncopy_L2_M2_60: | |||||
| COPY1x2 | |||||
| subs I , I , #1 | |||||
| bne sgemm_ncopy_L2_M2_60 | |||||
| sgemm_ncopy_L2_M2_END: | |||||
| subs J , J, #1 // j-- | |||||
| bne sgemm_ncopy_L2_M2_BEGIN | |||||
| /*********************************************************************************************/ | |||||
| sgemm_ncopy_L1_BEGIN: | |||||
| tst N, #1 | |||||
| ble sgemm_ncopy_L999 | |||||
| sgemm_ncopy_L1_M2_BEGIN: | |||||
| mov AO1, A // AO1 = A | |||||
| add A , AO1, LDA // A = A + 1 * LDA | |||||
| asrs I, M, #1 // I = M / 2 | |||||
| ble sgemm_ncopy_L1_M2_40 | |||||
| sgemm_ncopy_L1_M2_20: | |||||
| COPY2x1 | |||||
| subs I , I , #1 | |||||
| bne sgemm_ncopy_L1_M2_20 | |||||
| sgemm_ncopy_L1_M2_40: | |||||
| ands I, M , #1 | |||||
| ble sgemm_ncopy_L1_M2_END | |||||
| sgemm_ncopy_L1_M2_60: | |||||
| COPY1x1 | |||||
| subs I , I , #1 | |||||
| bne sgemm_ncopy_L1_M2_60 | |||||
| sgemm_ncopy_L1_M2_END: | |||||
| sgemm_ncopy_L999: | |||||
| movs r0, #0 // set return value | |||||
| sub sp, fp, #24 | |||||
| pop {r4 - r9, fp} | |||||
| bx lr | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,353 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| /************************************************************************************** | |||||
| * 2013/11/05 Saar | |||||
| * BLASTEST : OK | |||||
| * CTEST : OK | |||||
| * TEST : OK | |||||
| * | |||||
| **************************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define STACKSIZE 256 | |||||
| #define OLD_M r0 | |||||
| #define OLD_N r1 | |||||
| #define OLD_A r2 | |||||
| #define OLD_LDA r3 | |||||
| /****************************************************** | |||||
| * [fp, #-128] - [fp, #-64] is reserved | |||||
| * for store and restore of floating point | |||||
| * registers | |||||
| *******************************************************/ | |||||
| #define LDA [fp, #-260 ] | |||||
| #define B [fp, #4 ] | |||||
| #define M r0 | |||||
| #define N r1 | |||||
| #define A r2 | |||||
| #define BO r5 | |||||
| #define AO1 r6 | |||||
| #define AO2 r7 | |||||
| #define AO3 r8 | |||||
| #define AO4 r9 | |||||
| #define I r3 | |||||
| #define J r12 | |||||
| #define A_PRE 192 | |||||
| /************************************************************************************** | |||||
| * Macro definitions | |||||
| **************************************************************************************/ | |||||
| .macro COPY4x4 | |||||
| flds s0 , [ AO1, #0 ] | |||||
| flds s1 , [ AO2, #0 ] | |||||
| flds s2 , [ AO3, #0 ] | |||||
| flds s3 , [ AO4, #0 ] | |||||
| flds s4 , [ AO1, #4 ] | |||||
| flds s8 , [ AO1, #8 ] | |||||
| flds s12, [ AO1, #12 ] | |||||
| flds s5 , [ AO2, #4 ] | |||||
| add AO1, AO1, #16 | |||||
| flds s9 , [ AO2, #8 ] | |||||
| flds s13, [ AO2, #12 ] | |||||
| flds s6 , [ AO3, #4 ] | |||||
| add AO2, AO2, #16 | |||||
| flds s10, [ AO3, #8 ] | |||||
| flds s14, [ AO3, #12 ] | |||||
| flds s7 , [ AO4, #4 ] | |||||
| add AO3, AO3, #16 | |||||
| flds s11, [ AO4, #8 ] | |||||
| flds s15, [ AO4, #12 ] | |||||
| fstmias BO!, { s0 - s3 } | |||||
| add AO4, AO4, #16 | |||||
| fstmias BO!, { s4 - s7 } | |||||
| fstmias BO!, { s8 - s15 } | |||||
| .endm | |||||
| .macro COPY1x4 | |||||
| flds s0 , [ AO1, #0 ] | |||||
| flds s1 , [ AO2, #0 ] | |||||
| add AO1, AO1, #4 | |||||
| flds s2 , [ AO3, #0 ] | |||||
| add AO2, AO2, #4 | |||||
| flds s3 , [ AO4, #0 ] | |||||
| add AO3, AO3, #4 | |||||
| fstmias BO!, { s0 - s3 } | |||||
| add AO4, AO4, #4 | |||||
| .endm | |||||
| .macro COPY4x2 | |||||
| flds s0 , [ AO1, #0 ] | |||||
| flds s2 , [ AO1, #4 ] | |||||
| flds s4 , [ AO1, #8 ] | |||||
| flds s6 , [ AO1, #12 ] | |||||
| flds s1 , [ AO2, #0 ] | |||||
| flds s3 , [ AO2, #4 ] | |||||
| add AO1, AO1, #16 | |||||
| flds s5 , [ AO2, #8 ] | |||||
| flds s7 , [ AO2, #12 ] | |||||
| fstmias BO!, { s0 - s7 } | |||||
| add AO2, AO2, #16 | |||||
| .endm | |||||
| .macro COPY1x2 | |||||
| flds s0 , [ AO1, #0 ] | |||||
| flds s1 , [ AO2, #0 ] | |||||
| add AO1, AO1, #4 | |||||
| fstmias BO!, { s0 - s1 } | |||||
| add AO2, AO2, #4 | |||||
| .endm | |||||
| .macro COPY4x1 | |||||
| flds s0 , [ AO1, #0 ] | |||||
| flds s1 , [ AO1, #4 ] | |||||
| flds s2 , [ AO1, #8 ] | |||||
| flds s3 , [ AO1, #12 ] | |||||
| fstmias BO!, { s0 - s3 } | |||||
| add AO1, AO1, #16 | |||||
| .endm | |||||
| .macro COPY1x1 | |||||
| flds s0 , [ AO1, #0 ] | |||||
| fstmias BO!, { s0 } | |||||
| add AO1, AO1, #4 | |||||
| .endm | |||||
| /************************************************************************************** | |||||
| * End of macro definitions | |||||
| **************************************************************************************/ | |||||
| PROLOGUE | |||||
| .align 5 | |||||
| push {r4 - r9, fp} | |||||
| add fp, sp, #24 | |||||
| sub sp, sp, #STACKSIZE // reserve stack | |||||
| lsl r3, r3, #2 // lda = lda * 4 | |||||
| str r3, LDA | |||||
| sub r4, fp, #128 | |||||
| vstm r4, { s8 - s15} // store floating point registers | |||||
| ldr BO, B | |||||
| sgemm_ncopy_L4_BEGIN: | |||||
| asrs J, N, #2 // J = N / 4 | |||||
| ble sgemm_ncopy_L2_BEGIN | |||||
| sgemm_ncopy_L4_M4_BEGIN: | |||||
| mov AO1, A // AO1 = A | |||||
| ldr r4 , LDA | |||||
| add AO2, AO1, r4 | |||||
| add AO3, AO2, r4 | |||||
| add AO4, AO3, r4 | |||||
| add A , AO4, r4 // A = A + 4 * LDA | |||||
| asrs I, M, #2 // I = M / 4 | |||||
| ble sgemm_ncopy_L4_M4_40 | |||||
| sgemm_ncopy_L4_M4_20: | |||||
| pld [ AO1, #A_PRE ] | |||||
| pld [ AO2, #A_PRE ] | |||||
| pld [ AO3, #A_PRE ] | |||||
| pld [ AO4, #A_PRE ] | |||||
| COPY4x4 | |||||
| subs I , I , #1 | |||||
| ble sgemm_ncopy_L4_M4_40 | |||||
| COPY4x4 | |||||
| subs I , I , #1 | |||||
| bne sgemm_ncopy_L4_M4_20 | |||||
| sgemm_ncopy_L4_M4_40: | |||||
| ands I, M , #3 | |||||
| ble sgemm_ncopy_L4_M4_END | |||||
| sgemm_ncopy_L4_M4_60: | |||||
| COPY1x4 | |||||
| subs I , I , #1 | |||||
| bne sgemm_ncopy_L4_M4_60 | |||||
| sgemm_ncopy_L4_M4_END: | |||||
| subs J , J, #1 // j-- | |||||
| bne sgemm_ncopy_L4_M4_BEGIN | |||||
| /*********************************************************************************************/ | |||||
| sgemm_ncopy_L2_BEGIN: | |||||
| tst N, #3 | |||||
| ble sgemm_ncopy_L999 | |||||
| tst N, #2 | |||||
| ble sgemm_ncopy_L1_BEGIN | |||||
| sgemm_ncopy_L2_M4_BEGIN: | |||||
| mov AO1, A // AO1 = A | |||||
| ldr r4 , LDA | |||||
| add AO2, AO1, r4 | |||||
| add A , AO2, r4 // A = A + 2 * LDA | |||||
| asrs I, M, #2 // I = M / 4 | |||||
| ble sgemm_ncopy_L2_M4_40 | |||||
| sgemm_ncopy_L2_M4_20: | |||||
| COPY4x2 | |||||
| subs I , I , #1 | |||||
| bne sgemm_ncopy_L2_M4_20 | |||||
| sgemm_ncopy_L2_M4_40: | |||||
| ands I, M , #3 | |||||
| ble sgemm_ncopy_L2_M4_END | |||||
| sgemm_ncopy_L2_M4_60: | |||||
| COPY1x2 | |||||
| subs I , I , #1 | |||||
| bne sgemm_ncopy_L2_M4_60 | |||||
| sgemm_ncopy_L2_M4_END: | |||||
| /*********************************************************************************************/ | |||||
| sgemm_ncopy_L1_BEGIN: | |||||
| tst N, #1 | |||||
| ble sgemm_ncopy_L999 | |||||
| sgemm_ncopy_L1_M4_BEGIN: | |||||
| mov AO1, A // AO1 = A | |||||
| ldr r4 , LDA | |||||
| add A , AO1, r4 // A = A + 1 * LDA | |||||
| asrs I, M, #2 // I = M / 4 | |||||
| ble sgemm_ncopy_L1_M4_40 | |||||
| sgemm_ncopy_L1_M4_20: | |||||
| COPY4x1 | |||||
| subs I , I , #1 | |||||
| bne sgemm_ncopy_L1_M4_20 | |||||
| sgemm_ncopy_L1_M4_40: | |||||
| ands I, M , #3 | |||||
| ble sgemm_ncopy_L1_M4_END | |||||
| sgemm_ncopy_L1_M4_60: | |||||
| COPY1x1 | |||||
| subs I , I , #1 | |||||
| bne sgemm_ncopy_L1_M4_60 | |||||
| sgemm_ncopy_L1_M4_END: | |||||
| sgemm_ncopy_L999: | |||||
| sub r3, fp, #128 | |||||
| vldm r3, { s8 - s15} // restore floating point registers | |||||
| movs r0, #0 // set return value | |||||
| sub sp, fp, #24 | |||||
| pop {r4 - r9, fp} | |||||
| bx lr | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,430 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| /************************************************************************************** | |||||
| * 2013/11/06 Saar | |||||
| * BLASTEST : OK | |||||
| * CTEST : OK | |||||
| * TEST : OK | |||||
| * | |||||
| **************************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define STACKSIZE 256 | |||||
| #define OLD_M r0 | |||||
| #define OLD_N r1 | |||||
| #define OLD_A r2 | |||||
| #define OLD_LDA r3 | |||||
| /****************************************************** | |||||
| * [fp, #-128] - [fp, #-64] is reserved | |||||
| * for store and restore of floating point | |||||
| * registers | |||||
| *******************************************************/ | |||||
| #define B [fp, #4 ] | |||||
| #define A [fp, #-248 ] | |||||
| #define M r0 | |||||
| #define N r1 | |||||
| #define M4 r2 | |||||
| #define LDA r5 | |||||
| #define AO1 r6 | |||||
| #define BO1 r7 | |||||
| #define BO2 r8 | |||||
| #define BO3 r9 | |||||
| #define I r4 | |||||
| #define J r12 | |||||
| #define A_PRE 256 | |||||
| /************************************************************************************** | |||||
| * Macro definitions | |||||
| **************************************************************************************/ | |||||
| .macro COPY4x4_1 | |||||
| pld [ AO1, #A_PRE ] | |||||
| fldmias AO1, { s0 - s3 } | |||||
| add r3, AO1, LDA | |||||
| pld [ r3, #A_PRE ] | |||||
| fldmias r3, { s4 - s7 } | |||||
| add r3, r3, LDA | |||||
| pld [ r3, #A_PRE ] | |||||
| fldmias r3, { s8 - s11 } | |||||
| add r3, r3, LDA | |||||
| pld [ r3, #A_PRE ] | |||||
| fldmias r3, { s12 - s15 } | |||||
| fstmias BO1, { s0 - s15 } | |||||
| add AO1, AO1, #16 | |||||
| add BO1, BO1, M4 | |||||
| .endm | |||||
| .macro COPY4x4_2 | |||||
| fldmias AO1, { s0 - s3 } | |||||
| add r3, AO1, LDA | |||||
| fldmias r3, { s4 - s7 } | |||||
| add r3, r3, LDA | |||||
| fldmias r3, { s8 - s11 } | |||||
| add r3, r3, LDA | |||||
| fldmias r3, { s12 - s15 } | |||||
| fstmias BO1, { s0 - s15 } | |||||
| add AO1, AO1, #16 | |||||
| add BO1, BO1, M4 | |||||
| .endm | |||||
| .macro COPY2x4 | |||||
| fldmias AO1, { s0 - s1 } | |||||
| add r3, AO1, LDA | |||||
| fldmias r3, { s2 - s3 } | |||||
| add r3, r3, LDA | |||||
| fldmias r3, { s4 - s5 } | |||||
| add r3, r3, LDA | |||||
| fldmias r3, { s6 - s7 } | |||||
| fstmias BO2, { s0 - s7 } | |||||
| add AO1, AO1, #8 | |||||
| add BO2, BO2, #32 | |||||
| .endm | |||||
| .macro COPY1x4 | |||||
| fldmias AO1, { s0 } | |||||
| add r3, AO1, LDA | |||||
| fldmias r3, { s1 } | |||||
| add r3, r3, LDA | |||||
| fldmias r3, { s2 } | |||||
| add r3, r3, LDA | |||||
| fldmias r3, { s3 } | |||||
| fstmias BO3, { s0 - s3 } | |||||
| add AO1, AO1, #4 | |||||
| add BO3, BO3, #16 | |||||
| .endm | |||||
| /*************************************************************************************************************************/ | |||||
| .macro COPY4x2 | |||||
| fldmias AO1, { s0 - s3 } | |||||
| add r3, AO1, LDA | |||||
| fldmias r3, { s4 - s7 } | |||||
| fstmias BO1, { s0 - s7 } | |||||
| add AO1, AO1, #16 | |||||
| add BO1, BO1, M4 | |||||
| .endm | |||||
| .macro COPY2x2 | |||||
| fldmias AO1, { s0 - s1 } | |||||
| add r3, AO1, LDA | |||||
| fldmias r3, { s2 - s3 } | |||||
| fstmias BO2, { s0 - s3 } | |||||
| add AO1, AO1, #8 | |||||
| add BO2, BO2, #16 | |||||
| .endm | |||||
| .macro COPY1x2 | |||||
| fldmias AO1, { s0 } | |||||
| add r3, AO1, LDA | |||||
| fldmias r3, { s1 } | |||||
| fstmias BO3, { s0 - s1 } | |||||
| add AO1, AO1, #4 | |||||
| add BO3, BO3, #8 | |||||
| .endm | |||||
| /*************************************************************************************************************************/ | |||||
| .macro COPY4x1 | |||||
| fldmias AO1, { s0 - s3 } | |||||
| fstmias BO1, { s0 - s3 } | |||||
| add AO1, AO1, #16 | |||||
| add BO1, BO1, M4 | |||||
| .endm | |||||
| .macro COPY2x1 | |||||
| fldmias AO1, { s0 - s1 } | |||||
| fstmias BO2, { s0 - s1 } | |||||
| add AO1, AO1, #8 | |||||
| add BO2, BO2, #8 | |||||
| .endm | |||||
| .macro COPY1x1 | |||||
| fldmias AO1, { s0 } | |||||
| fstmias BO3, { s0 } | |||||
| add AO1, AO1, #4 | |||||
| add BO3, BO3, #4 | |||||
| .endm | |||||
| /************************************************************************************** | |||||
| * End of macro definitions | |||||
| **************************************************************************************/ | |||||
| PROLOGUE | |||||
| .align 5 | |||||
| push {r4 - r9, fp} | |||||
| add fp, sp, #24 | |||||
| sub sp, sp, #STACKSIZE // reserve stack | |||||
| str OLD_A, A // store A | |||||
| lsl LDA, OLD_LDA, #2 // lda = lda * SIZE | |||||
| sub r4, fp, #128 | |||||
| vstm r4, { s8 - s15} // store floating point registers | |||||
| lsl r4 , M, #2 // M * SIZE | |||||
| ldr r3, B | |||||
| and BO2 , N , #-4 | |||||
| and BO3 , N , #-2 | |||||
| mul BO2, BO2, r4 | |||||
| mul BO3, BO3, r4 | |||||
| add BO2 , BO2, r3 | |||||
| add BO3 , BO3, r3 | |||||
| lsl M4, M, #4 // M4 = M * 4 * SIZE | |||||
| sgemm_tcopy_L4_BEGIN: | |||||
| asrs J, M, #2 // J = N / 4 | |||||
| ble sgemm_tcopy_L2_BEGIN | |||||
| sgemm_tcopy_L4_M4_BEGIN: | |||||
| ldr AO1, A // AO1 = A | |||||
| lsl r3, LDA, #2 // r3 = 4 * LDA | |||||
| add r3, r3 , AO1 // A = A + 4 * LDA | |||||
| str r3, A // store A | |||||
| ldr BO1, B | |||||
| add r3, BO1, #64 // B = B + 16 * SIZE | |||||
| str r3, B | |||||
| asrs I, N, #2 // I = M / 4 | |||||
| ble sgemm_tcopy_L4_M4_40 | |||||
| sgemm_tcopy_L4_M4_20: | |||||
| COPY4x4_1 | |||||
| subs I , I , #1 | |||||
| ble sgemm_tcopy_L4_M4_40 | |||||
| COPY4x4_2 | |||||
| subs I , I , #1 | |||||
| bne sgemm_tcopy_L4_M4_20 | |||||
| sgemm_tcopy_L4_M4_40: | |||||
| tst N , #2 | |||||
| ble sgemm_tcopy_L4_M4_60 | |||||
| COPY2x4 | |||||
| sgemm_tcopy_L4_M4_60: | |||||
| tst N, #1 | |||||
| ble sgemm_tcopy_L4_M4_END | |||||
| COPY1x4 | |||||
| sgemm_tcopy_L4_M4_END: | |||||
| subs J , J, #1 // j-- | |||||
| bne sgemm_tcopy_L4_M4_BEGIN | |||||
| /*********************************************************************************************/ | |||||
| sgemm_tcopy_L2_BEGIN: | |||||
| tst M, #3 | |||||
| ble sgemm_tcopy_L999 | |||||
| tst M, #2 | |||||
| ble sgemm_tcopy_L1_BEGIN | |||||
| sgemm_tcopy_L2_M4_BEGIN: | |||||
| ldr AO1, A // AO1 = A | |||||
| lsl r3, LDA, #1 // r3 = 2 * LDA | |||||
| add r3, r3 , AO1 // A = A + 2 * LDA | |||||
| str r3, A // store A | |||||
| ldr BO1, B | |||||
| add r3, BO1, #32 // B = B + 8 * SIZE | |||||
| str r3, B | |||||
| asrs I, N, #2 // I = M / 4 | |||||
| ble sgemm_tcopy_L2_M4_40 | |||||
| sgemm_tcopy_L2_M4_20: | |||||
| COPY4x2 | |||||
| subs I , I , #1 | |||||
| bne sgemm_tcopy_L2_M4_20 | |||||
| sgemm_tcopy_L2_M4_40: | |||||
| tst N , #2 | |||||
| ble sgemm_tcopy_L2_M4_60 | |||||
| COPY2x2 | |||||
| sgemm_tcopy_L2_M4_60: | |||||
| tst N , #1 | |||||
| ble sgemm_tcopy_L2_M4_END | |||||
| COPY1x2 | |||||
| sgemm_tcopy_L2_M4_END: | |||||
| /*********************************************************************************************/ | |||||
| sgemm_tcopy_L1_BEGIN: | |||||
| tst M, #1 | |||||
| ble sgemm_tcopy_L999 | |||||
| sgemm_tcopy_L1_M4_BEGIN: | |||||
| ldr AO1, A // AO1 = A | |||||
| add r3, LDA , AO1 // A = A + 1 * LDA | |||||
| str r3, A // store A | |||||
| ldr BO1, B | |||||
| add r3, BO1, #16 // B = B + 4 * SIZE | |||||
| str r3, B | |||||
| asrs I, N, #2 // I = M / 4 | |||||
| ble sgemm_tcopy_L1_M4_40 | |||||
| sgemm_tcopy_L1_M4_20: | |||||
| COPY4x1 | |||||
| subs I , I , #1 | |||||
| bne sgemm_tcopy_L1_M4_20 | |||||
| sgemm_tcopy_L1_M4_40: | |||||
| tst N , #2 | |||||
| ble sgemm_tcopy_L1_M4_60 | |||||
| COPY2x1 | |||||
| sgemm_tcopy_L1_M4_60: | |||||
| tst N , #1 | |||||
| ble sgemm_tcopy_L1_M4_END | |||||
| COPY1x1 | |||||
| sgemm_tcopy_L1_M4_END: | |||||
| sgemm_tcopy_L999: | |||||
| sub r3, fp, #128 | |||||
| vldm r3, { s8 - s15} // restore floating point registers | |||||
| mov r0, #0 // set return value | |||||
| sub sp, fp, #24 | |||||
| pop {r4 - r9, fp} | |||||
| bx lr | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,62 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| /************************************************************************************** | |||||
| * 2013/08/20 Saar | |||||
| * BLASTEST float OK | |||||
| * BLASTEST double OK | |||||
| * | |||||
| **************************************************************************************/ | |||||
| #include "common.h" | |||||
| #include <stdio.h> | |||||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | |||||
| { | |||||
| BLASLONG i=0; | |||||
| BLASLONG ix=0,iy=0; | |||||
| FLOAT temp; | |||||
| if ( n < 0 ) return(0); | |||||
| while(i < n) | |||||
| { | |||||
| temp = x[ix] ; | |||||
| x[ix] = y[iy] ; | |||||
| y[iy] = temp ; | |||||
| ix += inc_x ; | |||||
| iy += inc_y ; | |||||
| i++ ; | |||||
| } | |||||
| return(0); | |||||
| } | |||||
| @@ -0,0 +1,354 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| /************************************************************************************** | |||||
| * 2013/11/14 Saar | |||||
| * BLASTEST : OK | |||||
| * CTEST : OK | |||||
| * TEST : OK | |||||
| * | |||||
| **************************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define STACKSIZE 256 | |||||
| #define OLD_INC_X [fp, #0 ] | |||||
| #define OLD_Y [fp, #4 ] | |||||
| #define OLD_INC_Y [fp, #8 ] | |||||
| #define N r0 | |||||
| #define Y r1 | |||||
| #define INC_X r2 | |||||
| #define X r3 | |||||
| #define INC_Y r4 | |||||
| #define I r12 | |||||
| #define X_PRE 512 | |||||
| /************************************************************************************** | |||||
| * Macro definitions | |||||
| **************************************************************************************/ | |||||
| /*****************************************************************************************/ | |||||
| #if !defined(COMPLEX) | |||||
| #if defined(DOUBLE) | |||||
| .macro KERNEL_F4 | |||||
| pld [ X, #X_PRE ] | |||||
| pld [ Y, #X_PRE ] | |||||
| fldmiad X, { d0 - d3 } | |||||
| fldmiad Y, { d4 - d7 } | |||||
| fstmiad Y!, { d0 - d3 } | |||||
| fstmiad X!, { d4 - d7} | |||||
| .endm | |||||
| .macro KERNEL_F1 | |||||
| fldmiad X, { d0 } | |||||
| fldmiad Y, { d4 } | |||||
| fstmiad Y!, { d0 } | |||||
| fstmiad X!, { d4 } | |||||
| .endm | |||||
| .macro KERNEL_S1 | |||||
| fldmiad X, { d0 } | |||||
| fldmiad Y, { d4 } | |||||
| fstmiad Y, { d0 } | |||||
| fstmiad X, { d4 } | |||||
| add X, X, INC_X | |||||
| add Y, Y, INC_Y | |||||
| .endm | |||||
| #else | |||||
| .macro KERNEL_F4 | |||||
| fldmias X, { s0 - s3 } | |||||
| fldmias Y, { s4 - s7 } | |||||
| fstmias Y!, { s0 - s3 } | |||||
| fstmias X!, { s4 - s7} | |||||
| .endm | |||||
| .macro KERNEL_F1 | |||||
| fldmias X, { s0 } | |||||
| fldmias Y, { s4 } | |||||
| fstmias Y!, { s0 } | |||||
| fstmias X!, { s4 } | |||||
| .endm | |||||
| .macro KERNEL_S1 | |||||
| fldmias X, { s0 } | |||||
| fldmias Y, { s4 } | |||||
| fstmias Y, { s0 } | |||||
| fstmias X, { s4 } | |||||
| add X, X, INC_X | |||||
| add Y, Y, INC_Y | |||||
| .endm | |||||
| #endif | |||||
| #else | |||||
| #if defined(DOUBLE) | |||||
| .macro KERNEL_F4 | |||||
| pld [ X, #X_PRE ] | |||||
| pld [ Y, #X_PRE ] | |||||
| fldmiad X, { d0 - d3 } | |||||
| fldmiad Y, { d4 - d7 } | |||||
| fstmiad Y!, { d0 - d3 } | |||||
| fstmiad X!, { d4 - d7} | |||||
| pld [ X, #X_PRE ] | |||||
| pld [ Y, #X_PRE ] | |||||
| fldmiad X, { d0 - d3 } | |||||
| fldmiad Y, { d4 - d7 } | |||||
| fstmiad Y!, { d0 - d3 } | |||||
| fstmiad X!, { d4 - d7} | |||||
| .endm | |||||
| .macro KERNEL_F1 | |||||
| fldmiad X, { d0 - d1 } | |||||
| fldmiad Y, { d4 - d5 } | |||||
| fstmiad Y!, { d0 - d1 } | |||||
| fstmiad X!, { d4 - d5 } | |||||
| .endm | |||||
| .macro KERNEL_S1 | |||||
| fldmiad X, { d0 - d1 } | |||||
| fldmiad Y, { d4 - d5 } | |||||
| fstmiad Y, { d0 - d1 } | |||||
| fstmiad X, { d4 - d5 } | |||||
| add X, X, INC_X | |||||
| add Y, Y, INC_Y | |||||
| .endm | |||||
| #else | |||||
| .macro KERNEL_F4 | |||||
| pld [ X, #X_PRE ] | |||||
| pld [ Y, #X_PRE ] | |||||
| fldmias X, { s0 - s3 } | |||||
| fldmias Y, { s4 - s7 } | |||||
| fstmias Y!, { s0 - s3 } | |||||
| fstmias X!, { s4 - s7} | |||||
| fldmias X, { s0 - s3 } | |||||
| fldmias Y, { s4 - s7 } | |||||
| fstmias Y!, { s0 - s3 } | |||||
| fstmias X!, { s4 - s7} | |||||
| .endm | |||||
| .macro KERNEL_F1 | |||||
| fldmias X, { s0 - s1 } | |||||
| fldmias Y, { s4 - s5 } | |||||
| fstmias Y!, { s0 - s1 } | |||||
| fstmias X!, { s4 - s5 } | |||||
| .endm | |||||
| .macro KERNEL_S1 | |||||
| fldmias X, { s0 - s1 } | |||||
| fldmias Y, { s4 - s5 } | |||||
| fstmias Y, { s0 - s1 } | |||||
| fstmias X, { s4 - s5 } | |||||
| add X, X, INC_X | |||||
| add Y, Y, INC_Y | |||||
| .endm | |||||
| #endif | |||||
| #endif | |||||
| /************************************************************************************** | |||||
| * End of macro definitions | |||||
| **************************************************************************************/ | |||||
| PROLOGUE | |||||
| .align 5 | |||||
| push {r4 , fp} | |||||
| add fp, sp, #8 | |||||
| ldr INC_X , OLD_INC_X | |||||
| ldr Y, OLD_Y | |||||
| ldr INC_Y , OLD_INC_Y | |||||
| cmp N, #0 | |||||
| ble swap_kernel_L999 | |||||
| cmp INC_X, #0 | |||||
| beq swap_kernel_L999 | |||||
| cmp INC_Y, #0 | |||||
| beq swap_kernel_L999 | |||||
| cmp INC_X, #1 | |||||
| bne swap_kernel_S_BEGIN | |||||
| cmp INC_Y, #1 | |||||
| bne swap_kernel_S_BEGIN | |||||
| swap_kernel_F_BEGIN: | |||||
| asrs I, N, #2 // I = N / 4 | |||||
| ble swap_kernel_F1 | |||||
| .align 5 | |||||
| swap_kernel_F4: | |||||
| #if !defined(COMPLEX) && !defined(DOUBLE) | |||||
| pld [ X, #X_PRE ] | |||||
| pld [ Y, #X_PRE ] | |||||
| #endif | |||||
| KERNEL_F4 | |||||
| subs I, I, #1 | |||||
| ble swap_kernel_F1 | |||||
| KERNEL_F4 | |||||
| subs I, I, #1 | |||||
| bne swap_kernel_F4 | |||||
| swap_kernel_F1: | |||||
| ands I, N, #3 | |||||
| ble swap_kernel_L999 | |||||
| swap_kernel_F10: | |||||
| KERNEL_F1 | |||||
| subs I, I, #1 | |||||
| bne swap_kernel_F10 | |||||
| b swap_kernel_L999 | |||||
| swap_kernel_S_BEGIN: | |||||
| #if defined(COMPLEX) | |||||
| #if defined(DOUBLE) | |||||
| lsl INC_X, INC_X, #4 // INC_X * SIZE * 2 | |||||
| lsl INC_Y, INC_Y, #4 // INC_Y * SIZE * 2 | |||||
| #else | |||||
| lsl INC_X, INC_X, #3 // INC_X * SIZE * 2 | |||||
| lsl INC_Y, INC_Y, #3 // INC_Y * SIZE * 2 | |||||
| #endif | |||||
| #else | |||||
| #if defined(DOUBLE) | |||||
| lsl INC_X, INC_X, #3 // INC_X * SIZE | |||||
| lsl INC_Y, INC_Y, #3 // INC_Y * SIZE | |||||
| #else | |||||
| lsl INC_X, INC_X, #2 // INC_X * SIZE | |||||
| lsl INC_Y, INC_Y, #2 // INC_Y * SIZE | |||||
| #endif | |||||
| #endif | |||||
| asrs I, N, #2 // I = N / 4 | |||||
| ble swap_kernel_S1 | |||||
| .align 5 | |||||
| swap_kernel_S4: | |||||
| KERNEL_S1 | |||||
| KERNEL_S1 | |||||
| KERNEL_S1 | |||||
| KERNEL_S1 | |||||
| subs I, I, #1 | |||||
| bne swap_kernel_S4 | |||||
| swap_kernel_S1: | |||||
| ands I, N, #3 | |||||
| ble swap_kernel_L999 | |||||
| swap_kernel_S10: | |||||
| KERNEL_S1 | |||||
| subs I, I, #1 | |||||
| bne swap_kernel_S10 | |||||
| swap_kernel_L999: | |||||
| mov r0, #0 // set return value | |||||
| sub sp, fp, #8 | |||||
| pop {r4,fp} | |||||
| bx lr | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,81 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| /************************************************************************************** | |||||
| * 2013/09/14 Saar | |||||
| * BLASTEST float : OK | |||||
| * BLASTEST double : OK | |||||
| * CTEST : NoTest | |||||
| * TEST : NoTest | |||||
| * | |||||
| **************************************************************************************/ | |||||
| #include "common.h" | |||||
| #include <math.h> | |||||
| #if defined(DOUBLE) | |||||
| #define ABS fabs | |||||
| #else | |||||
| #define ABS fabsf | |||||
| #endif | |||||
| #define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) | |||||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| { | |||||
| BLASLONG i=0; | |||||
| BLASLONG ix=0; | |||||
| FLOAT maxf[2]; | |||||
| BLASLONG max=0; | |||||
| BLASLONG inc_x2; | |||||
| if (n < 0 || inc_x < 1 ) return(0.0); | |||||
| inc_x2 = 2 * inc_x; | |||||
| maxf[0] = ABS(x[ix]); | |||||
| maxf[1] = ABS(x[ix+1]); | |||||
| while(i < n) | |||||
| { | |||||
| if( CABS1(x,ix) > CABS1(maxf,0) ) | |||||
| { | |||||
| max = i; | |||||
| maxf[0] = ABS(x[ix]); | |||||
| maxf[1] = ABS(x[ix+1]); | |||||
| } | |||||
| ix += inc_x2; | |||||
| i++; | |||||
| } | |||||
| return(CABS1(maxf,0)); | |||||
| } | |||||
| @@ -0,0 +1,81 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| /************************************************************************************** | |||||
| * 2013/09/14 Saar | |||||
| * BLASTEST float : OK | |||||
| * BLASTEST double : OK | |||||
| * CTEST : NoTest | |||||
| * TEST : NoTest | |||||
| * | |||||
| **************************************************************************************/ | |||||
| #include "common.h" | |||||
| #include <math.h> | |||||
| #if defined(DOUBLE) | |||||
| #define ABS fabs | |||||
| #else | |||||
| #define ABS fabsf | |||||
| #endif | |||||
| #define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) | |||||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| { | |||||
| BLASLONG i=0; | |||||
| BLASLONG ix=0; | |||||
| FLOAT minf[2]; | |||||
| BLASLONG min=0; | |||||
| BLASLONG inc_x2; | |||||
| if (n < 0 || inc_x < 1 ) return(0.0); | |||||
| inc_x2 = 2 * inc_x; | |||||
| minf[0] = ABS(x[ix]); | |||||
| minf[1] = ABS(x[ix+1]); | |||||
| while(i < n) | |||||
| { | |||||
| if( CABS1(x,ix) < CABS1(minf,0) ) | |||||
| { | |||||
| min = i; | |||||
| minf[0] = ABS(x[ix]); | |||||
| minf[1] = ABS(x[ix+1]); | |||||
| } | |||||
| ix += inc_x2; | |||||
| i++; | |||||
| } | |||||
| return(CABS1(minf,0)); | |||||
| } | |||||
| @@ -0,0 +1,71 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| /************************************************************************************** | |||||
| * 2013/09/14 Saar | |||||
| * BLASTEST float : OK | |||||
| * BLASTEST double : OK | |||||
| * CTEST : OK | |||||
| * TEST : OK | |||||
| * | |||||
| **************************************************************************************/ | |||||
| #include "common.h" | |||||
| #include <math.h> | |||||
| #if defined(DOUBLE) | |||||
| #define ABS fabs | |||||
| #else | |||||
| #define ABS fabsf | |||||
| #endif | |||||
| #define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) | |||||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| { | |||||
| BLASLONG i=0; | |||||
| FLOAT sumf = 0.0; | |||||
| BLASLONG inc_x2; | |||||
| if (n < 0 || inc_x < 1 ) return(sumf); | |||||
| inc_x2 = 2 * inc_x; | |||||
| n *= inc_x2; | |||||
| while(i < n) | |||||
| { | |||||
| sumf += CABS1(x,i); | |||||
| i += inc_x2; | |||||
| } | |||||
| return(sumf); | |||||
| } | |||||
| @@ -0,0 +1,72 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| /************************************************************************************** | |||||
| * 2013/09/15 Saar | |||||
| * BLASTEST float : OK | |||||
| * BLASTEST double : OK | |||||
| * CTEST : OK | |||||
| * TEST : OK | |||||
| * | |||||
| **************************************************************************************/ | |||||
| #include "common.h" | |||||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | |||||
| { | |||||
| BLASLONG i=0; | |||||
| BLASLONG ix,iy; | |||||
| if ( n < 0 ) return(0); | |||||
| if ( da_r == 0.0 && da_i == 0.0 ) return(0); | |||||
| ix = 0; | |||||
| iy = 0; | |||||
| BLASLONG inc_x2 = 2 * inc_x; | |||||
| BLASLONG inc_y2 = 2 * inc_y; | |||||
| while(i < n) | |||||
| { | |||||
| #if !defined(CONJ) | |||||
| y[iy] += ( da_r * x[ix] - da_i * x[ix+1] ) ; | |||||
| y[iy+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ; | |||||
| #else | |||||
| y[iy] += ( da_r * x[ix] + da_i * x[ix+1] ) ; | |||||
| y[iy+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ; | |||||
| #endif | |||||
| ix += inc_x2 ; | |||||
| iy += inc_y2 ; | |||||
| i++ ; | |||||
| } | |||||
| return(0); | |||||
| } | |||||
| @@ -0,0 +1,63 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| /************************************************************************************** | |||||
| * 2013/09/14 Saar | |||||
| * BLASTEST float : OK | |||||
| * BLASTEST double : OK | |||||
| * CTEST : OK | |||||
| * TEST : OK | |||||
| * | |||||
| **************************************************************************************/ | |||||
| #include "common.h" | |||||
| int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||||
| { | |||||
| BLASLONG i=0; | |||||
| BLASLONG ix=0,iy=0; | |||||
| if ( n < 0 ) return(0); | |||||
| BLASLONG inc_x2 = 2 * inc_x; | |||||
| BLASLONG inc_y2 = 2 * inc_y; | |||||
| while(i < n) | |||||
| { | |||||
| y[iy] = x[ix] ; | |||||
| y[iy+1] = x[ix+1] ; | |||||
| ix += inc_x2; | |||||
| iy += inc_y2; | |||||
| i++ ; | |||||
| } | |||||
| return(0); | |||||
| } | |||||
| @@ -0,0 +1,223 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| /************************************************************************************** | |||||
| * 2013/11/07 Saar | |||||
| * BLASTEST : OK | |||||
| * CTEST : OK | |||||
| * TEST : OK | |||||
| * | |||||
| **************************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define STACKSIZE 256 | |||||
| #define N r0 | |||||
| #define X r1 | |||||
| #define INC_X r2 | |||||
| #define OLD_Y r3 | |||||
| /****************************************************** | |||||
| * [fp, #-128] - [fp, #-64] is reserved | |||||
| * for store and restore of floating point | |||||
| * registers | |||||
| *******************************************************/ | |||||
| #define OLD_INC_Y [fp, #4 ] | |||||
| #define I r5 | |||||
| #define Y r6 | |||||
| #define INC_Y r7 | |||||
| #define X_PRE 256 | |||||
| /************************************************************************************** | |||||
| * Macro definitions | |||||
| **************************************************************************************/ | |||||
| .macro COPY_F4 | |||||
| pld [ X, #X_PRE ] | |||||
| pld [ X, #X_PRE+32 ] | |||||
| fldmiad X!, { d0 - d7 } | |||||
| fstmiad Y!, { d0 - d7 } | |||||
| .endm | |||||
| .macro COPY_F1 | |||||
| fldmiad X!, { d0 - d1 } | |||||
| fstmiad Y!, { d0 - d1 } | |||||
| .endm | |||||
| /*************************************************************************************************************************/ | |||||
| .macro COPY_S4 | |||||
| nop | |||||
| fldmiad X, { d0 - d1 } | |||||
| fstmiad Y, { d0 - d1 } | |||||
| add X, X, INC_X | |||||
| add Y, Y, INC_Y | |||||
| fldmiad X, { d2 - d3 } | |||||
| fstmiad Y, { d2 - d3 } | |||||
| add X, X, INC_X | |||||
| add Y, Y, INC_Y | |||||
| fldmiad X, { d0 - d1 } | |||||
| fstmiad Y, { d0 - d1 } | |||||
| add X, X, INC_X | |||||
| add Y, Y, INC_Y | |||||
| fldmiad X, { d2 - d3 } | |||||
| fstmiad Y, { d2 - d3 } | |||||
| add X, X, INC_X | |||||
| add Y, Y, INC_Y | |||||
| .endm | |||||
| .macro COPY_S1 | |||||
| fldmiad X, { d0 - d1 } | |||||
| fstmiad Y, { d0 - d1 } | |||||
| add X, X, INC_X | |||||
| add Y, Y, INC_Y | |||||
| .endm | |||||
| /************************************************************************************** | |||||
| * End of macro definitions | |||||
| **************************************************************************************/ | |||||
| PROLOGUE | |||||
| .align 5 | |||||
| push {r4 - r9, fp} | |||||
| add fp, sp, #24 | |||||
| sub sp, sp, #STACKSIZE // reserve stack | |||||
| sub r4, fp, #128 | |||||
| vstm r4, { d8 - d15} // store floating point registers | |||||
| mov Y, OLD_Y | |||||
| ldr INC_Y, OLD_INC_Y | |||||
| cmp N, #0 | |||||
| ble zcopy_kernel_L999 | |||||
| cmp INC_X, #0 | |||||
| beq zcopy_kernel_L999 | |||||
| cmp INC_Y, #0 | |||||
| beq zcopy_kernel_L999 | |||||
| cmp INC_X, #1 | |||||
| bne zcopy_kernel_S_BEGIN | |||||
| cmp INC_Y, #1 | |||||
| bne zcopy_kernel_S_BEGIN | |||||
| zcopy_kernel_F_BEGIN: | |||||
| asrs I, N, #2 // I = N / 4 | |||||
| ble zcopy_kernel_F1 | |||||
| zcopy_kernel_F4: | |||||
| COPY_F4 | |||||
| subs I, I, #1 | |||||
| bne zcopy_kernel_F4 | |||||
| zcopy_kernel_F1: | |||||
| ands I, N, #3 | |||||
| ble zcopy_kernel_L999 | |||||
| zcopy_kernel_F10: | |||||
| COPY_F1 | |||||
| subs I, I, #1 | |||||
| bne zcopy_kernel_F10 | |||||
| b zcopy_kernel_L999 | |||||
| zcopy_kernel_S_BEGIN: | |||||
| lsl INC_X, INC_X, #4 // INC_X * SIZE * 2 | |||||
| lsl INC_Y, INC_Y, #4 // INC_Y * SIZE * 2 | |||||
| asrs I, N, #2 // I = N / 4 | |||||
| ble zcopy_kernel_S1 | |||||
| zcopy_kernel_S4: | |||||
| COPY_S4 | |||||
| subs I, I, #1 | |||||
| bne zcopy_kernel_S4 | |||||
| zcopy_kernel_S1: | |||||
| ands I, N, #3 | |||||
| ble zcopy_kernel_L999 | |||||
| zcopy_kernel_S10: | |||||
| COPY_S1 | |||||
| subs I, I, #1 | |||||
| bne zcopy_kernel_S10 | |||||
| zcopy_kernel_L999: | |||||
| sub r3, fp, #128 | |||||
| vldm r3, { d8 - d15} // restore floating point registers | |||||
| mov r0, #0 // set return value | |||||
| sub sp, fp, #24 | |||||
| pop {r4 - r9, fp} | |||||
| bx lr | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,78 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| /************************************************************************************** | |||||
| * 2013/09/14 Saar | |||||
| * BLASTEST float : FAIL | |||||
| * BLASTEST double : FAIL | |||||
| * CTEST : OK | |||||
| * TEST : OK | |||||
| * | |||||
| **************************************************************************************/ | |||||
| #include "common.h" | |||||
| #include <complex.h> | |||||
| FLOAT _Complex CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||||
| { | |||||
| BLASLONG i=0; | |||||
| BLASLONG ix=0,iy=0; | |||||
| FLOAT dot[2]; | |||||
| FLOAT _Complex result; | |||||
| dot[0]=0.0; | |||||
| dot[1]=0.0; | |||||
| __real__ result = 0.0 ; | |||||
| __imag__ result = 0.0 ; | |||||
| if ( n < 1 ) return(result); | |||||
| BLASLONG inc_x2 = 2 * inc_x ; | |||||
| BLASLONG inc_y2 = 2 * inc_y ; | |||||
| while(i < n) | |||||
| { | |||||
| #if !defined(CONJ) | |||||
| dot[0] += ( x[ix] * y[iy] - x[ix+1] * y[iy+1] ) ; | |||||
| dot[1] += ( x[ix+1] * y[iy] + x[ix] * y[iy+1] ) ; | |||||
| #else | |||||
| dot[0] += ( x[ix] * y[iy] + x[ix+1] * y[iy+1] ) ; | |||||
| dot[1] -= ( x[ix+1] * y[iy] - x[ix] * y[iy+1] ) ; | |||||
| #endif | |||||
| ix += inc_x2 ; | |||||
| iy += inc_y2 ; | |||||
| i++ ; | |||||
| } | |||||
| __real__ result = dot[0]; | |||||
| __imag__ result = dot[1]; | |||||
| return(result); | |||||
| } | |||||
| @@ -0,0 +1,286 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| /************************************************************************************** | |||||
| * 2013/11/11 Saar | |||||
| * BLASTEST : OK | |||||
| * CTEST : OK | |||||
| * TEST : OK | |||||
| * | |||||
| **************************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define STACKSIZE 256 | |||||
| #define N r0 | |||||
| #define X r1 | |||||
| #define INC_X r2 | |||||
| #define OLD_Y r3 | |||||
| /****************************************************** | |||||
| * [fp, #-128] - [fp, #-64] is reserved | |||||
| * for store and restore of floating point | |||||
| * registers | |||||
| *******************************************************/ | |||||
| #define OLD_INC_Y [fp, #4 ] | |||||
| #define I r5 | |||||
| #define Y r6 | |||||
| #define INC_Y r7 | |||||
| #define X_PRE 512 | |||||
| /************************************************************************************** | |||||
| * Macro definitions | |||||
| **************************************************************************************/ | |||||
| .macro KERNEL_F4 | |||||
| pld [ X, #X_PRE ] | |||||
| pld [ Y, #X_PRE ] | |||||
| fldmiad X!, { d4 - d5 } | |||||
| fldmiad Y!, { d8 - d9 } | |||||
| fmacd d0 , d4, d8 | |||||
| fmacd d1 , d4, d9 | |||||
| fldmiad X!, { d6 - d7 } | |||||
| fmacd d2 , d5, d9 | |||||
| fmacd d3 , d5, d8 | |||||
| fldmiad Y!, { d10 - d11 } | |||||
| fmacd d0 , d6, d10 | |||||
| fmacd d1 , d6, d11 | |||||
| pld [ X, #X_PRE ] | |||||
| fmacd d2 , d7, d11 | |||||
| fmacd d3 , d7, d10 | |||||
| pld [ Y, #X_PRE ] | |||||
| fldmiad X!, { d4 - d5 } | |||||
| fldmiad Y!, { d8 - d9 } | |||||
| fmacd d0 , d4, d8 | |||||
| fmacd d1 , d4, d9 | |||||
| fldmiad X!, { d6 - d7 } | |||||
| fmacd d2 , d5, d9 | |||||
| fmacd d3 , d5, d8 | |||||
| fldmiad Y!, { d10 - d11 } | |||||
| fmacd d0 , d6, d10 | |||||
| fmacd d1 , d6, d11 | |||||
| fmacd d2 , d7, d11 | |||||
| fmacd d3 , d7, d10 | |||||
| .endm | |||||
| .macro KERNEL_F1 | |||||
| fldmiad X!, { d4 - d5 } | |||||
| fldmiad Y!, { d8 - d9 } | |||||
| fmacd d0 , d4, d8 | |||||
| fmacd d1 , d4, d9 | |||||
| fmacd d2 , d5, d9 | |||||
| fmacd d3 , d5, d8 | |||||
| .endm | |||||
| /*************************************************************************************************************************/ | |||||
| .macro KERNEL_S4 | |||||
| nop | |||||
| fldmiad X, { d4 - d5 } | |||||
| fldmiad Y, { d8 - d9 } | |||||
| fmacd d0 , d4, d8 | |||||
| fmacd d1 , d4, d9 | |||||
| fmacd d2 , d5, d9 | |||||
| fmacd d3 , d5, d8 | |||||
| add X, X, INC_X | |||||
| add Y, Y, INC_Y | |||||
| fldmiad X, { d4 - d5 } | |||||
| fldmiad Y, { d8 - d9 } | |||||
| fmacd d0 , d4, d8 | |||||
| fmacd d1 , d4, d9 | |||||
| fmacd d2 , d5, d9 | |||||
| fmacd d3 , d5, d8 | |||||
| add X, X, INC_X | |||||
| add Y, Y, INC_Y | |||||
| fldmiad X, { d4 - d5 } | |||||
| fldmiad Y, { d8 - d9 } | |||||
| fmacd d0 , d4, d8 | |||||
| fmacd d1 , d4, d9 | |||||
| fmacd d2 , d5, d9 | |||||
| fmacd d3 , d5, d8 | |||||
| add X, X, INC_X | |||||
| add Y, Y, INC_Y | |||||
| fldmiad X, { d4 - d5 } | |||||
| fldmiad Y, { d8 - d9 } | |||||
| fmacd d0 , d4, d8 | |||||
| fmacd d1 , d4, d9 | |||||
| fmacd d2 , d5, d9 | |||||
| fmacd d3 , d5, d8 | |||||
| add X, X, INC_X | |||||
| add Y, Y, INC_Y | |||||
| .endm | |||||
| .macro KERNEL_S1 | |||||
| fldmiad X, { d4 - d5 } | |||||
| fldmiad Y, { d8 - d9 } | |||||
| fmacd d0 , d4, d8 | |||||
| fmacd d1 , d4, d9 | |||||
| fmacd d2 , d5, d9 | |||||
| fmacd d3 , d5, d8 | |||||
| add X, X, INC_X | |||||
| add Y, Y, INC_Y | |||||
| .endm | |||||
| /************************************************************************************** | |||||
| * End of macro definitions | |||||
| **************************************************************************************/ | |||||
| PROLOGUE | |||||
| .align 5 | |||||
| push {r4 - r9, fp} | |||||
| add fp, sp, #24 | |||||
| sub sp, sp, #STACKSIZE // reserve stack | |||||
| sub r4, fp, #128 | |||||
| vstm r4, { d8 - d15} // store floating point registers | |||||
| mov Y, OLD_Y | |||||
| ldr INC_Y, OLD_INC_Y | |||||
| vsub.f64 d0 , d0 , d0 | |||||
| vsub.f64 d1 , d1 , d1 | |||||
| vsub.f64 d2 , d2 , d2 | |||||
| vsub.f64 d3 , d3 , d3 | |||||
| cmp N, #0 | |||||
| ble zdot_kernel_L999 | |||||
| cmp INC_X, #0 | |||||
| beq zdot_kernel_L999 | |||||
| cmp INC_Y, #0 | |||||
| beq zdot_kernel_L999 | |||||
| cmp INC_X, #1 | |||||
| bne zdot_kernel_S_BEGIN | |||||
| cmp INC_Y, #1 | |||||
| bne zdot_kernel_S_BEGIN | |||||
| zdot_kernel_F_BEGIN: | |||||
| asrs I, N, #2 // I = N / 4 | |||||
| ble zdot_kernel_F1 | |||||
| zdot_kernel_F4: | |||||
| KERNEL_F4 | |||||
| subs I, I, #1 | |||||
| bne zdot_kernel_F4 | |||||
| zdot_kernel_F1: | |||||
| ands I, N, #3 | |||||
| ble zdot_kernel_L999 | |||||
| zdot_kernel_F10: | |||||
| KERNEL_F1 | |||||
| subs I, I, #1 | |||||
| bne zdot_kernel_F10 | |||||
| b zdot_kernel_L999 | |||||
| zdot_kernel_S_BEGIN: | |||||
| lsl INC_X, INC_X, #4 // INC_X * SIZE * 2 | |||||
| lsl INC_Y, INC_Y, #4 // INC_Y * SIZE * 2 | |||||
| asrs I, N, #2 // I = N / 4 | |||||
| ble zdot_kernel_S1 | |||||
| zdot_kernel_S4: | |||||
| KERNEL_S4 | |||||
| subs I, I, #1 | |||||
| bne zdot_kernel_S4 | |||||
| zdot_kernel_S1: | |||||
| ands I, N, #3 | |||||
| ble zdot_kernel_L999 | |||||
| zdot_kernel_S10: | |||||
| KERNEL_S1 | |||||
| subs I, I, #1 | |||||
| bne zdot_kernel_S10 | |||||
| zdot_kernel_L999: | |||||
| sub r3, fp, #128 | |||||
| vldm r3, { d8 - d15} // restore floating point registers | |||||
| #if !defined(CONJ) | |||||
| vsub.f64 d0 , d0, d2 | |||||
| vadd.f64 d1 , d1, d3 | |||||
| #else | |||||
| vadd.f64 d0 , d0, d2 | |||||
| vsub.f64 d1 , d1, d3 | |||||
| #endif | |||||
| sub sp, fp, #24 | |||||
| pop {r4 - r9, fp} | |||||
| bx lr | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,254 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| /************************************************************************************** | |||||
| * 2013/11/05 Saar | |||||
| * BLASTEST : OK | |||||
| * CTEST : OK | |||||
| * TEST : OK | |||||
| * | |||||
| **************************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define STACKSIZE 256 | |||||
| #define OLD_M r0 | |||||
| #define OLD_N r1 | |||||
| #define OLD_A r2 | |||||
| #define OLD_LDA r3 | |||||
| /****************************************************** | |||||
| * [fp, #-128] - [fp, #-64] is reserved | |||||
| * for store and restore of floating point | |||||
| * registers | |||||
| *******************************************************/ | |||||
| #define LDA [fp, #-260 ] | |||||
| #define B [fp, #4 ] | |||||
| #define M r0 | |||||
| #define N r1 | |||||
| #define A r2 | |||||
| #define BO r5 | |||||
| #define AO1 r6 | |||||
| #define AO2 r7 | |||||
| #define I r3 | |||||
| #define J r12 | |||||
| #define A_PRE 256 | |||||
| /************************************************************************************** | |||||
| * Macro definitions | |||||
| **************************************************************************************/ | |||||
| .macro COPY2x2 | |||||
| pld [ AO1, #A_PRE ] | |||||
| pld [ AO2, #A_PRE ] | |||||
| fldd d0 , [ AO1, #0 ] | |||||
| fldd d1 , [ AO1, #8 ] | |||||
| fldd d4 , [ AO1, #16 ] | |||||
| fldd d5 , [ AO1, #24 ] | |||||
| fldd d2 , [ AO2, #0 ] | |||||
| fldd d3 , [ AO2, #8 ] | |||||
| add AO1, AO1, #32 | |||||
| fldd d6 , [ AO2, #16 ] | |||||
| fldd d7 , [ AO2, #24 ] | |||||
| fstmiad BO!, { d0 - d7 } | |||||
| add AO2, AO2, #32 | |||||
| .endm | |||||
| .macro COPY1x2 | |||||
| fldd d0 , [ AO1, #0 ] | |||||
| fldd d1 , [ AO1, #8 ] | |||||
| fldd d2 , [ AO2, #0 ] | |||||
| fldd d3 , [ AO2, #8 ] | |||||
| add AO1, AO1, #16 | |||||
| fstmiad BO!, { d0 - d3 } | |||||
| add AO2, AO2, #16 | |||||
| .endm | |||||
| .macro COPY2x1 | |||||
| fldd d0 , [ AO1, #0 ] | |||||
| fldd d1 , [ AO1, #8 ] | |||||
| fldd d2 , [ AO1, #16 ] | |||||
| fldd d3 , [ AO1, #24 ] | |||||
| fstmiad BO!, { d0 - d3 } | |||||
| add AO1, AO1, #32 | |||||
| .endm | |||||
| .macro COPY1x1 | |||||
| fldd d0 , [ AO1, #0 ] | |||||
| fldd d1 , [ AO1, #8 ] | |||||
| fstmiad BO!, { d0 - d1 } | |||||
| add AO1, AO1, #16 | |||||
| .endm | |||||
| /************************************************************************************** | |||||
| * End of macro definitions | |||||
| **************************************************************************************/ | |||||
| PROLOGUE | |||||
| .align 5 | |||||
| push {r4 - r9, fp} | |||||
| add fp, sp, #24 | |||||
| sub sp, sp, #STACKSIZE // reserve stack | |||||
| lsl r3, r3, #4 // lda = lda * 8 * 2 | |||||
| str r3, LDA | |||||
| sub r4, fp, #128 | |||||
| vstm r4, { d8 - d15} // store floating point registers | |||||
| ldr BO, B | |||||
| /*********************************************************************************************/ | |||||
| zgemm_ncopy_L2_BEGIN: | |||||
| asrs J, N, #1 // J = N / 2 | |||||
| ble zgemm_ncopy_L1_BEGIN | |||||
| zgemm_ncopy_L2_M2_BEGIN: | |||||
| mov AO1, A // AO1 = A | |||||
| ldr r4 , LDA | |||||
| add AO2, AO1, r4 | |||||
| add A , AO2, r4 // A = A + 2 * LDA | |||||
| asrs I, M, #1 // I = M / 2 | |||||
| ble zgemm_ncopy_L2_M2_40 | |||||
| zgemm_ncopy_L2_M2_20: | |||||
| COPY2x2 | |||||
| subs I , I , #1 | |||||
| bne zgemm_ncopy_L2_M2_20 | |||||
| zgemm_ncopy_L2_M2_40: | |||||
| ands I, M , #1 | |||||
| ble zgemm_ncopy_L2_M2_END | |||||
| zgemm_ncopy_L2_M2_60: | |||||
| COPY1x2 | |||||
| subs I , I , #1 | |||||
| bne zgemm_ncopy_L2_M2_60 | |||||
| zgemm_ncopy_L2_M2_END: | |||||
| subs J , J, #1 // j-- | |||||
| bne zgemm_ncopy_L2_M2_BEGIN | |||||
| /*********************************************************************************************/ | |||||
| zgemm_ncopy_L1_BEGIN: | |||||
| tst N, #1 | |||||
| ble zgemm_ncopy_L999 | |||||
| zgemm_ncopy_L1_M2_BEGIN: | |||||
| mov AO1, A // AO1 = A | |||||
| ldr r4 , LDA | |||||
| add A , AO1, r4 // A = A + 1 * LDA | |||||
| asrs I, M, #1 // I = M / 2 | |||||
| ble zgemm_ncopy_L1_M2_40 | |||||
| zgemm_ncopy_L1_M2_20: | |||||
| COPY2x1 | |||||
| subs I , I , #1 | |||||
| bne zgemm_ncopy_L1_M2_20 | |||||
| zgemm_ncopy_L1_M2_40: | |||||
| ands I, M , #1 | |||||
| ble zgemm_ncopy_L1_M2_END | |||||
| zgemm_ncopy_L1_M2_60: | |||||
| COPY1x1 | |||||
| subs I , I , #1 | |||||
| bne zgemm_ncopy_L1_M2_60 | |||||
| zgemm_ncopy_L1_M2_END: | |||||
| zgemm_ncopy_L999: | |||||
| sub r3, fp, #128 | |||||
| vldm r3, { d8 - d15} // restore floating point registers | |||||
| movs r0, #0 // set return value | |||||
| sub sp, fp, #24 | |||||
| pop {r4 - r9, fp} | |||||
| bx lr | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,245 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| /************************************************************************************** | |||||
| * 2013/11/07 Saar | |||||
| * BLASTEST : OK | |||||
| * CTEST : OK | |||||
| * TEST : OK | |||||
| * | |||||
| **************************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define STACKSIZE 256 | |||||
| #define OLD_M r0 | |||||
| #define OLD_N r1 | |||||
| #define OLD_A r2 | |||||
| #define OLD_LDA r3 | |||||
| /****************************************************** | |||||
| * [fp, #-128] - [fp, #-64] is reserved | |||||
| * for store and restore of floating point | |||||
| * registers | |||||
| *******************************************************/ | |||||
| #define B [fp, #4 ] | |||||
| #define A [fp, #-248 ] | |||||
| #define M r0 | |||||
| #define N r1 | |||||
| #define M4 r2 | |||||
| #define LDA r5 | |||||
| #define AO1 r6 | |||||
| #define BO1 r7 | |||||
| #define BO2 r8 | |||||
| #define I r4 | |||||
| #define J r12 | |||||
| #define A_PRE 256 | |||||
| /************************************************************************************** | |||||
| * Macro definitions | |||||
| **************************************************************************************/ | |||||
| .macro COPY2x2 | |||||
| pld [ AO1, #A_PRE ] | |||||
| fldmiad AO1, { d0 - d3 } | |||||
| add r3, AO1, LDA | |||||
| pld [ r3, #A_PRE ] | |||||
| fldmiad r3, { d4 - d7 } | |||||
| fstmiad BO1, { d0 - d7 } | |||||
| add AO1, AO1, #32 | |||||
| add BO1, BO1, M4 | |||||
| .endm | |||||
| .macro COPY1x2 | |||||
| fldmiad AO1, { d0 -d1 } | |||||
| add r3, AO1, LDA | |||||
| fldmiad r3, { d2 - d3 } | |||||
| fstmiad BO2, { d0 - d3 } | |||||
| add AO1, AO1, #16 | |||||
| add BO2, BO2, #32 | |||||
| .endm | |||||
| /*************************************************************************************************************************/ | |||||
| .macro COPY2x1 | |||||
| fldmiad AO1, { d0 - d3 } | |||||
| fstmiad BO1, { d0 - d3 } | |||||
| add AO1, AO1, #32 | |||||
| add BO1, BO1, M4 | |||||
| .endm | |||||
| .macro COPY1x1 | |||||
| fldmiad AO1, { d0 - d1 } | |||||
| fstmiad BO2, { d0 - d1 } | |||||
| add AO1, AO1, #16 | |||||
| add BO2, BO2, #16 | |||||
| .endm | |||||
| /************************************************************************************** | |||||
| * End of macro definitions | |||||
| **************************************************************************************/ | |||||
| PROLOGUE | |||||
| .align 5 | |||||
| push {r4 - r9, fp} | |||||
| add fp, sp, #24 | |||||
| sub sp, sp, #STACKSIZE // reserve stack | |||||
| str OLD_A, A // store A | |||||
| lsl LDA, OLD_LDA, #4 // lda = lda * SIZE * 2 | |||||
| sub r4, fp, #128 | |||||
| vstm r4, { d8 - d15} // store floating point registers | |||||
| lsl r4 , M, #4 // M * SIZE * 2 | |||||
| ldr r3, B | |||||
| and BO2 , N , #-2 | |||||
| mul BO2, BO2, r4 | |||||
| add BO2 , BO2, r3 | |||||
| lsl M4, M, #5 // M4 = M * 2 * SIZE * 2 | |||||
| zgemm_tcopy_L2_BEGIN: | |||||
| asrs J, M, #1 // J = N / 2 | |||||
| ble zgemm_tcopy_L1_BEGIN | |||||
| zgemm_tcopy_L2_M2_BEGIN: | |||||
| ldr AO1, A // AO1 = A | |||||
| lsl r3, LDA, #1 // r3 = 2 * LDA | |||||
| add r3, r3 , AO1 // A = A + 2 * LDA | |||||
| str r3, A // store A | |||||
| ldr BO1, B | |||||
| add r3, BO1, #64 // B = B + 4 * SIZE *2 | |||||
| str r3, B | |||||
| asrs I, N, #1 // I = M / 2 | |||||
| ble zgemm_tcopy_L2_M2_60 | |||||
| zgemm_tcopy_L2_M2_40: | |||||
| COPY2x2 | |||||
| subs I, I, #1 | |||||
| bne zgemm_tcopy_L2_M2_40 | |||||
| zgemm_tcopy_L2_M2_60: | |||||
| tst N , #1 | |||||
| ble zgemm_tcopy_L2_M2_END | |||||
| COPY1x2 | |||||
| zgemm_tcopy_L2_M2_END: | |||||
| subs J , J, #1 // j-- | |||||
| bne zgemm_tcopy_L2_M2_BEGIN | |||||
| /*********************************************************************************************/ | |||||
| zgemm_tcopy_L1_BEGIN: | |||||
| tst M, #1 | |||||
| ble zgemm_tcopy_L999 | |||||
| zgemm_tcopy_L1_M2_BEGIN: | |||||
| ldr AO1, A // AO1 = A | |||||
| add r3, LDA , AO1 // A = A + 1 * LDA | |||||
| str r3, A // store A | |||||
| ldr BO1, B | |||||
| add r3, BO1, #32 // B = B + 2 * SIZE *2 | |||||
| str r3, B | |||||
| asrs I, N, #1 // I = M / 2 | |||||
| ble zgemm_tcopy_L1_M2_60 | |||||
| zgemm_tcopy_L1_M2_40: | |||||
| COPY2x1 | |||||
| subs I, I, #1 | |||||
| bne zgemm_tcopy_L1_M2_40 | |||||
| zgemm_tcopy_L1_M2_60: | |||||
| tst N , #1 | |||||
| ble zgemm_tcopy_L1_M2_END | |||||
| COPY1x1 | |||||
| zgemm_tcopy_L1_M2_END: | |||||
| zgemm_tcopy_L999: | |||||
| sub r3, fp, #128 | |||||
| vldm r3, { d8 - d15} // restore floating point registers | |||||
| mov r0, #0 // set return value | |||||
| sub sp, fp, #24 | |||||
| pop {r4 - r9, fp} | |||||
| bx lr | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,157 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| /************************************************************************************** | |||||
| * * 2013/11/23 Saar | |||||
| * * BLASTEST float : OK | |||||
| * * BLASTEST double : OK | |||||
| * CTEST : OK | |||||
| * TEST : OK | |||||
| * * | |||||
| * **************************************************************************************/ | |||||
| #include "common.h" | |||||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||||
| { | |||||
| BLASLONG i; | |||||
| BLASLONG ix,iy; | |||||
| BLASLONG j; | |||||
| FLOAT *a_ptr; | |||||
| FLOAT temp_r,temp_i; | |||||
| BLASLONG inc_x2,inc_y2; | |||||
| BLASLONG lda2; | |||||
| BLASLONG i2; | |||||
| lda2 = 2*lda; | |||||
| ix = 0; | |||||
| a_ptr = a; | |||||
| if ( inc_x == 1 && inc_y == 1 ) | |||||
| { | |||||
| for (j=0; j<n; j++) | |||||
| { | |||||
| #if !defined(XCONJ) | |||||
| temp_r = alpha_r * x[ix] - alpha_i * x[ix+1]; | |||||
| temp_i = alpha_r * x[ix+1] + alpha_i * x[ix]; | |||||
| #else | |||||
| temp_r = alpha_r * x[ix] + alpha_i * x[ix+1]; | |||||
| temp_i = alpha_r * x[ix+1] - alpha_i * x[ix]; | |||||
| #endif | |||||
| iy = 0; | |||||
| i2=0; | |||||
| for (i=0; i<m; i++) | |||||
| { | |||||
| #if !defined(CONJ) | |||||
| #if !defined(XCONJ) | |||||
| y[iy] += temp_r * a_ptr[i2] - temp_i * a_ptr[i2+1]; | |||||
| y[iy+1] += temp_r * a_ptr[i2+1] + temp_i * a_ptr[i2]; | |||||
| #else | |||||
| y[iy] += temp_r * a_ptr[i2] + temp_i * a_ptr[i2+1]; | |||||
| y[iy+1] += temp_r * a_ptr[i2+1] - temp_i * a_ptr[i2]; | |||||
| #endif | |||||
| #else | |||||
| #if !defined(XCONJ) | |||||
| y[iy] += temp_r * a_ptr[i2] + temp_i * a_ptr[i2+1]; | |||||
| y[iy+1] -= temp_r * a_ptr[i2+1] - temp_i * a_ptr[i2]; | |||||
| #else | |||||
| y[iy] += temp_r * a_ptr[i2] - temp_i * a_ptr[i2+1]; | |||||
| y[iy+1] -= temp_r * a_ptr[i2+1] + temp_i * a_ptr[i2]; | |||||
| #endif | |||||
| #endif | |||||
| i2 += 2; | |||||
| iy += 2; | |||||
| } | |||||
| a_ptr += lda2; | |||||
| ix += 2; | |||||
| } | |||||
| return(0); | |||||
| } | |||||
| inc_x2 = 2 * inc_x; | |||||
| inc_y2 = 2 * inc_y; | |||||
| for (j=0; j<n; j++) | |||||
| { | |||||
| #if !defined(XCONJ) | |||||
| temp_r = alpha_r * x[ix] - alpha_i * x[ix+1]; | |||||
| temp_i = alpha_r * x[ix+1] + alpha_i * x[ix]; | |||||
| #else | |||||
| temp_r = alpha_r * x[ix] + alpha_i * x[ix+1]; | |||||
| temp_i = alpha_r * x[ix+1] - alpha_i * x[ix]; | |||||
| #endif | |||||
| iy = 0; | |||||
| i2=0; | |||||
| for (i=0; i<m; i++) | |||||
| { | |||||
| #if !defined(CONJ) | |||||
| #if !defined(XCONJ) | |||||
| y[iy] += temp_r * a_ptr[i2] - temp_i * a_ptr[i2+1]; | |||||
| y[iy+1] += temp_r * a_ptr[i2+1] + temp_i * a_ptr[i2]; | |||||
| #else | |||||
| y[iy] += temp_r * a_ptr[i2] + temp_i * a_ptr[i2+1]; | |||||
| y[iy+1] += temp_r * a_ptr[i2+1] - temp_i * a_ptr[i2]; | |||||
| #endif | |||||
| #else | |||||
| #if !defined(XCONJ) | |||||
| y[iy] += temp_r * a_ptr[i2] + temp_i * a_ptr[i2+1]; | |||||
| y[iy+1] -= temp_r * a_ptr[i2+1] - temp_i * a_ptr[i2]; | |||||
| #else | |||||
| y[iy] += temp_r * a_ptr[i2] - temp_i * a_ptr[i2+1]; | |||||
| y[iy+1] -= temp_r * a_ptr[i2+1] + temp_i * a_ptr[i2]; | |||||
| #endif | |||||
| #endif | |||||
| i2 += 2; | |||||
| iy += inc_y2; | |||||
| } | |||||
| a_ptr += lda2; | |||||
| ix += inc_x2; | |||||
| } | |||||
| return(0); | |||||
| } | |||||
| @@ -0,0 +1,699 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| /************************************************************************************** | |||||
| * 2013/11/29 Saar | |||||
| * BLASTEST : OK | |||||
| * CTEST : OK | |||||
| * TEST : OK | |||||
| * | |||||
| **************************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define STACKSIZE 256 | |||||
| #define OLD_LDA [fp, #0 ] | |||||
| #define X [fp, #4 ] | |||||
| #define OLD_INC_X [fp, #8 ] | |||||
| #define Y [fp, #12 ] | |||||
| #define OLD_INC_Y [fp, #16 ] | |||||
| #define OLD_A r3 | |||||
| #define OLD_M r0 | |||||
| #define AO1 r0 | |||||
| #define N r1 | |||||
| #define J r2 | |||||
| #define AO2 r4 | |||||
| #define XO r5 | |||||
| #define YO r6 | |||||
| #define LDA r7 | |||||
| #define INC_X r8 | |||||
| #define INC_Y r9 | |||||
| #define I r12 | |||||
| #define ALPHA_I [fp, #-236] | |||||
| #define ALPHA_R [fp, #-244] | |||||
| #define M [fp, #-252 ] | |||||
| #define A [fp, #-256 ] | |||||
| #define X_PRE 64 | |||||
| #define Y_PRE 0 | |||||
| #define A_PRE 0 | |||||
| /**************************************************************************************/ | |||||
| #if !defined(CONJ) && !defined(XCONJ) | |||||
| #define KMAC_R fnmacd | |||||
| #define KMAC_I fmacd | |||||
| #define FMAC_R1 fmacd | |||||
| #define FMAC_R2 fnmacd | |||||
| #define FMAC_I1 fmacd | |||||
| #define FMAC_I2 fmacd | |||||
| #elif defined(CONJ) && !defined(XCONJ) | |||||
| #define KMAC_R fmacd | |||||
| #define KMAC_I fnmacd | |||||
| #define FMAC_R1 fmacd | |||||
| #define FMAC_R2 fnmacd | |||||
| #define FMAC_I1 fmacd | |||||
| #define FMAC_I2 fmacd | |||||
| #elif !defined(CONJ) && defined(XCONJ) | |||||
| #define KMAC_R fmacd | |||||
| #define KMAC_I fnmacd | |||||
| #define FMAC_R1 fmacd | |||||
| #define FMAC_R2 fmacd | |||||
| #define FMAC_I1 fnmacd | |||||
| #define FMAC_I2 fmacd | |||||
| #else | |||||
| #define KMAC_R fnmacd | |||||
| #define KMAC_I fmacd | |||||
| #define FMAC_R1 fmacd | |||||
| #define FMAC_R2 fmacd | |||||
| #define FMAC_I1 fnmacd | |||||
| #define FMAC_I2 fmacd | |||||
| #endif | |||||
| .macro INIT_F4 | |||||
| pld [ YO, #Y_PRE ] | |||||
| vsub.f64 d8 , d8 , d8 | |||||
| vmov.f64 d9 , d8 | |||||
| vmov.f64 d10, d8 | |||||
| vmov.f64 d11, d8 | |||||
| vmov.f64 d12, d8 | |||||
| vmov.f64 d13, d8 | |||||
| vmov.f64 d14, d8 | |||||
| vmov.f64 d15, d8 | |||||
| .endm | |||||
| .macro KERNEL_F4X4 | |||||
| pld [ XO, #X_PRE ] | |||||
| KERNEL_F4X1 | |||||
| KERNEL_F4X1 | |||||
| pld [ XO, #X_PRE ] | |||||
| KERNEL_F4X1 | |||||
| KERNEL_F4X1 | |||||
| .endm | |||||
| .macro KERNEL_F4X1 | |||||
| fldd d0 , [ AO1 ] | |||||
| fldd d4 , [ XO ] | |||||
| fldd d5 , [ XO, #8 ] | |||||
| pld [ AO2, #A_PRE ] | |||||
| fldd d1 , [ AO1, #8 ] | |||||
| fmacd d8 , d0, d4 | |||||
| fldd d2 , [ AO1, #16 ] | |||||
| fmacd d9 , d0, d5 | |||||
| fldd d3 , [ AO1, #24 ] | |||||
| fmacd d10 , d2, d4 | |||||
| fldd d0 , [ AO1, #32 ] | |||||
| fmacd d11 , d2, d5 | |||||
| KMAC_R d8 , d1, d5 | |||||
| KMAC_I d9 , d1, d4 | |||||
| KMAC_R d10 , d3, d5 | |||||
| fldd d1 , [ AO1, #40 ] | |||||
| KMAC_I d11 , d3, d4 | |||||
| fldd d2 , [ AO1, #48 ] | |||||
| fmacd d12 , d0, d4 | |||||
| fldd d3 , [ AO1, #56 ] | |||||
| fmacd d13 , d0, d5 | |||||
| pld [ AO2, #A_PRE+32 ] | |||||
| fmacd d14 , d2, d4 | |||||
| fmacd d15 , d2, d5 | |||||
| KMAC_R d12 , d1, d5 | |||||
| add XO , XO, #16 | |||||
| KMAC_I d13 , d1, d4 | |||||
| add AO1 , AO1, LDA | |||||
| KMAC_R d14 , d3, d5 | |||||
| add AO2 , AO2, LDA | |||||
| KMAC_I d15 , d3, d4 | |||||
| .endm | |||||
| .macro SAVE_F4 | |||||
| fldd d0, ALPHA_R | |||||
| fldd d1, ALPHA_I | |||||
| fldmiad YO, { d4 - d7 } | |||||
| FMAC_R1 d4 , d0 , d8 | |||||
| FMAC_I1 d5 , d0 , d9 | |||||
| FMAC_R2 d4 , d1 , d9 | |||||
| FMAC_I2 d5 , d1 , d8 | |||||
| FMAC_R1 d6 , d0 , d10 | |||||
| FMAC_I1 d7 , d0 , d11 | |||||
| FMAC_R2 d6 , d1 , d11 | |||||
| FMAC_I2 d7 , d1 , d10 | |||||
| fstmiad YO!, { d4 - d7 } | |||||
| fldmiad YO, { d4 - d7 } | |||||
| FMAC_R1 d4 , d0 , d12 | |||||
| FMAC_I1 d5 , d0 , d13 | |||||
| FMAC_R2 d4 , d1 , d13 | |||||
| FMAC_I2 d5 , d1 , d12 | |||||
| FMAC_R1 d6 , d0 , d14 | |||||
| FMAC_I1 d7 , d0 , d15 | |||||
| FMAC_R2 d6 , d1 , d15 | |||||
| FMAC_I2 d7 , d1 , d14 | |||||
| fstmiad YO!, { d4 - d7 } | |||||
| .endm | |||||
| .macro INIT_F1 | |||||
| vsub.f64 d8 , d8 , d8 | |||||
| vmov.f64 d9 , d8 | |||||
| .endm | |||||
| .macro KERNEL_F1X1 | |||||
| fldd d0 , [ AO1 ] | |||||
| fldd d1 , [ AO1, #8 ] | |||||
| fldd d4 , [ XO ] | |||||
| fldd d5 , [ XO, #8 ] | |||||
| fmacd d8 , d0, d4 | |||||
| fmacd d9 , d0, d5 | |||||
| KMAC_R d8 , d1, d5 | |||||
| KMAC_I d9 , d1, d4 | |||||
| add XO , XO, #16 | |||||
| add AO1 , AO1, LDA | |||||
| .endm | |||||
| .macro SAVE_F1 | |||||
| fldd d0, ALPHA_R | |||||
| fldd d1, ALPHA_I | |||||
| fldmiad YO, { d4 - d5 } | |||||
| FMAC_R1 d4 , d0 , d8 | |||||
| FMAC_I1 d5 , d0 , d9 | |||||
| FMAC_R2 d4 , d1 , d9 | |||||
| FMAC_I2 d5 , d1 , d8 | |||||
| fstmiad YO, { d4 - d5 } | |||||
| add YO, YO, #16 | |||||
| .endm | |||||
| /****************************************************************************************/ | |||||
| .macro INIT_S4 | |||||
| vsub.f64 d8 , d8 , d8 | |||||
| vmov.f64 d9 , d8 | |||||
| vmov.f64 d10, d8 | |||||
| vmov.f64 d11, d8 | |||||
| vmov.f64 d12, d8 | |||||
| vmov.f64 d13, d8 | |||||
| vmov.f64 d14, d8 | |||||
| vmov.f64 d15, d8 | |||||
| .endm | |||||
| .macro KERNEL_S4X4 | |||||
| KERNEL_S4X1 | |||||
| KERNEL_S4X1 | |||||
| KERNEL_S4X1 | |||||
| KERNEL_S4X1 | |||||
| .endm | |||||
| .macro KERNEL_S4X1 | |||||
| fldd d0 , [ AO1 ] | |||||
| fldd d1 , [ AO1, #8 ] | |||||
| fldd d2 , [ AO1, #16 ] | |||||
| fldd d3 , [ AO1, #24 ] | |||||
| fldd d4 , [ XO ] | |||||
| fldd d5 , [ XO, #8 ] | |||||
| fmacd d8 , d0, d4 | |||||
| fmacd d9 , d0, d5 | |||||
| fmacd d10 , d2, d4 | |||||
| fmacd d11 , d2, d5 | |||||
| KMAC_R d8 , d1, d5 | |||||
| KMAC_I d9 , d1, d4 | |||||
| KMAC_R d10 , d3, d5 | |||||
| KMAC_I d11 , d3, d4 | |||||
| fldd d0 , [ AO1, #32 ] | |||||
| fldd d1 , [ AO1, #40 ] | |||||
| fldd d2 , [ AO1, #48 ] | |||||
| fldd d3 , [ AO1, #56 ] | |||||
| fmacd d12 , d0, d4 | |||||
| fmacd d13 , d0, d5 | |||||
| fmacd d14 , d2, d4 | |||||
| fmacd d15 , d2, d5 | |||||
| KMAC_R d12 , d1, d5 | |||||
| KMAC_I d13 , d1, d4 | |||||
| KMAC_R d14 , d3, d5 | |||||
| KMAC_I d15 , d3, d4 | |||||
| add XO , XO, INC_X | |||||
| add AO1 , AO1, LDA | |||||
| add AO2 , AO2, LDA | |||||
| .endm | |||||
| .macro SAVE_S4 | |||||
| fldd d0, ALPHA_R | |||||
| fldd d1, ALPHA_I | |||||
| fldmiad YO, { d4 - d5 } | |||||
| FMAC_R1 d4 , d0 , d8 | |||||
| FMAC_I1 d5 , d0 , d9 | |||||
| FMAC_R2 d4 , d1 , d9 | |||||
| FMAC_I2 d5 , d1 , d8 | |||||
| fstmiad YO, { d4 - d5 } | |||||
| add YO, YO, INC_Y | |||||
| fldmiad YO, { d6 - d7 } | |||||
| FMAC_R1 d6 , d0 , d10 | |||||
| FMAC_I1 d7 , d0 , d11 | |||||
| FMAC_R2 d6 , d1 , d11 | |||||
| FMAC_I2 d7 , d1 , d10 | |||||
| fstmiad YO, { d6 - d7 } | |||||
| add YO, YO, INC_Y | |||||
| fldmiad YO, { d4 - d5 } | |||||
| FMAC_R1 d4 , d0 , d12 | |||||
| FMAC_I1 d5 , d0 , d13 | |||||
| FMAC_R2 d4 , d1 , d13 | |||||
| FMAC_I2 d5 , d1 , d12 | |||||
| fstmiad YO, { d4 - d5 } | |||||
| add YO, YO, INC_Y | |||||
| fldmiad YO, { d6 - d7 } | |||||
| FMAC_R1 d6 , d0 , d14 | |||||
| FMAC_I1 d7 , d0 , d15 | |||||
| FMAC_R2 d6 , d1 , d15 | |||||
| FMAC_I2 d7 , d1 , d14 | |||||
| fstmiad YO, { d6 - d7 } | |||||
| add YO, YO, INC_Y | |||||
| .endm | |||||
| .macro INIT_S1 | |||||
| vsub.f64 d8 , d8 , d8 | |||||
| vmov.f64 d9 , d8 | |||||
| .endm | |||||
| .macro KERNEL_S1X1 | |||||
| fldd d0 , [ AO1 ] | |||||
| fldd d1 , [ AO1, #8 ] | |||||
| fldd d4 , [ XO ] | |||||
| fldd d5 , [ XO, #8 ] | |||||
| fmacd d8 , d0, d4 | |||||
| fmacd d9 , d0, d5 | |||||
| KMAC_R d8 , d1, d5 | |||||
| KMAC_I d9 , d1, d4 | |||||
| add XO , XO, INC_X | |||||
| add AO1 , AO1, LDA | |||||
| .endm | |||||
| .macro SAVE_S1 | |||||
| fldd d0, ALPHA_R | |||||
| fldd d1, ALPHA_I | |||||
| fldmiad YO, { d4 - d5 } | |||||
| FMAC_R1 d4 , d0 , d8 | |||||
| FMAC_I1 d5 , d0 , d9 | |||||
| FMAC_R2 d4 , d1 , d9 | |||||
| FMAC_I2 d5 , d1 , d8 | |||||
| fstmiad YO, { d4 - d5 } | |||||
| add YO, YO, INC_Y | |||||
| .endm | |||||
| /************************************************************************************** | |||||
| * End of macro definitions | |||||
| **************************************************************************************/ | |||||
| PROLOGUE | |||||
| .align 5 | |||||
| push {r4 - r9 , fp} | |||||
| add fp, sp, #28 | |||||
| sub sp, sp, #STACKSIZE // reserve stack | |||||
| sub r12, fp, #192 | |||||
| #if defined(DOUBLE) | |||||
| vstm r12, { d8 - d15 } // store floating point registers | |||||
| #else | |||||
| vstm r12, { s8 - s15 } // store floating point registers | |||||
| #endif | |||||
| cmp OLD_M, #0 | |||||
| ble zgemvn_kernel_L999 | |||||
| cmp N, #0 | |||||
| ble zgemvn_kernel_L999 | |||||
| str OLD_A, A | |||||
| str OLD_M, M | |||||
| vstr d0 , ALPHA_R | |||||
| vstr d1 , ALPHA_I | |||||
| ldr INC_X , OLD_INC_X | |||||
| ldr INC_Y , OLD_INC_Y | |||||
| cmp INC_X, #0 | |||||
| beq zgemvn_kernel_L999 | |||||
| cmp INC_Y, #0 | |||||
| beq zgemvn_kernel_L999 | |||||
| ldr LDA, OLD_LDA | |||||
| #if defined(DOUBLE) | |||||
| lsl LDA, LDA, #4 // LDA * SIZE * 2 | |||||
| #else | |||||
| lsl LDA, LDA, #3 // LDA * SIZE * 2 | |||||
| #endif | |||||
| cmp INC_X, #1 | |||||
| bne zgemvn_kernel_S4_BEGIN | |||||
| cmp INC_Y, #1 | |||||
| bne zgemvn_kernel_S4_BEGIN | |||||
| zgemvn_kernel_F4_BEGIN: | |||||
| ldr YO , Y | |||||
| ldr I, M | |||||
| asrs I, I, #2 // I = M / 4 | |||||
| ble zgemvn_kernel_F1_BEGIN | |||||
| zgemvn_kernel_F4X4: | |||||
| ldr AO1, A | |||||
| add AO2, AO1, LDA | |||||
| add r3 , AO1, #64 | |||||
| str r3 , A | |||||
| add AO2, AO2, LDA | |||||
| add AO2, AO2, LDA | |||||
| ldr XO , X | |||||
| INIT_F4 | |||||
| asrs J, N, #2 // J = N / 4 | |||||
| ble zgemvn_kernel_F4X1 | |||||
| zgemvn_kernel_F4X4_10: | |||||
| KERNEL_F4X4 | |||||
| subs J, J, #1 | |||||
| bne zgemvn_kernel_F4X4_10 | |||||
| zgemvn_kernel_F4X1: | |||||
| ands J, N , #3 | |||||
| ble zgemvn_kernel_F4_END | |||||
| zgemvn_kernel_F4X1_10: | |||||
| KERNEL_F4X1 | |||||
| subs J, J, #1 | |||||
| bne zgemvn_kernel_F4X1_10 | |||||
| zgemvn_kernel_F4_END: | |||||
| SAVE_F4 | |||||
| subs I , I , #1 | |||||
| bne zgemvn_kernel_F4X4 | |||||
| zgemvn_kernel_F1_BEGIN: | |||||
| ldr I, M | |||||
| ands I, I , #3 | |||||
| ble zgemvn_kernel_L999 | |||||
| zgemvn_kernel_F1X1: | |||||
| ldr AO1, A | |||||
| add r3, AO1, #16 | |||||
| str r3, A | |||||
| ldr XO , X | |||||
| INIT_F1 | |||||
| mov J, N | |||||
| zgemvn_kernel_F1X1_10: | |||||
| KERNEL_F1X1 | |||||
| subs J, J, #1 | |||||
| bne zgemvn_kernel_F1X1_10 | |||||
| zgemvn_kernel_F1_END: | |||||
| SAVE_F1 | |||||
| subs I , I , #1 | |||||
| bne zgemvn_kernel_F1X1 | |||||
| b zgemvn_kernel_L999 | |||||
| /*************************************************************************************************************/ | |||||
| zgemvn_kernel_S4_BEGIN: | |||||
| #if defined(DOUBLE) | |||||
| lsl INC_X, INC_X, #4 // INC_X * SIZE * 2 | |||||
| lsl INC_Y, INC_Y, #4 // INC_Y * SIZE * 2 | |||||
| #else | |||||
| lsl INC_X, INC_X, #3 // INC_X * SIZE * 2 | |||||
| lsl INC_Y, INC_Y, #3 // INC_Y * SIZE * 2 | |||||
| #endif | |||||
| ldr YO , Y | |||||
| ldr I, M | |||||
| asrs I, I, #2 // I = M / 4 | |||||
| ble zgemvn_kernel_S1_BEGIN | |||||
| zgemvn_kernel_S4X4: | |||||
| ldr AO1, A | |||||
| add AO2, AO1, LDA | |||||
| add r3 , AO1, #64 | |||||
| str r3 , A | |||||
| ldr XO , X | |||||
| INIT_S4 | |||||
| asrs J, N, #2 // J = N / 4 | |||||
| ble zgemvn_kernel_S4X1 | |||||
| zgemvn_kernel_S4X4_10: | |||||
| KERNEL_S4X4 | |||||
| subs J, J, #1 | |||||
| bne zgemvn_kernel_S4X4_10 | |||||
| zgemvn_kernel_S4X1: | |||||
| ands J, N , #3 | |||||
| ble zgemvn_kernel_S4_END | |||||
| zgemvn_kernel_S4X1_10: | |||||
| KERNEL_S4X1 | |||||
| subs J, J, #1 | |||||
| bne zgemvn_kernel_S4X1_10 | |||||
| zgemvn_kernel_S4_END: | |||||
| SAVE_S4 | |||||
| subs I , I , #1 | |||||
| bne zgemvn_kernel_S4X4 | |||||
| zgemvn_kernel_S1_BEGIN: | |||||
| ldr I, M | |||||
| ands I, I , #3 | |||||
| ble zgemvn_kernel_L999 | |||||
| zgemvn_kernel_S1X1: | |||||
| ldr AO1, A | |||||
| add r3, AO1, #16 | |||||
| str r3, A | |||||
| ldr XO , X | |||||
| INIT_S1 | |||||
| mov J, N | |||||
| zgemvn_kernel_S1X1_10: | |||||
| KERNEL_S1X1 | |||||
| subs J, J, #1 | |||||
| bne zgemvn_kernel_S1X1_10 | |||||
| zgemvn_kernel_S1_END: | |||||
| SAVE_S1 | |||||
| subs I , I , #1 | |||||
| bne zgemvn_kernel_S1X1 | |||||
| /*************************************************************************************************************/ | |||||
| zgemvn_kernel_L999: | |||||
| sub r3, fp, #192 | |||||
| #if defined(DOUBLE) | |||||
| vldm r3, { d8 - d15 } // restore floating point registers | |||||
| #else | |||||
| vldm r3, { s8 - s15 } // restore floating point registers | |||||
| #endif | |||||
| mov r0, #0 // set return value | |||||
| sub sp, fp, #28 | |||||
| pop {r4 -r9 ,fp} | |||||
| bx lr | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,140 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| /************************************************************************************** | |||||
| * * 2013/11/23 Saar | |||||
| * * BLASTEST float : OK | |||||
| * * BLASTEST double : OK | |||||
| * CTEST : OK | |||||
| * TEST : OK | |||||
| * * | |||||
| * **************************************************************************************/ | |||||
| #include "common.h" | |||||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||||
| { | |||||
| BLASLONG i; | |||||
| BLASLONG ix,iy; | |||||
| BLASLONG j; | |||||
| FLOAT *a_ptr; | |||||
| FLOAT temp_r,temp_i; | |||||
| BLASLONG inc_x2,inc_y2; | |||||
| BLASLONG lda2; | |||||
| BLASLONG i2; | |||||
| lda2 = 2*lda; | |||||
| iy = 0; | |||||
| a_ptr = a; | |||||
| if ( inc_x == 1 && inc_y == 1 ) | |||||
| { | |||||
| for (j=0; j<n; j++) | |||||
| { | |||||
| temp_r = 0.0; | |||||
| temp_i = 0.0; | |||||
| ix = 0; | |||||
| i2=0; | |||||
| for (i=0; i<m; i++) | |||||
| { | |||||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||||
| temp_r += a_ptr[i2] * x[ix] - a_ptr[i2+1] * x[ix+1]; | |||||
| temp_i += a_ptr[i2] * x[ix+1] + a_ptr[i2+1] * x[ix]; | |||||
| #else | |||||
| temp_r += a_ptr[i2] * x[ix] + a_ptr[i2+1] * x[ix+1]; | |||||
| temp_i += a_ptr[i2] * x[ix+1] - a_ptr[i2+1] * x[ix]; | |||||
| #endif | |||||
| i2 += 2; | |||||
| ix += 2; | |||||
| } | |||||
| #if !defined(XCONJ) | |||||
| y[iy] += alpha_r * temp_r - alpha_i * temp_i; | |||||
| y[iy+1] += alpha_r * temp_i + alpha_i * temp_r; | |||||
| #else | |||||
| y[iy] += alpha_r * temp_r + alpha_i * temp_i; | |||||
| y[iy+1] -= alpha_r * temp_i - alpha_i * temp_r; | |||||
| #endif | |||||
| a_ptr += lda2; | |||||
| iy += 2; | |||||
| } | |||||
| return(0); | |||||
| } | |||||
| inc_x2 = 2 * inc_x; | |||||
| inc_y2 = 2 * inc_y; | |||||
| for (j=0; j<n; j++) | |||||
| { | |||||
| temp_r = 0.0; | |||||
| temp_i = 0.0; | |||||
| ix = 0; | |||||
| i2=0; | |||||
| for (i=0; i<m; i++) | |||||
| { | |||||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||||
| temp_r += a_ptr[i2] * x[ix] - a_ptr[i2+1] * x[ix+1]; | |||||
| temp_i += a_ptr[i2] * x[ix+1] + a_ptr[i2+1] * x[ix]; | |||||
| #else | |||||
| temp_r += a_ptr[i2] * x[ix] + a_ptr[i2+1] * x[ix+1]; | |||||
| temp_i += a_ptr[i2] * x[ix+1] - a_ptr[i2+1] * x[ix]; | |||||
| #endif | |||||
| i2 += 2; | |||||
| ix += inc_x2; | |||||
| } | |||||
| #if !defined(XCONJ) | |||||
| y[iy] += alpha_r * temp_r - alpha_i * temp_i; | |||||
| y[iy+1] += alpha_r * temp_i + alpha_i * temp_r; | |||||
| #else | |||||
| y[iy] += alpha_r * temp_r + alpha_i * temp_i; | |||||
| y[iy+1] -= alpha_r * temp_i - alpha_i * temp_r; | |||||
| #endif | |||||
| a_ptr += lda2; | |||||
| iy += inc_y2; | |||||
| } | |||||
| return(0); | |||||
| } | |||||
| @@ -0,0 +1,608 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| /************************************************************************************** | |||||
| * 2013/11/29 Saar | |||||
| * BLASTEST : OK | |||||
| * CTEST : OK | |||||
| * TEST : OK | |||||
| * | |||||
| **************************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define STACKSIZE 256 | |||||
| #define OLD_LDA [fp, #0 ] | |||||
| #define X [fp, #4 ] | |||||
| #define OLD_INC_X [fp, #8 ] | |||||
| #define Y [fp, #12 ] | |||||
| #define OLD_INC_Y [fp, #16 ] | |||||
| #define OLD_A r3 | |||||
| #define OLD_N r1 | |||||
| #define M r0 | |||||
| #define AO1 r1 | |||||
| #define J r2 | |||||
| #define AO2 r4 | |||||
| #define XO r5 | |||||
| #define YO r6 | |||||
| #define LDA r7 | |||||
| #define INC_X r8 | |||||
| #define INC_Y r9 | |||||
| #define I r12 | |||||
| #define N [fp, #-252 ] | |||||
| #define A [fp, #-256 ] | |||||
| #define X_PRE 512 | |||||
| #define A_PRE 512 | |||||
| #define Y_PRE 32 | |||||
| /************************************************************************************** | |||||
| * Macro definitions | |||||
| **************************************************************************************/ | |||||
| #if !defined(CONJ) && !defined(XCONJ) | |||||
| #define KMAC_R fnmacd | |||||
| #define KMAC_I fmacd | |||||
| #define FMAC_R1 fmacd | |||||
| #define FMAC_R2 fnmacd | |||||
| #define FMAC_I1 fmacd | |||||
| #define FMAC_I2 fmacd | |||||
| #elif defined(CONJ) && !defined(XCONJ) | |||||
| #define KMAC_R fmacd | |||||
| #define KMAC_I fnmacd | |||||
| #define FMAC_R1 fmacd | |||||
| #define FMAC_R2 fnmacd | |||||
| #define FMAC_I1 fmacd | |||||
| #define FMAC_I2 fmacd | |||||
| #elif !defined(CONJ) && defined(XCONJ) | |||||
| #define KMAC_R fmacd | |||||
| #define KMAC_I fnmacd | |||||
| #define FMAC_R1 fmacd | |||||
| #define FMAC_R2 fmacd | |||||
| #define FMAC_I1 fnmacd | |||||
| #define FMAC_I2 fmacd | |||||
| #else | |||||
| #define KMAC_R fnmacd | |||||
| #define KMAC_I fmacd | |||||
| #define FMAC_R1 fmacd | |||||
| #define FMAC_R2 fmacd | |||||
| #define FMAC_I1 fnmacd | |||||
| #define FMAC_I2 fmacd | |||||
| #endif | |||||
| .macro INIT_F2 | |||||
| vsub.f64 d12, d12, d12 | |||||
| vsub.f64 d13, d13, d13 | |||||
| vsub.f64 d14, d14, d14 | |||||
| vsub.f64 d15, d15, d15 | |||||
| .endm | |||||
| .macro KERNEL_F2X4 | |||||
| KERNEL_F2X1 | |||||
| KERNEL_F2X1 | |||||
| KERNEL_F2X1 | |||||
| KERNEL_F2X1 | |||||
| .endm | |||||
| .macro KERNEL_F2X1 | |||||
| fldmiad XO! , { d2 - d3 } | |||||
| fldmiad AO1!, { d4 - d5 } | |||||
| fmacd d12 , d4 , d2 | |||||
| fmacd d13 , d4 , d3 | |||||
| fldmiad AO2!, { d8 - d9 } | |||||
| KMAC_R d12 , d5 , d3 | |||||
| KMAC_I d13 , d5 , d2 | |||||
| fmacd d14 , d8 , d2 | |||||
| fmacd d15 , d8 , d3 | |||||
| KMAC_R d14 , d9 , d3 | |||||
| KMAC_I d15 , d9 , d2 | |||||
| .endm | |||||
| .macro SAVE_F2 | |||||
| fldmiad YO, { d4 - d7 } | |||||
| FMAC_R1 d4 , d0 , d12 | |||||
| FMAC_I1 d5 , d0 , d13 | |||||
| FMAC_R2 d4 , d1 , d13 | |||||
| FMAC_I2 d5 , d1 , d12 | |||||
| FMAC_R1 d6 , d0 , d14 | |||||
| FMAC_I1 d7 , d0 , d15 | |||||
| FMAC_R2 d6 , d1 , d15 | |||||
| FMAC_I2 d7 , d1 , d14 | |||||
| fstmiad YO!, { d4 - d7 } | |||||
| .endm | |||||
| /************************************************************************************************/ | |||||
| .macro INIT_F1 | |||||
| vsub.f64 d12, d12, d12 | |||||
| vsub.f64 d13, d13, d13 | |||||
| .endm | |||||
| .macro KERNEL_F1X4 | |||||
| KERNEL_F1X1 | |||||
| KERNEL_F1X1 | |||||
| KERNEL_F1X1 | |||||
| KERNEL_F1X1 | |||||
| .endm | |||||
| .macro KERNEL_F1X1 | |||||
| fldmiad XO! , { d2 - d3 } | |||||
| fldmiad AO1!, { d4 - d5 } | |||||
| fmacd d12 , d4 , d2 | |||||
| fmacd d13 , d4 , d3 | |||||
| KMAC_R d12 , d5 , d3 | |||||
| KMAC_I d13 , d5 , d2 | |||||
| .endm | |||||
| .macro SAVE_F1 | |||||
| fldmiad YO, { d4 - d5 } | |||||
| FMAC_R1 d4 , d0 , d12 | |||||
| FMAC_I1 d5 , d0 , d13 | |||||
| FMAC_R2 d4 , d1 , d13 | |||||
| FMAC_I2 d5 , d1 , d12 | |||||
| fstmiad YO!, { d4 - d5 } | |||||
| .endm | |||||
| /************************************************************************************************/ | |||||
| .macro INIT_S2 | |||||
| vsub.f64 d12, d12, d12 | |||||
| vsub.f64 d13, d13, d13 | |||||
| vsub.f64 d14, d14, d14 | |||||
| vsub.f64 d15, d15, d15 | |||||
| .endm | |||||
| .macro KERNEL_S2X4 | |||||
| KERNEL_S2X1 | |||||
| KERNEL_S2X1 | |||||
| KERNEL_S2X1 | |||||
| KERNEL_S2X1 | |||||
| .endm | |||||
| .macro KERNEL_S2X1 | |||||
| fldmiad XO , { d2 - d3 } | |||||
| fldmiad AO1!, { d4 - d5 } | |||||
| fldmiad AO2!, { d8 - d9 } | |||||
| fmacd d12 , d4 , d2 | |||||
| fmacd d13 , d4 , d3 | |||||
| KMAC_R d12 , d5 , d3 | |||||
| KMAC_I d13 , d5 , d2 | |||||
| fmacd d14 , d8 , d2 | |||||
| fmacd d15 , d8 , d3 | |||||
| KMAC_R d14 , d9 , d3 | |||||
| KMAC_I d15 , d9 , d2 | |||||
| add XO, XO, INC_X | |||||
| .endm | |||||
| .macro SAVE_S2 | |||||
| fldmiad YO, { d4 - d5 } | |||||
| FMAC_R1 d4 , d0 , d12 | |||||
| FMAC_I1 d5 , d0 , d13 | |||||
| FMAC_R2 d4 , d1 , d13 | |||||
| FMAC_I2 d5 , d1 , d12 | |||||
| fstmiad YO, { d4 - d5 } | |||||
| add YO, YO, INC_Y | |||||
| fldmiad YO, { d6 - d7 } | |||||
| FMAC_R1 d6 , d0 , d14 | |||||
| FMAC_I1 d7 , d0 , d15 | |||||
| FMAC_R2 d6 , d1 , d15 | |||||
| FMAC_I2 d7 , d1 , d14 | |||||
| fstmiad YO, { d6 - d7 } | |||||
| add YO, YO, INC_Y | |||||
| .endm | |||||
| /************************************************************************************************/ | |||||
| .macro INIT_S1 | |||||
| vsub.f64 d12, d12, d12 | |||||
| vsub.f64 d13, d13, d13 | |||||
| .endm | |||||
| .macro KERNEL_S1X4 | |||||
| KERNEL_S1X1 | |||||
| KERNEL_S1X1 | |||||
| KERNEL_S1X1 | |||||
| KERNEL_S1X1 | |||||
| .endm | |||||
| .macro KERNEL_S1X1 | |||||
| fldmiad XO , { d2 - d3 } | |||||
| fldmiad AO1!, { d4 - d5 } | |||||
| fmacd d12 , d4 , d2 | |||||
| fmacd d13 , d4 , d3 | |||||
| KMAC_R d12 , d5 , d3 | |||||
| KMAC_I d13 , d5 , d2 | |||||
| add XO, XO, INC_X | |||||
| .endm | |||||
| .macro SAVE_S1 | |||||
| fldmiad YO, { d4 - d5 } | |||||
| FMAC_R1 d4 , d0 , d12 | |||||
| FMAC_I1 d5 , d0 , d13 | |||||
| FMAC_R2 d4 , d1 , d13 | |||||
| FMAC_I2 d5 , d1 , d12 | |||||
| fstmiad YO, { d4 - d5 } | |||||
| add YO, YO, INC_Y | |||||
| .endm | |||||
| /************************************************************************************** | |||||
| * End of macro definitions | |||||
| **************************************************************************************/ | |||||
| PROLOGUE | |||||
| .align 5 | |||||
| push {r4 - r9 , fp} | |||||
| add fp, sp, #28 | |||||
| sub sp, sp, #STACKSIZE // reserve stack | |||||
| sub r12, fp, #192 | |||||
| #if defined(DOUBLE) | |||||
| vstm r12, { d8 - d15 } // store floating point registers | |||||
| #else | |||||
| vstm r12, { s8 - s15 } // store floating point registers | |||||
| #endif | |||||
| cmp M, #0 | |||||
| ble zgemvt_kernel_L999 | |||||
| cmp OLD_N, #0 | |||||
| ble zgemvt_kernel_L999 | |||||
| str OLD_A, A | |||||
| str OLD_N, N | |||||
| ldr INC_X , OLD_INC_X | |||||
| ldr INC_Y , OLD_INC_Y | |||||
| cmp INC_X, #0 | |||||
| beq zgemvt_kernel_L999 | |||||
| cmp INC_Y, #0 | |||||
| beq zgemvt_kernel_L999 | |||||
| ldr LDA, OLD_LDA | |||||
| #if defined(DOUBLE) | |||||
| lsl LDA, LDA, #4 // LDA * SIZE | |||||
| #else | |||||
| lsl LDA, LDA, #3 // LDA * SIZE | |||||
| #endif | |||||
| cmp INC_X, #1 | |||||
| bne zgemvt_kernel_S2_BEGIN | |||||
| cmp INC_Y, #1 | |||||
| bne zgemvt_kernel_S2_BEGIN | |||||
| zgemvt_kernel_F2_BEGIN: | |||||
| ldr YO , Y | |||||
| ldr J, N | |||||
| asrs J, J, #1 // J = N / 2 | |||||
| ble zgemvt_kernel_F1_BEGIN | |||||
| zgemvt_kernel_F2X4: | |||||
| ldr AO1, A | |||||
| add AO2, AO1, LDA | |||||
| add r3 , AO2, LDA | |||||
| str r3 , A | |||||
| ldr XO , X | |||||
| INIT_F2 | |||||
| asrs I, M, #2 // I = M / 4 | |||||
| ble zgemvt_kernel_F2X1 | |||||
| zgemvt_kernel_F2X4_10: | |||||
| KERNEL_F2X4 | |||||
| subs I, I, #1 | |||||
| bne zgemvt_kernel_F2X4_10 | |||||
| zgemvt_kernel_F2X1: | |||||
| ands I, M , #3 | |||||
| ble zgemvt_kernel_F2_END | |||||
| zgemvt_kernel_F2X1_10: | |||||
| KERNEL_F2X1 | |||||
| subs I, I, #1 | |||||
| bne zgemvt_kernel_F2X1_10 | |||||
| zgemvt_kernel_F2_END: | |||||
| SAVE_F2 | |||||
| subs J , J , #1 | |||||
| bne zgemvt_kernel_F2X4 | |||||
| zgemvt_kernel_F1_BEGIN: | |||||
| ldr J, N | |||||
| ands J, J, #1 | |||||
| ble zgemvt_kernel_L999 | |||||
| zgemvt_kernel_F1X4: | |||||
| ldr AO1, A | |||||
| ldr XO , X | |||||
| INIT_F1 | |||||
| asrs I, M, #2 // I = M / 4 | |||||
| ble zgemvt_kernel_F1X1 | |||||
| zgemvt_kernel_F1X4_10: | |||||
| KERNEL_F1X4 | |||||
| subs I, I, #1 | |||||
| bne zgemvt_kernel_F1X4_10 | |||||
| zgemvt_kernel_F1X1: | |||||
| ands I, M , #3 | |||||
| ble zgemvt_kernel_F1_END | |||||
| zgemvt_kernel_F1X1_10: | |||||
| KERNEL_F1X1 | |||||
| subs I, I, #1 | |||||
| bne zgemvt_kernel_F1X1_10 | |||||
| zgemvt_kernel_F1_END: | |||||
| SAVE_F1 | |||||
| b zgemvt_kernel_L999 | |||||
| /*************************************************************************************************************/ | |||||
| zgemvt_kernel_S2_BEGIN: | |||||
| #if defined(DOUBLE) | |||||
| lsl INC_X, INC_X, #4 // INC_X * SIZE | |||||
| lsl INC_Y, INC_Y, #4 // INC_Y * SIZE | |||||
| #else | |||||
| lsl INC_X, INC_X, #3 // INC_X * SIZE | |||||
| lsl INC_Y, INC_Y, #3 // INC_Y * SIZE | |||||
| #endif | |||||
| ldr YO , Y | |||||
| ldr J, N | |||||
| asrs J, J, #1 // J = N / 2 | |||||
| ble zgemvt_kernel_S1_BEGIN | |||||
| zgemvt_kernel_S2X4: | |||||
| ldr AO1, A | |||||
| add AO2, AO1, LDA | |||||
| add r3 , AO2, LDA | |||||
| str r3 , A | |||||
| ldr XO , X | |||||
| INIT_S2 | |||||
| asrs I, M, #2 // I = M / 4 | |||||
| ble zgemvt_kernel_S2X1 | |||||
| zgemvt_kernel_S2X4_10: | |||||
| KERNEL_S2X4 | |||||
| subs I, I, #1 | |||||
| bne zgemvt_kernel_S2X4_10 | |||||
| zgemvt_kernel_S2X1: | |||||
| ands I, M , #3 | |||||
| ble zgemvt_kernel_S2_END | |||||
| zgemvt_kernel_S2X1_10: | |||||
| KERNEL_S2X1 | |||||
| subs I, I, #1 | |||||
| bne zgemvt_kernel_S2X1_10 | |||||
| zgemvt_kernel_S2_END: | |||||
| SAVE_S2 | |||||
| subs J , J , #1 | |||||
| bne zgemvt_kernel_S2X4 | |||||
| zgemvt_kernel_S1_BEGIN: | |||||
| ldr J, N | |||||
| ands J, J, #1 | |||||
| ble zgemvt_kernel_L999 | |||||
| zgemvt_kernel_S1X4: | |||||
| ldr AO1, A | |||||
| ldr XO , X | |||||
| INIT_S1 | |||||
| asrs I, M, #2 // I = M / 4 | |||||
| ble zgemvt_kernel_S1X1 | |||||
| zgemvt_kernel_S1X4_10: | |||||
| KERNEL_S1X4 | |||||
| subs I, I, #1 | |||||
| bne zgemvt_kernel_S1X4_10 | |||||
| zgemvt_kernel_S1X1: | |||||
| ands I, M , #3 | |||||
| ble zgemvt_kernel_S1_END | |||||
| zgemvt_kernel_S1X1_10: | |||||
| KERNEL_S1X1 | |||||
| subs I, I, #1 | |||||
| bne zgemvt_kernel_S1X1_10 | |||||
| zgemvt_kernel_S1_END: | |||||
| SAVE_S1 | |||||
| /*************************************************************************************************************/ | |||||
| zgemvt_kernel_L999: | |||||
| sub r3, fp, #192 | |||||
| #if defined(DOUBLE) | |||||
| vldm r3, { d8 - d15 } // restore floating point registers | |||||
| #else | |||||
| vldm r3, { s8 - s15 } // restore floating point registers | |||||
| #endif | |||||
| mov r0, #0 // set return value | |||||
| sub sp, fp, #28 | |||||
| pop {r4 -r9 ,fp} | |||||
| bx lr | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,106 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| /************************************************************************************** | |||||
| * 2013/09/13 Saar | |||||
| * BLASTEST float : OK | |||||
| * BLASTEST double : OK | |||||
| * CTEST : OK | |||||
| * TEST : OK | |||||
| * | |||||
| **************************************************************************************/ | |||||
| #include "common.h" | |||||
| #include <math.h> | |||||
| #if defined(DOUBLE) | |||||
| #define ABS fabs | |||||
| #else | |||||
| #define ABS fabsf | |||||
| #endif | |||||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| { | |||||
| BLASLONG i=0; | |||||
| FLOAT scale = 0.0; | |||||
| FLOAT ssq = 1.0; | |||||
| BLASLONG inc_x2; | |||||
| FLOAT temp; | |||||
| if (n < 0 || inc_x < 1 ) return(0.0); | |||||
| inc_x2 = 2 * inc_x; | |||||
| n *= inc_x2; | |||||
| while(i < n) | |||||
| { | |||||
| if ( x[i] != 0.0 ) | |||||
| { | |||||
| temp = ABS( x[i] ); | |||||
| if ( scale < temp ) | |||||
| { | |||||
| ssq = 1 + ssq * ( scale / temp ) * ( scale / temp ); | |||||
| scale = temp ; | |||||
| } | |||||
| else | |||||
| { | |||||
| ssq += ( temp / scale ) * ( temp / scale ); | |||||
| } | |||||
| } | |||||
| if ( x[i+1] != 0.0 ) | |||||
| { | |||||
| temp = ABS( x[i+1] ); | |||||
| if ( scale < temp ) | |||||
| { | |||||
| ssq = 1 + ssq * ( scale / temp ) * ( scale / temp ); | |||||
| scale = temp ; | |||||
| } | |||||
| else | |||||
| { | |||||
| ssq += ( temp / scale ) * ( temp / scale ); | |||||
| } | |||||
| } | |||||
| i += inc_x2; | |||||
| } | |||||
| scale = scale * sqrt( ssq ); | |||||
| return(scale); | |||||
| } | |||||
| @@ -0,0 +1,68 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| /************************************************************************************** | |||||
| * 2013/09/14 Saar | |||||
| * BLASTEST float : OK | |||||
| * BLASTEST double : OK | |||||
| * CTEST : OK | |||||
| * TEST : OK | |||||
| * | |||||
| **************************************************************************************/ | |||||
| #include "common.h" | |||||
| int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) | |||||
| { | |||||
| BLASLONG i=0; | |||||
| BLASLONG ix=0,iy=0; | |||||
| FLOAT temp[2]; | |||||
| if ( n <= 0 ) return(0); | |||||
| BLASLONG inc_x2 = 2 * inc_x ; | |||||
| BLASLONG inc_y2 = 2 * inc_y ; | |||||
| while(i < n) | |||||
| { | |||||
| temp[0] = c*x[ix] + s*y[iy] ; | |||||
| temp[1] = c*x[ix+1] + s*y[iy+1] ; | |||||
| y[iy] = c*y[iy] - s*x[ix] ; | |||||
| y[iy+1] = c*y[iy+1] - s*x[ix+1] ; | |||||
| x[ix] = temp[0] ; | |||||
| x[ix+1] = temp[1] ; | |||||
| ix += inc_x2 ; | |||||
| iy += inc_y2 ; | |||||
| i++ ; | |||||
| } | |||||
| return(0); | |||||
| } | |||||
| @@ -0,0 +1,64 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| /************************************************************************************** | |||||
| * 2013/09/14 Saar | |||||
| * BLASTEST float : OK | |||||
| * BLASTEST double : OK | |||||
| * CTEST : OK | |||||
| * TEST : OK | |||||
| * | |||||
| **************************************************************************************/ | |||||
| #include "common.h" | |||||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | |||||
| { | |||||
| BLASLONG i=0; | |||||
| BLASLONG inc_x2; | |||||
| BLASLONG ip = 0; | |||||
| FLOAT temp; | |||||
| if ( n < 0 || inc_x < 1 ) return(0); | |||||
| inc_x2 = 2 * inc_x; | |||||
| for ( i=0; i<n; i++ ) | |||||
| { | |||||
| temp = da_r * x[ip] - da_i * x[ip+1] ; | |||||
| x[ip+1] = da_r * x[ip+1] + da_i * x[ip] ; | |||||
| x[ip] = temp; | |||||
| ip += inc_x2; | |||||
| } | |||||
| return(0); | |||||
| } | |||||