Merge from develop for 0.3.12 releasetags/v0.3.12
| @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5) | |||
| project(OpenBLAS C ASM) | |||
| set(OpenBLAS_MAJOR_VERSION 0) | |||
| set(OpenBLAS_MINOR_VERSION 3) | |||
| set(OpenBLAS_PATCH_VERSION 11) | |||
| set(OpenBLAS_PATCH_VERSION 12) | |||
| set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") | |||
| # Adhere to GNU filesystem layout conventions | |||
| @@ -1,9 +1,36 @@ | |||
| OpenBLAS ChangeLog | |||
| ==================================================================== | |||
| Version 0.3.12 | |||
| 24-Oct-2020 | |||
| common: | |||
| * Fixed missibg LAPACK functions (inadvertently dropped during | |||
| the build system restructuring) | |||
| * Fixed argument conversion macro in LAPACKE_zgesvdq (LAPACK #458) | |||
| POWER: | |||
| * Added optimized SCOPY/CCOPY kernels for POWER10 | |||
| * Increased and unified the default size of the GEMM BUFFER | |||
| * Fixed building for POWER1ß in DYNAMIC_ARCH mode | |||
| * POWER10 compatibility test now checks binutils version as well | |||
| * Cleaned up compiler warnings | |||
| x86_64: | |||
| * corrected compiler version checks for AVX2 compatibility | |||
| * added compiler option -mavx2 for building with flang | |||
| * fixed direct SGEMM pathway for small matrix sizes (broken by | |||
| the code refactoring in 0.3.11) | |||
| * fixed unhandled partial register clobbers in several kernels | |||
| for AXPY,DOT,GEMV_N and GEMV_T flagged by gcc10 tree-vectorizer | |||
| ARMV8: | |||
| * improved Apple Vortex support to include cross-compiling | |||
| ==================================================================== | |||
| Version 0.3.11 | |||
| 17-Oct-2020 | |||
| common: | |||
| common: | |||
| * API change: | |||
| the newly added BFLOAT16 functions were renamed to use the | |||
| letter "B" instead of "H" to avoid potential confusion with | |||
| @@ -28,7 +55,7 @@ Version 0.3.11 | |||
| * Makefile builds no longer misread NO_CBLAS=0 or NO_LAPACK=0 as | |||
| enabling these options | |||
| * Fixed detection of gfortran when invoked through an mpi wrapper | |||
| * Improve thread reinitialization performance with OpenMP xafter a fork | |||
| * Improve thread reinitialization performance with OpenMP after a fork | |||
| * Added support for building only the subset of the library required | |||
| for a particular precision by specifying BUILD_SINGLE, BUILD_DOUBLE | |||
| * Optional function name prefixes and suffixes are now correctly | |||
| @@ -66,7 +93,6 @@ ARMV8: | |||
| * Fixed cpu detection on BSD-like systems | |||
| * Fixed compilation in -std=C18 mode | |||
| IBM Z: | |||
| * Added support for compiling with the clang compiler | |||
| * Improved GEMM performance on Z14 | |||
| @@ -10,7 +10,7 @@ USE_OPENMP = 1 | |||
| endif | |||
| ifeq ($(CORE), POWER10) | |||
| COMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math | |||
| CCOMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math | |||
| FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10 -fno-fast-math | |||
| endif | |||
| @@ -3,7 +3,7 @@ | |||
| # | |||
| # This library's version | |||
| VERSION = 0.3.11 | |||
| VERSION = 0.3.12 | |||
| # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a | |||
| # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library | |||
| @@ -295,10 +295,13 @@ COMMON_PROF = -pg | |||
| # the below is not yet configurable, use cmake if you need to build only select types | |||
| BUILD_SINGLE = 1 | |||
| BUILD_DOUBLE = 1 | |||
| BUILD_COMPLEX = 1 | |||
| BUILD_COMPLEX16 = 1 | |||
| # By default the library contains BLAS functions (and LAPACK if selected) for all input types. | |||
| # To build a smaller library supporting e.g. only single precision real (SGEMM etc.) or only | |||
| # the functions for complex numbers, uncomment the desired type(s) below | |||
| # BUILD_SINGLE = 1 | |||
| # BUILD_DOUBLE = 1 | |||
| # BUILD_COMPLEX = 1 | |||
| # BUILD_COMPLEX16 = 1 | |||
| # | |||
| # End of user configuration | |||
| # | |||
| @@ -641,6 +641,7 @@ DYNAMIC_CORE += POWER8 | |||
| ifneq ($(C_COMPILER), GCC) | |||
| DYNAMIC_CORE += POWER9 | |||
| DYNAMIC_CORE += POWER10 | |||
| CCOMMON_OPT += -DHAVE_P10_SUPPORT | |||
| endif | |||
| ifeq ($(C_COMPILER), GCC) | |||
| ifeq ($(GCCVERSIONGT5), 1) | |||
| @@ -648,11 +649,14 @@ DYNAMIC_CORE += POWER9 | |||
| else | |||
| $(info, OpenBLAS: Your gcc version is too old to build the POWER9 kernels.) | |||
| endif | |||
| ifeq ($(GCCVERSIONGTEQ11), 1) | |||
| LDVERSIONGTEQ35 := $(shell expr `ld --version | head -1 | cut -f2 -d "." | cut -f1 -d "-"` >= 35) | |||
| ifeq ($(GCCVERSIONGTEQ11)$(LDVERSIONGTEQ35), 11) | |||
| DYNAMIC_CORE += POWER10 | |||
| CCOMMON_OPT += -DHAVE_P10_SUPPORT | |||
| else ifeq ($(GCCVERSIONGTEQ10), 1) | |||
| ifeq ($(GCCMINORVERSIONGTEQ2), 1) | |||
| ifeq ($(GCCMINORVERSIONGTEQ2)$(LDVERSIONGTEQ35), 11) | |||
| DYNAMIC_CORE += POWER10 | |||
| CCOMMON_OPT += -DHAVE_P10_SUPPORT | |||
| endif | |||
| else | |||
| $(info, OpenBLAS: Your gcc version is too old to build the POWER10 kernels.) | |||
| @@ -74,8 +74,10 @@ ifndef NO_AVX2 | |||
| ifeq ($(C_COMPILER), GCC) | |||
| # AVX2 support was added in 4.7.0 | |||
| GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4) | |||
| GCCVERSIONGTEQ5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 5) | |||
| GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7) | |||
| ifeq ($(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7), 11) | |||
| GCCVERSIONCHECK := $(GCCVERSIONGTEQ5)$(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7) | |||
| ifeq ($(GCCVERSIONCHECK), $(filter $(GCCVERSIONCHECK), 011 110 111)) | |||
| CCOMMON_OPT += -mavx2 | |||
| endif | |||
| else | |||
| @@ -86,8 +88,14 @@ endif | |||
| ifeq ($(F_COMPILER), GFORTRAN) | |||
| # AVX2 support was added in 4.7.0 | |||
| GCCVERSIONGTEQ4 := $(shell expr `$(FC) -dumpversion | cut -f1 -d.` \>= 4) | |||
| GCCVERSIONGTEQ5 := $(shell expr `$(FC) -dumpversion | cut -f1 -d.` \>= 5) | |||
| GCCMINORVERSIONGTEQ7 := $(shell expr `$(FC) -dumpversion | cut -f2 -d.` \>= 7) | |||
| ifeq ($(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7), 11) | |||
| GCCVERSIONCHECK := $(GCCVERSIONGTEQ5)$(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7) | |||
| ifeq ($(GCCVERSIONCHECK), $(filter $(GCCVERSIONCHECK), 011 110 111)) | |||
| FCOMMON_OPT += -mavx2 | |||
| endif | |||
| else | |||
| ifeq ($(F_COMPILER), FLANG) | |||
| FCOMMON_OPT += -mavx2 | |||
| endif | |||
| endif | |||
| @@ -49,6 +49,7 @@ if (DYNAMIC_ARCH) | |||
| if (POWER) | |||
| set(DYNAMIC_CORE POWER6 POWER8 POWER9 POWER10) | |||
| set(CCOMMON_OPT "${CCOMMON_OPT} -DHAVE_P10_SUPPORT") | |||
| endif () | |||
| if (X86) | |||
| @@ -416,6 +416,29 @@ endif () | |||
| set(ZGEMM_UNROLL_M 4) | |||
| set(ZGEMM_UNROLL_N 4) | |||
| set(SYMV_P 16) | |||
| elseif ("${TCORE}" STREQUAL "VORTEX") | |||
| file(APPEND ${TARGET_CONF_TEMP} | |||
| "#define ARMV8\n" | |||
| "#define L1_CODE_SIZE\t32768\n" | |||
| "#define L1_CODE_LINESIZE\t64\n" | |||
| "#define L1_CODE_ASSOCIATIVE\t4\n" | |||
| "#define L1_DATA_SIZE\t32768\n" | |||
| "#define L1_DATA_LINESIZE\t64\n" | |||
| "#define L1_DATA_ASSOCIATIVE\t4\n" | |||
| "#define L2_SIZE\t5262144\n" | |||
| "#define L2_LINESIZE\t64\n" | |||
| "#define L2_ASSOCIATIVE\t8\n" | |||
| "#define DTB_DEFAULT_ENTRIES\t64\n" | |||
| "#define DTB_SIZE\t4096\n") | |||
| set(SGEMM_UNROLL_M 16) | |||
| set(SGEMM_UNROLL_N 4) | |||
| set(DGEMM_UNROLL_M 8) | |||
| set(DGEMM_UNROLL_N 4) | |||
| set(CGEMM_UNROLL_M 8) | |||
| set(CGEMM_UNROLL_N 4) | |||
| set(ZGEMM_UNROLL_M 4) | |||
| set(ZGEMM_UNROLL_N 4) | |||
| set(SYMV_P 16) | |||
| elseif ("${TCORE}" STREQUAL "POWER6") | |||
| file(APPEND ${TARGET_CONF_TEMP} | |||
| "#define L1_DATA_SIZE 32768\n" | |||
| @@ -844,8 +844,8 @@ Lmcount$lazy_ptr: | |||
| #define BUFFER_SIZE ( 2 << 20) | |||
| #elif defined(PPC440FP2) | |||
| #define BUFFER_SIZE ( 16 << 20) | |||
| #elif defined(POWER8) || defined(POWER9) || defined(POWER10) | |||
| #define BUFFER_SIZE ( 64 << 20) | |||
| #elif defined(POWER6) || defined(POWER8) || defined(POWER9) || defined(POWER10) | |||
| #define BUFFER_SIZE ( 64 << 22) | |||
| #else | |||
| #define BUFFER_SIZE ( 16 << 20) | |||
| #endif | |||
| @@ -424,7 +424,7 @@ void get_cpuconfig(void) | |||
| sysctlbyname("hw.l1dcachesize",&value,&length,NULL,0); | |||
| printf("#define L1_DATA_SIZE %d \n",value); | |||
| sysctlbyname("hw.l2dcachesize",&value,&length,NULL,0); | |||
| printf("#define L2_DATA_SIZE %d \n",value); | |||
| printf("#define L2_SIZE %d \n",value); | |||
| break; | |||
| #endif | |||
| } | |||
| @@ -6,10 +6,10 @@ extern gotoblas_t gotoblas_POWER8; | |||
| #if (!defined __GNUC__) || ( __GNUC__ >= 6) | |||
| extern gotoblas_t gotoblas_POWER9; | |||
| #endif | |||
| #if (!defined __GNUC__) || ( __GNUC__ >= 11) \ | |||
| || (__GNUC__ == 10 && __GNUC_MINOR__ >= 2) | |||
| #define HAVE_P10_SUPPORT 1 | |||
| #endif | |||
| //#if (!defined __GNUC__) || ( __GNUC__ >= 11) \ | |||
| // || (__GNUC__ == 10 && __GNUC_MINOR__ >= 2) | |||
| //#define HAVE_P10_SUPPORT 1 | |||
| //#endif | |||
| #ifdef HAVE_P10_SUPPORT | |||
| extern gotoblas_t gotoblas_POWER10; | |||
| #endif | |||
| @@ -120,10 +120,10 @@ dll : ../$(LIBDLLNAME) | |||
| -Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive $(FEXTRALIB) $(EXTRALIB) | |||
| $(LIBPREFIX).def : gensymbol | |||
| perl ./gensymbol win2k $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16)> $(@F) | |||
| perl ./gensymbol win2k $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F) | |||
| libgoto_hpl.def : gensymbol | |||
| perl ./gensymbol win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16)> $(@F) | |||
| perl ./gensymbol win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F) | |||
| ifeq ($(OSNAME), Darwin) | |||
| INTERNALNAME = $(LIBPREFIX).$(MAJOR_VERSION).dylib | |||
| @@ -258,16 +258,16 @@ static : ../$(LIBNAME) | |||
| rm -f goto.$(SUFFIX) | |||
| osx.def : gensymbol ../Makefile.system ../getarch.c | |||
| perl ./gensymbol osx $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16)> $(@F) | |||
| perl ./gensymbol osx $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F) | |||
| aix.def : gensymbol ../Makefile.system ../getarch.c | |||
| perl ./gensymbol aix $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16)> $(@F) | |||
| perl ./gensymbol aix $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F) | |||
| objcopy.def : gensymbol ../Makefile.system ../getarch.c | |||
| perl ./gensymbol objcopy $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16)> $(@F) | |||
| perl ./gensymbol objcopy $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F) | |||
| objconv.def : gensymbol ../Makefile.system ../getarch.c | |||
| perl ./gensymbol objconv $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16)> $(@F) | |||
| perl ./gensymbol objconv $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F) | |||
| test : linktest.c | |||
| $(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) -lm && echo OK. | |||
| @@ -50,8 +50,8 @@ | |||
| zomatcopy, zimatcopy,dzamax,dzamin,dzasum,dznrm2, | |||
| zgeadd, dzsum); | |||
| @cblasobjs = (lsame, xerbla); | |||
| @halfblasobjs = (sbgemm, sbdot, sbstobf16, sbdtobf16, sbf16tos, dbf16tod); | |||
| @blasobjs = (lsame, xerbla); | |||
| @bfblasobjs = (sbgemm, sbdot, sbstobf16, sbdtobf16, sbf16tos, dbf16tod); | |||
| @cblasobjsc = ( | |||
| cblas_caxpy, cblas_ccopy, cblas_cdotc, cblas_cdotu, cblas_cgbmv, cblas_cgemm, cblas_cgemv, | |||
| cblas_cgerc, cblas_cgeru, cblas_chbmv, cblas_chemm, cblas_chemv, cblas_cher2, cblas_cher2k, | |||
| @@ -72,7 +72,7 @@ | |||
| ); | |||
| @cblasobjss = ( | |||
| cblas_sasum, cblas_saxpy, | |||
| cblas_sasum, cblas_saxpy, cblas_saxpby, | |||
| cblas_scopy, cblas_sdot, cblas_sdsdot, cblas_sgbmv, cblas_sgemm, | |||
| cblas_sgemv, cblas_sger, cblas_snrm2, cblas_srot, cblas_srotg, | |||
| cblas_srotm, cblas_srotmg, cblas_ssbmv, cblas_sscal, cblas_sspmv, cblas_sspr2, cblas_sspr, | |||
| @@ -94,7 +94,7 @@ | |||
| @cblasobjs = ( cblas_xerbla ); | |||
| @halfcblasobjs = (cblas_sbgemm, cblas_sbdot, cblas_sbstobf16, cblas_sbdtobf16, cblas_sbf16tos, cblas_dbf16tod); | |||
| @bfcblasobjs = (cblas_sbgemm, cblas_sbdot, cblas_sbstobf16, cblas_sbdtobf16, cblas_sbf16tos, cblas_dbf16tod); | |||
| @exblasobjs = ( | |||
| qamax,qamin,qasum,qaxpy,qcabs1,qcopy,qdot,qgbmv,qgemm, | |||
| @@ -415,7 +415,7 @@ zpotri, | |||
| cgeqrt, cgeqrt2, cgeqrt3, cgemqrt, | |||
| ctpqrt, ctpqrt2, ctpmqrt, ctprfb, | |||
| ); | |||
| @lapack2objszc = ( | |||
| @lapackobjs2zc = ( | |||
| # ZCLASRC -- Double-single mixed precision complex routines called from | |||
| # single, single-extra and double precision complex LAPACK | |||
| # routines (i.e. from CLASRC, CXLASRC, ZLASRC). | |||
| @@ -425,7 +425,7 @@ zpotri, | |||
| cpotrs, | |||
| ); | |||
| @lapack2objsd = ( | |||
| @lapackobjs2d = ( | |||
| # DLASRC -- Double precision real LAPACK routines | |||
| # already provided by @lapackobjs: | |||
| # dgesv, dgetf2, dgetrs, dlaswp, dlauu2, dlauum, dpotf2, dpotrf, dpotri, | |||
| @@ -568,7 +568,7 @@ zpotri, | |||
| ); | |||
| # functions added for lapack-3.6.0 | |||
| @lapack2objsc = ( @lapack2objsc, | |||
| @lapackobjs2c = ( @lapackobjs2c, | |||
| cgejsv, | |||
| cgesvdx, | |||
| cgesvj, | |||
| @@ -604,7 +604,7 @@ zpotri, | |||
| csyr2, | |||
| cunm22, | |||
| ); | |||
| @lapackobjs2d = (@lapack2objsd, | |||
| @lapackobjs2d = (@lapackobjs2d, | |||
| dbdsvdx, | |||
| dgesvdx, | |||
| dgetrf2, | |||
| @@ -637,7 +637,7 @@ zpotri, | |||
| dpotrf2, | |||
| dsecnd, | |||
| ); | |||
| @lapack2objss = (@lapack2objss, | |||
| @lapackobjs2s = (@lapackobjs2s, | |||
| sbdsvdx, | |||
| second, | |||
| sgesvdx, | |||
| @@ -670,7 +670,7 @@ zpotri, | |||
| sorm22, | |||
| spotrf2, | |||
| ); | |||
| @lapack2objsz = (@lapack2objsz, | |||
| @lapackobjs2z = (@lapackobjs2z, | |||
| zgejsv, | |||
| zgesvdx, | |||
| zgesvj, | |||
| @@ -707,7 +707,7 @@ zpotri, | |||
| zunm22, | |||
| ); | |||
| # functions added for lapack-3.7.0 | |||
| @lapack2objss = (@lapack2objss, | |||
| @lapackobjs2s = (@lapackobjs2s, | |||
| slarfy, | |||
| strevc3, | |||
| sgelqt, | |||
| @@ -726,7 +726,7 @@ zpotri, | |||
| stplqt2, | |||
| stpmlqt, | |||
| ); | |||
| @lapack2objsd = (@lapack2objsd, | |||
| @lapackobjs2d = (@lapackobjs2d, | |||
| dlarfy, | |||
| dsyconvf, | |||
| dtrevc3, | |||
| @@ -746,7 +746,7 @@ zpotri, | |||
| dtplqt2, | |||
| dtpmlqt, | |||
| ); | |||
| @lapack2objsc = (@lapack2objsc, | |||
| @lapackobjs2c = (@lapackobjs2c, | |||
| clarfy, | |||
| csyconvf, | |||
| ctrevc3, | |||
| @@ -766,7 +766,7 @@ zpotri, | |||
| ctplqt2, | |||
| ctpmlqt, | |||
| ); | |||
| @lapack2objsz = (@lapack2objsz, | |||
| @lapackobjs2z = (@lapackobjs2z, | |||
| zlarfy, | |||
| zsyconvf, | |||
| ztrevc3, | |||
| @@ -786,31 +786,31 @@ zpotri, | |||
| zlamswlq, | |||
| zgemlq, | |||
| ); | |||
| @lapack2objs = (@lapack2objs, | |||
| sladiv1, | |||
| dladiv1, | |||
| @lapackobjs2s = (@lapackobjs2s, | |||
| sladiv1); | |||
| @lapackobjs2d = (@lapackobjs2d, | |||
| dladiv1); | |||
| @lapackobjs = (@lapackobjs, | |||
| iparam2stage, | |||
| # functions added for lapack-3.8.0 | |||
| ilaenv2stage, | |||
| ); | |||
| # functions added for lapack-3.9.0 | |||
| @lapack2objsc = (@lapack2objsc, | |||
| @lapackobjs2c = (@lapackobjs2c, | |||
| cgesvdq, | |||
| cungtsqr, | |||
| dcombssq, | |||
| cungtsqr | |||
| ); | |||
| @lapack2objsd = (@lapack2objsd, | |||
| @lapackobjs2d = (@lapackobjs2d, | |||
| dcombssq, | |||
| dgesvdq, | |||
| dorgtsqr, | |||
| ); | |||
| @lapack2objss = (@lapack2objss, | |||
| @lapackobjs2s = (@lapackobjs2s, | |||
| scombssq, | |||
| sgesvdq, | |||
| sorgtsqr, | |||
| ); | |||
| @lapack2objsz = (@lapack2objsz, | |||
| @lapackobjs2z = (@lapackobjs2z, | |||
| zgesvdq, | |||
| zungtsqr | |||
| ); | |||
| @@ -835,10 +835,29 @@ zpotri, | |||
| dlatzm, dtzrqf); | |||
| @lapack_deprecated_objss = ( | |||
| sgelsx, | |||
| sgegs, | |||
| sgegv, | |||
| sgegv, | |||
| sgeqpf, | |||
| sggsvd, | |||
| sggsvp, | |||
| slahrd, | |||
| slatzm, | |||
| stzrqf | |||
| ); | |||
| @lapack_deprecated_objsz = ( | |||
| zgegs, | |||
| zgegv, | |||
| zgelsx, | |||
| zgeqpf, | |||
| zggsvd, | |||
| zggsvp, | |||
| zlahrd, | |||
| zlatzm, | |||
| ztzrqf | |||
| ); | |||
| @lapacke_deprecated_objsc = ( | |||
| LAPACKE_cggsvp, | |||
| LAPACKE_cggsvp_work, | |||
| @@ -3590,14 +3609,18 @@ use File::Basename; | |||
| my $dirname = File::Spec->catfile(dirname(dirname(File::Spec->rel2abs(__FILE__))), "lapack-netlib"); | |||
| if ($ARGV[12] == 1) { | |||
| @blasobjs = (@blasobjs, @halfblasobjs); | |||
| @cblasobjs = (@cblasobjs, @halfcblasobjs); | |||
| @blasobjs = (@blasobjs, @bfblasobjs); | |||
| @cblasobjs = (@cblasobjs, @bfcblasobjs); | |||
| } | |||
| if ($ARGV[13] == 1) { | |||
| @blasobjs = (@blasobjs, @blasobjss); | |||
| @cblasobjs = (@cblasobjs, @cblasobjss); | |||
| @lapackobjs = (@lapackobjs, @lapackobjss); | |||
| @lapack2objs = (@lapack2objs, @lapack2objss); | |||
| @lapackobjs2 = (@lapackobjs2, @lapackobjs2s); | |||
| @lapackobjs2 = (@lapackobjs2, @lapackobjs2sc); | |||
| @lapackobjs2 = (@lapackobjs2, @lapackobjs2ds); | |||
| @lapack_deprecated_objs = (@lapack_deprecated_objs, @lapack_deprecated_objss); | |||
| @lapacke_deprecated_objs = (@lapacke_deprecated_objs, @lapacke_deprecated_objss); | |||
| @lapack_embeded_underscore_objs = (@lapack_embeded_underscore_objs, @lapack_embeded_underscore_objs_s); | |||
| @lapackeobjs = (@lapackeobjs, @lapackeobjss); | |||
| } | |||
| @@ -3605,7 +3628,12 @@ if ($ARGV[14] == 1) { | |||
| @blasobjs = (@blasobjs, @blasobjsd); | |||
| @cblasobjs = (@cblasobjs, @cblasobjsd); | |||
| @lapackobjs = (@lapackobjs, @lapackobjsd); | |||
| @lapack2objs = (@lapack2objs, @lapack2objsd); | |||
| if ($ARGV[13] == 0) { | |||
| @lapackobjs2 = (@lapackobjs2, @lapackobjs2ds); | |||
| } | |||
| @lapackobjs2 = (@lapackobjs2, @lapackobjs2d, @lapackobjs2dz); | |||
| @lapack_deprecated_objs = (@lapack_deprecated_objs, @lapack_deprecated_objsd); | |||
| @lapacke_deprecated_objs = (@lapacke_deprecated_objs, @lapacke_deprecated_objsd); | |||
| @lapack_embeded_underscore_objs = (@lapack_embeded_underscore_objs, @lapack_embeded_underscore_objs_d); | |||
| @lapackeobjs = (@lapackeobjs, @lapackeobjsd); | |||
| } | |||
| @@ -3613,9 +3641,14 @@ if ($ARGV[15] == 1) { | |||
| @blasobjs = (@blasobjs, @blasobjsc); | |||
| @cblasobjs = (@cblasobjs, @cblasobjsc); | |||
| @gemm3mobjs = (@gemm3mobjs, @gemm3mobjsc); | |||
| @cblasgemm3mobjs = (@cblasgemm3mobjs, @sblasgemm3mobjsc); | |||
| @cblasgemm3mobjs = (@cblasgemm3mobjs, @cblasgemm3mobjsc); | |||
| @lapackobjs = (@lapackobjs, @lapackobjsc); | |||
| @lapack2objs = (@lapack2objs, @lapack2objsc, @lapac2objszc); | |||
| @lapackobjs2 = (@lapackobjs2, @lapackobjs2c, @lapackobjs2zc); | |||
| if ($ARGV[13] == 0) { | |||
| @lapackobjs2 = (@lapackobjs2, @lapackobjs2sc); | |||
| } | |||
| @lapack_deprecated_objs = (@lapack_deprecated_objs, @lapack_deprecated_objsc); | |||
| @lapacke_deprecated_objs = (@lapacke_deprecated_objs, @lapacke_deprecated_objsc); | |||
| @lapack_embeded_underscore_objs = (@lapack_embeded_underscore_objs, @lapack_embeded_underscore_objs_c); | |||
| @lapackeobjs = (@lapackeobjs, @lapackeobjsc); | |||
| } | |||
| @@ -3623,9 +3656,17 @@ if ($ARGV[16] == 1) { | |||
| @blasobjs = (@blasobjs, @blasobjsz); | |||
| @cblasobjs = (@cblasobjs, @cblasobjsz); | |||
| @gemm3mobjs = (@gemm3mobjs, @gemm3mobjsz); | |||
| @cblasgemm3mobjs = (@cblasgemm3mobjs, @sblasgemm3mobjsz); | |||
| @cblasgemm3mobjs = (@cblasgemm3mobjs, @cblasgemm3mobjsz); | |||
| @lapackobjs = (@lapackobjs, @lapackobjsz); | |||
| @lapack2objs = (@lapack2objs, @lapack2objsz, @lapack2objszc); | |||
| @lapackobjs2 = (@lapackobjs2, @lapackobjs2z); | |||
| if ($ARGV[15] == 0) { | |||
| @lapackobjs2 = (@lapackobjs2, @lapackobjs2zc); | |||
| } | |||
| if ($ARGV[14] == 0) { | |||
| @lapackobjs2 = (@lapackobjs2, @lapackobjs2dz); | |||
| } | |||
| @lapack_deprecated_objs = (@lapack_deprecated_objs, @lapack_deprecated_objsz); | |||
| @lapacke_deprecated_objs = (@lapacke_deprecated_objs, @lapacke_deprecated_objsz); | |||
| @lapack_embeded_underscore_objs = (@lapack_embeded_underscore_objs, @lapack_embeded_underscore_objs_z); | |||
| @lapackeobjs = (@lapackeobjs, @lapackeobjsz); | |||
| } | |||
| @@ -1222,6 +1222,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #else | |||
| #endif | |||
| #ifdef FORCE_VORTEX | |||
| #define FORCE | |||
| #define ARCHITECTURE "ARM64" | |||
| #define SUBARCHITECTURE "VORTEX" | |||
| #define SUBDIRNAME "arm64" | |||
| #define ARCHCONFIG "-DVORTEX " \ | |||
| "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ | |||
| "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 " \ | |||
| "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" | |||
| #define LIBNAME "vortex" | |||
| #define CORENAME "VORTEX" | |||
| #endif | |||
| #ifdef FORCE_ZARCH_GENERIC | |||
| #define FORCE | |||
| #define ARCHITECTURE "ZARCH" | |||
| @@ -22,20 +22,25 @@ ifeq ($(C_COMPILER), CLANG) | |||
| override CFLAGS += -fno-integrated-as | |||
| endif | |||
| endif | |||
| AVX2OPT = | |||
| ifeq ($(C_COMPILER), GCC) | |||
| # AVX2 support was added in 4.7.0 | |||
| GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4) | |||
| GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7) | |||
| ifeq ($(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7), 11) | |||
| GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4) | |||
| GCCVERSIONGTEQ5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 5) | |||
| GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7) | |||
| GCCVERSIONCHECK := $(GCCVERSIONGTEQ5)$(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7) | |||
| ifeq ($(GCCVERSIONCHECK), $(filter $(GCCVERSIONCHECK), 011 110 111)) | |||
| AVX2OPT = -mavx2 | |||
| endif | |||
| endif | |||
| ifeq ($(C_COMPILER), CLANG) | |||
| # Any clang posing as gcc 4.2 should be new enough (3.4 or later) | |||
| GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4) | |||
| GCCVERSIONGTEQ5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 5) | |||
| GCCMINORVERSIONGTEQ2 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 2) | |||
| ifeq ($(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ2), 11) | |||
| GCCVERSIONCHECK := $(GCCVERSIONGTEQ5)$(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7) | |||
| ifeq ($(GCCVERSIONCHECK), $(filter $(GCCVERSIONCHECK), 011 110 111)) | |||
| AVX2OPT = -mavx2 | |||
| endif | |||
| endif | |||
| @@ -150,9 +150,9 @@ CAXPYKERNEL = caxpy.c | |||
| endif | |||
| ZAXPYKERNEL = zaxpy_power10.c | |||
| # | |||
| SCOPYKERNEL = scopy.c | |||
| SCOPYKERNEL = scopy_power10.c | |||
| DCOPYKERNEL = dcopy_power10.c | |||
| CCOPYKERNEL = ccopy.c | |||
| CCOPYKERNEL = ccopy_power10.c | |||
| ZCOPYKERNEL = zcopy_power10.c | |||
| # | |||
| SDOTKERNEL = sdot.c | |||
| @@ -0,0 +1,132 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||
| #include "copy_microk_power10.c" | |||
| #endif | |||
| #ifndef HAVE_KERNEL | |||
| static void copy_kernel(BLASLONG n, FLOAT *x, FLOAT *y) | |||
| { | |||
| BLASLONG i=0; | |||
| FLOAT f0, f1, f2, f3, f4, f5, f6, f7; | |||
| FLOAT *x1=x; | |||
| FLOAT *y1=y; | |||
| while ( i<n ) | |||
| { | |||
| f0 = x1[0]; | |||
| f1 = x1[1]; | |||
| f2 = x1[2]; | |||
| f3 = x1[3]; | |||
| f4 = x1[4]; | |||
| f5 = x1[5]; | |||
| f6 = x1[6]; | |||
| f7 = x1[7]; | |||
| y1[0] = f0; | |||
| y1[1] = f1; | |||
| y1[2] = f2; | |||
| y1[3] = f3; | |||
| y1[4] = f4; | |||
| y1[5] = f5; | |||
| y1[6] = f6; | |||
| y1[7] = f7; | |||
| x1 += 8; | |||
| y1 += 8; | |||
| i+=4; | |||
| } | |||
| return; | |||
| } | |||
| #endif | |||
| int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| { | |||
| BLASLONG i=0; | |||
| BLASLONG ix=0,iy=0; | |||
| if ( n <= 0 ) return(0); | |||
| if ( (inc_x == 1) && (inc_y == 1 )) | |||
| { | |||
| BLASLONG n1 = n & -64; | |||
| if ( n1 > 0 ) | |||
| { | |||
| copy_kernel(n1, x, y); | |||
| i=n1; | |||
| ix=n1*2; | |||
| iy=n1*2; | |||
| } | |||
| while(i < n) | |||
| { | |||
| y[iy] = x[iy] ; | |||
| y[iy+1] = x[ix+1] ; | |||
| ix+=2; | |||
| iy+=2; | |||
| i++ ; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| BLASLONG inc_x2 = 2 * inc_x; | |||
| BLASLONG inc_y2 = 2 * inc_y; | |||
| while(i < n) | |||
| { | |||
| y[iy] = x[ix] ; | |||
| y[iy+1] = x[ix+1] ; | |||
| ix += inc_x2 ; | |||
| iy += inc_y2 ; | |||
| i++ ; | |||
| } | |||
| } | |||
| return(0); | |||
| } | |||
| @@ -25,9 +25,9 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #define HAVE_KERNEL_64 1 | |||
| #define HAVE_KERNEL 1 | |||
| static void dcopy_kernel_64 (long n, double *x, double *y) | |||
| static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y) | |||
| { | |||
| __asm__ | |||
| ( | |||
| @@ -49,8 +49,13 @@ static void dcopy_kernel_64 (long n, double *x, double *y) | |||
| "lxvp 60, 448(%2) \n\t" | |||
| "lxvp 62, 480(%2) \n\t" | |||
| "addi %2, %2, 512 \n\t" | |||
| #if !defined(COMPLEX) && !defined(DOUBLE) | |||
| "addic. %1, %1, -128 \n\t" | |||
| #elif defined(COMPLEX) && defined(DOUBLE) | |||
| "addic. %1, %1, -32 \n\t" | |||
| #else | |||
| "addic. %1, %1, -64 \n\t" | |||
| #endif | |||
| "ble two%= \n\t" | |||
| ".align 5 \n" | |||
| @@ -94,7 +99,13 @@ static void dcopy_kernel_64 (long n, double *x, double *y) | |||
| "addi %3, %3, 512 \n\t" | |||
| "addi %2, %2, 512 \n\t" | |||
| #if !defined(COMPLEX) && !defined(DOUBLE) | |||
| "addic. %1, %1, -128 \n\t" | |||
| #elif defined(COMPLEX) && defined(DOUBLE) | |||
| "addic. %1, %1, -32 \n\t" | |||
| #else | |||
| "addic. %1, %1, -64 \n\t" | |||
| #endif | |||
| "bgt one%= \n" | |||
| "two%=: \n\t" | |||
| @@ -121,7 +132,7 @@ static void dcopy_kernel_64 (long n, double *x, double *y) | |||
| "=m" (*y), | |||
| "+r" (n), // 1 | |||
| "+b" (x), // 2 | |||
| "+b" (y) // 3 | |||
| "+b" (y) // 3 | |||
| : | |||
| "m" (*x) | |||
| : | |||
| @@ -82,12 +82,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| #ifdef __64BIT__ | |||
| #define STACKSIZE 400 | |||
| #define STACKSIZE 592 | |||
| #define ALPHA_R_SP 304+192(SP) | |||
| #define ALPHA_I_SP 312+192(SP) | |||
| #else | |||
| #define STACKSIZE 256 | |||
| #define STACKSIZE 452 | |||
| #define ALPHA_R_SP 224+196(SP) | |||
| #define ALPHA_I_SP 232+196(SP) | |||
| @@ -28,12 +28,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||
| #include "dcopy_microk_power10.c" | |||
| #include "copy_microk_power10.c" | |||
| #endif | |||
| #ifndef HAVE_KERNEL_64 | |||
| #ifndef HAVE_KERNEL | |||
| static void dcopy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y) | |||
| static void copy_kernel(BLASLONG n, FLOAT *x, FLOAT *y) | |||
| { | |||
| BLASLONG i=0; | |||
| @@ -89,7 +89,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| BLASLONG n1 = n & -64; | |||
| if ( n1 > 0 ) | |||
| { | |||
| dcopy_kernel_64(n1, x, y); | |||
| copy_kernel(n1, x, y); | |||
| i=n1; | |||
| } | |||
| @@ -82,12 +82,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| #ifdef __64BIT__ | |||
| #define STACKSIZE 320 | |||
| #define STACKSIZE 512 | |||
| #define ALPHA_SP 296+192(SP) | |||
| #define FZERO 304+192(SP) | |||
| #else | |||
| #define STACKSIZE 240 | |||
| #define STACKSIZE 440 | |||
| #define ALPHA_SP 224+200(SP) | |||
| #define FZERO 232+200(SP) | |||
| @@ -82,7 +82,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| #ifdef __64BIT__ | |||
| #define STACKSIZE 320 | |||
| #define STACKSIZE 520 | |||
| #define ALPHA_SP 296+200(SP) | |||
| #define FZERO 304+200(SP) | |||
| @@ -47,7 +47,6 @@ | |||
| #endif | |||
| #ifdef __64BIT__ | |||
| #define STACKSIZE 320 | |||
| #define STACKSIZE 520 | |||
| #define ALPHA 296+200(SP) | |||
| #define FZERO 304+200(SP) | |||
| @@ -0,0 +1,123 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||
| #include "copy_microk_power10.c" | |||
| #endif | |||
| #ifndef HAVE_KERNEL | |||
| static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y) | |||
| { | |||
| BLASLONG i=0; | |||
| FLOAT f0, f1, f2, f3, f4, f5, f6, f7; | |||
| FLOAT *x1=x; | |||
| FLOAT *y1=y; | |||
| while ( i<n ) | |||
| { | |||
| f0 = x1[0]; | |||
| f1 = x1[1]; | |||
| f2 = x1[2]; | |||
| f3 = x1[3]; | |||
| f4 = x1[4]; | |||
| f5 = x1[5]; | |||
| f6 = x1[6]; | |||
| f7 = x1[7]; | |||
| y1[0] = f0; | |||
| y1[1] = f1; | |||
| y1[2] = f2; | |||
| y1[3] = f3; | |||
| y1[4] = f4; | |||
| y1[5] = f5; | |||
| y1[6] = f6; | |||
| y1[7] = f7; | |||
| x1 += 8; | |||
| y1 += 8; | |||
| i+=8; | |||
| } | |||
| return; | |||
| } | |||
| #endif | |||
| int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| { | |||
| BLASLONG i=0; | |||
| BLASLONG ix=0,iy=0; | |||
| if ( n <= 0 ) return(0); | |||
| if ( (inc_x == 1) && (inc_y == 1 )) | |||
| { | |||
| BLASLONG n1 = n & -128; | |||
| if ( n1 > 0 ) | |||
| { | |||
| copy_kernel (n1, x, y); | |||
| i=n1; | |||
| } | |||
| while(i < n) | |||
| { | |||
| y[i] = x[i] ; | |||
| i++ ; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| while(i < n) | |||
| { | |||
| y[iy] = x[ix] ; | |||
| ix += inc_x ; | |||
| iy += inc_y ; | |||
| i++ ; | |||
| } | |||
| } | |||
| return(0); | |||
| } | |||
| @@ -82,7 +82,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| #ifdef __64BIT__ | |||
| #define STACKSIZE 340 | |||
| #define STACKSIZE 540 | |||
| #define ALPHA_SP 296+200(SP) | |||
| #define FZERO 304+200(SP) | |||
| @@ -1,134 +0,0 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2020, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #define HAVE_KERNEL_32 1 | |||
| static void zcopy_kernel_32 (long n, double *x, double *y) | |||
| { | |||
| __asm__ | |||
| ( | |||
| "lxvp 32, 0(%2) \n\t" | |||
| "lxvp 34, 32(%2) \n\t" | |||
| "lxvp 36, 64(%2) \n\t" | |||
| "lxvp 38, 96(%2) \n\t" | |||
| "lxvp 40, 128(%2) \n\t" | |||
| "lxvp 42, 160(%2) \n\t" | |||
| "lxvp 44, 192(%2) \n\t" | |||
| "lxvp 46, 224(%2) \n\t" | |||
| "lxvp 48, 256(%2) \n\t" | |||
| "lxvp 50, 288(%2) \n\t" | |||
| "lxvp 52, 320(%2) \n\t" | |||
| "lxvp 54, 352(%2) \n\t" | |||
| "lxvp 56, 384(%2) \n\t" | |||
| "lxvp 58, 416(%2) \n\t" | |||
| "lxvp 60, 448(%2) \n\t" | |||
| "lxvp 62, 480(%2) \n\t" | |||
| "addi %2, %2, 512 \n\t" | |||
| "addic. %1, %1, -32 \n\t" | |||
| "ble two%= \n\t" | |||
| ".align 5 \n" | |||
| "one%=: \n\t" | |||
| "stxvp 32, 0(%3) \n\t" | |||
| "lxvp 32, 0(%2) \n\t" | |||
| "stxvp 34, 32(%3) \n\t" | |||
| "lxvp 34, 32(%2) \n\t" | |||
| "stxvp 36, 64(%3) \n\t" | |||
| "lxvp 36, 64(%2) \n\t" | |||
| "stxvp 38, 96(%3) \n\t" | |||
| "lxvp 38, 96(%2) \n\t" | |||
| "stxvp 40, 128(%3) \n\t" | |||
| "lxvp 40, 128(%2) \n\t" | |||
| "stxvp 42, 160(%3) \n\t" | |||
| "lxvp 42, 160(%2) \n\t" | |||
| "stxvp 44, 192(%3) \n\t" | |||
| "lxvp 44, 192(%2) \n\t" | |||
| "stxvp 46, 224(%3) \n\t" | |||
| "lxvp 46, 224(%2) \n\t" | |||
| "stxvp 48, 256(%3) \n\t" | |||
| "lxvp 48, 256(%2) \n\t" | |||
| "stxvp 50, 288(%3) \n\t" | |||
| "lxvp 50, 288(%2) \n\t" | |||
| "stxvp 52, 320(%3) \n\t" | |||
| "lxvp 52, 320(%2) \n\t" | |||
| "stxvp 54, 352(%3) \n\t" | |||
| "lxvp 54, 352(%2) \n\t" | |||
| "stxvp 56, 384(%3) \n\t" | |||
| "lxvp 56, 384(%2) \n\t" | |||
| "stxvp 58, 416(%3) \n\t" | |||
| "lxvp 58, 416(%2) \n\t" | |||
| "stxvp 60, 448(%3) \n\t" | |||
| "lxvp 60, 448(%2) \n\t" | |||
| "stxvp 62, 480(%3) \n\t" | |||
| "lxvp 62, 480(%2) \n\t" | |||
| "addi %3, %3, 512 \n\t" | |||
| "addi %2, %2, 512 \n\t" | |||
| "addic. %1, %1, -32 \n\t" | |||
| "bgt one%= \n" | |||
| "two%=: \n\t" | |||
| "stxvp 32, 0(%3) \n\t" | |||
| "stxvp 34, 32(%3) \n\t" | |||
| "stxvp 36, 64(%3) \n\t" | |||
| "stxvp 38, 96(%3) \n\t" | |||
| "stxvp 40, 128(%3) \n\t" | |||
| "stxvp 42, 160(%3) \n\t" | |||
| "stxvp 44, 192(%3) \n\t" | |||
| "stxvp 46, 224(%3) \n\t" | |||
| "stxvp 48, 256(%3) \n\t" | |||
| "stxvp 50, 288(%3) \n\t" | |||
| "stxvp 52, 320(%3) \n\t" | |||
| "stxvp 54, 352(%3) \n\t" | |||
| "stxvp 56, 384(%3) \n\t" | |||
| "stxvp 58, 416(%3) \n\t" | |||
| "stxvp 60, 448(%3) \n\t" | |||
| "stxvp 62, 480(%3) \n\t" | |||
| "#n=%1 x=%4=%2 y=%0=%3" | |||
| : | |||
| "=m" (*y), | |||
| "+r" (n), // 1 | |||
| "+b" (x), // 2 | |||
| "+b" (y) // 3 | |||
| : | |||
| "m" (*x) | |||
| : | |||
| "cr0", | |||
| "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", | |||
| "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", | |||
| "vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55", | |||
| "vs56","vs57","vs58","vs59","vs60","vs61","vs62","vs63" | |||
| ); | |||
| } | |||
| @@ -28,12 +28,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||
| #include "zcopy_microk_power10.c" | |||
| #include "copy_microk_power10.c" | |||
| #endif | |||
| #ifndef HAVE_KERNEL_32 | |||
| #ifndef HAVE_KERNEL | |||
| static void zcopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) | |||
| static void copy_kernel(BLASLONG n, FLOAT *x, FLOAT *y) | |||
| { | |||
| BLASLONG i=0; | |||
| @@ -89,7 +89,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| BLASLONG n1 = n & -32; | |||
| if ( n1 > 0 ) | |||
| { | |||
| zcopy_kernel_32(n1, x, y); | |||
| copy_kernel(n1, x, y); | |||
| i=n1; | |||
| ix=n1*2; | |||
| iy=n1*2; | |||
| @@ -513,7 +513,7 @@ static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT al | |||
| #endif | |||
| static __attribute__((always_inline)) void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { | |||
| static __attribute__((always_inline)) inline void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { | |||
| BLASLONG i; | |||
| for (i = 0; i < n; i++) { | |||
| *dest = *src; | |||
| @@ -122,7 +122,7 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| "r" (alpha), // 4 | |||
| "r" (mvec) // 5 | |||
| : "cc", | |||
| "%xmm0", "%xmm1", | |||
| "%xmm0", "%xmm1", "%xmm2", "%xmm3", | |||
| "%xmm4", "%xmm5", "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| @@ -189,9 +189,10 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| "r" (alpha), // 4 | |||
| "r" (mvec) // 5 | |||
| : "cc", | |||
| "%xmm0", "%xmm1", | |||
| "%xmm0", "%xmm1", "%xmm2", "%xmm3", | |||
| "%xmm4", "%xmm5", "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| ); | |||
| @@ -120,7 +120,7 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| "r" (alpha), // 4 | |||
| "r" (mvec) // 5 | |||
| : "cc", | |||
| "%xmm0", "%xmm1", | |||
| "%xmm0", "%xmm1", "%xmm2", "%xmm3", | |||
| "%xmm4", "%xmm5", "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| @@ -104,7 +104,7 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| "r" (alpha), // 4 | |||
| "r" (mvec) // 5 | |||
| : "cc", | |||
| "%xmm0", "%xmm1", | |||
| "%xmm0", "%xmm1", "%xmm2", "%xmm3", | |||
| "%xmm4", "%xmm5", "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| @@ -122,7 +122,7 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| "r" (alpha), // 4 | |||
| "r" (mvec) // 5 | |||
| : "cc", | |||
| "%xmm0", "%xmm1", | |||
| "%xmm0", "%xmm1", "%xmm2", "%xmm3", | |||
| "%xmm4", "%xmm5", "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| @@ -189,9 +189,10 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| "r" (alpha), // 4 | |||
| "r" (mvec) // 5 | |||
| : "cc", | |||
| "%xmm0", "%xmm1", | |||
| "%xmm0", "%xmm1", "%xmm2", "%xmm3", | |||
| "%xmm4", "%xmm5", "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| ); | |||
| @@ -67,8 +67,9 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| "r" (y), // 3 | |||
| "r" (alpha) // 4 | |||
| : "cc", | |||
| "%xmm0", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| "%xmm0", "%xmm1", "%xmm2", "%xmm3", | |||
| "%xmm4", "%xmm5", "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| ); | |||
| @@ -84,8 +84,9 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) | |||
| "r" (y), // 3 | |||
| "r" (dot) // 4 | |||
| : "cc", | |||
| "%xmm4", "%xmm5", | |||
| "%xmm6", "%xmm7", | |||
| "%xmm0", "%xmm1", "%xmm2", "%xmm3", | |||
| "%xmm4", "%xmm5", "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| ); | |||
| @@ -91,6 +91,7 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) | |||
| : "cc", | |||
| "%xmm0", "%xmm1", "%xmm2", "%xmm3", | |||
| "%xmm4", "%xmm5", "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| ); | |||
| @@ -155,6 +156,7 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) | |||
| : "cc", | |||
| "%xmm0", "%xmm1", "%xmm2", "%xmm3", | |||
| "%xmm4", "%xmm5", "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| ); | |||
| @@ -89,8 +89,9 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) | |||
| "r" (y), // 3 | |||
| "r" (dot) // 4 | |||
| : "cc", | |||
| "%xmm4", "%xmm5", | |||
| "%xmm6", "%xmm7", | |||
| "%xmm0", "%xmm1", "%xmm2", "%xmm3", | |||
| "%xmm4", "%xmm5", "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| ); | |||
| @@ -88,6 +88,7 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) | |||
| : "cc", | |||
| "%xmm0", "%xmm1", "%xmm2", "%xmm3", | |||
| "%xmm4", "%xmm5", "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| ); | |||
| @@ -126,4 +126,5 @@ int CNAME(BLASLONG dim_second, BLASLONG dim_first, double *src, BLASLONG lead_di | |||
| } | |||
| src1 += src_inc; | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -105,9 +105,8 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT | |||
| "r" (alpha) // 8 | |||
| : "cc", | |||
| "%xmm0", "%xmm1", "%xmm2", "%xmm3", | |||
| "%xmm4", "%xmm5", | |||
| "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", | |||
| "%xmm4", "%xmm5", "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| ); | |||
| @@ -182,11 +181,10 @@ static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT | |||
| "r" (ap[1]), // 5 | |||
| "r" (alpha) // 6 | |||
| : "cc", | |||
| "%xmm0", "%xmm1", | |||
| "%xmm4", "%xmm5", | |||
| "%xmm6", | |||
| "%xmm8", | |||
| "%xmm12", "%xmm13", | |||
| "%xmm0", "%xmm1", "%xmm2", "%xmm3", | |||
| "%xmm4", "%xmm5", "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| ); | |||
| } | |||
| @@ -140,7 +140,7 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO | |||
| "%xmm2", "%xmm3", | |||
| "%xmm4", "%xmm5", | |||
| "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| ); | |||
| @@ -235,9 +235,11 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT | |||
| "r" (ap[3]), // 7 | |||
| "r" (alpha) // 8 | |||
| : "cc", | |||
| "%xmm0", "%xmm1", | |||
| "%xmm2", "%xmm3", | |||
| "%xmm4", "%xmm5", | |||
| "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| ); | |||
| @@ -117,7 +117,9 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) | |||
| "r" (ap[2]), // 6 | |||
| "r" (ap[3]) // 7 | |||
| : "cc", | |||
| "%xmm0", "%xmm1", "%xmm2", "%xmm3", | |||
| "%xmm4", "%xmm5", "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| ); | |||
| @@ -67,7 +67,8 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| "r" (y), // 3 | |||
| "r" (alpha) // 4 | |||
| : "cc", | |||
| "%xmm0", | |||
| "%xmm0", "%xmm1", "%xmm2", "%xmm3", | |||
| "%xmm4", "%xmm5", "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| @@ -86,7 +86,8 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| "r" (y), // 3 | |||
| "r" (alpha) // 4 | |||
| : "cc", | |||
| "%xmm0", | |||
| "%xmm0", "%xmm1", "%xmm2", "%xmm3", | |||
| "%xmm4", "%xmm5", "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| @@ -147,7 +148,8 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| "r" (y), // 3 | |||
| "r" (alpha) // 4 | |||
| : "cc", | |||
| "%xmm0", | |||
| "%xmm0", "%xmm1", "%xmm2", "%xmm3", | |||
| "%xmm4", "%xmm5", "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| @@ -87,8 +87,9 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) | |||
| "r" (y), // 3 | |||
| "r" (dot) // 4 | |||
| : "cc", | |||
| "%xmm4", "%xmm5", | |||
| "%xmm6", "%xmm7", | |||
| "%xmm0", "%xmm1", "%xmm2", "%xmm3", | |||
| "%xmm4", "%xmm5", "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| ); | |||
| @@ -90,8 +90,9 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) | |||
| "r" (y), // 3 | |||
| "r" (dot) // 4 | |||
| : "cc", | |||
| "%xmm4", "%xmm5", | |||
| "%xmm6", "%xmm7", | |||
| "%xmm0", "%xmm1", "%xmm2", "%xmm3", | |||
| "%xmm4", "%xmm5", "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| ); | |||
| @@ -1,7 +1,8 @@ | |||
| #if defined(SKYLAKEX) || defined (COOPERLAKE) | |||
| /* the direct sgemm code written by Arjan van der Ven */ | |||
| #include <immintrin.h> | |||
| #include "common.h" | |||
| #if defined(SKYLAKEX) || defined (COOPERLAKE) | |||
| /* | |||
| * "Direct sgemm" code. This code operates directly on the inputs and outputs | |||
| * of the sgemm call, avoiding the copies, memory realignments and threading, | |||
| @@ -164,11 +164,9 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO | |||
| "r" (ap[3]), // 8 | |||
| "r" (alpha) // 9 | |||
| : "cc", | |||
| "%xmm0", "%xmm1", | |||
| "%xmm2", "%xmm3", | |||
| "%xmm4", "%xmm5", | |||
| "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", | |||
| "%xmm0", "%xmm1", "%xmm2", "%xmm3", | |||
| "%xmm4", "%xmm5", "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| ); | |||
| @@ -286,9 +284,9 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT | |||
| "r" (ap[3]), // 7 | |||
| "r" (alpha) // 8 | |||
| : "cc", | |||
| "%xmm4", "%xmm5", | |||
| "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", | |||
| "%xmm0", "%xmm1", "%xmm2", "%xmm3", | |||
| "%xmm4", "%xmm5", "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| ); | |||
| @@ -138,7 +138,9 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) | |||
| "r" (ap[2]), // 6 | |||
| "r" (ap[3]) // 7 | |||
| : "cc", | |||
| "%xmm0", "%xmm1", "%xmm2", "%xmm3", | |||
| "%xmm4", "%xmm5", "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| ); | |||
| @@ -122,7 +122,7 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| "r" (alpha), // 4 | |||
| "r" (mvec) // 5 | |||
| : "cc", | |||
| "%xmm0", "%xmm1", | |||
| "%xmm0", "%xmm1", "%xmm2", "%xmm3", | |||
| "%xmm4", "%xmm5", "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| @@ -189,9 +189,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| "r" (alpha), // 4 | |||
| "r" (mvec) // 5 | |||
| : "cc", | |||
| "%xmm0", "%xmm1", | |||
| "%xmm0", "%xmm1", "%xmm2", "%xmm3", | |||
| "%xmm4", "%xmm5", "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| ); | |||
| @@ -120,7 +120,7 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| "r" (alpha), // 4 | |||
| "r" (mvec) // 5 | |||
| : "cc", | |||
| "%xmm0", "%xmm1", | |||
| "%xmm0", "%xmm1", "%xmm2", "%xmm3", | |||
| "%xmm4", "%xmm5", "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| @@ -108,9 +108,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| "r" (alpha), // 4 | |||
| "r" (mvec) // 5 | |||
| : "cc", | |||
| "%xmm0", "%xmm1", | |||
| "%xmm0", "%xmm1", "%xmm2", "%xmm3", | |||
| "%xmm4", "%xmm5", "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| ); | |||
| return; | |||
| @@ -185,9 +186,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| "r" (alpha), // 4 | |||
| "r" (mvec) // 5 | |||
| : "cc", | |||
| "%xmm0", "%xmm1", | |||
| "%xmm0", "%xmm1", "%xmm2", "%xmm3", | |||
| "%xmm4", "%xmm5", "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| ); | |||
| @@ -122,7 +122,7 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| "r" (alpha), // 4 | |||
| "r" (mvec) // 5 | |||
| : "cc", | |||
| "%xmm0", "%xmm1", | |||
| "%xmm0", "%xmm1", "%xmm2", "%xmm3", | |||
| "%xmm4", "%xmm5", "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| @@ -189,9 +189,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| "r" (alpha), // 4 | |||
| "r" (mvec) // 5 | |||
| : "cc", | |||
| "%xmm0", "%xmm1", | |||
| "%xmm0", "%xmm1", "%xmm2", "%xmm3", | |||
| "%xmm4", "%xmm5", "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| ); | |||
| @@ -71,7 +71,7 @@ lapack_int LAPACKE_zgesvdq( int matrix_layout, char joba, char jobp, | |||
| goto exit_level_0; | |||
| } | |||
| liwork = iwork_query; | |||
| lcwork = LAPACK_C2INT(cwork_query); | |||
| lcwork = LAPACK_Z2INT(cwork_query); | |||
| lrwork = (lapack_int)rwork_query; | |||
| /* Allocate memory for work arrays */ | |||
| iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork ); | |||