Merge develop into 0.3.x for 0.3.3tags/v0.3.3
| @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5) | |||||
| project(OpenBLAS C ASM) | project(OpenBLAS C ASM) | ||||
| set(OpenBLAS_MAJOR_VERSION 0) | set(OpenBLAS_MAJOR_VERSION 0) | ||||
| set(OpenBLAS_MINOR_VERSION 3) | set(OpenBLAS_MINOR_VERSION 3) | ||||
| set(OpenBLAS_PATCH_VERSION 2) | |||||
| set(OpenBLAS_PATCH_VERSION 3.dev) | |||||
| set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") | set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") | ||||
| # Adhere to GNU filesystem layout conventions | # Adhere to GNU filesystem layout conventions | ||||
| @@ -150,6 +150,7 @@ endif() | |||||
| # add objects to the openblas lib | # add objects to the openblas lib | ||||
| add_library(${OpenBLAS_LIBNAME} ${LA_SOURCES} ${LAPACKE_SOURCES} ${RELA_SOURCES} ${TARGET_OBJS} ${OpenBLAS_DEF_FILE}) | add_library(${OpenBLAS_LIBNAME} ${LA_SOURCES} ${LAPACKE_SOURCES} ${RELA_SOURCES} ${TARGET_OBJS} ${OpenBLAS_DEF_FILE}) | ||||
| target_include_directories(${OpenBLAS_LIBNAME} INTERFACE $<INSTALL_INTERFACE:include>) | |||||
| # Android needs to explicitly link against libm | # Android needs to explicitly link against libm | ||||
| if(ANDROID) | if(ANDROID) | ||||
| @@ -169,6 +170,7 @@ endif() | |||||
| # Set output for libopenblas | # Set output for libopenblas | ||||
| set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib) | set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib) | ||||
| set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES LIBRARY_OUTPUT_NAME_DEBUG "${OpenBLAS_LIBNAME}_d") | set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES LIBRARY_OUTPUT_NAME_DEBUG "${OpenBLAS_LIBNAME}_d") | ||||
| set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES EXPORT_NAME "OpenBLAS") | |||||
| foreach (OUTPUTCONFIG ${CMAKE_CONFIGURATION_TYPES}) | foreach (OUTPUTCONFIG ${CMAKE_CONFIGURATION_TYPES}) | ||||
| string( TOUPPER ${OUTPUTCONFIG} OUTPUTCONFIG ) | string( TOUPPER ${OUTPUTCONFIG} OUTPUTCONFIG ) | ||||
| @@ -1,4 +1,115 @@ | |||||
| OpenBLAS ChangeLog | OpenBLAS ChangeLog | ||||
| ==================================================================== | |||||
| Version 0.3.2 | |||||
| 30-Jul-2018 | |||||
| common: | |||||
| * fixes for regressions caused by the rewrite of the thread | |||||
| initialization code in 0.3.1 | |||||
| POWER: | |||||
| * fixed cpu autodetection for the BSDs | |||||
| MIPS64: | |||||
| * fixed utest errors in AXPY, DSDOT, ROT and SWAP | |||||
| x86_64: | |||||
| * added autodetection of AMD Ryzen 2 | |||||
| * fixed build with older versions of MSVC | |||||
| ==================================================================== | |||||
| Version 0.3.1 | |||||
| 01-Jul-2018 | |||||
| common: | |||||
| * rewritten thread initialization code with significantly reduced overhead | |||||
| * added CBLAS interfaces to the IxAMIN BLAS extension functions | |||||
| * fixed the lapack-test target | |||||
| * CMAKE builds now create an OpenBLASConfig.cmake file | |||||
| * ZAXPY now uses a single thread for small input sizes | |||||
| * the LAPACK code was updated from Reference-LAPACK/lapack#253 | |||||
| (fixing LAPACKE interfaces to Aasen's functions) | |||||
| POWER: | |||||
| * corrected CROT and ZROT behaviour with zero INC_X | |||||
| ARMV7: | |||||
| * corrected xDOT behaviour with zero INC_X or INC_Y | |||||
| x86_64: | |||||
| * retired some older targets of DYNAMIC_ARCH builds to a new option DYNAMIC_OLDER, | |||||
| this affects PENRYN,DUNNINGTON,OPTERON,OPTERON_SSE3,BOBCAT,ATOM and NANO | |||||
| (which will still be supported via the slower PRESCOTT kernels when this option is not set) | |||||
| * added an option DYNAMIC_LIST that (used in conjunction with DYNAMIC_ARCH) allows to | |||||
| specify the list of x86_64 targets to include. Any target not on the list will be supported | |||||
| by the Sandybridge or Nehalem kernels if available, or by Prescott. | |||||
| * improved SWITCH_RATIO on Haswell for increased GEMM throughput | |||||
| * added initial support for Intel Skylake X, including an AVX512 SGEMM kernel | |||||
| * added autodetection of Intel Cannon Lake series as Skylake X | |||||
| * added a default L2 cache size for hypervisors that return zero here (Chromebook) | |||||
| * fixed a name clash with recent Windows10 headers that broke the build with (at least) | |||||
| recent mingw from MSYS2 | |||||
| * fixed a link error in mixed clang/gfortran builds with OpenMP | |||||
| * updated the OSX deployment target to 10.8 | |||||
| * switched on parallel make for builds on MS Windows by default | |||||
| x86: | |||||
| * fixed SSWAP and DSWAP behaviour with zero INC_X and INC_Y | |||||
| ==================================================================== | |||||
| Version 0.3.0 | |||||
| 23-May-2108 | |||||
| common: | |||||
| * fixed some more thread race and locking bugs | |||||
| * added preliminary support for calling an OpenMP build of the library from multiple threads | |||||
| * removed performance impact of thread locks added in 0.2.20 on OpenMP code | |||||
| * general code cleanup | |||||
| * optimized DSDOT implementation | |||||
| * improved thread distribution for GEMM | |||||
| * corrected IMATCOPY/OMATCOPY implementation | |||||
| * fixed out-of-bounds accesses in the multithreaded xBMV/xPMV and SYMV implementations | |||||
| * cmake build improvements | |||||
| * pkgconfig file now contains build options | |||||
| * openblas_get_config() now reports USE_OPENMP and NUM_THREADS settings used for the build | |||||
| * corrections and improvements for systems with more than 64 cpus | |||||
| * LAPACK code updated to 3.8.0 including later fixes | |||||
| * added ReLAPACK, a recursive implementation of several LAPACK functions | |||||
| * Rewrote ROTMG to handle cases that the netlib code failed to address | |||||
| * Disabled (broken) multithreading code for xTRMV | |||||
| * corrected prototypes of complex CBLAS functions to make our cblas.h match the generally accepted standard | |||||
| * shared memory access failures on startup are now handled more gracefully | |||||
| * restored utests from earlier releases (and made them pass on all affected systems) | |||||
| SPARC: | |||||
| * several fixes for cpu autodetection | |||||
| POWER: | |||||
| * corrected vector register overwriting in several Power8 kernels | |||||
| * optimized additional BLAS functions | |||||
| ARM: | |||||
| * added support for CortexA53 and A72 | |||||
| * added autodetection for ThunderX2T99 | |||||
| * made most optimized kernels the default for generic ARMv8 targets | |||||
| x86_64: | |||||
| * parallelized DDOT kernel for Haswell | |||||
| * changed alignment directives in assembly kernels to boost performance on OSX | |||||
| * fixed register handling in the GEMV microkernels (bug exposed by gcc7) | |||||
| * added support for building on OpenBSD and Dragonfly | |||||
| * updated compiler options to work with Intel release 2018 | |||||
| * support fully optimized build with clang/flang on Microsoft Windows | |||||
| * fixed building on AIX | |||||
| IBM Z: | |||||
| * added optimized BLAS 1/2 functions | |||||
| MIPS: | |||||
| * fixed cpu autodetection helper code | |||||
| * added mips32 1004K cpu (Mediatek MT7621 and similar SoC) | |||||
| * added mips64 I6500 cpu | |||||
| ==================================================================== | ==================================================================== | ||||
| Version 0.2.20 | Version 0.2.20 | ||||
| 24-Jul-2017 | 24-Jul-2017 | ||||
| @@ -97,7 +97,7 @@ endif | |||||
| shared : | shared : | ||||
| ifndef NO_SHARED | ifndef NO_SHARED | ||||
| ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android)) | |||||
| ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku)) | |||||
| @$(MAKE) -C exports so | @$(MAKE) -C exports so | ||||
| @ln -fs $(LIBSONAME) $(LIBPREFIX).so | @ln -fs $(LIBSONAME) $(LIBPREFIX).so | ||||
| @ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION) | @ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION) | ||||
| @@ -267,6 +267,8 @@ ifeq ($(F_COMPILER), GFORTRAN) | |||||
| ifdef SMP | ifdef SMP | ||||
| ifeq ($(OSNAME), WINNT) | ifeq ($(OSNAME), WINNT) | ||||
| -@echo "LOADER = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc | -@echo "LOADER = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc | ||||
| else ifeq ($(OSNAME), Haiku) | |||||
| -@echo "LOADER = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc | |||||
| else | else | ||||
| -@echo "LOADER = $(FC) -pthread" >> $(NETLIB_LAPACK_DIR)/make.inc | -@echo "LOADER = $(FC) -pthread" >> $(NETLIB_LAPACK_DIR)/make.inc | ||||
| endif | endif | ||||
| @@ -66,7 +66,7 @@ endif | |||||
| #for install shared library | #for install shared library | ||||
| ifndef NO_SHARED | ifndef NO_SHARED | ||||
| @echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) | @echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) | ||||
| ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android)) | |||||
| ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku)) | |||||
| @install -pm755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" | @install -pm755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" | ||||
| @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ | @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ | ||||
| ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \ | ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \ | ||||
| @@ -3,7 +3,7 @@ | |||||
| # | # | ||||
| # This library's version | # This library's version | ||||
| VERSION = 0.3.2 | |||||
| VERSION = 0.3.3.dev | |||||
| # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a | # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a | ||||
| # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library | # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library | ||||
| @@ -107,7 +107,13 @@ BUILD_LAPACK_DEPRECATED = 1 | |||||
| # BUILD_RELAPACK = 1 | # BUILD_RELAPACK = 1 | ||||
| # If you want to use legacy threaded Level 3 implementation. | # If you want to use legacy threaded Level 3 implementation. | ||||
| # USE_SIMPLE_THREADED_LEVEL3 = 1 | |||||
| USE_SIMPLE_THREADED_LEVEL3 = 1 | |||||
| # If you want to use the new, still somewhat experimental code that uses | |||||
| # thread-local storage instead of a central memory buffer in memory.c | |||||
| # Note that if your system uses GLIBC, it needs to have at least glibc 2.21 | |||||
| # for this to work. | |||||
| USE_TLS = 1 | |||||
| # If you want to drive whole 64bit region by BLAS. Not all Fortran | # If you want to drive whole 64bit region by BLAS. Not all Fortran | ||||
| # compiler supports this. It's safe to keep comment it out if you | # compiler supports this. It's safe to keep comment it out if you | ||||
| @@ -1018,6 +1018,10 @@ ifdef USE_SIMPLE_THREADED_LEVEL3 | |||||
| CCOMMON_OPT += -DUSE_SIMPLE_THREADED_LEVEL3 | CCOMMON_OPT += -DUSE_SIMPLE_THREADED_LEVEL3 | ||||
| endif | endif | ||||
| ifdef USE_TLS | |||||
| CCOMMON_OPT += -DUSE_TLS | |||||
| endif | |||||
| ifndef SYMBOLPREFIX | ifndef SYMBOLPREFIX | ||||
| SYMBOLPREFIX = | SYMBOLPREFIX = | ||||
| endif | endif | ||||
| @@ -12,6 +12,9 @@ ifeq ($(CORE), SKYLAKEX) | |||||
| ifndef NO_AVX512 | ifndef NO_AVX512 | ||||
| CCOMMON_OPT += -march=skylake-avx512 | CCOMMON_OPT += -march=skylake-avx512 | ||||
| FCOMMON_OPT += -march=skylake-avx512 | FCOMMON_OPT += -march=skylake-avx512 | ||||
| ifeq ($(OSNAME), CYGWIN_NT) | |||||
| CCOMMON_OPT += -fno-asynchronous-unwind-tables | |||||
| endif | |||||
| endif | endif | ||||
| endif | endif | ||||
| @@ -110,6 +110,7 @@ Please read `GotoBLAS_01Readme.txt`. | |||||
| - **Intel Xeon 56xx (Westmere)**: Used GotoBLAS2 Nehalem codes. | - **Intel Xeon 56xx (Westmere)**: Used GotoBLAS2 Nehalem codes. | ||||
| - **Intel Sandy Bridge**: Optimized Level-3 and Level-2 BLAS with AVX on x86-64. | - **Intel Sandy Bridge**: Optimized Level-3 and Level-2 BLAS with AVX on x86-64. | ||||
| - **Intel Haswell**: Optimized Level-3 and Level-2 BLAS with AVX2 and FMA on x86-64. | - **Intel Haswell**: Optimized Level-3 and Level-2 BLAS with AVX2 and FMA on x86-64. | ||||
| - **Intel Skylake**: Optimized Level-3 and Level-2 BLAS with AVX512 and FMA on x86-64. | |||||
| - **AMD Bobcat**: Used GotoBLAS2 Barcelona codes. | - **AMD Bobcat**: Used GotoBLAS2 Barcelona codes. | ||||
| - **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thanks to Werner Saar) | - **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thanks to Werner Saar) | ||||
| - **AMD PILEDRIVER**: Uses Bulldozer codes with some optimizations. | - **AMD PILEDRIVER**: Uses Bulldozer codes with some optimizations. | ||||
| @@ -200,6 +201,7 @@ Please see Changelog.txt to view the differences between OpenBLAS and GotoBLAS2 | |||||
| * Please use GCC version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MinGW/BSD. | * Please use GCC version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MinGW/BSD. | ||||
| * Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture. | * Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture. | ||||
| Clang 3.0 will generate the wrong AVX binary code. | Clang 3.0 will generate the wrong AVX binary code. | ||||
| * Please use GCC version 6 or LLVM version 6 and above to compile Skyalke AVX512 kernels. | |||||
| * The number of CPUs/cores should less than or equal to 256. On Linux `x86_64` (`amd64`), | * The number of CPUs/cores should less than or equal to 256. On Linux `x86_64` (`amd64`), | ||||
| there is experimental support for up to 1024 CPUs/cores and 128 numa nodes if you build | there is experimental support for up to 1024 CPUs/cores and 128 numa nodes if you build | ||||
| the library with `BIGNUMA=1`. | the library with `BIGNUMA=1`. | ||||
| @@ -122,7 +122,7 @@ int main(int argc, char *argv[]){ | |||||
| FLOAT *a, *x, *y; | FLOAT *a, *x, *y; | ||||
| FLOAT alpha[] = {1.0, 1.0}; | FLOAT alpha[] = {1.0, 1.0}; | ||||
| FLOAT beta [] = {1.0, 1.0}; | |||||
| FLOAT beta [] = {1.0, 0.0}; | |||||
| char trans='N'; | char trans='N'; | ||||
| blasint m, i, j; | blasint m, i, j; | ||||
| blasint inc_x=1,inc_y=1; | blasint inc_x=1,inc_y=1; | ||||
| @@ -64,6 +64,7 @@ $os = WINNT if ($data =~ /OS_WINNT/); | |||||
| $os = CYGWIN_NT if ($data =~ /OS_CYGWIN_NT/); | $os = CYGWIN_NT if ($data =~ /OS_CYGWIN_NT/); | ||||
| $os = Interix if ($data =~ /OS_INTERIX/); | $os = Interix if ($data =~ /OS_INTERIX/); | ||||
| $os = Android if ($data =~ /OS_ANDROID/); | $os = Android if ($data =~ /OS_ANDROID/); | ||||
| $os = Haiku if ($data =~ /OS_HAIKU/); | |||||
| $architecture = x86 if ($data =~ /ARCH_X86/); | $architecture = x86 if ($data =~ /ARCH_X86/); | ||||
| $architecture = x86_64 if ($data =~ /ARCH_X86_64/); | $architecture = x86_64 if ($data =~ /ARCH_X86_64/); | ||||
| @@ -223,7 +224,6 @@ $data =~ /globl\s([_\.]*)(.*)/; | |||||
| $need_fu = $1; | $need_fu = $1; | ||||
| $cross = 0; | $cross = 0; | ||||
| $cross = 1 if ($os ne $hostos); | |||||
| if ($architecture ne $hostarch) { | if ($architecture ne $hostarch) { | ||||
| $cross = 1; | $cross = 1; | ||||
| @@ -231,6 +231,8 @@ if ($architecture ne $hostarch) { | |||||
| $cross = 0 if (($hostarch eq "mips64") && ($architecture eq "mips")); | $cross = 0 if (($hostarch eq "mips64") && ($architecture eq "mips")); | ||||
| } | } | ||||
| $cross = 1 if ($os ne $hostos); | |||||
| $openmp = "" if $ENV{USE_OPENMP} != 1; | $openmp = "" if $ENV{USE_OPENMP} != 1; | ||||
| $linker_L = ""; | $linker_L = ""; | ||||
| @@ -214,6 +214,10 @@ if (CONSISTENT_FPCSR) | |||||
| set(CCOMMON_OPT "${CCOMMON_OPT} -DCONSISTENT_FPCSR") | set(CCOMMON_OPT "${CCOMMON_OPT} -DCONSISTENT_FPCSR") | ||||
| endif () | endif () | ||||
| if (USE_TLS) | |||||
| set(CCOMMON_OPT "${CCOMMON_OPT} -DUSE_TLS") | |||||
| endif () | |||||
| # Only for development | # Only for development | ||||
| # set(CCOMMON_OPT "${CCOMMON_OPT} -DPARAMTEST") | # set(CCOMMON_OPT "${CCOMMON_OPT} -DPARAMTEST") | ||||
| # set(CCOMMON_OPT "${CCOMMON_OPT} -DPREFETCHTEST") | # set(CCOMMON_OPT "${CCOMMON_OPT} -DPREFETCHTEST") | ||||
| @@ -105,6 +105,10 @@ extern "C" { | |||||
| #endif | #endif | ||||
| #endif | #endif | ||||
| #ifdef OS_HAIKU | |||||
| #define NO_SYSV_IPC | |||||
| #endif | |||||
| #ifdef OS_WINDOWS | #ifdef OS_WINDOWS | ||||
| #ifdef ATOM | #ifdef ATOM | ||||
| #define GOTO_ATOM ATOM | #define GOTO_ATOM ATOM | ||||
| @@ -253,8 +257,14 @@ typedef unsigned long BLASULONG; | |||||
| #ifdef USE64BITINT | #ifdef USE64BITINT | ||||
| typedef BLASLONG blasint; | typedef BLASLONG blasint; | ||||
| #if defined(OS_WINDOWS) && defined(__64BIT__) | |||||
| #define blasabs(x) llabs(x) | |||||
| #else | |||||
| #define blasabs(x) labs(x) | |||||
| #endif | |||||
| #else | #else | ||||
| typedef int blasint; | typedef int blasint; | ||||
| #define blasabs(x) abs(x) | |||||
| #endif | #endif | ||||
| #else | #else | ||||
| #ifdef USE64BITINT | #ifdef USE64BITINT | ||||
| @@ -29,15 +29,18 @@ | |||||
| #define CPU_GENERIC 0 | #define CPU_GENERIC 0 | ||||
| #define CPU_Z13 1 | #define CPU_Z13 1 | ||||
| #define CPU_Z14 2 | |||||
| static char *cpuname[] = { | static char *cpuname[] = { | ||||
| "ZARCH_GENERIC", | "ZARCH_GENERIC", | ||||
| "Z13" | |||||
| "Z13", | |||||
| "Z14" | |||||
| }; | }; | ||||
| static char *cpuname_lower[] = { | static char *cpuname_lower[] = { | ||||
| "zarch_generic", | "zarch_generic", | ||||
| "z13" | |||||
| "z13", | |||||
| "z14" | |||||
| }; | }; | ||||
| int detect(void) | int detect(void) | ||||
| @@ -62,6 +65,10 @@ int detect(void) | |||||
| if (strstr(p, "2964")) return CPU_Z13; | if (strstr(p, "2964")) return CPU_Z13; | ||||
| if (strstr(p, "2965")) return CPU_Z13; | if (strstr(p, "2965")) return CPU_Z13; | ||||
| /* detect z14, but fall back to z13 */ | |||||
| if (strstr(p, "3906")) return CPU_Z13; | |||||
| if (strstr(p, "3907")) return CPU_Z13; | |||||
| return CPU_GENERIC; | return CPU_GENERIC; | ||||
| } | } | ||||
| @@ -107,5 +114,9 @@ void get_cpuconfig(void) | |||||
| printf("#define Z13\n"); | printf("#define Z13\n"); | ||||
| printf("#define DTB_DEFAULT_ENTRIES 64\n"); | printf("#define DTB_DEFAULT_ENTRIES 64\n"); | ||||
| break; | break; | ||||
| case CPU_Z14: | |||||
| printf("#define Z14\n"); | |||||
| printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||||
| break; | |||||
| } | } | ||||
| } | } | ||||
| @@ -101,6 +101,10 @@ OS_INTERIX | |||||
| OS_LINUX | OS_LINUX | ||||
| #endif | #endif | ||||
| #if defined(__HAIKU__) | |||||
| OS_HAIKU | |||||
| #endif | |||||
| #if defined(__i386) || defined(_X86) | #if defined(__i386) || defined(_X86) | ||||
| ARCH_X86 | ARCH_X86 | ||||
| #endif | #endif | ||||
| @@ -70,7 +70,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| /*********************************************************************/ | /*********************************************************************/ | ||||
| #include "common.h" | #include "common.h" | ||||
| #if defined(OS_LINUX) || defined(OS_NETBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_SUNOS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) | |||||
| #if defined(OS_LINUX) || defined(OS_NETBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_SUNOS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_HAIKU) | |||||
| #include <dlfcn.h> | #include <dlfcn.h> | ||||
| #include <signal.h> | #include <signal.h> | ||||
| #include <sys/resource.h> | #include <sys/resource.h> | ||||
| @@ -122,7 +122,7 @@ endif | |||||
| dllinit.$(SUFFIX) : dllinit.c | dllinit.$(SUFFIX) : dllinit.c | ||||
| $(CC) $(CFLAGS) -c -o $(@F) -s $< | $(CC) $(CFLAGS) -c -o $(@F) -s $< | ||||
| ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android)) | |||||
| ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku)) | |||||
| so : ../$(LIBSONAME) | so : ../$(LIBSONAME) | ||||
| @@ -213,7 +213,7 @@ void CNAME(enum CBLAS_ORDER order, | |||||
| if (trans) lenx = m; | if (trans) lenx = m; | ||||
| if (trans) leny = n; | if (trans) leny = n; | ||||
| if (beta != ONE) SCAL_K(leny, 0, 0, beta, y, abs(incy), NULL, 0, NULL, 0); | |||||
| if (beta != ONE) SCAL_K(leny, 0, 0, beta, y, blasabs(incy), NULL, 0, NULL, 0); | |||||
| if (alpha == ZERO) return; | if (alpha == ZERO) return; | ||||
| @@ -199,7 +199,7 @@ void CNAME(enum CBLAS_ORDER order, | |||||
| if (trans) lenx = m; | if (trans) lenx = m; | ||||
| if (trans) leny = n; | if (trans) leny = n; | ||||
| if (beta != ONE) SCAL_K(leny, 0, 0, beta, y, abs(incy), NULL, 0, NULL, 0); | |||||
| if (beta != ONE) SCAL_K(leny, 0, 0, beta, y, blasabs(incy), NULL, 0, NULL, 0); | |||||
| if (alpha == ZERO) return; | if (alpha == ZERO) return; | ||||
| @@ -22,8 +22,8 @@ void CNAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){ | |||||
| long double s; | long double s; | ||||
| long double r, roe, z; | long double r, roe, z; | ||||
| long double ada = fabs(da); | |||||
| long double adb = fabs(db); | |||||
| long double ada = fabsl(da); | |||||
| long double adb = fabsl(db); | |||||
| long double scale = ada + adb; | long double scale = ada + adb; | ||||
| #ifndef CBLAS | #ifndef CBLAS | ||||
| @@ -184,7 +184,7 @@ void CNAME(enum CBLAS_ORDER order, | |||||
| if (n == 0) return; | if (n == 0) return; | ||||
| if (beta != ONE) SCAL_K(n, 0, 0, beta, y, abs(incy), NULL, 0, NULL, 0); | |||||
| if (beta != ONE) SCAL_K(n, 0, 0, beta, y, blasabs(incy), NULL, 0, NULL, 0); | |||||
| if (alpha == ZERO) return; | if (alpha == ZERO) return; | ||||
| @@ -168,7 +168,7 @@ void CNAME(enum CBLAS_ORDER order, | |||||
| if (n == 0) return; | if (n == 0) return; | ||||
| if (beta != ONE) SCAL_K(n, 0, 0, beta, y, abs(incy), NULL, 0, NULL, 0); | |||||
| if (beta != ONE) SCAL_K(n, 0, 0, beta, y, blasabs(incy), NULL, 0, NULL, 0); | |||||
| if (alpha == ZERO) return; | if (alpha == ZERO) return; | ||||
| @@ -166,7 +166,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha, | |||||
| if (n == 0) return; | if (n == 0) return; | ||||
| if (beta != ONE) SCAL_K(n, 0, 0, beta, y, abs(incy), NULL, 0, NULL, 0); | |||||
| if (beta != ONE) SCAL_K(n, 0, 0, beta, y, blasabs(incy), NULL, 0, NULL, 0); | |||||
| if (alpha == ZERO) return; | if (alpha == ZERO) return; | ||||
| @@ -237,7 +237,7 @@ void CNAME(enum CBLAS_ORDER order, | |||||
| if (trans & 1) lenx = m; | if (trans & 1) lenx = m; | ||||
| if (trans & 1) leny = n; | if (trans & 1) leny = n; | ||||
| if (beta_r != ONE || beta_i != ZERO) SCAL_K(leny, 0, 0, beta_r, beta_i, y, abs(incy), NULL, 0, NULL, 0); | |||||
| if (beta_r != ONE || beta_i != ZERO) SCAL_K(leny, 0, 0, beta_r, beta_i, y, blasabs(incy), NULL, 0, NULL, 0); | |||||
| if (alpha_r == ZERO && alpha_i == ZERO) return; | if (alpha_r == ZERO && alpha_i == ZERO) return; | ||||
| @@ -225,7 +225,7 @@ void CNAME(enum CBLAS_ORDER order, | |||||
| if (trans & 1) lenx = m; | if (trans & 1) lenx = m; | ||||
| if (trans & 1) leny = n; | if (trans & 1) leny = n; | ||||
| if (beta_r != ONE || beta_i != ZERO) SCAL_K(leny, 0, 0, beta_r, beta_i, y, abs(incy), NULL, 0, NULL, 0); | |||||
| if (beta_r != ONE || beta_i != ZERO) SCAL_K(leny, 0, 0, beta_r, beta_i, y, blasabs(incy), NULL, 0, NULL, 0); | |||||
| if (alpha_r == ZERO && alpha_i == ZERO) return; | if (alpha_r == ZERO && alpha_i == ZERO) return; | ||||
| @@ -190,7 +190,7 @@ void CNAME(enum CBLAS_ORDER order, | |||||
| if (n == 0) return; | if (n == 0) return; | ||||
| if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, y, abs(incy), NULL, 0, NULL, 0); | |||||
| if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, y, blasabs(incy), NULL, 0, NULL, 0); | |||||
| if ((alpha_r == ZERO) && (alpha_i == ZERO)) return; | if ((alpha_r == ZERO) && (alpha_i == ZERO)) return; | ||||
| @@ -181,7 +181,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, void *VALPHA | |||||
| if (n == 0) return; | if (n == 0) return; | ||||
| if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, y, abs(incy), NULL, 0, NULL, 0); | |||||
| if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, y, blasabs(incy), NULL, 0, NULL, 0); | |||||
| if ((alpha_r == ZERO) && (alpha_i == ZERO)) return; | if ((alpha_r == ZERO) && (alpha_i == ZERO)) return; | ||||
| @@ -180,7 +180,7 @@ void CNAME(enum CBLAS_ORDER order, | |||||
| if (n == 0) return; | if (n == 0) return; | ||||
| if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, y, abs(incy), NULL, 0, NULL, 0); | |||||
| if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, y, blasabs(incy), NULL, 0, NULL, 0); | |||||
| if ((alpha_r == ZERO) && (alpha_i == ZERO)) return; | if ((alpha_r == ZERO) && (alpha_i == ZERO)) return; | ||||
| @@ -14,7 +14,7 @@ void NAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){ | |||||
| long double db_i = *(DB + 1); | long double db_i = *(DB + 1); | ||||
| long double r; | long double r; | ||||
| long double ada = fabs(da_r) + fabs(da_i); | |||||
| long double ada = fabsl(da_r) + fabsl(da_i); | |||||
| PRINT_DEBUG_NAME; | PRINT_DEBUG_NAME; | ||||
| @@ -126,7 +126,7 @@ void NAME(char *UPLO, blasint *N, blasint *K, FLOAT *ALPHA, FLOAT *a, blasint * | |||||
| if (n == 0) return; | if (n == 0) return; | ||||
| if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, c, abs(incy), NULL, 0, NULL, 0); | |||||
| if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, c, blasabs(incy), NULL, 0, NULL, 0); | |||||
| if ((alpha_r == ZERO) && (alpha_i == ZERO)) return; | if ((alpha_r == ZERO) && (alpha_i == ZERO)) return; | ||||
| @@ -44,7 +44,7 @@ ifeq ($(CORE), POWER8) | |||||
| USE_TRMM = 1 | USE_TRMM = 1 | ||||
| endif | endif | ||||
| ifeq ($(CORE), Z13) | |||||
| ifeq ($(ARCH), zarch) | |||||
| USE_TRMM = 1 | USE_TRMM = 1 | ||||
| endif | endif | ||||
| @@ -37,8 +37,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "daxpy_microk_steamroller-2.c" | #include "daxpy_microk_steamroller-2.c" | ||||
| #elif defined(PILEDRIVER) | #elif defined(PILEDRIVER) | ||||
| #include "daxpy_microk_piledriver-2.c" | #include "daxpy_microk_piledriver-2.c" | ||||
| #elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) | |||||
| #elif defined(HASWELL) || defined(ZEN) | |||||
| #include "daxpy_microk_haswell-2.c" | #include "daxpy_microk_haswell-2.c" | ||||
| #elif defined (SKYLAKEX) | |||||
| #include "daxpy_microk_skylakex-2.c" | |||||
| #elif defined(SANDYBRIDGE) | #elif defined(SANDYBRIDGE) | ||||
| #include "daxpy_microk_sandy-2.c" | #include "daxpy_microk_sandy-2.c" | ||||
| #endif | #endif | ||||
| @@ -0,0 +1,71 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2014, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| /* need a new enough GCC for avx512 support */ | |||||
| #if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) | |||||
| #include <immintrin.h> | |||||
| #define HAVE_KERNEL_8 1 | |||||
| static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||||
| { | |||||
| BLASLONG i = 0; | |||||
| __m256d __alpha; | |||||
| __alpha = _mm256_broadcastsd_pd(_mm_load_sd(alpha)); | |||||
| #ifdef __AVX512CD__ | |||||
| BLASLONG n32; | |||||
| __m512d __alpha5; | |||||
| __alpha5 = _mm512_broadcastsd_pd(_mm_load_sd(alpha)); | |||||
| n32 = n & ~31; | |||||
| for (; i < n32; i+= 32) { | |||||
| _mm512_storeu_pd(&y[i + 0], _mm512_loadu_pd(&y[i + 0]) + __alpha5 * _mm512_loadu_pd(&x[i + 0])); | |||||
| _mm512_storeu_pd(&y[i + 8], _mm512_loadu_pd(&y[i + 8]) + __alpha5 * _mm512_loadu_pd(&x[i + 8])); | |||||
| _mm512_storeu_pd(&y[i + 16], _mm512_loadu_pd(&y[i + 16]) + __alpha5 * _mm512_loadu_pd(&x[i + 16])); | |||||
| _mm512_storeu_pd(&y[i + 24], _mm512_loadu_pd(&y[i + 24]) + __alpha5 * _mm512_loadu_pd(&x[i + 24])); | |||||
| } | |||||
| #endif | |||||
| for (; i < n; i+= 16) { | |||||
| _mm256_storeu_pd(&y[i + 0], _mm256_loadu_pd(&y[i + 0]) + __alpha * _mm256_loadu_pd(&x[i + 0])); | |||||
| _mm256_storeu_pd(&y[i + 4], _mm256_loadu_pd(&y[i + 4]) + __alpha * _mm256_loadu_pd(&x[i + 4])); | |||||
| _mm256_storeu_pd(&y[i + 8], _mm256_loadu_pd(&y[i + 8]) + __alpha * _mm256_loadu_pd(&x[i + 8])); | |||||
| _mm256_storeu_pd(&y[i + 12], _mm256_loadu_pd(&y[i + 12]) + __alpha * _mm256_loadu_pd(&x[i + 12])); | |||||
| } | |||||
| } | |||||
| #else | |||||
| #include "daxpy_microk_haswell-2.c" | |||||
| #endif | |||||
| @@ -37,8 +37,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "ddot_microk_piledriver-2.c" | #include "ddot_microk_piledriver-2.c" | ||||
| #elif defined(NEHALEM) | #elif defined(NEHALEM) | ||||
| #include "ddot_microk_nehalem-2.c" | #include "ddot_microk_nehalem-2.c" | ||||
| #elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) | |||||
| #elif defined(HASWELL) || defined(ZEN) | |||||
| #include "ddot_microk_haswell-2.c" | #include "ddot_microk_haswell-2.c" | ||||
| #elif defined (SKYLAKEX) | |||||
| #include "ddot_microk_skylakex-2.c" | |||||
| #elif defined(SANDYBRIDGE) | #elif defined(SANDYBRIDGE) | ||||
| #include "ddot_microk_sandy-2.c" | #include "ddot_microk_sandy-2.c" | ||||
| #endif | #endif | ||||
| @@ -0,0 +1,96 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2014, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| /* need a new enough GCC for avx512 support */ | |||||
| #if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) | |||||
| #define HAVE_KERNEL_8 1 | |||||
| #include <immintrin.h> | |||||
| static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) | |||||
| { | |||||
| int i = 0; | |||||
| __m256d accum_0, accum_1, accum_2, accum_3; | |||||
| accum_0 = _mm256_setzero_pd(); | |||||
| accum_1 = _mm256_setzero_pd(); | |||||
| accum_2 = _mm256_setzero_pd(); | |||||
| accum_3 = _mm256_setzero_pd(); | |||||
| #ifdef __AVX512CD__ | |||||
| __m512d accum_05, accum_15, accum_25, accum_35; | |||||
| int n32; | |||||
| n32 = n & (~31); | |||||
| accum_05 = _mm512_setzero_pd(); | |||||
| accum_15 = _mm512_setzero_pd(); | |||||
| accum_25 = _mm512_setzero_pd(); | |||||
| accum_35 = _mm512_setzero_pd(); | |||||
| for (; i < n32; i += 32) { | |||||
| accum_05 += _mm512_loadu_pd(&x[i+ 0]) * _mm512_loadu_pd(&y[i+ 0]); | |||||
| accum_15 += _mm512_loadu_pd(&x[i+ 8]) * _mm512_loadu_pd(&y[i+ 8]); | |||||
| accum_25 += _mm512_loadu_pd(&x[i+16]) * _mm512_loadu_pd(&y[i+16]); | |||||
| accum_35 += _mm512_loadu_pd(&x[i+24]) * _mm512_loadu_pd(&y[i+24]); | |||||
| } | |||||
| /* | |||||
| * we need to fold our 512 bit wide accumulator vectors into 256 bit wide vectors so that the AVX2 code | |||||
| * below can continue using the intermediate results in its loop | |||||
| */ | |||||
| accum_0 = _mm512_extractf64x4_pd(accum_05, 0) + _mm512_extractf64x4_pd(accum_05, 1); | |||||
| accum_1 = _mm512_extractf64x4_pd(accum_15, 0) + _mm512_extractf64x4_pd(accum_15, 1); | |||||
| accum_2 = _mm512_extractf64x4_pd(accum_25, 0) + _mm512_extractf64x4_pd(accum_25, 1); | |||||
| accum_3 = _mm512_extractf64x4_pd(accum_35, 0) + _mm512_extractf64x4_pd(accum_35, 1); | |||||
| #endif | |||||
| for (; i < n; i += 16) { | |||||
| accum_0 += _mm256_loadu_pd(&x[i+ 0]) * _mm256_loadu_pd(&y[i+ 0]); | |||||
| accum_1 += _mm256_loadu_pd(&x[i+ 4]) * _mm256_loadu_pd(&y[i+ 4]); | |||||
| accum_2 += _mm256_loadu_pd(&x[i+ 8]) * _mm256_loadu_pd(&y[i+ 8]); | |||||
| accum_3 += _mm256_loadu_pd(&x[i+12]) * _mm256_loadu_pd(&y[i+12]); | |||||
| } | |||||
| /* we now have the partial sums of the dot product in the 4 accumulation vectors, time to consolidate */ | |||||
| accum_0 = accum_0 + accum_1 + accum_2 + accum_3; | |||||
| __m128d half_accum0; | |||||
| /* Add upper half to lower half of each of the 256 bit vector to get a 128 bit vector */ | |||||
| half_accum0 = _mm_add_pd(_mm256_extractf128_pd(accum_0, 0), _mm256_extractf128_pd(accum_0, 1)); | |||||
| /* in 128 bit land there is a hadd operation to do the rest of the element-wise sum in one go */ | |||||
| half_accum0 = _mm_hadd_pd(half_accum0, half_accum0); | |||||
| *dot = half_accum0[0]; | |||||
| } | |||||
| #else | |||||
| #include "ddot_microk_haswell-2.c" | |||||
| #endif | |||||
| @@ -31,8 +31,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #if defined(NEHALEM) | #if defined(NEHALEM) | ||||
| #include "dgemv_n_microk_nehalem-4.c" | #include "dgemv_n_microk_nehalem-4.c" | ||||
| #elif defined(HASWELL) || defined(ZEN) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined (SKYLAKEX) | |||||
| #elif defined(HASWELL) || defined(ZEN) || defined(STEAMROLLER) || defined(EXCAVATOR) | |||||
| #include "dgemv_n_microk_haswell-4.c" | #include "dgemv_n_microk_haswell-4.c" | ||||
| #elif defined (SKYLAKEX) | |||||
| #include "dgemv_n_microk_skylakex-4.c" | |||||
| #endif | #endif | ||||
| @@ -0,0 +1,126 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2014, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| /* need a new enough GCC for avx512 support */ | |||||
| #if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) | |||||
| #define HAVE_KERNEL_4x4 1 | |||||
| #include <immintrin.h> | |||||
| static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||||
| { | |||||
| int i = 0; | |||||
| __m256d x0, x1, x2, x3; | |||||
| __m256d __alpha; | |||||
| x0 = _mm256_broadcastsd_pd(_mm_load_sd(&x[0])); | |||||
| x1 = _mm256_broadcastsd_pd(_mm_load_sd(&x[1])); | |||||
| x2 = _mm256_broadcastsd_pd(_mm_load_sd(&x[2])); | |||||
| x3 = _mm256_broadcastsd_pd(_mm_load_sd(&x[3])); | |||||
| __alpha = _mm256_broadcastsd_pd(_mm_load_sd(alpha)); | |||||
| #ifdef __AVX512CD__ | |||||
| int n5; | |||||
| __m512d x05, x15, x25, x35; | |||||
| __m512d __alpha5; | |||||
| n5 = n & ~7; | |||||
| x05 = _mm512_broadcastsd_pd(_mm_load_sd(&x[0])); | |||||
| x15 = _mm512_broadcastsd_pd(_mm_load_sd(&x[1])); | |||||
| x25 = _mm512_broadcastsd_pd(_mm_load_sd(&x[2])); | |||||
| x35 = _mm512_broadcastsd_pd(_mm_load_sd(&x[3])); | |||||
| __alpha5 = _mm512_broadcastsd_pd(_mm_load_sd(alpha)); | |||||
| for (; i < n5; i+= 8) { | |||||
| __m512d tempY; | |||||
| __m512d sum; | |||||
| sum = _mm512_loadu_pd(&ap[0][i]) * x05 + | |||||
| _mm512_loadu_pd(&ap[1][i]) * x15 + | |||||
| _mm512_loadu_pd(&ap[2][i]) * x25 + | |||||
| _mm512_loadu_pd(&ap[3][i]) * x35; | |||||
| tempY = _mm512_loadu_pd(&y[i]); | |||||
| tempY += sum * __alpha5; | |||||
| _mm512_storeu_pd(&y[i], tempY); | |||||
| } | |||||
| #endif | |||||
| for (; i < n; i+= 4) { | |||||
| __m256d tempY; | |||||
| __m256d sum; | |||||
| sum = _mm256_loadu_pd(&ap[0][i]) * x0 + | |||||
| _mm256_loadu_pd(&ap[1][i]) * x1 + | |||||
| _mm256_loadu_pd(&ap[2][i]) * x2 + | |||||
| _mm256_loadu_pd(&ap[3][i]) * x3; | |||||
| tempY = _mm256_loadu_pd(&y[i]); | |||||
| tempY += sum * __alpha; | |||||
| _mm256_storeu_pd(&y[i], tempY); | |||||
| } | |||||
| } | |||||
| #define HAVE_KERNEL_4x2 | |||||
| static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||||
| { | |||||
| int i = 0; | |||||
| __m256d x0, x1; | |||||
| __m256d __alpha; | |||||
| x0 = _mm256_broadcastsd_pd(_mm_load_sd(&x[0])); | |||||
| x1 = _mm256_broadcastsd_pd(_mm_load_sd(&x[1])); | |||||
| __alpha = _mm256_broadcastsd_pd(_mm_load_sd(alpha)); | |||||
| for (i = 0; i < n; i+= 4) { | |||||
| __m256d tempY; | |||||
| __m256d sum; | |||||
| sum = _mm256_loadu_pd(&ap[0][i]) * x0 + _mm256_loadu_pd(&ap[1][i]) * x1; | |||||
| tempY = _mm256_loadu_pd(&y[i]); | |||||
| tempY += sum * __alpha; | |||||
| _mm256_storeu_pd(&y[i], tempY); | |||||
| } | |||||
| } | |||||
| #else | |||||
| #include "dgemv_n_microk_haswell-4.c" | |||||
| #endif | |||||
| @@ -31,8 +31,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "dscal_microk_bulldozer-2.c" | #include "dscal_microk_bulldozer-2.c" | ||||
| #elif defined(SANDYBRIDGE) | #elif defined(SANDYBRIDGE) | ||||
| #include "dscal_microk_sandy-2.c" | #include "dscal_microk_sandy-2.c" | ||||
| #elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) | |||||
| #elif defined(HASWELL) || defined(ZEN) | |||||
| #include "dscal_microk_haswell-2.c" | #include "dscal_microk_haswell-2.c" | ||||
| #elif defined (SKYLAKEX) | |||||
| #include "dscal_microk_skylakex-2.c" | |||||
| #endif | #endif | ||||
| @@ -0,0 +1,77 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2014-2015, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| /* need a new enough GCC for avx512 support */ | |||||
| #if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) | |||||
| #include <immintrin.h> | |||||
| #define HAVE_KERNEL_8 1 | |||||
| static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) | |||||
| { | |||||
| int i = 0; | |||||
| #ifdef __AVX512CD__ | |||||
| __m512d __alpha5 = _mm512_broadcastsd_pd(_mm_load_sd(alpha)); | |||||
| for (; i < n; i += 8) { | |||||
| _mm512_storeu_pd(&x[i + 0], __alpha5 * _mm512_loadu_pd(&x[i + 0])); | |||||
| } | |||||
| #else | |||||
| __m256d __alpha = _mm256_broadcastsd_pd(_mm_load_sd(alpha)); | |||||
| for (; i < n; i += 8) { | |||||
| _mm256_storeu_pd(&x[i + 0], __alpha * _mm256_loadu_pd(&x[i + 0])); | |||||
| _mm256_storeu_pd(&x[i + 4], __alpha * _mm256_loadu_pd(&x[i + 4])); | |||||
| } | |||||
| #endif | |||||
| } | |||||
| static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) | |||||
| { | |||||
| int i = 0; | |||||
| /* question to self: Why is this not just memset() */ | |||||
| #ifdef __AVX512CD__ | |||||
| __m512d zero = _mm512_setzero_pd(); | |||||
| for (; i < n; i += 8) { | |||||
| _mm512_storeu_pd(&x[i], zero); | |||||
| } | |||||
| #else | |||||
| __m256d zero = _mm256_setzero_pd(); | |||||
| for (; i < n; i += 8) { | |||||
| _mm256_storeu_pd(&x[i + 0], zero); | |||||
| _mm256_storeu_pd(&x[i + 4], zero); | |||||
| } | |||||
| #endif | |||||
| } | |||||
| #else | |||||
| #include "dscal_microk_haswell-2.c" | |||||
| #endif | |||||
| @@ -30,8 +30,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) | #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) | ||||
| #include "dsymv_L_microk_bulldozer-2.c" | #include "dsymv_L_microk_bulldozer-2.c" | ||||
| #elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) | |||||
| #elif defined(HASWELL) || defined(ZEN) | |||||
| #include "dsymv_L_microk_haswell-2.c" | #include "dsymv_L_microk_haswell-2.c" | ||||
| #elif defined (SKYLAKEX) | |||||
| #include "dsymv_L_microk_skylakex-2.c" | |||||
| #elif defined(SANDYBRIDGE) | #elif defined(SANDYBRIDGE) | ||||
| #include "dsymv_L_microk_sandy-2.c" | #include "dsymv_L_microk_sandy-2.c" | ||||
| #elif defined(NEHALEM) | #elif defined(NEHALEM) | ||||
| @@ -0,0 +1,161 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2014, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| /* need a new enough GCC for avx512 support */ | |||||
| #if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) | |||||
| #include <immintrin.h> | |||||
| #define HAVE_KERNEL_4x4 1 | |||||
| static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) | |||||
| { | |||||
| __m256d accum_0, accum_1, accum_2, accum_3; | |||||
| __m256d temp1_0, temp1_1, temp1_2, temp1_3; | |||||
| /* the 256 bit wide acculmulator vectors start out as zero */ | |||||
| accum_0 = _mm256_setzero_pd(); | |||||
| accum_1 = _mm256_setzero_pd(); | |||||
| accum_2 = _mm256_setzero_pd(); | |||||
| accum_3 = _mm256_setzero_pd(); | |||||
| temp1_0 = _mm256_broadcastsd_pd(_mm_load_sd(&temp1[0])); | |||||
| temp1_1 = _mm256_broadcastsd_pd(_mm_load_sd(&temp1[1])); | |||||
| temp1_2 = _mm256_broadcastsd_pd(_mm_load_sd(&temp1[2])); | |||||
| temp1_3 = _mm256_broadcastsd_pd(_mm_load_sd(&temp1[3])); | |||||
| #ifdef __AVX512CD__ | |||||
| __m512d accum_05, accum_15, accum_25, accum_35; | |||||
| __m512d temp1_05, temp1_15, temp1_25, temp1_35; | |||||
| BLASLONG to2; | |||||
| int delta; | |||||
| /* the 512 bit wide accumulator vectors start out as zero */ | |||||
| accum_05 = _mm512_setzero_pd(); | |||||
| accum_15 = _mm512_setzero_pd(); | |||||
| accum_25 = _mm512_setzero_pd(); | |||||
| accum_35 = _mm512_setzero_pd(); | |||||
| temp1_05 = _mm512_broadcastsd_pd(_mm_load_sd(&temp1[0])); | |||||
| temp1_15 = _mm512_broadcastsd_pd(_mm_load_sd(&temp1[1])); | |||||
| temp1_25 = _mm512_broadcastsd_pd(_mm_load_sd(&temp1[2])); | |||||
| temp1_35 = _mm512_broadcastsd_pd(_mm_load_sd(&temp1[3])); | |||||
| delta = (to - from) & ~7; | |||||
| to2 = from + delta; | |||||
| for (; from < to2; from += 8) { | |||||
| __m512d _x, _y; | |||||
| __m512d a0, a1, a2, a3; | |||||
| _y = _mm512_loadu_pd(&y[from]); | |||||
| _x = _mm512_loadu_pd(&x[from]); | |||||
| a0 = _mm512_loadu_pd(&a[0][from]); | |||||
| a1 = _mm512_loadu_pd(&a[1][from]); | |||||
| a2 = _mm512_loadu_pd(&a[2][from]); | |||||
| a3 = _mm512_loadu_pd(&a[3][from]); | |||||
| _y += temp1_05 * a0 + temp1_15 * a1 + temp1_25 * a2 + temp1_35 * a3; | |||||
| accum_05 += _x * a0; | |||||
| accum_15 += _x * a1; | |||||
| accum_25 += _x * a2; | |||||
| accum_35 += _x * a3; | |||||
| _mm512_storeu_pd(&y[from], _y); | |||||
| }; | |||||
| /* | |||||
| * we need to fold our 512 bit wide accumulator vectors into 256 bit wide vectors so that the AVX2 code | |||||
| * below can continue using the intermediate results in its loop | |||||
| */ | |||||
| accum_0 = _mm256_add_pd(_mm512_extractf64x4_pd(accum_05, 0), _mm512_extractf64x4_pd(accum_05, 1)); | |||||
| accum_1 = _mm256_add_pd(_mm512_extractf64x4_pd(accum_15, 0), _mm512_extractf64x4_pd(accum_15, 1)); | |||||
| accum_2 = _mm256_add_pd(_mm512_extractf64x4_pd(accum_25, 0), _mm512_extractf64x4_pd(accum_25, 1)); | |||||
| accum_3 = _mm256_add_pd(_mm512_extractf64x4_pd(accum_35, 0), _mm512_extractf64x4_pd(accum_35, 1)); | |||||
| #endif | |||||
| for (; from != to; from += 4) { | |||||
| __m256d _x, _y; | |||||
| __m256d a0, a1, a2, a3; | |||||
| _y = _mm256_loadu_pd(&y[from]); | |||||
| _x = _mm256_loadu_pd(&x[from]); | |||||
| /* load 4 rows of matrix data */ | |||||
| a0 = _mm256_loadu_pd(&a[0][from]); | |||||
| a1 = _mm256_loadu_pd(&a[1][from]); | |||||
| a2 = _mm256_loadu_pd(&a[2][from]); | |||||
| a3 = _mm256_loadu_pd(&a[3][from]); | |||||
| _y += temp1_0 * a0 + temp1_1 * a1 + temp1_2 * a2 + temp1_3 * a3; | |||||
| accum_0 += _x * a0; | |||||
| accum_1 += _x * a1; | |||||
| accum_2 += _x * a2; | |||||
| accum_3 += _x * a3; | |||||
| _mm256_storeu_pd(&y[from], _y); | |||||
| }; | |||||
| /* | |||||
| * we now have 4 accumulator vectors. Each vector needs to be summed up element wise and stored in the temp2 | |||||
| * output array. There is no direct instruction for this in 256 bit space, only in 128 space. | |||||
| */ | |||||
| __m128d half_accum0, half_accum1, half_accum2, half_accum3; | |||||
| /* Add upper half to lower half of each of the four 256 bit vectors to get to four 128 bit vectors */ | |||||
| half_accum0 = _mm_add_pd(_mm256_extractf128_pd(accum_0, 0), _mm256_extractf128_pd(accum_0, 1)); | |||||
| half_accum1 = _mm_add_pd(_mm256_extractf128_pd(accum_1, 0), _mm256_extractf128_pd(accum_1, 1)); | |||||
| half_accum2 = _mm_add_pd(_mm256_extractf128_pd(accum_2, 0), _mm256_extractf128_pd(accum_2, 1)); | |||||
| half_accum3 = _mm_add_pd(_mm256_extractf128_pd(accum_3, 0), _mm256_extractf128_pd(accum_3, 1)); | |||||
| /* in 128 bit land there is a hadd operation to do the rest of the element-wise sum in one go */ | |||||
| half_accum0 = _mm_hadd_pd(half_accum0, half_accum0); | |||||
| half_accum1 = _mm_hadd_pd(half_accum1, half_accum1); | |||||
| half_accum2 = _mm_hadd_pd(half_accum2, half_accum2); | |||||
| half_accum3 = _mm_hadd_pd(half_accum3, half_accum3); | |||||
| /* and store the lowest double value from each of these vectors in the temp2 output */ | |||||
| temp2[0] += half_accum0[0]; | |||||
| temp2[1] += half_accum1[0]; | |||||
| temp2[2] += half_accum2[0]; | |||||
| temp2[3] += half_accum3[0]; | |||||
| } | |||||
| #else | |||||
| #include "dsymv_L_microk_haswell-2.c" | |||||
| #endif | |||||
| @@ -31,8 +31,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #if defined(NEHALEM) | #if defined(NEHALEM) | ||||
| #include "saxpy_microk_nehalem-2.c" | #include "saxpy_microk_nehalem-2.c" | ||||
| #elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) | |||||
| #elif defined(HASWELL) || defined(ZEN) | |||||
| #include "saxpy_microk_haswell-2.c" | #include "saxpy_microk_haswell-2.c" | ||||
| #elif defined (SKYLAKEX) | |||||
| #include "saxpy_microk_skylakex-2.c" | |||||
| #elif defined(SANDYBRIDGE) | #elif defined(SANDYBRIDGE) | ||||
| #include "saxpy_microk_sandy-2.c" | #include "saxpy_microk_sandy-2.c" | ||||
| #elif defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) | #elif defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) | ||||
| @@ -0,0 +1,69 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2014, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| /* need a new enough GCC for avx512 support */ | |||||
| #if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) | |||||
| #define HAVE_KERNEL_16 1 | |||||
| #include <immintrin.h> | |||||
| static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||||
| { | |||||
| BLASLONG i = 0; | |||||
| __m256 __alpha; | |||||
| __alpha = _mm256_broadcastss_ps(_mm_load_ss(alpha)); | |||||
| #ifdef __AVX512CD__ | |||||
| BLASLONG n64; | |||||
| __m512 __alpha5; | |||||
| __alpha5 = _mm512_broadcastss_ps(_mm_load_ss(alpha)); | |||||
| n64 = n & ~63; | |||||
| for (; i < n64; i+= 64) { | |||||
| _mm512_storeu_ps(&y[i + 0], _mm512_loadu_ps(&y[i + 0]) + __alpha5 * _mm512_loadu_ps(&x[i + 0])); | |||||
| _mm512_storeu_ps(&y[i + 16], _mm512_loadu_ps(&y[i + 16]) + __alpha5 * _mm512_loadu_ps(&x[i + 16])); | |||||
| _mm512_storeu_ps(&y[i + 32], _mm512_loadu_ps(&y[i + 32]) + __alpha5 * _mm512_loadu_ps(&x[i + 32])); | |||||
| _mm512_storeu_ps(&y[i + 48], _mm512_loadu_ps(&y[i + 48]) + __alpha5 * _mm512_loadu_ps(&x[i + 48])); | |||||
| } | |||||
| #endif | |||||
| for (; i < n; i+= 32) { | |||||
| _mm256_storeu_ps(&y[i + 0], _mm256_loadu_ps(&y[i + 0]) + __alpha * _mm256_loadu_ps(&x[i + 0])); | |||||
| _mm256_storeu_ps(&y[i + 8], _mm256_loadu_ps(&y[i + 8]) + __alpha * _mm256_loadu_ps(&x[i + 8])); | |||||
| _mm256_storeu_ps(&y[i + 16], _mm256_loadu_ps(&y[i + 16]) + __alpha * _mm256_loadu_ps(&x[i + 16])); | |||||
| _mm256_storeu_ps(&y[i + 24], _mm256_loadu_ps(&y[i + 24]) + __alpha * _mm256_loadu_ps(&x[i + 24])); | |||||
| } | |||||
| } | |||||
| #else | |||||
| #include "saxpy_microk_haswell-2.c" | |||||
| #endif | |||||
| @@ -34,8 +34,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "sdot_microk_steamroller-2.c" | #include "sdot_microk_steamroller-2.c" | ||||
| #elif defined(NEHALEM) | #elif defined(NEHALEM) | ||||
| #include "sdot_microk_nehalem-2.c" | #include "sdot_microk_nehalem-2.c" | ||||
| #elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) | |||||
| #elif defined(HASWELL) || defined(ZEN) | |||||
| #include "sdot_microk_haswell-2.c" | #include "sdot_microk_haswell-2.c" | ||||
| #elif defined (SKYLAKEX) | |||||
| #include "sdot_microk_skylakex-2.c" | |||||
| #elif defined(SANDYBRIDGE) | #elif defined(SANDYBRIDGE) | ||||
| #include "sdot_microk_sandy-2.c" | #include "sdot_microk_sandy-2.c" | ||||
| #endif | #endif | ||||
| @@ -0,0 +1,98 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2014, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| /* need a new enough GCC for avx512 support */ | |||||
| #if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) | |||||
| #define HAVE_KERNEL_16 1 | |||||
| #include <immintrin.h> | |||||
| static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) | |||||
| { | |||||
| int i = 0; | |||||
| __m256 accum_0, accum_1, accum_2, accum_3; | |||||
| accum_0 = _mm256_setzero_ps(); | |||||
| accum_1 = _mm256_setzero_ps(); | |||||
| accum_2 = _mm256_setzero_ps(); | |||||
| accum_3 = _mm256_setzero_ps(); | |||||
| #ifdef __AVX512CD__ | |||||
| __m512 accum_05, accum_15, accum_25, accum_35; | |||||
| int n64; | |||||
| n64 = n & (~63); | |||||
| accum_05 = _mm512_setzero_ps(); | |||||
| accum_15 = _mm512_setzero_ps(); | |||||
| accum_25 = _mm512_setzero_ps(); | |||||
| accum_35 = _mm512_setzero_ps(); | |||||
| for (; i < n64; i += 64) { | |||||
| accum_05 += _mm512_loadu_ps(&x[i+ 0]) * _mm512_loadu_ps(&y[i+ 0]); | |||||
| accum_15 += _mm512_loadu_ps(&x[i+16]) * _mm512_loadu_ps(&y[i+16]); | |||||
| accum_25 += _mm512_loadu_ps(&x[i+32]) * _mm512_loadu_ps(&y[i+32]); | |||||
| accum_35 += _mm512_loadu_ps(&x[i+48]) * _mm512_loadu_ps(&y[i+48]); | |||||
| } | |||||
| /* | |||||
| * we need to fold our 512 bit wide accumulator vectors into 256 bit wide vectors so that the AVX2 code | |||||
| * below can continue using the intermediate results in its loop | |||||
| */ | |||||
| accum_0 = _mm512_extractf32x8_ps(accum_05, 0) + _mm512_extractf32x8_ps(accum_05, 1); | |||||
| accum_1 = _mm512_extractf32x8_ps(accum_15, 0) + _mm512_extractf32x8_ps(accum_15, 1); | |||||
| accum_2 = _mm512_extractf32x8_ps(accum_25, 0) + _mm512_extractf32x8_ps(accum_25, 1); | |||||
| accum_3 = _mm512_extractf32x8_ps(accum_35, 0) + _mm512_extractf32x8_ps(accum_35, 1); | |||||
| #endif | |||||
| for (; i < n; i += 32) { | |||||
| accum_0 += _mm256_loadu_ps(&x[i+ 0]) * _mm256_loadu_ps(&y[i+ 0]); | |||||
| accum_1 += _mm256_loadu_ps(&x[i+ 8]) * _mm256_loadu_ps(&y[i+ 8]); | |||||
| accum_2 += _mm256_loadu_ps(&x[i+16]) * _mm256_loadu_ps(&y[i+16]); | |||||
| accum_3 += _mm256_loadu_ps(&x[i+24]) * _mm256_loadu_ps(&y[i+24]); | |||||
| } | |||||
| /* we now have the partial sums of the dot product in the 4 accumulation vectors, time to consolidate */ | |||||
| accum_0 = accum_0 + accum_1 + accum_2 + accum_3; | |||||
| __m128 half_accum0; | |||||
| /* Add upper half to lower half of each of the 256 bit vector to get a 128 bit vector */ | |||||
| half_accum0 = _mm256_extractf128_ps(accum_0, 0) + _mm256_extractf128_ps(accum_0, 1); | |||||
| /* in 128 bit land there is a hadd operation to do the rest of the element-wise sum in one go */ | |||||
| half_accum0 = _mm_hadd_ps(half_accum0, half_accum0); | |||||
| half_accum0 = _mm_hadd_ps(half_accum0, half_accum0); | |||||
| *dot = half_accum0[0]; | |||||
| } | |||||
| #else | |||||
| #include "sdot_microk_haswell-2.c" | |||||
| #endif | |||||
| @@ -280,8 +280,8 @@ | |||||
| * .. | * .. | ||||
| * .. External Functions .. | * .. External Functions .. | ||||
| LOGICAL LSAME | LOGICAL LSAME | ||||
| INTEGER ILAENV | |||||
| EXTERNAL LSAME, ILAENV | |||||
| INTEGER ILAENV2STAGE | |||||
| EXTERNAL LSAME, ILAENV2STAGE | |||||
| * .. | * .. | ||||
| * .. Executable Statements .. | * .. Executable Statements .. | ||||
| * | * | ||||
| @@ -297,9 +297,9 @@ | |||||
| * | * | ||||
| * Determine the block size, the workspace size and the hous size. | * Determine the block size, the workspace size and the hous size. | ||||
| * | * | ||||
| IB = ILAENV( 18, 'CHETRD_HB2ST', VECT, N, KD, -1, -1 ) | |||||
| LHMIN = ILAENV( 19, 'CHETRD_HB2ST', VECT, N, KD, IB, -1 ) | |||||
| LWMIN = ILAENV( 20, 'CHETRD_HB2ST', VECT, N, KD, IB, -1 ) | |||||
| IB = ILAENV2STAGE( 2, 'CHETRD_HB2ST', VECT, N, KD, -1, -1 ) | |||||
| LHMIN = ILAENV2STAGE( 3, 'CHETRD_HB2ST', VECT, N, KD, IB, -1 ) | |||||
| LWMIN = ILAENV2STAGE( 4, 'CHETRD_HB2ST', VECT, N, KD, IB, -1 ) | |||||
| * | * | ||||
| IF( .NOT.AFTERS1 .AND. .NOT.LSAME( STAGE1, 'N' ) ) THEN | IF( .NOT.AFTERS1 .AND. .NOT.LSAME( STAGE1, 'N' ) ) THEN | ||||
| INFO = -1 | INFO = -1 | ||||
| @@ -285,8 +285,8 @@ | |||||
| * .. | * .. | ||||
| * .. External Functions .. | * .. External Functions .. | ||||
| LOGICAL LSAME | LOGICAL LSAME | ||||
| INTEGER ILAENV | |||||
| EXTERNAL LSAME, ILAENV | |||||
| INTEGER ILAENV2STAGE | |||||
| EXTERNAL LSAME, ILAENV2STAGE | |||||
| * .. | * .. | ||||
| * .. Executable Statements .. | * .. Executable Statements .. | ||||
| * | * | ||||
| @@ -296,7 +296,7 @@ | |||||
| INFO = 0 | INFO = 0 | ||||
| UPPER = LSAME( UPLO, 'U' ) | UPPER = LSAME( UPLO, 'U' ) | ||||
| LQUERY = ( LWORK.EQ.-1 ) | LQUERY = ( LWORK.EQ.-1 ) | ||||
| LWMIN = ILAENV( 20, 'CHETRD_HE2HB', '', N, KD, -1, -1 ) | |||||
| LWMIN = ILAENV2STAGE( 4, 'CHETRD_HE2HB', '', N, KD, -1, -1 ) | |||||
| IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN | IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN | ||||
| INFO = -1 | INFO = -1 | ||||
| @@ -277,8 +277,8 @@ | |||||
| * .. | * .. | ||||
| * .. External Functions .. | * .. External Functions .. | ||||
| LOGICAL LSAME | LOGICAL LSAME | ||||
| INTEGER ILAENV | |||||
| EXTERNAL LSAME, ILAENV | |||||
| INTEGER ILAENV2STAGE | |||||
| EXTERNAL LSAME, ILAENV2STAGE | |||||
| * .. | * .. | ||||
| * .. Executable Statements .. | * .. Executable Statements .. | ||||
| * | * | ||||
| @@ -294,9 +294,9 @@ | |||||
| * | * | ||||
| * Determine the block size, the workspace size and the hous size. | * Determine the block size, the workspace size and the hous size. | ||||
| * | * | ||||
| IB = ILAENV( 18, 'DSYTRD_SB2ST', VECT, N, KD, -1, -1 ) | |||||
| LHMIN = ILAENV( 19, 'DSYTRD_SB2ST', VECT, N, KD, IB, -1 ) | |||||
| LWMIN = ILAENV( 20, 'DSYTRD_SB2ST', VECT, N, KD, IB, -1 ) | |||||
| IB = ILAENV2STAGE( 2, 'DSYTRD_SB2ST', VECT, N, KD, -1, -1 ) | |||||
| LHMIN = ILAENV2STAGE( 3, 'DSYTRD_SB2ST', VECT, N, KD, IB, -1 ) | |||||
| LWMIN = ILAENV2STAGE( 4, 'DSYTRD_SB2ST', VECT, N, KD, IB, -1 ) | |||||
| * | * | ||||
| IF( .NOT.AFTERS1 .AND. .NOT.LSAME( STAGE1, 'N' ) ) THEN | IF( .NOT.AFTERS1 .AND. .NOT.LSAME( STAGE1, 'N' ) ) THEN | ||||
| INFO = -1 | INFO = -1 | ||||
| @@ -285,8 +285,8 @@ | |||||
| * .. | * .. | ||||
| * .. External Functions .. | * .. External Functions .. | ||||
| LOGICAL LSAME | LOGICAL LSAME | ||||
| INTEGER ILAENV | |||||
| EXTERNAL LSAME, ILAENV | |||||
| INTEGER ILAENV2STAGE | |||||
| EXTERNAL LSAME, ILAENV2STAGE | |||||
| * .. | * .. | ||||
| * .. Executable Statements .. | * .. Executable Statements .. | ||||
| * | * | ||||
| @@ -296,7 +296,7 @@ | |||||
| INFO = 0 | INFO = 0 | ||||
| UPPER = LSAME( UPLO, 'U' ) | UPPER = LSAME( UPLO, 'U' ) | ||||
| LQUERY = ( LWORK.EQ.-1 ) | LQUERY = ( LWORK.EQ.-1 ) | ||||
| LWMIN = ILAENV( 20, 'DSYTRD_SY2SB', '', N, KD, -1, -1 ) | |||||
| LWMIN = ILAENV2STAGE( 4, 'DSYTRD_SY2SB', '', N, KD, -1, -1 ) | |||||
| IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN | IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN | ||||
| INFO = -1 | INFO = -1 | ||||
| @@ -277,8 +277,8 @@ | |||||
| * .. | * .. | ||||
| * .. External Functions .. | * .. External Functions .. | ||||
| LOGICAL LSAME | LOGICAL LSAME | ||||
| INTEGER ILAENV | |||||
| EXTERNAL LSAME, ILAENV | |||||
| INTEGER ILAENV2STAGE | |||||
| EXTERNAL LSAME, ILAENV2STAGE | |||||
| * .. | * .. | ||||
| * .. Executable Statements .. | * .. Executable Statements .. | ||||
| * | * | ||||
| @@ -294,9 +294,9 @@ | |||||
| * | * | ||||
| * Determine the block size, the workspace size and the hous size. | * Determine the block size, the workspace size and the hous size. | ||||
| * | * | ||||
| IB = ILAENV( 18, 'SSYTRD_SB2ST', VECT, N, KD, -1, -1 ) | |||||
| LHMIN = ILAENV( 19, 'SSYTRD_SB2ST', VECT, N, KD, IB, -1 ) | |||||
| LWMIN = ILAENV( 20, 'SSYTRD_SB2ST', VECT, N, KD, IB, -1 ) | |||||
| IB = ILAENV2STAGE( 2, 'SSYTRD_SB2ST', VECT, N, KD, -1, -1 ) | |||||
| LHMIN = ILAENV2STAGE( 3, 'SSYTRD_SB2ST', VECT, N, KD, IB, -1 ) | |||||
| LWMIN = ILAENV2STAGE( 4, 'SSYTRD_SB2ST', VECT, N, KD, IB, -1 ) | |||||
| * | * | ||||
| IF( .NOT.AFTERS1 .AND. .NOT.LSAME( STAGE1, 'N' ) ) THEN | IF( .NOT.AFTERS1 .AND. .NOT.LSAME( STAGE1, 'N' ) ) THEN | ||||
| INFO = -1 | INFO = -1 | ||||
| @@ -285,8 +285,8 @@ | |||||
| * .. | * .. | ||||
| * .. External Functions .. | * .. External Functions .. | ||||
| LOGICAL LSAME | LOGICAL LSAME | ||||
| INTEGER ILAENV | |||||
| EXTERNAL LSAME, ILAENV | |||||
| INTEGER ILAENV2STAGE | |||||
| EXTERNAL LSAME, ILAENV2STAGE | |||||
| * .. | * .. | ||||
| * .. Executable Statements .. | * .. Executable Statements .. | ||||
| * | * | ||||
| @@ -296,7 +296,7 @@ | |||||
| INFO = 0 | INFO = 0 | ||||
| UPPER = LSAME( UPLO, 'U' ) | UPPER = LSAME( UPLO, 'U' ) | ||||
| LQUERY = ( LWORK.EQ.-1 ) | LQUERY = ( LWORK.EQ.-1 ) | ||||
| LWMIN = ILAENV( 20, 'SSYTRD_SY2SB', '', N, KD, -1, -1 ) | |||||
| LWMIN = ILAENV2STAGE( 4, 'SSYTRD_SY2SB', '', N, KD, -1, -1 ) | |||||
| IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN | IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN | ||||
| INFO = -1 | INFO = -1 | ||||
| @@ -280,8 +280,8 @@ | |||||
| * .. | * .. | ||||
| * .. External Functions .. | * .. External Functions .. | ||||
| LOGICAL LSAME | LOGICAL LSAME | ||||
| INTEGER ILAENV | |||||
| EXTERNAL LSAME, ILAENV | |||||
| INTEGER ILAENV2STAGE | |||||
| EXTERNAL LSAME, ILAENV2STAGE | |||||
| * .. | * .. | ||||
| * .. Executable Statements .. | * .. Executable Statements .. | ||||
| * | * | ||||
| @@ -297,9 +297,9 @@ | |||||
| * | * | ||||
| * Determine the block size, the workspace size and the hous size. | * Determine the block size, the workspace size and the hous size. | ||||
| * | * | ||||
| IB = ILAENV( 18, 'ZHETRD_HB2ST', VECT, N, KD, -1, -1 ) | |||||
| LHMIN = ILAENV( 19, 'ZHETRD_HB2ST', VECT, N, KD, IB, -1 ) | |||||
| LWMIN = ILAENV( 20, 'ZHETRD_HB2ST', VECT, N, KD, IB, -1 ) | |||||
| IB = ILAENV2STAGE( 2, 'ZHETRD_HB2ST', VECT, N, KD, -1, -1 ) | |||||
| LHMIN = ILAENV2STAGE( 3, 'ZHETRD_HB2ST', VECT, N, KD, IB, -1 ) | |||||
| LWMIN = ILAENV2STAGE( 4, 'ZHETRD_HB2ST', VECT, N, KD, IB, -1 ) | |||||
| * | * | ||||
| IF( .NOT.AFTERS1 .AND. .NOT.LSAME( STAGE1, 'N' ) ) THEN | IF( .NOT.AFTERS1 .AND. .NOT.LSAME( STAGE1, 'N' ) ) THEN | ||||
| INFO = -1 | INFO = -1 | ||||
| @@ -285,8 +285,8 @@ | |||||
| * .. | * .. | ||||
| * .. External Functions .. | * .. External Functions .. | ||||
| LOGICAL LSAME | LOGICAL LSAME | ||||
| INTEGER ILAENV | |||||
| EXTERNAL LSAME, ILAENV | |||||
| INTEGER ILAENV2STAGE | |||||
| EXTERNAL LSAME, ILAENV2STAGE | |||||
| * .. | * .. | ||||
| * .. Executable Statements .. | * .. Executable Statements .. | ||||
| * | * | ||||
| @@ -296,7 +296,7 @@ | |||||
| INFO = 0 | INFO = 0 | ||||
| UPPER = LSAME( UPLO, 'U' ) | UPPER = LSAME( UPLO, 'U' ) | ||||
| LQUERY = ( LWORK.EQ.-1 ) | LQUERY = ( LWORK.EQ.-1 ) | ||||
| LWMIN = ILAENV( 20, 'ZHETRD_HE2HB', '', N, KD, -1, -1 ) | |||||
| LWMIN = ILAENV2STAGE( 4, 'ZHETRD_HE2HB', '', N, KD, -1, -1 ) | |||||
| IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN | IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN | ||||
| INFO = -1 | INFO = -1 | ||||