Merge develop into 0.3.x for 0.3.3tags/v0.3.3
| @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5) | |||
| project(OpenBLAS C ASM) | |||
| set(OpenBLAS_MAJOR_VERSION 0) | |||
| set(OpenBLAS_MINOR_VERSION 3) | |||
| set(OpenBLAS_PATCH_VERSION 2) | |||
| set(OpenBLAS_PATCH_VERSION 3.dev) | |||
| set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") | |||
| # Adhere to GNU filesystem layout conventions | |||
| @@ -150,6 +150,7 @@ endif() | |||
| # add objects to the openblas lib | |||
| add_library(${OpenBLAS_LIBNAME} ${LA_SOURCES} ${LAPACKE_SOURCES} ${RELA_SOURCES} ${TARGET_OBJS} ${OpenBLAS_DEF_FILE}) | |||
| target_include_directories(${OpenBLAS_LIBNAME} INTERFACE $<INSTALL_INTERFACE:include>) | |||
| # Android needs to explicitly link against libm | |||
| if(ANDROID) | |||
| @@ -169,6 +170,7 @@ endif() | |||
| # Set output for libopenblas | |||
| set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib) | |||
| set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES LIBRARY_OUTPUT_NAME_DEBUG "${OpenBLAS_LIBNAME}_d") | |||
| set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES EXPORT_NAME "OpenBLAS") | |||
| foreach (OUTPUTCONFIG ${CMAKE_CONFIGURATION_TYPES}) | |||
| string( TOUPPER ${OUTPUTCONFIG} OUTPUTCONFIG ) | |||
| @@ -1,4 +1,115 @@ | |||
| OpenBLAS ChangeLog | |||
| ==================================================================== | |||
| Version 0.3.2 | |||
| 30-Jul-2018 | |||
| common: | |||
| * fixes for regressions caused by the rewrite of the thread | |||
| initialization code in 0.3.1 | |||
| POWER: | |||
| * fixed cpu autodetection for the BSDs | |||
| MIPS64: | |||
| * fixed utest errors in AXPY, DSDOT, ROT and SWAP | |||
| x86_64: | |||
| * added autodetection of AMD Ryzen 2 | |||
| * fixed build with older versions of MSVC | |||
| ==================================================================== | |||
| Version 0.3.1 | |||
| 01-Jul-2018 | |||
| common: | |||
| * rewritten thread initialization code with significantly reduced overhead | |||
| * added CBLAS interfaces to the IxAMIN BLAS extension functions | |||
| * fixed the lapack-test target | |||
| * CMAKE builds now create an OpenBLASConfig.cmake file | |||
| * ZAXPY now uses a single thread for small input sizes | |||
| * the LAPACK code was updated from Reference-LAPACK/lapack#253 | |||
| (fixing LAPACKE interfaces to Aasen's functions) | |||
| POWER: | |||
| * corrected CROT and ZROT behaviour with zero INC_X | |||
| ARMV7: | |||
| * corrected xDOT behaviour with zero INC_X or INC_Y | |||
| x86_64: | |||
| * retired some older targets of DYNAMIC_ARCH builds to a new option DYNAMIC_OLDER, | |||
| this affects PENRYN,DUNNINGTON,OPTERON,OPTERON_SSE3,BOBCAT,ATOM and NANO | |||
| (which will still be supported via the slower PRESCOTT kernels when this option is not set) | |||
| * added an option DYNAMIC_LIST that (used in conjunction with DYNAMIC_ARCH) allows to | |||
| specify the list of x86_64 targets to include. Any target not on the list will be supported | |||
| by the Sandybridge or Nehalem kernels if available, or by Prescott. | |||
| * improved SWITCH_RATIO on Haswell for increased GEMM throughput | |||
| * added initial support for Intel Skylake X, including an AVX512 SGEMM kernel | |||
| * added autodetection of Intel Cannon Lake series as Skylake X | |||
| * added a default L2 cache size for hypervisors that return zero here (Chromebook) | |||
| * fixed a name clash with recent Windows10 headers that broke the build with (at least) | |||
| recent mingw from MSYS2 | |||
| * fixed a link error in mixed clang/gfortran builds with OpenMP | |||
| * updated the OSX deployment target to 10.8 | |||
| * switched on parallel make for builds on MS Windows by default | |||
| x86: | |||
| * fixed SSWAP and DSWAP behaviour with zero INC_X and INC_Y | |||
| ==================================================================== | |||
| Version 0.3.0 | |||
| 23-May-2108 | |||
| common: | |||
| * fixed some more thread race and locking bugs | |||
| * added preliminary support for calling an OpenMP build of the library from multiple threads | |||
| * removed performance impact of thread locks added in 0.2.20 on OpenMP code | |||
| * general code cleanup | |||
| * optimized DSDOT implementation | |||
| * improved thread distribution for GEMM | |||
| * corrected IMATCOPY/OMATCOPY implementation | |||
| * fixed out-of-bounds accesses in the multithreaded xBMV/xPMV and SYMV implementations | |||
| * cmake build improvements | |||
| * pkgconfig file now contains build options | |||
| * openblas_get_config() now reports USE_OPENMP and NUM_THREADS settings used for the build | |||
| * corrections and improvements for systems with more than 64 cpus | |||
| * LAPACK code updated to 3.8.0 including later fixes | |||
| * added ReLAPACK, a recursive implementation of several LAPACK functions | |||
| * Rewrote ROTMG to handle cases that the netlib code failed to address | |||
| * Disabled (broken) multithreading code for xTRMV | |||
| * corrected prototypes of complex CBLAS functions to make our cblas.h match the generally accepted standard | |||
| * shared memory access failures on startup are now handled more gracefully | |||
| * restored utests from earlier releases (and made them pass on all affected systems) | |||
| SPARC: | |||
| * several fixes for cpu autodetection | |||
| POWER: | |||
| * corrected vector register overwriting in several Power8 kernels | |||
| * optimized additional BLAS functions | |||
| ARM: | |||
| * added support for CortexA53 and A72 | |||
| * added autodetection for ThunderX2T99 | |||
| * made most optimized kernels the default for generic ARMv8 targets | |||
| x86_64: | |||
| * parallelized DDOT kernel for Haswell | |||
| * changed alignment directives in assembly kernels to boost performance on OSX | |||
| * fixed register handling in the GEMV microkernels (bug exposed by gcc7) | |||
| * added support for building on OpenBSD and Dragonfly | |||
| * updated compiler options to work with Intel release 2018 | |||
| * support fully optimized build with clang/flang on Microsoft Windows | |||
| * fixed building on AIX | |||
| IBM Z: | |||
| * added optimized BLAS 1/2 functions | |||
| MIPS: | |||
| * fixed cpu autodetection helper code | |||
| * added mips32 1004K cpu (Mediatek MT7621 and similar SoC) | |||
| * added mips64 I6500 cpu | |||
| ==================================================================== | |||
| Version 0.2.20 | |||
| 24-Jul-2017 | |||
| @@ -97,7 +97,7 @@ endif | |||
| shared : | |||
| ifndef NO_SHARED | |||
| ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android)) | |||
| ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku)) | |||
| @$(MAKE) -C exports so | |||
| @ln -fs $(LIBSONAME) $(LIBPREFIX).so | |||
| @ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION) | |||
| @@ -267,6 +267,8 @@ ifeq ($(F_COMPILER), GFORTRAN) | |||
| ifdef SMP | |||
| ifeq ($(OSNAME), WINNT) | |||
| -@echo "LOADER = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc | |||
| else ifeq ($(OSNAME), Haiku) | |||
| -@echo "LOADER = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc | |||
| else | |||
| -@echo "LOADER = $(FC) -pthread" >> $(NETLIB_LAPACK_DIR)/make.inc | |||
| endif | |||
| @@ -66,7 +66,7 @@ endif | |||
| #for install shared library | |||
| ifndef NO_SHARED | |||
| @echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) | |||
| ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android)) | |||
| ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku)) | |||
| @install -pm755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" | |||
| @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ | |||
| ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \ | |||
| @@ -3,7 +3,7 @@ | |||
| # | |||
| # This library's version | |||
| VERSION = 0.3.2 | |||
| VERSION = 0.3.3.dev | |||
| # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a | |||
| # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library | |||
| @@ -107,7 +107,13 @@ BUILD_LAPACK_DEPRECATED = 1 | |||
| # BUILD_RELAPACK = 1 | |||
| # If you want to use legacy threaded Level 3 implementation. | |||
| # USE_SIMPLE_THREADED_LEVEL3 = 1 | |||
| USE_SIMPLE_THREADED_LEVEL3 = 1 | |||
| # If you want to use the new, still somewhat experimental code that uses | |||
| # thread-local storage instead of a central memory buffer in memory.c | |||
| # Note that if your system uses GLIBC, it needs to have at least glibc 2.21 | |||
| # for this to work. | |||
| USE_TLS = 1 | |||
| # If you want to drive whole 64bit region by BLAS. Not all Fortran | |||
| # compiler supports this. It's safe to keep comment it out if you | |||
| @@ -1018,6 +1018,10 @@ ifdef USE_SIMPLE_THREADED_LEVEL3 | |||
| CCOMMON_OPT += -DUSE_SIMPLE_THREADED_LEVEL3 | |||
| endif | |||
| ifdef USE_TLS | |||
| CCOMMON_OPT += -DUSE_TLS | |||
| endif | |||
| ifndef SYMBOLPREFIX | |||
| SYMBOLPREFIX = | |||
| endif | |||
| @@ -12,6 +12,9 @@ ifeq ($(CORE), SKYLAKEX) | |||
| ifndef NO_AVX512 | |||
| CCOMMON_OPT += -march=skylake-avx512 | |||
| FCOMMON_OPT += -march=skylake-avx512 | |||
| ifeq ($(OSNAME), CYGWIN_NT) | |||
| CCOMMON_OPT += -fno-asynchronous-unwind-tables | |||
| endif | |||
| endif | |||
| endif | |||
| @@ -110,6 +110,7 @@ Please read `GotoBLAS_01Readme.txt`. | |||
| - **Intel Xeon 56xx (Westmere)**: Used GotoBLAS2 Nehalem codes. | |||
| - **Intel Sandy Bridge**: Optimized Level-3 and Level-2 BLAS with AVX on x86-64. | |||
| - **Intel Haswell**: Optimized Level-3 and Level-2 BLAS with AVX2 and FMA on x86-64. | |||
| - **Intel Skylake**: Optimized Level-3 and Level-2 BLAS with AVX512 and FMA on x86-64. | |||
| - **AMD Bobcat**: Used GotoBLAS2 Barcelona codes. | |||
| - **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thanks to Werner Saar) | |||
| - **AMD PILEDRIVER**: Uses Bulldozer codes with some optimizations. | |||
| @@ -200,6 +201,7 @@ Please see Changelog.txt to view the differences between OpenBLAS and GotoBLAS2 | |||
| * Please use GCC version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MinGW/BSD. | |||
| * Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture. | |||
| Clang 3.0 will generate the wrong AVX binary code. | |||
| * Please use GCC version 6 or LLVM version 6 and above to compile Skyalke AVX512 kernels. | |||
| * The number of CPUs/cores should less than or equal to 256. On Linux `x86_64` (`amd64`), | |||
| there is experimental support for up to 1024 CPUs/cores and 128 numa nodes if you build | |||
| the library with `BIGNUMA=1`. | |||
| @@ -122,7 +122,7 @@ int main(int argc, char *argv[]){ | |||
| FLOAT *a, *x, *y; | |||
| FLOAT alpha[] = {1.0, 1.0}; | |||
| FLOAT beta [] = {1.0, 1.0}; | |||
| FLOAT beta [] = {1.0, 0.0}; | |||
| char trans='N'; | |||
| blasint m, i, j; | |||
| blasint inc_x=1,inc_y=1; | |||
| @@ -64,6 +64,7 @@ $os = WINNT if ($data =~ /OS_WINNT/); | |||
| $os = CYGWIN_NT if ($data =~ /OS_CYGWIN_NT/); | |||
| $os = Interix if ($data =~ /OS_INTERIX/); | |||
| $os = Android if ($data =~ /OS_ANDROID/); | |||
| $os = Haiku if ($data =~ /OS_HAIKU/); | |||
| $architecture = x86 if ($data =~ /ARCH_X86/); | |||
| $architecture = x86_64 if ($data =~ /ARCH_X86_64/); | |||
| @@ -223,7 +224,6 @@ $data =~ /globl\s([_\.]*)(.*)/; | |||
| $need_fu = $1; | |||
| $cross = 0; | |||
| $cross = 1 if ($os ne $hostos); | |||
| if ($architecture ne $hostarch) { | |||
| $cross = 1; | |||
| @@ -231,6 +231,8 @@ if ($architecture ne $hostarch) { | |||
| $cross = 0 if (($hostarch eq "mips64") && ($architecture eq "mips")); | |||
| } | |||
| $cross = 1 if ($os ne $hostos); | |||
| $openmp = "" if $ENV{USE_OPENMP} != 1; | |||
| $linker_L = ""; | |||
| @@ -214,6 +214,10 @@ if (CONSISTENT_FPCSR) | |||
| set(CCOMMON_OPT "${CCOMMON_OPT} -DCONSISTENT_FPCSR") | |||
| endif () | |||
| if (USE_TLS) | |||
| set(CCOMMON_OPT "${CCOMMON_OPT} -DUSE_TLS") | |||
| endif () | |||
| # Only for development | |||
| # set(CCOMMON_OPT "${CCOMMON_OPT} -DPARAMTEST") | |||
| # set(CCOMMON_OPT "${CCOMMON_OPT} -DPREFETCHTEST") | |||
| @@ -105,6 +105,10 @@ extern "C" { | |||
| #endif | |||
| #endif | |||
| #ifdef OS_HAIKU | |||
| #define NO_SYSV_IPC | |||
| #endif | |||
| #ifdef OS_WINDOWS | |||
| #ifdef ATOM | |||
| #define GOTO_ATOM ATOM | |||
| @@ -253,8 +257,14 @@ typedef unsigned long BLASULONG; | |||
| #ifdef USE64BITINT | |||
| typedef BLASLONG blasint; | |||
| #if defined(OS_WINDOWS) && defined(__64BIT__) | |||
| #define blasabs(x) llabs(x) | |||
| #else | |||
| #define blasabs(x) labs(x) | |||
| #endif | |||
| #else | |||
| typedef int blasint; | |||
| #define blasabs(x) abs(x) | |||
| #endif | |||
| #else | |||
| #ifdef USE64BITINT | |||
| @@ -29,15 +29,18 @@ | |||
| #define CPU_GENERIC 0 | |||
| #define CPU_Z13 1 | |||
| #define CPU_Z14 2 | |||
| static char *cpuname[] = { | |||
| "ZARCH_GENERIC", | |||
| "Z13" | |||
| "Z13", | |||
| "Z14" | |||
| }; | |||
| static char *cpuname_lower[] = { | |||
| "zarch_generic", | |||
| "z13" | |||
| "z13", | |||
| "z14" | |||
| }; | |||
| int detect(void) | |||
| @@ -62,6 +65,10 @@ int detect(void) | |||
| if (strstr(p, "2964")) return CPU_Z13; | |||
| if (strstr(p, "2965")) return CPU_Z13; | |||
| /* detect z14, but fall back to z13 */ | |||
| if (strstr(p, "3906")) return CPU_Z13; | |||
| if (strstr(p, "3907")) return CPU_Z13; | |||
| return CPU_GENERIC; | |||
| } | |||
| @@ -107,5 +114,9 @@ void get_cpuconfig(void) | |||
| printf("#define Z13\n"); | |||
| printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||
| break; | |||
| case CPU_Z14: | |||
| printf("#define Z14\n"); | |||
| printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||
| break; | |||
| } | |||
| } | |||
| @@ -101,6 +101,10 @@ OS_INTERIX | |||
| OS_LINUX | |||
| #endif | |||
| #if defined(__HAIKU__) | |||
| OS_HAIKU | |||
| #endif | |||
| #if defined(__i386) || defined(_X86) | |||
| ARCH_X86 | |||
| #endif | |||
| @@ -70,7 +70,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| /*********************************************************************/ | |||
| #include "common.h" | |||
| #if defined(OS_LINUX) || defined(OS_NETBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_SUNOS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) | |||
| #if defined(OS_LINUX) || defined(OS_NETBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_SUNOS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_HAIKU) | |||
| #include <dlfcn.h> | |||
| #include <signal.h> | |||
| #include <sys/resource.h> | |||
| @@ -122,7 +122,7 @@ endif | |||
| dllinit.$(SUFFIX) : dllinit.c | |||
| $(CC) $(CFLAGS) -c -o $(@F) -s $< | |||
| ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android)) | |||
| ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku)) | |||
| so : ../$(LIBSONAME) | |||
| @@ -213,7 +213,7 @@ void CNAME(enum CBLAS_ORDER order, | |||
| if (trans) lenx = m; | |||
| if (trans) leny = n; | |||
| if (beta != ONE) SCAL_K(leny, 0, 0, beta, y, abs(incy), NULL, 0, NULL, 0); | |||
| if (beta != ONE) SCAL_K(leny, 0, 0, beta, y, blasabs(incy), NULL, 0, NULL, 0); | |||
| if (alpha == ZERO) return; | |||
| @@ -199,7 +199,7 @@ void CNAME(enum CBLAS_ORDER order, | |||
| if (trans) lenx = m; | |||
| if (trans) leny = n; | |||
| if (beta != ONE) SCAL_K(leny, 0, 0, beta, y, abs(incy), NULL, 0, NULL, 0); | |||
| if (beta != ONE) SCAL_K(leny, 0, 0, beta, y, blasabs(incy), NULL, 0, NULL, 0); | |||
| if (alpha == ZERO) return; | |||
| @@ -22,8 +22,8 @@ void CNAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){ | |||
| long double s; | |||
| long double r, roe, z; | |||
| long double ada = fabs(da); | |||
| long double adb = fabs(db); | |||
| long double ada = fabsl(da); | |||
| long double adb = fabsl(db); | |||
| long double scale = ada + adb; | |||
| #ifndef CBLAS | |||
| @@ -184,7 +184,7 @@ void CNAME(enum CBLAS_ORDER order, | |||
| if (n == 0) return; | |||
| if (beta != ONE) SCAL_K(n, 0, 0, beta, y, abs(incy), NULL, 0, NULL, 0); | |||
| if (beta != ONE) SCAL_K(n, 0, 0, beta, y, blasabs(incy), NULL, 0, NULL, 0); | |||
| if (alpha == ZERO) return; | |||
| @@ -168,7 +168,7 @@ void CNAME(enum CBLAS_ORDER order, | |||
| if (n == 0) return; | |||
| if (beta != ONE) SCAL_K(n, 0, 0, beta, y, abs(incy), NULL, 0, NULL, 0); | |||
| if (beta != ONE) SCAL_K(n, 0, 0, beta, y, blasabs(incy), NULL, 0, NULL, 0); | |||
| if (alpha == ZERO) return; | |||
| @@ -166,7 +166,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha, | |||
| if (n == 0) return; | |||
| if (beta != ONE) SCAL_K(n, 0, 0, beta, y, abs(incy), NULL, 0, NULL, 0); | |||
| if (beta != ONE) SCAL_K(n, 0, 0, beta, y, blasabs(incy), NULL, 0, NULL, 0); | |||
| if (alpha == ZERO) return; | |||
| @@ -237,7 +237,7 @@ void CNAME(enum CBLAS_ORDER order, | |||
| if (trans & 1) lenx = m; | |||
| if (trans & 1) leny = n; | |||
| if (beta_r != ONE || beta_i != ZERO) SCAL_K(leny, 0, 0, beta_r, beta_i, y, abs(incy), NULL, 0, NULL, 0); | |||
| if (beta_r != ONE || beta_i != ZERO) SCAL_K(leny, 0, 0, beta_r, beta_i, y, blasabs(incy), NULL, 0, NULL, 0); | |||
| if (alpha_r == ZERO && alpha_i == ZERO) return; | |||
| @@ -225,7 +225,7 @@ void CNAME(enum CBLAS_ORDER order, | |||
| if (trans & 1) lenx = m; | |||
| if (trans & 1) leny = n; | |||
| if (beta_r != ONE || beta_i != ZERO) SCAL_K(leny, 0, 0, beta_r, beta_i, y, abs(incy), NULL, 0, NULL, 0); | |||
| if (beta_r != ONE || beta_i != ZERO) SCAL_K(leny, 0, 0, beta_r, beta_i, y, blasabs(incy), NULL, 0, NULL, 0); | |||
| if (alpha_r == ZERO && alpha_i == ZERO) return; | |||
| @@ -190,7 +190,7 @@ void CNAME(enum CBLAS_ORDER order, | |||
| if (n == 0) return; | |||
| if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, y, abs(incy), NULL, 0, NULL, 0); | |||
| if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, y, blasabs(incy), NULL, 0, NULL, 0); | |||
| if ((alpha_r == ZERO) && (alpha_i == ZERO)) return; | |||
| @@ -181,7 +181,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, void *VALPHA | |||
| if (n == 0) return; | |||
| if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, y, abs(incy), NULL, 0, NULL, 0); | |||
| if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, y, blasabs(incy), NULL, 0, NULL, 0); | |||
| if ((alpha_r == ZERO) && (alpha_i == ZERO)) return; | |||
| @@ -180,7 +180,7 @@ void CNAME(enum CBLAS_ORDER order, | |||
| if (n == 0) return; | |||
| if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, y, abs(incy), NULL, 0, NULL, 0); | |||
| if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, y, blasabs(incy), NULL, 0, NULL, 0); | |||
| if ((alpha_r == ZERO) && (alpha_i == ZERO)) return; | |||
| @@ -14,7 +14,7 @@ void NAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){ | |||
| long double db_i = *(DB + 1); | |||
| long double r; | |||
| long double ada = fabs(da_r) + fabs(da_i); | |||
| long double ada = fabsl(da_r) + fabsl(da_i); | |||
| PRINT_DEBUG_NAME; | |||
| @@ -126,7 +126,7 @@ void NAME(char *UPLO, blasint *N, blasint *K, FLOAT *ALPHA, FLOAT *a, blasint * | |||
| if (n == 0) return; | |||
| if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, c, abs(incy), NULL, 0, NULL, 0); | |||
| if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, c, blasabs(incy), NULL, 0, NULL, 0); | |||
| if ((alpha_r == ZERO) && (alpha_i == ZERO)) return; | |||
| @@ -44,7 +44,7 @@ ifeq ($(CORE), POWER8) | |||
| USE_TRMM = 1 | |||
| endif | |||
| ifeq ($(CORE), Z13) | |||
| ifeq ($(ARCH), zarch) | |||
| USE_TRMM = 1 | |||
| endif | |||
| @@ -37,8 +37,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "daxpy_microk_steamroller-2.c" | |||
| #elif defined(PILEDRIVER) | |||
| #include "daxpy_microk_piledriver-2.c" | |||
| #elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) | |||
| #elif defined(HASWELL) || defined(ZEN) | |||
| #include "daxpy_microk_haswell-2.c" | |||
| #elif defined (SKYLAKEX) | |||
| #include "daxpy_microk_skylakex-2.c" | |||
| #elif defined(SANDYBRIDGE) | |||
| #include "daxpy_microk_sandy-2.c" | |||
| #endif | |||
| @@ -0,0 +1,71 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2014, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /* need a new enough GCC for avx512 support */ | |||
| #if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) | |||
| #include <immintrin.h> | |||
| #define HAVE_KERNEL_8 1 | |||
| static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| { | |||
| BLASLONG i = 0; | |||
| __m256d __alpha; | |||
| __alpha = _mm256_broadcastsd_pd(_mm_load_sd(alpha)); | |||
| #ifdef __AVX512CD__ | |||
| BLASLONG n32; | |||
| __m512d __alpha5; | |||
| __alpha5 = _mm512_broadcastsd_pd(_mm_load_sd(alpha)); | |||
| n32 = n & ~31; | |||
| for (; i < n32; i+= 32) { | |||
| _mm512_storeu_pd(&y[i + 0], _mm512_loadu_pd(&y[i + 0]) + __alpha5 * _mm512_loadu_pd(&x[i + 0])); | |||
| _mm512_storeu_pd(&y[i + 8], _mm512_loadu_pd(&y[i + 8]) + __alpha5 * _mm512_loadu_pd(&x[i + 8])); | |||
| _mm512_storeu_pd(&y[i + 16], _mm512_loadu_pd(&y[i + 16]) + __alpha5 * _mm512_loadu_pd(&x[i + 16])); | |||
| _mm512_storeu_pd(&y[i + 24], _mm512_loadu_pd(&y[i + 24]) + __alpha5 * _mm512_loadu_pd(&x[i + 24])); | |||
| } | |||
| #endif | |||
| for (; i < n; i+= 16) { | |||
| _mm256_storeu_pd(&y[i + 0], _mm256_loadu_pd(&y[i + 0]) + __alpha * _mm256_loadu_pd(&x[i + 0])); | |||
| _mm256_storeu_pd(&y[i + 4], _mm256_loadu_pd(&y[i + 4]) + __alpha * _mm256_loadu_pd(&x[i + 4])); | |||
| _mm256_storeu_pd(&y[i + 8], _mm256_loadu_pd(&y[i + 8]) + __alpha * _mm256_loadu_pd(&x[i + 8])); | |||
| _mm256_storeu_pd(&y[i + 12], _mm256_loadu_pd(&y[i + 12]) + __alpha * _mm256_loadu_pd(&x[i + 12])); | |||
| } | |||
| } | |||
| #else | |||
| #include "daxpy_microk_haswell-2.c" | |||
| #endif | |||
| @@ -37,8 +37,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "ddot_microk_piledriver-2.c" | |||
| #elif defined(NEHALEM) | |||
| #include "ddot_microk_nehalem-2.c" | |||
| #elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) | |||
| #elif defined(HASWELL) || defined(ZEN) | |||
| #include "ddot_microk_haswell-2.c" | |||
| #elif defined (SKYLAKEX) | |||
| #include "ddot_microk_skylakex-2.c" | |||
| #elif defined(SANDYBRIDGE) | |||
| #include "ddot_microk_sandy-2.c" | |||
| #endif | |||
| @@ -0,0 +1,96 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2014, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /* need a new enough GCC for avx512 support */ | |||
| #if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) | |||
| #define HAVE_KERNEL_8 1 | |||
| #include <immintrin.h> | |||
| static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) | |||
| { | |||
| int i = 0; | |||
| __m256d accum_0, accum_1, accum_2, accum_3; | |||
| accum_0 = _mm256_setzero_pd(); | |||
| accum_1 = _mm256_setzero_pd(); | |||
| accum_2 = _mm256_setzero_pd(); | |||
| accum_3 = _mm256_setzero_pd(); | |||
| #ifdef __AVX512CD__ | |||
| __m512d accum_05, accum_15, accum_25, accum_35; | |||
| int n32; | |||
| n32 = n & (~31); | |||
| accum_05 = _mm512_setzero_pd(); | |||
| accum_15 = _mm512_setzero_pd(); | |||
| accum_25 = _mm512_setzero_pd(); | |||
| accum_35 = _mm512_setzero_pd(); | |||
| for (; i < n32; i += 32) { | |||
| accum_05 += _mm512_loadu_pd(&x[i+ 0]) * _mm512_loadu_pd(&y[i+ 0]); | |||
| accum_15 += _mm512_loadu_pd(&x[i+ 8]) * _mm512_loadu_pd(&y[i+ 8]); | |||
| accum_25 += _mm512_loadu_pd(&x[i+16]) * _mm512_loadu_pd(&y[i+16]); | |||
| accum_35 += _mm512_loadu_pd(&x[i+24]) * _mm512_loadu_pd(&y[i+24]); | |||
| } | |||
| /* | |||
| * we need to fold our 512 bit wide accumulator vectors into 256 bit wide vectors so that the AVX2 code | |||
| * below can continue using the intermediate results in its loop | |||
| */ | |||
| accum_0 = _mm512_extractf64x4_pd(accum_05, 0) + _mm512_extractf64x4_pd(accum_05, 1); | |||
| accum_1 = _mm512_extractf64x4_pd(accum_15, 0) + _mm512_extractf64x4_pd(accum_15, 1); | |||
| accum_2 = _mm512_extractf64x4_pd(accum_25, 0) + _mm512_extractf64x4_pd(accum_25, 1); | |||
| accum_3 = _mm512_extractf64x4_pd(accum_35, 0) + _mm512_extractf64x4_pd(accum_35, 1); | |||
| #endif | |||
| for (; i < n; i += 16) { | |||
| accum_0 += _mm256_loadu_pd(&x[i+ 0]) * _mm256_loadu_pd(&y[i+ 0]); | |||
| accum_1 += _mm256_loadu_pd(&x[i+ 4]) * _mm256_loadu_pd(&y[i+ 4]); | |||
| accum_2 += _mm256_loadu_pd(&x[i+ 8]) * _mm256_loadu_pd(&y[i+ 8]); | |||
| accum_3 += _mm256_loadu_pd(&x[i+12]) * _mm256_loadu_pd(&y[i+12]); | |||
| } | |||
| /* we now have the partial sums of the dot product in the 4 accumulation vectors, time to consolidate */ | |||
| accum_0 = accum_0 + accum_1 + accum_2 + accum_3; | |||
| __m128d half_accum0; | |||
| /* Add upper half to lower half of each of the 256 bit vector to get a 128 bit vector */ | |||
| half_accum0 = _mm_add_pd(_mm256_extractf128_pd(accum_0, 0), _mm256_extractf128_pd(accum_0, 1)); | |||
| /* in 128 bit land there is a hadd operation to do the rest of the element-wise sum in one go */ | |||
| half_accum0 = _mm_hadd_pd(half_accum0, half_accum0); | |||
| *dot = half_accum0[0]; | |||
| } | |||
| #else | |||
| #include "ddot_microk_haswell-2.c" | |||
| #endif | |||
| @@ -31,8 +31,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #if defined(NEHALEM) | |||
| #include "dgemv_n_microk_nehalem-4.c" | |||
| #elif defined(HASWELL) || defined(ZEN) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined (SKYLAKEX) | |||
| #elif defined(HASWELL) || defined(ZEN) || defined(STEAMROLLER) || defined(EXCAVATOR) | |||
| #include "dgemv_n_microk_haswell-4.c" | |||
| #elif defined (SKYLAKEX) | |||
| #include "dgemv_n_microk_skylakex-4.c" | |||
| #endif | |||
| @@ -0,0 +1,126 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2014, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /* need a new enough GCC for avx512 support */ | |||
| #if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) | |||
| #define HAVE_KERNEL_4x4 1 | |||
| #include <immintrin.h> | |||
| static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| { | |||
| int i = 0; | |||
| __m256d x0, x1, x2, x3; | |||
| __m256d __alpha; | |||
| x0 = _mm256_broadcastsd_pd(_mm_load_sd(&x[0])); | |||
| x1 = _mm256_broadcastsd_pd(_mm_load_sd(&x[1])); | |||
| x2 = _mm256_broadcastsd_pd(_mm_load_sd(&x[2])); | |||
| x3 = _mm256_broadcastsd_pd(_mm_load_sd(&x[3])); | |||
| __alpha = _mm256_broadcastsd_pd(_mm_load_sd(alpha)); | |||
| #ifdef __AVX512CD__ | |||
| int n5; | |||
| __m512d x05, x15, x25, x35; | |||
| __m512d __alpha5; | |||
| n5 = n & ~7; | |||
| x05 = _mm512_broadcastsd_pd(_mm_load_sd(&x[0])); | |||
| x15 = _mm512_broadcastsd_pd(_mm_load_sd(&x[1])); | |||
| x25 = _mm512_broadcastsd_pd(_mm_load_sd(&x[2])); | |||
| x35 = _mm512_broadcastsd_pd(_mm_load_sd(&x[3])); | |||
| __alpha5 = _mm512_broadcastsd_pd(_mm_load_sd(alpha)); | |||
| for (; i < n5; i+= 8) { | |||
| __m512d tempY; | |||
| __m512d sum; | |||
| sum = _mm512_loadu_pd(&ap[0][i]) * x05 + | |||
| _mm512_loadu_pd(&ap[1][i]) * x15 + | |||
| _mm512_loadu_pd(&ap[2][i]) * x25 + | |||
| _mm512_loadu_pd(&ap[3][i]) * x35; | |||
| tempY = _mm512_loadu_pd(&y[i]); | |||
| tempY += sum * __alpha5; | |||
| _mm512_storeu_pd(&y[i], tempY); | |||
| } | |||
| #endif | |||
| for (; i < n; i+= 4) { | |||
| __m256d tempY; | |||
| __m256d sum; | |||
| sum = _mm256_loadu_pd(&ap[0][i]) * x0 + | |||
| _mm256_loadu_pd(&ap[1][i]) * x1 + | |||
| _mm256_loadu_pd(&ap[2][i]) * x2 + | |||
| _mm256_loadu_pd(&ap[3][i]) * x3; | |||
| tempY = _mm256_loadu_pd(&y[i]); | |||
| tempY += sum * __alpha; | |||
| _mm256_storeu_pd(&y[i], tempY); | |||
| } | |||
| } | |||
| #define HAVE_KERNEL_4x2 | |||
| static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| { | |||
| int i = 0; | |||
| __m256d x0, x1; | |||
| __m256d __alpha; | |||
| x0 = _mm256_broadcastsd_pd(_mm_load_sd(&x[0])); | |||
| x1 = _mm256_broadcastsd_pd(_mm_load_sd(&x[1])); | |||
| __alpha = _mm256_broadcastsd_pd(_mm_load_sd(alpha)); | |||
| for (i = 0; i < n; i+= 4) { | |||
| __m256d tempY; | |||
| __m256d sum; | |||
| sum = _mm256_loadu_pd(&ap[0][i]) * x0 + _mm256_loadu_pd(&ap[1][i]) * x1; | |||
| tempY = _mm256_loadu_pd(&y[i]); | |||
| tempY += sum * __alpha; | |||
| _mm256_storeu_pd(&y[i], tempY); | |||
| } | |||
| } | |||
| #else | |||
| #include "dgemv_n_microk_haswell-4.c" | |||
| #endif | |||
| @@ -31,8 +31,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "dscal_microk_bulldozer-2.c" | |||
| #elif defined(SANDYBRIDGE) | |||
| #include "dscal_microk_sandy-2.c" | |||
| #elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) | |||
| #elif defined(HASWELL) || defined(ZEN) | |||
| #include "dscal_microk_haswell-2.c" | |||
| #elif defined (SKYLAKEX) | |||
| #include "dscal_microk_skylakex-2.c" | |||
| #endif | |||
| @@ -0,0 +1,77 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2014-2015, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /* need a new enough GCC for avx512 support */ | |||
| #if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) | |||
| #include <immintrin.h> | |||
| #define HAVE_KERNEL_8 1 | |||
| static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) | |||
| { | |||
| int i = 0; | |||
| #ifdef __AVX512CD__ | |||
| __m512d __alpha5 = _mm512_broadcastsd_pd(_mm_load_sd(alpha)); | |||
| for (; i < n; i += 8) { | |||
| _mm512_storeu_pd(&x[i + 0], __alpha5 * _mm512_loadu_pd(&x[i + 0])); | |||
| } | |||
| #else | |||
| __m256d __alpha = _mm256_broadcastsd_pd(_mm_load_sd(alpha)); | |||
| for (; i < n; i += 8) { | |||
| _mm256_storeu_pd(&x[i + 0], __alpha * _mm256_loadu_pd(&x[i + 0])); | |||
| _mm256_storeu_pd(&x[i + 4], __alpha * _mm256_loadu_pd(&x[i + 4])); | |||
| } | |||
| #endif | |||
| } | |||
| static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) | |||
| { | |||
| int i = 0; | |||
| /* question to self: Why is this not just memset() */ | |||
| #ifdef __AVX512CD__ | |||
| __m512d zero = _mm512_setzero_pd(); | |||
| for (; i < n; i += 8) { | |||
| _mm512_storeu_pd(&x[i], zero); | |||
| } | |||
| #else | |||
| __m256d zero = _mm256_setzero_pd(); | |||
| for (; i < n; i += 8) { | |||
| _mm256_storeu_pd(&x[i + 0], zero); | |||
| _mm256_storeu_pd(&x[i + 4], zero); | |||
| } | |||
| #endif | |||
| } | |||
| #else | |||
| #include "dscal_microk_haswell-2.c" | |||
| #endif | |||
| @@ -30,8 +30,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) | |||
| #include "dsymv_L_microk_bulldozer-2.c" | |||
| #elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) | |||
| #elif defined(HASWELL) || defined(ZEN) | |||
| #include "dsymv_L_microk_haswell-2.c" | |||
| #elif defined (SKYLAKEX) | |||
| #include "dsymv_L_microk_skylakex-2.c" | |||
| #elif defined(SANDYBRIDGE) | |||
| #include "dsymv_L_microk_sandy-2.c" | |||
| #elif defined(NEHALEM) | |||
| @@ -0,0 +1,161 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2014, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /* need a new enough GCC for avx512 support */ | |||
| #if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) | |||
| #include <immintrin.h> | |||
| #define HAVE_KERNEL_4x4 1 | |||
| static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) | |||
| { | |||
| __m256d accum_0, accum_1, accum_2, accum_3; | |||
| __m256d temp1_0, temp1_1, temp1_2, temp1_3; | |||
| /* the 256 bit wide acculmulator vectors start out as zero */ | |||
| accum_0 = _mm256_setzero_pd(); | |||
| accum_1 = _mm256_setzero_pd(); | |||
| accum_2 = _mm256_setzero_pd(); | |||
| accum_3 = _mm256_setzero_pd(); | |||
| temp1_0 = _mm256_broadcastsd_pd(_mm_load_sd(&temp1[0])); | |||
| temp1_1 = _mm256_broadcastsd_pd(_mm_load_sd(&temp1[1])); | |||
| temp1_2 = _mm256_broadcastsd_pd(_mm_load_sd(&temp1[2])); | |||
| temp1_3 = _mm256_broadcastsd_pd(_mm_load_sd(&temp1[3])); | |||
| #ifdef __AVX512CD__ | |||
| __m512d accum_05, accum_15, accum_25, accum_35; | |||
| __m512d temp1_05, temp1_15, temp1_25, temp1_35; | |||
| BLASLONG to2; | |||
| int delta; | |||
| /* the 512 bit wide accumulator vectors start out as zero */ | |||
| accum_05 = _mm512_setzero_pd(); | |||
| accum_15 = _mm512_setzero_pd(); | |||
| accum_25 = _mm512_setzero_pd(); | |||
| accum_35 = _mm512_setzero_pd(); | |||
| temp1_05 = _mm512_broadcastsd_pd(_mm_load_sd(&temp1[0])); | |||
| temp1_15 = _mm512_broadcastsd_pd(_mm_load_sd(&temp1[1])); | |||
| temp1_25 = _mm512_broadcastsd_pd(_mm_load_sd(&temp1[2])); | |||
| temp1_35 = _mm512_broadcastsd_pd(_mm_load_sd(&temp1[3])); | |||
| delta = (to - from) & ~7; | |||
| to2 = from + delta; | |||
| for (; from < to2; from += 8) { | |||
| __m512d _x, _y; | |||
| __m512d a0, a1, a2, a3; | |||
| _y = _mm512_loadu_pd(&y[from]); | |||
| _x = _mm512_loadu_pd(&x[from]); | |||
| a0 = _mm512_loadu_pd(&a[0][from]); | |||
| a1 = _mm512_loadu_pd(&a[1][from]); | |||
| a2 = _mm512_loadu_pd(&a[2][from]); | |||
| a3 = _mm512_loadu_pd(&a[3][from]); | |||
| _y += temp1_05 * a0 + temp1_15 * a1 + temp1_25 * a2 + temp1_35 * a3; | |||
| accum_05 += _x * a0; | |||
| accum_15 += _x * a1; | |||
| accum_25 += _x * a2; | |||
| accum_35 += _x * a3; | |||
| _mm512_storeu_pd(&y[from], _y); | |||
| }; | |||
| /* | |||
| * we need to fold our 512 bit wide accumulator vectors into 256 bit wide vectors so that the AVX2 code | |||
| * below can continue using the intermediate results in its loop | |||
| */ | |||
| accum_0 = _mm256_add_pd(_mm512_extractf64x4_pd(accum_05, 0), _mm512_extractf64x4_pd(accum_05, 1)); | |||
| accum_1 = _mm256_add_pd(_mm512_extractf64x4_pd(accum_15, 0), _mm512_extractf64x4_pd(accum_15, 1)); | |||
| accum_2 = _mm256_add_pd(_mm512_extractf64x4_pd(accum_25, 0), _mm512_extractf64x4_pd(accum_25, 1)); | |||
| accum_3 = _mm256_add_pd(_mm512_extractf64x4_pd(accum_35, 0), _mm512_extractf64x4_pd(accum_35, 1)); | |||
| #endif | |||
| for (; from != to; from += 4) { | |||
| __m256d _x, _y; | |||
| __m256d a0, a1, a2, a3; | |||
| _y = _mm256_loadu_pd(&y[from]); | |||
| _x = _mm256_loadu_pd(&x[from]); | |||
| /* load 4 rows of matrix data */ | |||
| a0 = _mm256_loadu_pd(&a[0][from]); | |||
| a1 = _mm256_loadu_pd(&a[1][from]); | |||
| a2 = _mm256_loadu_pd(&a[2][from]); | |||
| a3 = _mm256_loadu_pd(&a[3][from]); | |||
| _y += temp1_0 * a0 + temp1_1 * a1 + temp1_2 * a2 + temp1_3 * a3; | |||
| accum_0 += _x * a0; | |||
| accum_1 += _x * a1; | |||
| accum_2 += _x * a2; | |||
| accum_3 += _x * a3; | |||
| _mm256_storeu_pd(&y[from], _y); | |||
| }; | |||
| /* | |||
| * we now have 4 accumulator vectors. Each vector needs to be summed up element wise and stored in the temp2 | |||
| * output array. There is no direct instruction for this in 256 bit space, only in 128 space. | |||
| */ | |||
| __m128d half_accum0, half_accum1, half_accum2, half_accum3; | |||
| /* Add upper half to lower half of each of the four 256 bit vectors to get to four 128 bit vectors */ | |||
| half_accum0 = _mm_add_pd(_mm256_extractf128_pd(accum_0, 0), _mm256_extractf128_pd(accum_0, 1)); | |||
| half_accum1 = _mm_add_pd(_mm256_extractf128_pd(accum_1, 0), _mm256_extractf128_pd(accum_1, 1)); | |||
| half_accum2 = _mm_add_pd(_mm256_extractf128_pd(accum_2, 0), _mm256_extractf128_pd(accum_2, 1)); | |||
| half_accum3 = _mm_add_pd(_mm256_extractf128_pd(accum_3, 0), _mm256_extractf128_pd(accum_3, 1)); | |||
| /* in 128 bit land there is a hadd operation to do the rest of the element-wise sum in one go */ | |||
| half_accum0 = _mm_hadd_pd(half_accum0, half_accum0); | |||
| half_accum1 = _mm_hadd_pd(half_accum1, half_accum1); | |||
| half_accum2 = _mm_hadd_pd(half_accum2, half_accum2); | |||
| half_accum3 = _mm_hadd_pd(half_accum3, half_accum3); | |||
| /* and store the lowest double value from each of these vectors in the temp2 output */ | |||
| temp2[0] += half_accum0[0]; | |||
| temp2[1] += half_accum1[0]; | |||
| temp2[2] += half_accum2[0]; | |||
| temp2[3] += half_accum3[0]; | |||
| } | |||
| #else | |||
| #include "dsymv_L_microk_haswell-2.c" | |||
| #endif | |||
| @@ -31,8 +31,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #if defined(NEHALEM) | |||
| #include "saxpy_microk_nehalem-2.c" | |||
| #elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) | |||
| #elif defined(HASWELL) || defined(ZEN) | |||
| #include "saxpy_microk_haswell-2.c" | |||
| #elif defined (SKYLAKEX) | |||
| #include "saxpy_microk_skylakex-2.c" | |||
| #elif defined(SANDYBRIDGE) | |||
| #include "saxpy_microk_sandy-2.c" | |||
| #elif defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) | |||
| @@ -0,0 +1,69 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2014, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /* need a new enough GCC for avx512 support */ | |||
| #if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) | |||
| #define HAVE_KERNEL_16 1 | |||
| #include <immintrin.h> | |||
| static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| { | |||
| BLASLONG i = 0; | |||
| __m256 __alpha; | |||
| __alpha = _mm256_broadcastss_ps(_mm_load_ss(alpha)); | |||
| #ifdef __AVX512CD__ | |||
| BLASLONG n64; | |||
| __m512 __alpha5; | |||
| __alpha5 = _mm512_broadcastss_ps(_mm_load_ss(alpha)); | |||
| n64 = n & ~63; | |||
| for (; i < n64; i+= 64) { | |||
| _mm512_storeu_ps(&y[i + 0], _mm512_loadu_ps(&y[i + 0]) + __alpha5 * _mm512_loadu_ps(&x[i + 0])); | |||
| _mm512_storeu_ps(&y[i + 16], _mm512_loadu_ps(&y[i + 16]) + __alpha5 * _mm512_loadu_ps(&x[i + 16])); | |||
| _mm512_storeu_ps(&y[i + 32], _mm512_loadu_ps(&y[i + 32]) + __alpha5 * _mm512_loadu_ps(&x[i + 32])); | |||
| _mm512_storeu_ps(&y[i + 48], _mm512_loadu_ps(&y[i + 48]) + __alpha5 * _mm512_loadu_ps(&x[i + 48])); | |||
| } | |||
| #endif | |||
| for (; i < n; i+= 32) { | |||
| _mm256_storeu_ps(&y[i + 0], _mm256_loadu_ps(&y[i + 0]) + __alpha * _mm256_loadu_ps(&x[i + 0])); | |||
| _mm256_storeu_ps(&y[i + 8], _mm256_loadu_ps(&y[i + 8]) + __alpha * _mm256_loadu_ps(&x[i + 8])); | |||
| _mm256_storeu_ps(&y[i + 16], _mm256_loadu_ps(&y[i + 16]) + __alpha * _mm256_loadu_ps(&x[i + 16])); | |||
| _mm256_storeu_ps(&y[i + 24], _mm256_loadu_ps(&y[i + 24]) + __alpha * _mm256_loadu_ps(&x[i + 24])); | |||
| } | |||
| } | |||
| #else | |||
| #include "saxpy_microk_haswell-2.c" | |||
| #endif | |||
| @@ -34,8 +34,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "sdot_microk_steamroller-2.c" | |||
| #elif defined(NEHALEM) | |||
| #include "sdot_microk_nehalem-2.c" | |||
| #elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) | |||
| #elif defined(HASWELL) || defined(ZEN) | |||
| #include "sdot_microk_haswell-2.c" | |||
| #elif defined (SKYLAKEX) | |||
| #include "sdot_microk_skylakex-2.c" | |||
| #elif defined(SANDYBRIDGE) | |||
| #include "sdot_microk_sandy-2.c" | |||
| #endif | |||
| @@ -0,0 +1,98 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2014, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /* need a new enough GCC for avx512 support */ | |||
| #if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) | |||
| #define HAVE_KERNEL_16 1 | |||
| #include <immintrin.h> | |||
| static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) | |||
| { | |||
| int i = 0; | |||
| __m256 accum_0, accum_1, accum_2, accum_3; | |||
| accum_0 = _mm256_setzero_ps(); | |||
| accum_1 = _mm256_setzero_ps(); | |||
| accum_2 = _mm256_setzero_ps(); | |||
| accum_3 = _mm256_setzero_ps(); | |||
| #ifdef __AVX512CD__ | |||
| __m512 accum_05, accum_15, accum_25, accum_35; | |||
| int n64; | |||
| n64 = n & (~63); | |||
| accum_05 = _mm512_setzero_ps(); | |||
| accum_15 = _mm512_setzero_ps(); | |||
| accum_25 = _mm512_setzero_ps(); | |||
| accum_35 = _mm512_setzero_ps(); | |||
| for (; i < n64; i += 64) { | |||
| accum_05 += _mm512_loadu_ps(&x[i+ 0]) * _mm512_loadu_ps(&y[i+ 0]); | |||
| accum_15 += _mm512_loadu_ps(&x[i+16]) * _mm512_loadu_ps(&y[i+16]); | |||
| accum_25 += _mm512_loadu_ps(&x[i+32]) * _mm512_loadu_ps(&y[i+32]); | |||
| accum_35 += _mm512_loadu_ps(&x[i+48]) * _mm512_loadu_ps(&y[i+48]); | |||
| } | |||
| /* | |||
| * we need to fold our 512 bit wide accumulator vectors into 256 bit wide vectors so that the AVX2 code | |||
| * below can continue using the intermediate results in its loop | |||
| */ | |||
| accum_0 = _mm512_extractf32x8_ps(accum_05, 0) + _mm512_extractf32x8_ps(accum_05, 1); | |||
| accum_1 = _mm512_extractf32x8_ps(accum_15, 0) + _mm512_extractf32x8_ps(accum_15, 1); | |||
| accum_2 = _mm512_extractf32x8_ps(accum_25, 0) + _mm512_extractf32x8_ps(accum_25, 1); | |||
| accum_3 = _mm512_extractf32x8_ps(accum_35, 0) + _mm512_extractf32x8_ps(accum_35, 1); | |||
| #endif | |||
| for (; i < n; i += 32) { | |||
| accum_0 += _mm256_loadu_ps(&x[i+ 0]) * _mm256_loadu_ps(&y[i+ 0]); | |||
| accum_1 += _mm256_loadu_ps(&x[i+ 8]) * _mm256_loadu_ps(&y[i+ 8]); | |||
| accum_2 += _mm256_loadu_ps(&x[i+16]) * _mm256_loadu_ps(&y[i+16]); | |||
| accum_3 += _mm256_loadu_ps(&x[i+24]) * _mm256_loadu_ps(&y[i+24]); | |||
| } | |||
| /* we now have the partial sums of the dot product in the 4 accumulation vectors, time to consolidate */ | |||
| accum_0 = accum_0 + accum_1 + accum_2 + accum_3; | |||
| __m128 half_accum0; | |||
| /* Add upper half to lower half of each of the 256 bit vector to get a 128 bit vector */ | |||
| half_accum0 = _mm256_extractf128_ps(accum_0, 0) + _mm256_extractf128_ps(accum_0, 1); | |||
| /* in 128 bit land there is a hadd operation to do the rest of the element-wise sum in one go */ | |||
| half_accum0 = _mm_hadd_ps(half_accum0, half_accum0); | |||
| half_accum0 = _mm_hadd_ps(half_accum0, half_accum0); | |||
| *dot = half_accum0[0]; | |||
| } | |||
| #else | |||
| #include "sdot_microk_haswell-2.c" | |||
| #endif | |||
| @@ -280,8 +280,8 @@ | |||
| * .. | |||
| * .. External Functions .. | |||
| LOGICAL LSAME | |||
| INTEGER ILAENV | |||
| EXTERNAL LSAME, ILAENV | |||
| INTEGER ILAENV2STAGE | |||
| EXTERNAL LSAME, ILAENV2STAGE | |||
| * .. | |||
| * .. Executable Statements .. | |||
| * | |||
| @@ -297,9 +297,9 @@ | |||
| * | |||
| * Determine the block size, the workspace size and the hous size. | |||
| * | |||
| IB = ILAENV( 18, 'CHETRD_HB2ST', VECT, N, KD, -1, -1 ) | |||
| LHMIN = ILAENV( 19, 'CHETRD_HB2ST', VECT, N, KD, IB, -1 ) | |||
| LWMIN = ILAENV( 20, 'CHETRD_HB2ST', VECT, N, KD, IB, -1 ) | |||
| IB = ILAENV2STAGE( 2, 'CHETRD_HB2ST', VECT, N, KD, -1, -1 ) | |||
| LHMIN = ILAENV2STAGE( 3, 'CHETRD_HB2ST', VECT, N, KD, IB, -1 ) | |||
| LWMIN = ILAENV2STAGE( 4, 'CHETRD_HB2ST', VECT, N, KD, IB, -1 ) | |||
| * | |||
| IF( .NOT.AFTERS1 .AND. .NOT.LSAME( STAGE1, 'N' ) ) THEN | |||
| INFO = -1 | |||
| @@ -285,8 +285,8 @@ | |||
| * .. | |||
| * .. External Functions .. | |||
| LOGICAL LSAME | |||
| INTEGER ILAENV | |||
| EXTERNAL LSAME, ILAENV | |||
| INTEGER ILAENV2STAGE | |||
| EXTERNAL LSAME, ILAENV2STAGE | |||
| * .. | |||
| * .. Executable Statements .. | |||
| * | |||
| @@ -296,7 +296,7 @@ | |||
| INFO = 0 | |||
| UPPER = LSAME( UPLO, 'U' ) | |||
| LQUERY = ( LWORK.EQ.-1 ) | |||
| LWMIN = ILAENV( 20, 'CHETRD_HE2HB', '', N, KD, -1, -1 ) | |||
| LWMIN = ILAENV2STAGE( 4, 'CHETRD_HE2HB', '', N, KD, -1, -1 ) | |||
| IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN | |||
| INFO = -1 | |||
| @@ -277,8 +277,8 @@ | |||
| * .. | |||
| * .. External Functions .. | |||
| LOGICAL LSAME | |||
| INTEGER ILAENV | |||
| EXTERNAL LSAME, ILAENV | |||
| INTEGER ILAENV2STAGE | |||
| EXTERNAL LSAME, ILAENV2STAGE | |||
| * .. | |||
| * .. Executable Statements .. | |||
| * | |||
| @@ -294,9 +294,9 @@ | |||
| * | |||
| * Determine the block size, the workspace size and the hous size. | |||
| * | |||
| IB = ILAENV( 18, 'DSYTRD_SB2ST', VECT, N, KD, -1, -1 ) | |||
| LHMIN = ILAENV( 19, 'DSYTRD_SB2ST', VECT, N, KD, IB, -1 ) | |||
| LWMIN = ILAENV( 20, 'DSYTRD_SB2ST', VECT, N, KD, IB, -1 ) | |||
| IB = ILAENV2STAGE( 2, 'DSYTRD_SB2ST', VECT, N, KD, -1, -1 ) | |||
| LHMIN = ILAENV2STAGE( 3, 'DSYTRD_SB2ST', VECT, N, KD, IB, -1 ) | |||
| LWMIN = ILAENV2STAGE( 4, 'DSYTRD_SB2ST', VECT, N, KD, IB, -1 ) | |||
| * | |||
| IF( .NOT.AFTERS1 .AND. .NOT.LSAME( STAGE1, 'N' ) ) THEN | |||
| INFO = -1 | |||
| @@ -285,8 +285,8 @@ | |||
| * .. | |||
| * .. External Functions .. | |||
| LOGICAL LSAME | |||
| INTEGER ILAENV | |||
| EXTERNAL LSAME, ILAENV | |||
| INTEGER ILAENV2STAGE | |||
| EXTERNAL LSAME, ILAENV2STAGE | |||
| * .. | |||
| * .. Executable Statements .. | |||
| * | |||
| @@ -296,7 +296,7 @@ | |||
| INFO = 0 | |||
| UPPER = LSAME( UPLO, 'U' ) | |||
| LQUERY = ( LWORK.EQ.-1 ) | |||
| LWMIN = ILAENV( 20, 'DSYTRD_SY2SB', '', N, KD, -1, -1 ) | |||
| LWMIN = ILAENV2STAGE( 4, 'DSYTRD_SY2SB', '', N, KD, -1, -1 ) | |||
| IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN | |||
| INFO = -1 | |||
| @@ -277,8 +277,8 @@ | |||
| * .. | |||
| * .. External Functions .. | |||
| LOGICAL LSAME | |||
| INTEGER ILAENV | |||
| EXTERNAL LSAME, ILAENV | |||
| INTEGER ILAENV2STAGE | |||
| EXTERNAL LSAME, ILAENV2STAGE | |||
| * .. | |||
| * .. Executable Statements .. | |||
| * | |||
| @@ -294,9 +294,9 @@ | |||
| * | |||
| * Determine the block size, the workspace size and the hous size. | |||
| * | |||
| IB = ILAENV( 18, 'SSYTRD_SB2ST', VECT, N, KD, -1, -1 ) | |||
| LHMIN = ILAENV( 19, 'SSYTRD_SB2ST', VECT, N, KD, IB, -1 ) | |||
| LWMIN = ILAENV( 20, 'SSYTRD_SB2ST', VECT, N, KD, IB, -1 ) | |||
| IB = ILAENV2STAGE( 2, 'SSYTRD_SB2ST', VECT, N, KD, -1, -1 ) | |||
| LHMIN = ILAENV2STAGE( 3, 'SSYTRD_SB2ST', VECT, N, KD, IB, -1 ) | |||
| LWMIN = ILAENV2STAGE( 4, 'SSYTRD_SB2ST', VECT, N, KD, IB, -1 ) | |||
| * | |||
| IF( .NOT.AFTERS1 .AND. .NOT.LSAME( STAGE1, 'N' ) ) THEN | |||
| INFO = -1 | |||
| @@ -285,8 +285,8 @@ | |||
| * .. | |||
| * .. External Functions .. | |||
| LOGICAL LSAME | |||
| INTEGER ILAENV | |||
| EXTERNAL LSAME, ILAENV | |||
| INTEGER ILAENV2STAGE | |||
| EXTERNAL LSAME, ILAENV2STAGE | |||
| * .. | |||
| * .. Executable Statements .. | |||
| * | |||
| @@ -296,7 +296,7 @@ | |||
| INFO = 0 | |||
| UPPER = LSAME( UPLO, 'U' ) | |||
| LQUERY = ( LWORK.EQ.-1 ) | |||
| LWMIN = ILAENV( 20, 'SSYTRD_SY2SB', '', N, KD, -1, -1 ) | |||
| LWMIN = ILAENV2STAGE( 4, 'SSYTRD_SY2SB', '', N, KD, -1, -1 ) | |||
| IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN | |||
| INFO = -1 | |||
| @@ -280,8 +280,8 @@ | |||
| * .. | |||
| * .. External Functions .. | |||
| LOGICAL LSAME | |||
| INTEGER ILAENV | |||
| EXTERNAL LSAME, ILAENV | |||
| INTEGER ILAENV2STAGE | |||
| EXTERNAL LSAME, ILAENV2STAGE | |||
| * .. | |||
| * .. Executable Statements .. | |||
| * | |||
| @@ -297,9 +297,9 @@ | |||
| * | |||
| * Determine the block size, the workspace size and the hous size. | |||
| * | |||
| IB = ILAENV( 18, 'ZHETRD_HB2ST', VECT, N, KD, -1, -1 ) | |||
| LHMIN = ILAENV( 19, 'ZHETRD_HB2ST', VECT, N, KD, IB, -1 ) | |||
| LWMIN = ILAENV( 20, 'ZHETRD_HB2ST', VECT, N, KD, IB, -1 ) | |||
| IB = ILAENV2STAGE( 2, 'ZHETRD_HB2ST', VECT, N, KD, -1, -1 ) | |||
| LHMIN = ILAENV2STAGE( 3, 'ZHETRD_HB2ST', VECT, N, KD, IB, -1 ) | |||
| LWMIN = ILAENV2STAGE( 4, 'ZHETRD_HB2ST', VECT, N, KD, IB, -1 ) | |||
| * | |||
| IF( .NOT.AFTERS1 .AND. .NOT.LSAME( STAGE1, 'N' ) ) THEN | |||
| INFO = -1 | |||
| @@ -285,8 +285,8 @@ | |||
| * .. | |||
| * .. External Functions .. | |||
| LOGICAL LSAME | |||
| INTEGER ILAENV | |||
| EXTERNAL LSAME, ILAENV | |||
| INTEGER ILAENV2STAGE | |||
| EXTERNAL LSAME, ILAENV2STAGE | |||
| * .. | |||
| * .. Executable Statements .. | |||
| * | |||
| @@ -296,7 +296,7 @@ | |||
| INFO = 0 | |||
| UPPER = LSAME( UPLO, 'U' ) | |||
| LQUERY = ( LWORK.EQ.-1 ) | |||
| LWMIN = ILAENV( 20, 'ZHETRD_HE2HB', '', N, KD, -1, -1 ) | |||
| LWMIN = ILAENV2STAGE( 4, 'ZHETRD_HE2HB', '', N, KD, -1, -1 ) | |||
| IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN | |||
| INFO = -1 | |||