| @@ -68,3 +68,4 @@ test/zblat2 | |||
| test/zblat3 | |||
| build | |||
| build.* | |||
| *.swp | |||
| @@ -24,7 +24,12 @@ before_install: | |||
| - if [[ "$TARGET_BOX" == "WIN64" ]]; then sudo apt-get install -qq binutils-mingw-w64-x86-64 gcc-mingw-w64-x86-64 gfortran-mingw-w64-x86-64; fi | |||
| - if [[ "$TARGET_BOX" == "LINUX32" ]]; then sudo apt-get install -qq gcc-multilib gfortran-multilib; fi | |||
| script: make QUIET_MAKE=1 DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 $BTYPE | |||
| script: | |||
| - set -e | |||
| - make QUIET_MAKE=1 DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 $BTYPE | |||
| - if [ "$TARGET_BOX" == "LINUX32" ] || [ "$TARGET_BOX" == "LINUX64" ]; then make -C test DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 $BTYPE; fi | |||
| - if [ "$TARGET_BOX" == "LINUX32" ] || [ "$TARGET_BOX" == "LINUX64" ]; then make -C ctest DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 $BTYPE; fi | |||
| - if [ "$TARGET_BOX" == "LINUX32" ] || [ "$TARGET_BOX" == "LINUX64" ]; then make -C utest DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 $BTYPE; fi | |||
| # whitelist | |||
| branches: | |||
| @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.4) | |||
| project(OpenBLAS) | |||
| set(OpenBLAS_MAJOR_VERSION 0) | |||
| set(OpenBLAS_MINOR_VERSION 2) | |||
| set(OpenBLAS_PATCH_VERSION 14) | |||
| set(OpenBLAS_PATCH_VERSION 16) | |||
| set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") | |||
| enable_language(ASM) | |||
| @@ -54,10 +54,6 @@ if (NOT DYNAMIC_ARCH) | |||
| list(APPEND BLASDIRS kernel) | |||
| endif () | |||
| if (DEFINED UTEST_CHECK) | |||
| set(SANITY_CHECK 1) | |||
| endif () | |||
| if (DEFINED SANITY_CHECK) | |||
| list(APPEND BLASDIRS reference) | |||
| endif () | |||
| @@ -110,6 +106,10 @@ if (${NO_STATIC} AND ${NO_SHARED}) | |||
| message(FATAL_ERROR "Neither static nor shared are enabled.") | |||
| endif () | |||
| #Set default output directory | |||
| set( CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib ) | |||
| set( CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib ) | |||
| # get obj vars into format that add_library likes: $<TARGET_OBJS:objlib> (see http://www.cmake.org/cmake/help/v3.0/command/add_library.html) | |||
| set(TARGET_OBJS "") | |||
| foreach (SUBDIR ${SUBDIRS}) | |||
| @@ -139,6 +139,17 @@ add_library(${OpenBLAS_LIBNAME} SHARED ${LA_SOURCES} ${LAPACKE_SOURCES} ${TARGET | |||
| include("${CMAKE_SOURCE_DIR}/cmake/export.cmake") | |||
| # Set output for libopenblas | |||
| set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib) | |||
| foreach (OUTPUTCONFIG ${CMAKE_CONFIGURATION_TYPES}) | |||
| string( TOUPPER ${OUTPUTCONFIG} OUTPUTCONFIG ) | |||
| set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib) | |||
| set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES LIBRARY_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib) | |||
| set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES ARCHIVE_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib) | |||
| endforeach() | |||
| enable_testing() | |||
| add_subdirectory(utest) | |||
| if(NOT MSVC) | |||
| #only build shared library for MSVC | |||
| @@ -152,7 +163,6 @@ target_link_libraries(${OpenBLAS_LIBNAME}_static pthread) | |||
| endif() | |||
| #build test and ctest | |||
| enable_testing() | |||
| add_subdirectory(test) | |||
| if(NOT NO_CBLAS) | |||
| add_subdirectory(ctest) | |||
| @@ -121,6 +121,17 @@ In chronological order: | |||
| * [2014-10-10] trmm and sgemm kernels (optimized for APM's X-Gene 1). | |||
| ARMv8 support. | |||
| * Jerome Robert <jeromerobert@gmx.com> | |||
| * [2015-01-01] Speed-up small `ger` and `gemv` using stack allocation (bug #478) | |||
| * [2015-12-23] `stack_check` in `gemv.c` (bug #722) | |||
| * [2015-12-28] Allow to force the number of parallel make job | |||
| * [2015-12-28] Fix detection of AMD E2-3200 detection | |||
| * [2015-12-31] Let `make MAX_STACK_ALLOC=0` do what expected | |||
| * [2016-01-19] Disable multi-threading in `ger` and `swap` for small matrices (bug #731) | |||
| * [2016-01-24] Use `GEMM_MULTITHREAD_THRESHOLD` as a number of ops (bug #742) | |||
| * [2016-01-26] Let `openblas_get_num_threads` return the number of active threads (bug #760) | |||
| * [2016-01-30] Speed-up small `zger`, `zgemv`, `ztrmv` using stack allocation (bug #727) | |||
| * Dan Kortschak | |||
| * [2015-01-07] Added test for drotmg bug #484. | |||
| @@ -130,5 +141,11 @@ In chronological order: | |||
| * Martin Koehler <https://github.com/grisuthedragon/> | |||
| * [2015-09-07] Improved imatcopy | |||
| * Ashwin Sekhar T K <https://github.com/ashwinyes/> | |||
| * [2015-11-09] Assembly kernels for Cortex-A57 (ARMv8) | |||
| * [2015-11-20] lapack-test fixes for Cortex-A57 | |||
| * [2016-03-14] Additional functional Assembly Kernels for Cortex-A57 | |||
| * [2016-03-14] Optimize Dgemm 4x4 for Cortex-A57 | |||
| * [Your name or handle] <[email or website]> | |||
| * [Date] [Brief summary of your changes] | |||
| @@ -1,4 +1,57 @@ | |||
| OpenBLAS ChangeLog | |||
| ==================================================================== | |||
| Version 0.2.16 | |||
| 15-Mar-2016 | |||
| common: | |||
| * Avoid potential getenv segfault. (#716) | |||
| * Import LAPACK svn bugfix #142-#147,#150-#155 | |||
| x86/x86_64: | |||
| * Optimize c/zgemv for AMD Bulldozer, Piledriver, Steamroller | |||
| * Fix bug with scipy linalg test. | |||
| ARM: | |||
| * Improve DGEMM for ARM Cortex-A57. (Thanks, Ashwin Sekhar T K) | |||
| POWER: | |||
| * Optimize D and Z BLAS3 functions for Power8. | |||
| ==================================================================== | |||
| Version 0.2.16.rc1 | |||
| 23-Feb-2016 | |||
| common: | |||
| * Upgrade LAPACK to 3.6.0 version. | |||
| Add BUILD_LAPACK_DEPRECATED option in Makefile.rule to build | |||
| LAPACK deprecated functions. | |||
| * Add MAKE_NB_JOBS option in Makefile. | |||
| Force number of make jobs.This is particularly | |||
| useful when using distcc. (#735. Thanks, Jerome Robert.) | |||
| * Redesign unit test. Run unit/regression test at every build (Travis-CI and Appveyor). | |||
| * Disable multi-threading for small size swap and ger. (#744. Thanks, Jerome Robert) | |||
| * Improve small zger, zgemv, ztrmv using stack alloction (#727. Thanks, Jerome Robert) | |||
| * Let openblas_get_num_threads return the number of active threads. | |||
| (#760. Thanks, Jerome Robert) | |||
| * Support illumos(OmniOS). (#749. Thanks, Lauri Tirkkonen) | |||
| * Fix LAPACK Dormbr, Dormlq bug. (#711, #713. Thanks, Brendan Tracey) | |||
| * Update scipy benchmark script. (#745. Thanks, John Kirkham) | |||
| x86/x86_64: | |||
| * Optimize trsm kernels for AMD Bulldozer, Piledriver, Steamroller. | |||
| * Detect Intel Avoton. | |||
| * Detect AMD Trinity, Richland, E2-3200. | |||
| * Fix gemv performance bug on Mac OSX Intel Haswell. | |||
| * Fix some bugs with CMake and Visual Studio | |||
| ARM: | |||
| * Support and optimize Cortex-A57 AArch64. | |||
| (#686. Thanks, Ashwin Sekhar TK) | |||
| * Fix Android build on ARMV7 (#778. Thanks, Paul Mustiere) | |||
| * Update ARMV6 kernels. | |||
| POWER: | |||
| * Fix detection of POWER architecture | |||
| (#684. Thanks, Sebastien Villemot) | |||
| ==================================================================== | |||
| Version 0.2.15 | |||
| 27-Oct-2015 | |||
| @@ -7,10 +7,6 @@ ifneq ($(DYNAMIC_ARCH), 1) | |||
| BLASDIRS += kernel | |||
| endif | |||
| ifdef UTEST_CHECK | |||
| SANITY_CHECK = 1 | |||
| endif | |||
| ifdef SANITY_CHECK | |||
| BLASDIRS += reference | |||
| endif | |||
| @@ -85,22 +81,22 @@ endif | |||
| shared : | |||
| ifndef NO_SHARED | |||
| ifeq ($(OSNAME), Linux) | |||
| ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS)) | |||
| @$(MAKE) -C exports so | |||
| @-ln -fs $(LIBSONAME) $(LIBPREFIX).so | |||
| @-ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION) | |||
| @ln -fs $(LIBSONAME) $(LIBPREFIX).so | |||
| @ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION) | |||
| endif | |||
| ifeq ($(OSNAME), FreeBSD) | |||
| @$(MAKE) -C exports so | |||
| @-ln -fs $(LIBSONAME) $(LIBPREFIX).so | |||
| @ln -fs $(LIBSONAME) $(LIBPREFIX).so | |||
| endif | |||
| ifeq ($(OSNAME), NetBSD) | |||
| @$(MAKE) -C exports so | |||
| @-ln -fs $(LIBSONAME) $(LIBPREFIX).so | |||
| @ln -fs $(LIBSONAME) $(LIBPREFIX).so | |||
| endif | |||
| ifeq ($(OSNAME), Darwin) | |||
| @$(MAKE) -C exports dyn | |||
| @-ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib | |||
| @ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib | |||
| endif | |||
| ifeq ($(OSNAME), WINNT) | |||
| @$(MAKE) -C exports dll | |||
| @@ -117,10 +113,8 @@ ifndef CROSS | |||
| touch $(LIBNAME) | |||
| ifndef NO_FBLAS | |||
| $(MAKE) -C test all | |||
| ifdef UTEST_CHECK | |||
| $(MAKE) -C utest all | |||
| endif | |||
| endif | |||
| ifndef NO_CBLAS | |||
| $(MAKE) -C ctest all | |||
| endif | |||
| @@ -249,16 +243,23 @@ ifndef NOFORTRAN | |||
| -@echo "SUFFIX = $(SUFFIX)" >> $(NETLIB_LAPACK_DIR)/make.inc | |||
| -@echo "PSUFFIX = $(PSUFFIX)" >> $(NETLIB_LAPACK_DIR)/make.inc | |||
| -@echo "CEXTRALIB = $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc | |||
| ifeq ($(FC), gfortran) | |||
| ifeq ($(F_COMPILER), GFORTRAN) | |||
| -@echo "TIMER = INT_ETIME" >> $(NETLIB_LAPACK_DIR)/make.inc | |||
| ifdef SMP | |||
| ifeq ($(OSNAME), WINNT) | |||
| -@echo "LOADER = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc | |||
| else | |||
| -@echo "LOADER = $(FC) -pthread" >> $(NETLIB_LAPACK_DIR)/make.inc | |||
| endif | |||
| else | |||
| -@echo "LOADER = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc | |||
| endif | |||
| else | |||
| -@echo "TIMER = NONE" >> $(NETLIB_LAPACK_DIR)/make.inc | |||
| -@echo "LOADER = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc | |||
| endif | |||
| ifeq ($(BUILD_LAPACK_DEPRECATED), 1) | |||
| -@echo "BUILD_DEPRECATED = 1" >> $(NETLIB_LAPACK_DIR)/make.inc | |||
| endif | |||
| -@cat make.inc >> $(NETLIB_LAPACK_DIR)/make.inc | |||
| endif | |||
| @@ -288,8 +289,18 @@ endif | |||
| lapack-test : | |||
| (cd $(NETLIB_LAPACK_DIR)/TESTING && rm -f x* *.out) | |||
| make -j 1 -C $(NETLIB_LAPACK_DIR)/TESTING xeigtstc xeigtstd xeigtsts xeigtstz xlintstc xlintstd xlintstds xlintstrfd xlintstrfz xlintsts xlintstz xlintstzc xlintstrfs xlintstrfc | |||
| ifneq ($(CROSS), 1) | |||
| ( cd $(NETLIB_LAPACK_DIR)/INSTALL; ./testlsame; ./testslamch; ./testdlamch; \ | |||
| ./testsecond; ./testdsecnd; ./testieee; ./testversion ) | |||
| (cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r ) | |||
| endif | |||
| lapack-runtest: | |||
| ( cd $(NETLIB_LAPACK_DIR)/INSTALL; ./testlsame; ./testslamch; ./testdlamch; \ | |||
| ./testsecond; ./testdsecnd; ./testieee; ./testversion ) | |||
| (cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r ) | |||
| blas-test: | |||
| (cd $(NETLIB_LAPACK_DIR)/BLAS && rm -f x* *.out) | |||
| make -j 1 -C $(NETLIB_LAPACK_DIR) blas_testing | |||
| @@ -11,8 +11,8 @@ endif | |||
| ifeq ($(CORE), ARMV7) | |||
| ifeq ($(OSNAME), Android) | |||
| CCOMMON_OPT += -marm -mfpu=neon -mfloat-abi=hard -march=armv7-a | |||
| FCOMMON_OPT += -marm -mfpu=neon -mfloat-abi=hard -march=armv7-a | |||
| CCOMMON_OPT += -marm -mfpu=neon -mfloat-abi=hard -march=armv7-a -Wl,--no-warn-mismatch | |||
| FCOMMON_OPT += -marm -mfpu=neon -mfloat-abi=hard -march=armv7-a -Wl,--no-warn-mismatch | |||
| else | |||
| CCOMMON_OPT += -marm -mfpu=vfpv3 -mfloat-abi=hard -march=armv7-a | |||
| FCOMMON_OPT += -marm -mfpu=vfpv3 -mfloat-abi=hard -march=armv7-a | |||
| @@ -29,5 +29,3 @@ ifeq ($(CORE), ARMV5) | |||
| CCOMMON_OPT += -marm -march=armv5 | |||
| FCOMMON_OPT += -marm -march=armv5 | |||
| endif | |||
| @@ -4,4 +4,8 @@ CCOMMON_OPT += -march=armv8-a | |||
| FCOMMON_OPT += -march=armv8-a | |||
| endif | |||
| ifeq ($(CORE), CORTEXA57) | |||
| CCOMMON_OPT += -march=armv8-a+crc+crypto+fp+simd -mtune=cortex-a57 | |||
| FCOMMON_OPT += -march=armv8-a+crc+crypto+fp+simd -mtune=cortex-a57 | |||
| endif | |||
| @@ -29,7 +29,7 @@ install : lib.grd | |||
| #for inc | |||
| @echo \#ifndef OPENBLAS_CONFIG_H > $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h | |||
| @echo \#define OPENBLAS_CONFIG_H >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h | |||
| @awk 'NF {print $$1, "OPENBLAS_"$$2, $$3}' config_last.h >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h | |||
| @$(AWK) 'NF {print $$1, "OPENBLAS_"$$2, $$3}' config_last.h >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h | |||
| @echo \#define OPENBLAS_VERSION \" OpenBLAS $(VERSION) \" >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h | |||
| @cat openblas_config_template.h >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h | |||
| @echo \#endif \/\* OPENBLAS_CONFIG_H \*\/ >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h | |||
| @@ -48,10 +48,10 @@ endif | |||
| ifndef NO_LAPACKE | |||
| @echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) | |||
| @-install -pm644 $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h | |||
| @-install -pm644 $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_config.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h | |||
| @-install -pm644 $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_mangling_with_flags.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h | |||
| @-install -pm644 $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_utils.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h | |||
| @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h | |||
| @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h | |||
| @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h | |||
| @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_utils.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h | |||
| endif | |||
| #for install static library | |||
| @@ -64,7 +64,7 @@ endif | |||
| #for install shared library | |||
| ifndef NO_SHARED | |||
| @echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) | |||
| ifeq ($(OSNAME), Linux) | |||
| ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS)) | |||
| @install -pm755 $(LIBSONAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) | |||
| @cd $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) ; \ | |||
| ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \ | |||
| @@ -3,7 +3,7 @@ | |||
| # | |||
| # This library's version | |||
| VERSION = 0.2.15 | |||
| VERSION = 0.2.16 | |||
| # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a | |||
| # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library | |||
| @@ -79,6 +79,9 @@ VERSION = 0.2.15 | |||
| # If you don't need LAPACKE (C Interface to LAPACK), please comment it in. | |||
| # NO_LAPACKE = 1 | |||
| # Build LAPACK Deprecated functions since LAPACK 3.6.0 | |||
| # BUILD_LAPACK_DEPRECATED = 1 | |||
| # If you want to use legacy threaded Level 3 implementation. | |||
| # USE_SIMPLE_THREADED_LEVEL3 = 1 | |||
| @@ -108,6 +111,10 @@ NO_AFFINITY = 1 | |||
| # Don't use parallel make. | |||
| # NO_PARALLEL_MAKE = 1 | |||
| # Force number of make jobs. The default is the number of logical CPU of the host. | |||
| # This is particularly useful when using distcc | |||
| # MAKE_NB_JOBS = 2 | |||
| # If you would like to know minute performance report of GotoBLAS. | |||
| # FUNCTION_PROFILE = 1 | |||
| @@ -138,10 +145,6 @@ NO_AFFINITY = 1 | |||
| # slow (Not implemented yet). | |||
| # SANITY_CHECK = 1 | |||
| # Run testcases in utest/ . When you enable UTEST_CHECK, it would enable | |||
| # SANITY_CHECK to compare the result with reference BLAS. | |||
| # UTEST_CHECK = 1 | |||
| # The installation directory. | |||
| # PREFIX = /opt/OpenBLAS | |||
| @@ -159,10 +162,11 @@ COMMON_PROF = -pg | |||
| # Build Debug version | |||
| # DEBUG = 1 | |||
| # Improve GEMV and GER for small matrices by stack allocation. | |||
| # For details, https://github.com/xianyi/OpenBLAS/pull/482 | |||
| # Set maximum stack allocation. | |||
| # The default value is 2048. 0 disable stack allocation a may reduce GER and GEMV | |||
| # performance. For details, https://github.com/xianyi/OpenBLAS/pull/482 | |||
| # | |||
| MAX_STACK_ALLOC=2048 | |||
| # MAX_STACK_ALLOC = 0 | |||
| # Add a prefix or suffix to all exported symbol names in the shared library. | |||
| # Avoid conflicts with other BLAS libraries, especially when using | |||
| @@ -139,6 +139,10 @@ NO_PARALLEL_MAKE=0 | |||
| endif | |||
| GETARCH_FLAGS += -DNO_PARALLEL_MAKE=$(NO_PARALLEL_MAKE) | |||
| ifdef MAKE_NB_JOBS | |||
| GETARCH_FLAGS += -DMAKE_NB_JOBS=$(MAKE_NB_JOBS) | |||
| endif | |||
| ifeq ($(HOSTCC), loongcc) | |||
| GETARCH_FLAGS += -static | |||
| endif | |||
| @@ -292,12 +296,14 @@ endif | |||
| ifneq ($(OSNAME), WINNT) | |||
| ifneq ($(OSNAME), CYGWIN_NT) | |||
| ifneq ($(OSNAME), Interix) | |||
| ifneq ($(OSNAME), Android) | |||
| ifdef SMP | |||
| EXTRALIB += -lpthread | |||
| endif | |||
| endif | |||
| endif | |||
| endif | |||
| endif | |||
| # ifeq logical or | |||
| ifeq ($(OSNAME), $(filter $(OSNAME),WINNT CYGWIN_NT Interix)) | |||
| @@ -324,7 +330,8 @@ ifdef SANITY_CHECK | |||
| CCOMMON_OPT += -DSANITY_CHECK -DREFNAME=$(*F)f$(BU) | |||
| endif | |||
| ifdef MAX_STACK_ALLOC | |||
| MAX_STACK_ALLOC ?= 2048 | |||
| ifneq ($(MAX_STACK_ALLOC), 0) | |||
| CCOMMON_OPT += -DMAX_STACK_ALLOC=$(MAX_STACK_ALLOC) | |||
| endif | |||
| @@ -374,7 +381,7 @@ FCOMMON_OPT += -m128bit-long-double | |||
| endif | |||
| ifeq ($(C_COMPILER), CLANG) | |||
| EXPRECISION = 1 | |||
| CCOMMON_OPT += -DEXPRECISION | |||
| CCOMMON_OPT += -DEXPRECISION | |||
| FCOMMON_OPT += -m128bit-long-double | |||
| endif | |||
| endif | |||
| @@ -388,7 +395,7 @@ endif | |||
| ifeq ($(USE_OPENMP), 1) | |||
| #check | |||
| #check | |||
| ifeq ($(USE_THREAD), 0) | |||
| $(error OpenBLAS: Cannot set both USE_OPENMP=1 and USE_THREAD=0. The USE_THREAD=0 is only for building single thread version.) | |||
| endif | |||
| @@ -952,17 +959,18 @@ ifeq ($(OSNAME), SunOS) | |||
| TAR = gtar | |||
| PATCH = gpatch | |||
| GREP = ggrep | |||
| AWK = nawk | |||
| else | |||
| TAR = tar | |||
| PATCH = patch | |||
| GREP = grep | |||
| AWK = awk | |||
| endif | |||
| ifndef MD5SUM | |||
| MD5SUM = md5sum | |||
| endif | |||
| AWK = awk | |||
| REVISION = -r$(VERSION) | |||
| MAJOR_VERSION = $(word 1,$(subst ., ,$(VERSION))) | |||
| @@ -971,16 +979,25 @@ ifeq ($(DEBUG), 1) | |||
| COMMON_OPT += -g | |||
| endif | |||
| ifeq ($(DEBUG), 1) | |||
| FCOMMON_OPT += -g | |||
| endif | |||
| ifndef COMMON_OPT | |||
| COMMON_OPT = -O2 | |||
| endif | |||
| ifndef FCOMMON_OPT | |||
| FCOMMON_OPT = -O2 -frecursive | |||
| endif | |||
| override CFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) | |||
| override PFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) -DPROFILE $(COMMON_PROF) | |||
| override FFLAGS += $(COMMON_OPT) $(FCOMMON_OPT) | |||
| override FPFLAGS += $(COMMON_OPT) $(FCOMMON_OPT) $(COMMON_PROF) | |||
| override FFLAGS += $(FCOMMON_OPT) | |||
| override FPFLAGS += $(FCOMMON_OPT) $(COMMON_PROF) | |||
| #MAKEOVERRIDES = | |||
| #For LAPACK Fortran codes. | |||
| @@ -1170,4 +1187,3 @@ SUNPATH = /opt/sunstudio12.1 | |||
| else | |||
| SUNPATH = /opt/SUNWspro | |||
| endif | |||
| @@ -75,10 +75,11 @@ Please read GotoBLAS_01Readme.txt | |||
| #### ARM64: | |||
| - **ARMV8**: Experimental | |||
| - **ARM Cortex-A57**: Experimental | |||
| ### Support OS: | |||
| - **GNU/Linux** | |||
| - **MingWin/Windows**: Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-in-Microsoft-Visual-Studio>. | |||
| - **MingWin or Visual Studio(CMake)/Windows**: Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-in-Microsoft-Visual-Studio>. | |||
| - **Darwin/Mac OS X**: Experimental. Although GotoBLAS2 supports Darwin, we are the beginner on Mac OS X. | |||
| - **FreeBSD**: Supported by community. We didn't test the library on this OS. | |||
| @@ -74,3 +74,5 @@ ARMV5 | |||
| 7.ARM 64-bit CPU: | |||
| ARMV8 | |||
| CORTEXA57 | |||
| @@ -0,0 +1,199 @@ | |||
| # Notes on OpenBLAS usage | |||
| ## Usage | |||
| #### Program is Terminated. Because you tried to allocate too many memory regions | |||
| In OpenBLAS, we mange a pool of memory buffers and allocate the number of | |||
| buffers as the following. | |||
| ``` | |||
| #define NUM_BUFFERS (MAX_CPU_NUMBER * 2) | |||
| ``` | |||
| This error indicates that the program exceeded the number of buffers. | |||
| Please build OpenBLAS with larger `NUM_THREADS`. For example, `make | |||
| NUM_THREADS=32` or `make NUM_THREADS=64`. In `Makefile.system`, we will set | |||
| `MAX_CPU_NUMBER=NUM_THREADS`. | |||
| #### How can I use OpenBLAS in multi-threaded applications? | |||
| If your application is already multi-threaded, it will conflict with OpenBLAS | |||
| multi-threading. Thus, you must set OpenBLAS to use single thread in any of the | |||
| following ways: | |||
| * `export OPENBLAS_NUM_THREADS=1` in the environment variables. | |||
| * Call `openblas_set_num_threads(1)` in the application on runtime. | |||
| * Build OpenBLAS single thread version, e.g. `make USE_THREAD=0` | |||
| If the application is parallelized by OpenMP, please use OpenBLAS built with | |||
| `USE_OPENMP=1` | |||
| #### How to choose TARGET manually at runtime when compiled with DYNAMIC_ARCH | |||
| The environment variable which control the kernel selection is | |||
| `OPENBLAS_CORETYPE` (see `driver/others/dynamic.c`) e.g. `export | |||
| OPENBLAS_CORETYPE=Haswell` and the function `char* openblas_get_corename()` | |||
| returns the used target. | |||
| #### How could I disable OpenBLAS threading affinity on runtime? | |||
| You can define the `OPENBLAS_MAIN_FREE` or `GOTOBLAS_MAIN_FREE` environment | |||
| variable to disable threading affinity on runtime. For example, before the | |||
| running, | |||
| ``` | |||
| export OPENBLAS_MAIN_FREE=1 | |||
| ``` | |||
| Alternatively, you can disable affinity feature with enabling `NO_AFFINITY=1` | |||
| in `Makefile.rule`. | |||
| ## Linking with the library | |||
| * Link with shared library | |||
| `gcc -o test test.c -I /your_path/OpenBLAS/include/ -L/your_path/OpenBLAS/lib -lopenblas` | |||
| If the library is multithreaded, please add `-lpthread`. If the library | |||
| contains LAPACK functions, please add `-lgfortran` or other Fortran libs. | |||
| * Link with static library | |||
| `gcc -o test test.c /your/path/libopenblas.a` | |||
| You can download `test.c` from https://gist.github.com/xianyi/5780018 | |||
| On Linux, if OpenBLAS was compiled with threading support (`USE_THREAD=1` by | |||
| default), custom programs statically linked against `libopenblas.a` should also | |||
| link with the pthread library e.g.: | |||
| ``` | |||
| gcc -static -I/opt/OpenBLAS/include -L/opt/OpenBLAS/lib -o my_program my_program.c -lopenblas -lpthread | |||
| ``` | |||
| Failing to add the `-lpthread` flag will cause errors such as: | |||
| ``` | |||
| /opt/OpenBLAS/libopenblas.a(memory.o): In function `_touch_memory': | |||
| memory.c:(.text+0x15): undefined reference to `pthread_mutex_lock' | |||
| memory.c:(.text+0x41): undefined reference to `pthread_mutex_unlock' | |||
| ... | |||
| ``` | |||
| ## Code examples | |||
| #### Call CBLAS interface | |||
| This example shows calling cblas_dgemm in C. https://gist.github.com/xianyi/6930656 | |||
| ``` | |||
| #include <cblas.h> | |||
| #include <stdio.h> | |||
| void main() | |||
| { | |||
| int i=0; | |||
| double A[6] = {1.0,2.0,1.0,-3.0,4.0,-1.0}; | |||
| double B[6] = {1.0,2.0,1.0,-3.0,4.0,-1.0}; | |||
| double C[9] = {.5,.5,.5,.5,.5,.5,.5,.5,.5}; | |||
| cblas_dgemm(CblasColMajor, CblasNoTrans, CblasTrans,3,3,2,1,A, 3, B, 3,2,C,3); | |||
| for(i=0; i<9; i++) | |||
| printf("%lf ", C[i]); | |||
| printf("\n"); | |||
| } | |||
| ``` | |||
| `gcc -o test_cblas_open test_cblas_dgemm.c -I /your_path/OpenBLAS/include/ -L/your_path/OpenBLAS/lib -lopenblas -lpthread -lgfortran` | |||
| #### Call BLAS Fortran interface | |||
| This example shows calling dgemm Fortran interface in C. https://gist.github.com/xianyi/5780018 | |||
| ``` | |||
| #include "stdio.h" | |||
| #include "stdlib.h" | |||
| #include "sys/time.h" | |||
| #include "time.h" | |||
| extern void dgemm_(char*, char*, int*, int*,int*, double*, double*, int*, double*, int*, double*, double*, int*); | |||
| int main(int argc, char* argv[]) | |||
| { | |||
| int i; | |||
| printf("test!\n"); | |||
| if(argc<4){ | |||
| printf("Input Error\n"); | |||
| return 1; | |||
| } | |||
| int m = atoi(argv[1]); | |||
| int n = atoi(argv[2]); | |||
| int k = atoi(argv[3]); | |||
| int sizeofa = m * k; | |||
| int sizeofb = k * n; | |||
| int sizeofc = m * n; | |||
| char ta = 'N'; | |||
| char tb = 'N'; | |||
| double alpha = 1.2; | |||
| double beta = 0.001; | |||
| struct timeval start,finish; | |||
| double duration; | |||
| double* A = (double*)malloc(sizeof(double) * sizeofa); | |||
| double* B = (double*)malloc(sizeof(double) * sizeofb); | |||
| double* C = (double*)malloc(sizeof(double) * sizeofc); | |||
| srand((unsigned)time(NULL)); | |||
| for (i=0; i<sizeofa; i++) | |||
| A[i] = i%3+1;//(rand()%100)/10.0; | |||
| for (i=0; i<sizeofb; i++) | |||
| B[i] = i%3+1;//(rand()%100)/10.0; | |||
| for (i=0; i<sizeofc; i++) | |||
| C[i] = i%3+1;//(rand()%100)/10.0; | |||
| //#if 0 | |||
| printf("m=%d,n=%d,k=%d,alpha=%lf,beta=%lf,sizeofc=%d\n",m,n,k,alpha,beta,sizeofc); | |||
| gettimeofday(&start, NULL); | |||
| dgemm_(&ta, &tb, &m, &n, &k, &alpha, A, &m, B, &k, &beta, C, &m); | |||
| gettimeofday(&finish, NULL); | |||
| duration = ((double)(finish.tv_sec-start.tv_sec)*1000000 + (double)(finish.tv_usec-start.tv_usec)) / 1000000; | |||
| double gflops = 2.0 * m *n*k; | |||
| gflops = gflops/duration*1.0e-6; | |||
| FILE *fp; | |||
| fp = fopen("timeDGEMM.txt", "a"); | |||
| fprintf(fp, "%dx%dx%d\t%lf s\t%lf MFLOPS\n", m, n, k, duration, gflops); | |||
| fclose(fp); | |||
| free(A); | |||
| free(B); | |||
| free(C); | |||
| return 0; | |||
| } | |||
| ``` | |||
| ` gcc -o time_dgemm time_dgemm.c /your/path/libopenblas.a` | |||
| ` ./time_dgemm <m> <n> <k> ` | |||
| ## Troubleshooting | |||
| * Please read [Faq](https://github.com/xianyi/OpenBLAS/wiki/Faq) at first. | |||
| * Please use gcc version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MingW/BSD. | |||
| * Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture. The Clang 3.0 will generate the wrong AVX binary code. | |||
| * The number of CPUs/Cores should less than or equal to 256. On Linux x86_64(amd64), there is experimental support for up to 1024 CPUs/Cores and 128 numa nodes if you build the library with BIGNUMA=1. | |||
| * OpenBLAS does not set processor affinity by default. On Linux, you can enable processor affinity by commenting the line NO_AFFINITY=1 in Makefile.rule. But this may cause [the conflict with R parallel](https://stat.ethz.ch/pipermail/r-sig-hpc/2012-April/001348.html). | |||
| * On Loongson 3A. make test would be failed because of pthread_create error. The error code is EAGAIN. However, it will be OK when you run the same testcase on shell. | |||
| ## BLAS reference manual | |||
| If you want to understand every BLAS function and definition, please read | |||
| [Intel MKL reference manual](https://software.intel.com/sites/products/documentation/doclib/iss/2013/mkl/mklman/GUID-F7ED9FB8-6663-4F44-A62B-61B63C4F0491.htm) | |||
| or [netlib.org](http://netlib.org/blas/) | |||
| Here are [OpenBLAS extension functions](https://github.com/xianyi/OpenBLAS/wiki/OpenBLAS-Extensions) | |||
| ## How to reference OpenBLAS. | |||
| You can reference our [papers](https://github.com/xianyi/OpenBLAS/wiki/publications). | |||
| Alternatively, you can cite the OpenBLAS homepage http://www.openblas.net directly. | |||
| @@ -39,4 +39,6 @@ before_build: | |||
| - cmake -G "Visual Studio 12 Win64" . | |||
| test_script: | |||
| - echo Build OK! | |||
| - echo Running Test | |||
| - cd c:\projects\OpenBLAS\utest | |||
| - openblas_utest | |||
| @@ -166,7 +166,8 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ | |||
| sgeev.goto dgeev.goto cgeev.goto zgeev.goto \ | |||
| sgetri.goto dgetri.goto cgetri.goto zgetri.goto \ | |||
| spotrf.goto dpotrf.goto cpotrf.goto zpotrf.goto \ | |||
| ssymm.goto dsymm.goto csymm.goto zsymm.goto | |||
| ssymm.goto dsymm.goto csymm.goto zsymm.goto \ | |||
| smallscaling | |||
| acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ | |||
| scholesky.acml dcholesky.acml ccholesky.acml zcholesky.acml \ | |||
| @@ -2132,6 +2133,8 @@ cgemm3m.$(SUFFIX) : gemm3m.c | |||
| zgemm3m.$(SUFFIX) : gemm3m.c | |||
| $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ | |||
| smallscaling: smallscaling.c ../$(LIBNAME) | |||
| $(CC) $(CFLAGS) -o $(@F) $^ $(EXTRALIB) -fopenmp -lm | |||
| clean :: | |||
| @rm -f *.goto *.mkl *.acml *.atlas *.veclib | |||
| @@ -172,7 +172,7 @@ int main(int argc, char *argv[]){ | |||
| srandom(getpid()); | |||
| #endif | |||
| for(j = 0; j < m; j++){ | |||
| for(j = 0; j < to; j++){ | |||
| for(i = 0; i < to * COMPSIZE; i++){ | |||
| a[i + j * to * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| b[i + j * to * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| @@ -0,0 +1,58 @@ | |||
| #!/usr/bin/env python | |||
| import os | |||
| import sys | |||
| import time | |||
| import numpy | |||
| from numpy import zeros | |||
| from numpy.random import randn | |||
| from scipy.linalg import blas | |||
| def run_dsyrk(N, l): | |||
| A = randn(N, N).astype('float64', order='F') | |||
| C = zeros((N, N), dtype='float64', order='F') | |||
| start = time.time() | |||
| for i in range(0, l): | |||
| blas.dsyrk(1.0, A, c=C, overwrite_c=True) | |||
| end = time.time() | |||
| timediff = (end - start) | |||
| mflops = (N * N * N) * l / timediff | |||
| mflops *= 1e-6 | |||
| size = "%dx%d" % (N, N) | |||
| print("%14s :\t%20f MFlops\t%20f sec" % (size, mflops, timediff)) | |||
| if __name__ == "__main__": | |||
| N = 128 | |||
| NMAX = 2048 | |||
| NINC = 128 | |||
| LOOPS = 1 | |||
| z = 0 | |||
| for arg in sys.argv: | |||
| if z == 1: | |||
| N = int(arg) | |||
| elif z == 2: | |||
| NMAX = int(arg) | |||
| elif z == 3: | |||
| NINC = int(arg) | |||
| elif z == 4: | |||
| LOOPS = int(arg) | |||
| z = z + 1 | |||
| if 'OPENBLAS_LOOPS' in os.environ: | |||
| p = os.environ['OPENBLAS_LOOPS'] | |||
| if p: | |||
| LOOPS = int(p) | |||
| print("From: %d To: %d Step=%d Loops=%d" % (N, NMAX, NINC, LOOPS)) | |||
| print("\tSIZE\t\t\tFlops\t\t\t\t\tTime") | |||
| for i in range(N, NMAX + NINC, NINC): | |||
| run_dsyrk(i, LOOPS) | |||
| @@ -0,0 +1,58 @@ | |||
| #!/usr/bin/env python | |||
| import os | |||
| import sys | |||
| import time | |||
| import numpy | |||
| from numpy import zeros | |||
| from numpy.random import randn | |||
| from scipy.linalg import blas | |||
| def run_ssyrk(N, l): | |||
| A = randn(N, N).astype('float32', order='F') | |||
| C = zeros((N, N), dtype='float32', order='F') | |||
| start = time.time() | |||
| for i in range(0, l): | |||
| blas.ssyrk(1.0, A, c=C, overwrite_c=True) | |||
| end = time.time() | |||
| timediff = (end - start) | |||
| mflops = (N * N * N) * l / timediff | |||
| mflops *= 1e-6 | |||
| size = "%dx%d" % (N, N) | |||
| print("%14s :\t%20f MFlops\t%20f sec" % (size, mflops, timediff)) | |||
| if __name__ == "__main__": | |||
| N = 128 | |||
| NMAX = 2048 | |||
| NINC = 128 | |||
| LOOPS = 1 | |||
| z = 0 | |||
| for arg in sys.argv: | |||
| if z == 1: | |||
| N = int(arg) | |||
| elif z == 2: | |||
| NMAX = int(arg) | |||
| elif z == 3: | |||
| NINC = int(arg) | |||
| elif z == 4: | |||
| LOOPS = int(arg) | |||
| z = z + 1 | |||
| if 'OPENBLAS_LOOPS' in os.environ: | |||
| p = os.environ['OPENBLAS_LOOPS'] | |||
| if p: | |||
| LOOPS = int(p) | |||
| print("From: %d To: %d Step=%d Loops=%d" % (N, NMAX, NINC, LOOPS)) | |||
| print("\tSIZE\t\t\tFlops\t\t\t\t\tTime") | |||
| for i in range(N, NMAX + NINC, NINC): | |||
| run_ssyrk(i, LOOPS) | |||
| @@ -0,0 +1,196 @@ | |||
| // run with OPENBLAS_NUM_THREADS=1 and OMP_NUM_THREADS=n | |||
| #include <math.h> | |||
| #include <stdlib.h> | |||
| #include <stdio.h> | |||
| #include <time.h> | |||
| #include <cblas.h> | |||
| #include <omp.h> | |||
| #define MIN_SIZE 5 | |||
| #define MAX_SIZE 60 | |||
| #define NB_SIZE 10 | |||
| // number of loop for a 1x1 matrix. Lower it if the test is | |||
| // too slow on you computer. | |||
| #define NLOOP 2e7 | |||
| typedef struct { | |||
| int matrix_size; | |||
| int n_loop; | |||
| void (* bench_func)(); | |||
| void (* blas_func)(); | |||
| void * (* create_matrix)(int size); | |||
| } BenchParam; | |||
| void * s_create_matrix(int size) { | |||
| float * r = malloc(size * sizeof(double)); | |||
| int i; | |||
| for(i = 0; i < size; i++) | |||
| r[i] = 1e3 * i / size; | |||
| return r; | |||
| } | |||
| void * c_create_matrix(int size) { | |||
| float * r = malloc(size * 2 * sizeof(double)); | |||
| int i; | |||
| for(i = 0; i < 2 * size; i++) | |||
| r[i] = 1e3 * i / size; | |||
| return r; | |||
| } | |||
| void * z_create_matrix(int size) { | |||
| double * r = malloc(size * 2 * sizeof(double)); | |||
| int i; | |||
| for(i = 0; i < 2 * size; i++) | |||
| r[i] = 1e3 * i / size; | |||
| return r; | |||
| } | |||
| void * d_create_matrix(int size) { | |||
| double * r = malloc(size * sizeof(double)); | |||
| int i; | |||
| for(i = 0; i < size; i++) | |||
| r[i] = 1e3 * i / size; | |||
| return r; | |||
| } | |||
| void trmv_bench(BenchParam * param) | |||
| { | |||
| int i, n; | |||
| int size = param->matrix_size; | |||
| n = param->n_loop / size; | |||
| int one = 1; | |||
| void * A = param->create_matrix(size * size); | |||
| void * y = param->create_matrix(size); | |||
| for(i = 0; i < n; i++) { | |||
| param->blas_func("U", "N", "N", &size, A, &size, y, &one); | |||
| } | |||
| free(A); | |||
| free(y); | |||
| } | |||
| void gemv_bench(BenchParam * param) | |||
| { | |||
| int i, n; | |||
| int size = param->matrix_size; | |||
| n = param->n_loop / size; | |||
| double v = 1.01; | |||
| int one = 1; | |||
| void * A = param->create_matrix(size * size); | |||
| void * y = param->create_matrix(size); | |||
| for(i = 0; i < n; i++) { | |||
| param->blas_func("N", &size, &size, &v, A, &size, y, &one, &v, y, &one); | |||
| } | |||
| free(A); | |||
| free(y); | |||
| } | |||
| void ger_bench(BenchParam * param) { | |||
| int i, n; | |||
| int size = param->matrix_size; | |||
| n = param->n_loop / size; | |||
| double v = 1.01; | |||
| int one = 1; | |||
| void * A = param->create_matrix(size * size); | |||
| void * y = param->create_matrix(size); | |||
| for(i = 0; i < n; i++) { | |||
| param->blas_func(&size, &size, &v, y, &one, y, &one, A, &size); | |||
| } | |||
| free(A); | |||
| free(y); | |||
| } | |||
| #ifndef _WIN32 | |||
| void * pthread_func_wrapper(void * param) { | |||
| ((BenchParam *)param)->bench_func(param); | |||
| pthread_exit(NULL); | |||
| } | |||
| #endif | |||
| #define NB_TESTS 5 | |||
| void * TESTS[4 * NB_TESTS] = { | |||
| trmv_bench, ztrmv_, z_create_matrix, "ztrmv", | |||
| gemv_bench, dgemv_, d_create_matrix, "dgemv", | |||
| gemv_bench, zgemv_, z_create_matrix, "zgemv", | |||
| ger_bench, dger_, d_create_matrix, "dger", | |||
| ger_bench, zgerc_, z_create_matrix, "zgerc", | |||
| }; | |||
| inline static double delta_time(struct timespec tick) { | |||
| struct timespec tock; | |||
| clock_gettime(CLOCK_MONOTONIC, &tock); | |||
| return (tock.tv_sec - tick.tv_sec) + (tock.tv_nsec - tick.tv_nsec) / 1e9; | |||
| } | |||
| double pthread_bench(BenchParam * param, int nb_threads) | |||
| { | |||
| #ifdef _WIN32 | |||
| return 0; | |||
| #else | |||
| BenchParam threaded_param = *param; | |||
| pthread_t threads[nb_threads]; | |||
| int t, rc; | |||
| struct timespec tick; | |||
| threaded_param.n_loop /= nb_threads; | |||
| clock_gettime(CLOCK_MONOTONIC, &tick); | |||
| for(t=0; t<nb_threads; t++){ | |||
| rc = pthread_create(&threads[t], NULL, pthread_func_wrapper, &threaded_param); | |||
| if (rc){ | |||
| printf("ERROR; return code from pthread_create() is %d\n", rc); | |||
| exit(-1); | |||
| } | |||
| } | |||
| for(t=0; t<nb_threads; t++){ | |||
| pthread_join(threads[t], NULL); | |||
| } | |||
| return delta_time(tick); | |||
| #endif | |||
| } | |||
| double seq_bench(BenchParam * param) { | |||
| struct timespec tick; | |||
| clock_gettime(CLOCK_MONOTONIC, &tick); | |||
| param->bench_func(param); | |||
| return delta_time(tick); | |||
| } | |||
| double omp_bench(BenchParam * param) { | |||
| BenchParam threaded_param = *param; | |||
| struct timespec tick; | |||
| int t; | |||
| int nb_threads = omp_get_max_threads(); | |||
| threaded_param.n_loop /= nb_threads; | |||
| clock_gettime(CLOCK_MONOTONIC, &tick); | |||
| #pragma omp parallel for | |||
| for(t = 0; t < nb_threads; t ++){ | |||
| param->bench_func(&threaded_param); | |||
| } | |||
| return delta_time(tick); | |||
| } | |||
| int main(int argc, char * argv[]) { | |||
| double inc_factor = exp(log((double)MAX_SIZE / MIN_SIZE) / NB_SIZE); | |||
| BenchParam param; | |||
| int test_id; | |||
| printf ("Running on %d threads\n", omp_get_max_threads()); | |||
| for(test_id = 0; test_id < NB_TESTS; test_id ++) { | |||
| double size = MIN_SIZE; | |||
| param.bench_func = TESTS[test_id * 4]; | |||
| param.blas_func = TESTS[test_id * 4 + 1]; | |||
| param.create_matrix = TESTS[test_id * 4 + 2]; | |||
| printf("\nBenchmark of %s\n", (char*)TESTS[test_id * 4 + 3]); | |||
| param.n_loop = NLOOP; | |||
| while(size <= MAX_SIZE) { | |||
| param.matrix_size = (int)(size + 0.5); | |||
| double seq_time = seq_bench(¶m); | |||
| double omp_time = omp_bench(¶m); | |||
| double pthread_time = pthread_bench(¶m, omp_get_max_threads()); | |||
| printf("matrix size %d, sequential %gs, openmp %gs, speedup %g, " | |||
| "pthread %gs, speedup %g\n", | |||
| param.matrix_size, seq_time, | |||
| omp_time, seq_time / omp_time, | |||
| pthread_time, seq_time / pthread_time); | |||
| size *= inc_factor; | |||
| } | |||
| } | |||
| return(0); | |||
| } | |||
| @@ -6,6 +6,7 @@ $hostarch = `uname -m | sed -e s/i.86/x86/`;chop($hostarch); | |||
| $hostarch = "x86_64" if ($hostarch eq "amd64"); | |||
| $hostarch = "arm" if ($hostarch =~ /^arm.*/); | |||
| $hostarch = "arm64" if ($hostarch eq "aarch64"); | |||
| $hostarch = "power" if ($hostarch =~ /^(powerpc|ppc).*/); | |||
| $binary = $ENV{"BINARY"}; | |||
| @@ -14,12 +14,12 @@ if (${ARCH} STREQUAL "x86" OR ${ARCH} STREQUAL "x86_64") | |||
| if (NOT NO_EXPRECISION) | |||
| if (${F_COMPILER} MATCHES "GFORTRAN") | |||
| # N.B. I'm not sure if CMake differentiates between GCC and LSB -hpa | |||
| if (${CMAKE_C_COMPILER} STREQUAL "GNU" OR ${CMAKE_C_COMPILER} STREQUAL "LSB") | |||
| if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "LSB") | |||
| set(EXPRECISION 1) | |||
| set(CCOMMON_OPT "${CCOMMON_OPT} -DEXPRECISION -m128bit-long-double") | |||
| set(FCOMMON_OPT "${FCOMMON_OPT} -m128bit-long-double") | |||
| endif () | |||
| if (${CMAKE_C_COMPILER} STREQUAL "Clang") | |||
| if (${CMAKE_C_COMPILER_ID} STREQUAL "Clang") | |||
| set(EXPRECISION 1) | |||
| set(CCOMMON_OPT "${CCOMMON_OPT} -DEXPRECISION") | |||
| set(FCOMMON_OPT "${FCOMMON_OPT} -m128bit-long-double") | |||
| @@ -28,35 +28,35 @@ if (${ARCH} STREQUAL "x86" OR ${ARCH} STREQUAL "x86_64") | |||
| endif () | |||
| endif () | |||
| if (${CMAKE_C_COMPILER} STREQUAL "Intel") | |||
| if (${CMAKE_C_COMPILER_ID} STREQUAL "Intel") | |||
| set(CCOMMON_OPT "${CCOMMON_OPT} -wd981") | |||
| endif () | |||
| if (USE_OPENMP) | |||
| if (${CMAKE_C_COMPILER} STREQUAL "GNU" OR ${CMAKE_C_COMPILER} STREQUAL "LSB") | |||
| if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "LSB") | |||
| set(CCOMMON_OPT "${CCOMMON_OPT} -fopenmp") | |||
| endif () | |||
| if (${CMAKE_C_COMPILER} STREQUAL "Clang") | |||
| if (${CMAKE_C_COMPILER_ID} STREQUAL "Clang") | |||
| message(WARNING "Clang doesn't support OpenMP yet.") | |||
| set(CCOMMON_OPT "${CCOMMON_OPT} -fopenmp") | |||
| endif () | |||
| if (${CMAKE_C_COMPILER} STREQUAL "Intel") | |||
| if (${CMAKE_C_COMPILER_ID} STREQUAL "Intel") | |||
| set(CCOMMON_OPT "${CCOMMON_OPT} -openmp") | |||
| endif () | |||
| if (${CMAKE_C_COMPILER} STREQUAL "PGI") | |||
| if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI") | |||
| set(CCOMMON_OPT "${CCOMMON_OPT} -mp") | |||
| endif () | |||
| if (${CMAKE_C_COMPILER} STREQUAL "OPEN64") | |||
| if (${CMAKE_C_COMPILER_ID} STREQUAL "OPEN64") | |||
| set(CCOMMON_OPT "${CCOMMON_OPT} -mp") | |||
| set(CEXTRALIB "${CEXTRALIB} -lstdc++") | |||
| endif () | |||
| if (${CMAKE_C_COMPILER} STREQUAL "PATHSCALE") | |||
| if (${CMAKE_C_COMPILER_ID} STREQUAL "PATHSCALE") | |||
| set(CCOMMON_OPT "${CCOMMON_OPT} -mp") | |||
| endif () | |||
| endif () | |||
| @@ -87,7 +87,7 @@ if (${ARCH} STREQUAL "ia64") | |||
| set(BINARY_DEFINED 1) | |||
| if (${F_COMPILER} MATCHES "GFORTRAN") | |||
| if (${CMAKE_C_COMPILER} STREQUAL "GNU") | |||
| if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU") | |||
| # EXPRECISION = 1 | |||
| # CCOMMON_OPT += -DEXPRECISION | |||
| endif () | |||
| @@ -48,18 +48,18 @@ set(SLASRC | |||
| sgbbrd.f sgbcon.f sgbequ.f sgbrfs.f sgbsv.f | |||
| sgbsvx.f sgbtf2.f sgbtrf.f sgbtrs.f sgebak.f sgebal.f sgebd2.f | |||
| sgebrd.f sgecon.f sgeequ.f sgees.f sgeesx.f sgeev.f sgeevx.f | |||
| sgegs.f sgegv.f sgehd2.f sgehrd.f sgelq2.f sgelqf.f | |||
| sgels.f sgelsd.f sgelss.f sgelsx.f sgelsy.f sgeql2.f sgeqlf.f | |||
| sgeqp3.f sgeqpf.f sgeqr2.f sgeqr2p.f sgeqrf.f sgeqrfp.f sgerfs.f | |||
| DEPRECATED/sgegs.f DEPRECATED/sgegv.f sgehd2.f sgehrd.f sgelq2.f sgelqf.f | |||
| sgels.f sgelsd.f sgelss.f DEPRECATED/sgelsx.f sgelsy.f sgeql2.f sgeqlf.f | |||
| sgeqp3.f DEPRECATED/sgeqpf.f sgeqr2.f sgeqr2p.f sgeqrf.f sgeqrfp.f sgerfs.f | |||
| sgerq2.f sgerqf.f sgesc2.f sgesdd.f sgesvd.f sgesvx.f | |||
| sgetc2.f sgetri.f | |||
| sggbak.f sggbal.f sgges.f sggesx.f sggev.f sggevx.f | |||
| sggglm.f sgghrd.f sgglse.f sggqrf.f | |||
| sggrqf.f sggsvd.f sggsvp.f sgtcon.f sgtrfs.f sgtsv.f | |||
| sggrqf.f DEPRECATED/sggsvd.f DEPRECATED/sggsvp.f sgtcon.f sgtrfs.f sgtsv.f | |||
| sgtsvx.f sgttrf.f sgttrs.f sgtts2.f shgeqz.f | |||
| shsein.f shseqr.f slabrd.f slacon.f slacn2.f | |||
| slaein.f slaexc.f slag2.f slags2.f slagtm.f slagv2.f slahqr.f | |||
| slahrd.f slahr2.f slaic1.f slaln2.f slals0.f slalsa.f slalsd.f | |||
| DEPRECATED/slahrd.f slahr2.f slaic1.f slaln2.f slals0.f slalsa.f slalsd.f | |||
| slangb.f slange.f slangt.f slanhs.f slansb.f slansp.f | |||
| slansy.f slantb.f slantp.f slantr.f slanv2.f | |||
| slapll.f slapmt.f | |||
| @@ -69,7 +69,7 @@ set(SLASRC | |||
| slarf.f slarfb.f slarfg.f slarfgp.f slarft.f slarfx.f slargv.f | |||
| slarrv.f slartv.f | |||
| slarz.f slarzb.f slarzt.f slasy2.f slasyf.f slasyf_rook.f | |||
| slatbs.f slatdf.f slatps.f slatrd.f slatrs.f slatrz.f slatzm.f | |||
| slatbs.f slatdf.f slatps.f slatrd.f slatrs.f slatrz.f DEPRECATED/slatzm.f | |||
| sopgtr.f sopmtr.f sorg2l.f sorg2r.f | |||
| sorgbr.f sorghr.f sorgl2.f sorglq.f sorgql.f sorgqr.f sorgr2.f | |||
| sorgrq.f sorgtr.f sorm2l.f sorm2r.f | |||
| @@ -97,7 +97,7 @@ set(SLASRC | |||
| stgsja.f stgsna.f stgsy2.f stgsyl.f stpcon.f stprfs.f stptri.f | |||
| stptrs.f | |||
| strcon.f strevc.f strexc.f strrfs.f strsen.f strsna.f strsyl.f | |||
| strtrs.f stzrqf.f stzrzf.f sstemr.f | |||
| strtrs.f DEPRECATED/stzrqf.f stzrzf.f sstemr.f | |||
| slansf.f spftrf.f spftri.f spftrs.f ssfrk.f stfsm.f stftri.f stfttp.f | |||
| stfttr.f stpttf.f stpttr.f strttf.f strttp.f | |||
| sgejsv.f sgesvj.f sgsvj0.f sgsvj1.f | |||
| @@ -114,14 +114,14 @@ set(CLASRC | |||
| cbdsqr.f cgbbrd.f cgbcon.f cgbequ.f cgbrfs.f cgbsv.f cgbsvx.f | |||
| cgbtf2.f cgbtrf.f cgbtrs.f cgebak.f cgebal.f cgebd2.f cgebrd.f | |||
| cgecon.f cgeequ.f cgees.f cgeesx.f cgeev.f cgeevx.f | |||
| cgegs.f cgegv.f cgehd2.f cgehrd.f cgelq2.f cgelqf.f | |||
| cgels.f cgelsd.f cgelss.f cgelsx.f cgelsy.f cgeql2.f cgeqlf.f cgeqp3.f | |||
| cgeqpf.f cgeqr2.f cgeqr2p.f cgeqrf.f cgeqrfp.f cgerfs.f | |||
| DEPRECATED/cgegs.f DEPRECATED/cgegv.f cgehd2.f cgehrd.f cgelq2.f cgelqf.f | |||
| cgels.f cgelsd.f cgelss.f DEPRECATED/cgelsx.f cgelsy.f cgeql2.f cgeqlf.f cgeqp3.f | |||
| DEPRECATED/cgeqpf.f cgeqr2.f cgeqr2p.f cgeqrf.f cgeqrfp.f cgerfs.f | |||
| cgerq2.f cgerqf.f cgesc2.f cgesdd.f cgesvd.f | |||
| cgesvx.f cgetc2.f cgetri.f | |||
| cggbak.f cggbal.f cgges.f cggesx.f cggev.f cggevx.f cggglm.f | |||
| cgghrd.f cgglse.f cggqrf.f cggrqf.f | |||
| cggsvd.f cggsvp.f | |||
| DEPRECATED/cggsvd.f DEPRECATED/cggsvp.f | |||
| cgtcon.f cgtrfs.f cgtsv.f cgtsvx.f cgttrf.f cgttrs.f cgtts2.f chbev.f | |||
| chbevd.f chbevx.f chbgst.f chbgv.f chbgvd.f chbgvx.f chbtrd.f | |||
| checon.f cheev.f cheevd.f cheevr.f cheevx.f chegs2.f chegst.f | |||
| @@ -138,7 +138,7 @@ set(CLASRC | |||
| claed0.f claed7.f claed8.f | |||
| claein.f claesy.f claev2.f clags2.f clagtm.f | |||
| clahef.f clahef_rook.f clahqr.f | |||
| clahrd.f clahr2.f claic1.f clals0.f clalsa.f clalsd.f clangb.f clange.f clangt.f | |||
| DEPRECATED/clahrd.f clahr2.f claic1.f clals0.f clalsa.f clalsd.f clangb.f clange.f clangt.f | |||
| clanhb.f clanhe.f | |||
| clanhp.f clanhs.f clanht.f clansb.f clansp.f clansy.f clantb.f | |||
| clantp.f clantr.f clapll.f clapmt.f clarcm.f claqgb.f claqge.f | |||
| @@ -149,7 +149,7 @@ set(CLASRC | |||
| clarfx.f clargv.f clarnv.f clarrv.f clartg.f clartv.f | |||
| clarz.f clarzb.f clarzt.f clascl.f claset.f clasr.f classq.f | |||
| clasyf.f clasyf_rook.f clatbs.f clatdf.f clatps.f clatrd.f clatrs.f clatrz.f | |||
| clatzm.f cpbcon.f cpbequ.f cpbrfs.f cpbstf.f cpbsv.f | |||
| DEPRECATED/clatzm.f cpbcon.f cpbequ.f cpbrfs.f cpbstf.f cpbsv.f | |||
| cpbsvx.f cpbtf2.f cpbtrf.f cpbtrs.f cpocon.f cpoequ.f cporfs.f | |||
| cposv.f cposvx.f cpstrf.f cpstf2.f | |||
| cppcon.f cppequ.f cpprfs.f cppsv.f cppsvx.f cpptrf.f cpptri.f cpptrs.f | |||
| @@ -166,7 +166,7 @@ set(CLASRC | |||
| ctgexc.f ctgsen.f ctgsja.f ctgsna.f ctgsy2.f ctgsyl.f ctpcon.f | |||
| ctprfs.f ctptri.f | |||
| ctptrs.f ctrcon.f ctrevc.f ctrexc.f ctrrfs.f ctrsen.f ctrsna.f | |||
| ctrsyl.f ctrtrs.f ctzrqf.f ctzrzf.f cung2l.f cung2r.f | |||
| ctrsyl.f ctrtrs.f DEPRECATED/ctzrqf.f ctzrzf.f cung2l.f cung2r.f | |||
| cungbr.f cunghr.f cungl2.f cunglq.f cungql.f cungqr.f cungr2.f | |||
| cungrq.f cungtr.f cunm2l.f cunm2r.f cunmbr.f cunmhr.f cunml2.f | |||
| cunmlq.f cunmql.f cunmqr.f cunmr2.f cunmr3.f cunmrq.f cunmrz.f | |||
| @@ -186,18 +186,18 @@ set(DLASRC | |||
| dgbbrd.f dgbcon.f dgbequ.f dgbrfs.f dgbsv.f | |||
| dgbsvx.f dgbtf2.f dgbtrf.f dgbtrs.f dgebak.f dgebal.f dgebd2.f | |||
| dgebrd.f dgecon.f dgeequ.f dgees.f dgeesx.f dgeev.f dgeevx.f | |||
| dgegs.f dgegv.f dgehd2.f dgehrd.f dgelq2.f dgelqf.f | |||
| dgels.f dgelsd.f dgelss.f dgelsx.f dgelsy.f dgeql2.f dgeqlf.f | |||
| dgeqp3.f dgeqpf.f dgeqr2.f dgeqr2p.f dgeqrf.f dgeqrfp.f dgerfs.f | |||
| DEPRECATED/dgegs.f DEPRECATED/dgegv.f dgehd2.f dgehrd.f dgelq2.f dgelqf.f | |||
| dgels.f dgelsd.f dgelss.f DEPRECATED/dgelsx.f dgelsy.f dgeql2.f dgeqlf.f | |||
| dgeqp3.f DEPRECATED/dgeqpf.f dgeqr2.f dgeqr2p.f dgeqrf.f dgeqrfp.f dgerfs.f | |||
| dgerq2.f dgerqf.f dgesc2.f dgesdd.f dgesvd.f dgesvx.f | |||
| dgetc2.f dgetri.f | |||
| dggbak.f dggbal.f dgges.f dggesx.f dggev.f dggevx.f | |||
| dggglm.f dgghrd.f dgglse.f dggqrf.f | |||
| dggrqf.f dggsvd.f dggsvp.f dgtcon.f dgtrfs.f dgtsv.f | |||
| dggrqf.f DEPRECATED/dggsvd.f DEPRECATED/dggsvp.f dgtcon.f dgtrfs.f dgtsv.f | |||
| dgtsvx.f dgttrf.f dgttrs.f dgtts2.f dhgeqz.f | |||
| dhsein.f dhseqr.f dlabrd.f dlacon.f dlacn2.f | |||
| dlaein.f dlaexc.f dlag2.f dlags2.f dlagtm.f dlagv2.f dlahqr.f | |||
| dlahrd.f dlahr2.f dlaic1.f dlaln2.f dlals0.f dlalsa.f dlalsd.f | |||
| DEPRECATED/dlahrd.f dlahr2.f dlaic1.f dlaln2.f dlals0.f dlalsa.f dlalsd.f | |||
| dlangb.f dlange.f dlangt.f dlanhs.f dlansb.f dlansp.f | |||
| dlansy.f dlantb.f dlantp.f dlantr.f dlanv2.f | |||
| dlapll.f dlapmt.f | |||
| @@ -207,7 +207,7 @@ set(DLASRC | |||
| dlarf.f dlarfb.f dlarfg.f dlarfgp.f dlarft.f dlarfx.f | |||
| dlargv.f dlarrv.f dlartv.f | |||
| dlarz.f dlarzb.f dlarzt.f dlasy2.f dlasyf.f dlasyf_rook.f | |||
| dlatbs.f dlatdf.f dlatps.f dlatrd.f dlatrs.f dlatrz.f dlatzm.f | |||
| dlatbs.f dlatdf.f dlatps.f dlatrd.f dlatrs.f dlatrz.f DEPRECATED/dlatzm.f | |||
| dopgtr.f dopmtr.f dorg2l.f dorg2r.f | |||
| dorgbr.f dorghr.f dorgl2.f dorglq.f dorgql.f dorgqr.f dorgr2.f | |||
| dorgrq.f dorgtr.f dorm2l.f dorm2r.f | |||
| @@ -235,7 +235,7 @@ set(DLASRC | |||
| dtgsja.f dtgsna.f dtgsy2.f dtgsyl.f dtpcon.f dtprfs.f dtptri.f | |||
| dtptrs.f | |||
| dtrcon.f dtrevc.f dtrexc.f dtrrfs.f dtrsen.f dtrsna.f dtrsyl.f | |||
| dtrtrs.f dtzrqf.f dtzrzf.f dstemr.f | |||
| dtrtrs.f DEPRECATED/dtzrqf.f dtzrzf.f dstemr.f | |||
| dsgesv.f dsposv.f dlag2s.f slag2d.f dlat2s.f | |||
| dlansf.f dpftrf.f dpftri.f dpftrs.f dsfrk.f dtfsm.f dtftri.f dtfttp.f | |||
| dtfttr.f dtpttf.f dtpttr.f dtrttf.f dtrttp.f | |||
| @@ -251,14 +251,14 @@ set(ZLASRC | |||
| zbdsqr.f zgbbrd.f zgbcon.f zgbequ.f zgbrfs.f zgbsv.f zgbsvx.f | |||
| zgbtf2.f zgbtrf.f zgbtrs.f zgebak.f zgebal.f zgebd2.f zgebrd.f | |||
| zgecon.f zgeequ.f zgees.f zgeesx.f zgeev.f zgeevx.f | |||
| zgegs.f zgegv.f zgehd2.f zgehrd.f zgelq2.f zgelqf.f | |||
| zgels.f zgelsd.f zgelss.f zgelsx.f zgelsy.f zgeql2.f zgeqlf.f zgeqp3.f | |||
| zgeqpf.f zgeqr2.f zgeqr2p.f zgeqrf.f zgeqrfp.f zgerfs.f zgerq2.f zgerqf.f | |||
| DEPRECATED/zgegs.f DEPRECATED/zgegv.f zgehd2.f zgehrd.f zgelq2.f zgelqf.f | |||
| zgels.f zgelsd.f zgelss.f DEPRECATED/zgelsx.f zgelsy.f zgeql2.f zgeqlf.f zgeqp3.f | |||
| DEPRECATED/zgeqpf.f zgeqr2.f zgeqr2p.f zgeqrf.f zgeqrfp.f zgerfs.f zgerq2.f zgerqf.f | |||
| zgesc2.f zgesdd.f zgesvd.f zgesvx.f zgetc2.f | |||
| zgetri.f | |||
| zggbak.f zggbal.f zgges.f zggesx.f zggev.f zggevx.f zggglm.f | |||
| zgghrd.f zgglse.f zggqrf.f zggrqf.f | |||
| zggsvd.f zggsvp.f | |||
| DEPRECATED/zggsvd.f DEPRECATED/zggsvp.f | |||
| zgtcon.f zgtrfs.f zgtsv.f zgtsvx.f zgttrf.f zgttrs.f zgtts2.f zhbev.f | |||
| zhbevd.f zhbevx.f zhbgst.f zhbgv.f zhbgvd.f zhbgvx.f zhbtrd.f | |||
| zhecon.f zheev.f zheevd.f zheevr.f zheevx.f zhegs2.f zhegst.f | |||
| @@ -275,7 +275,7 @@ set(ZLASRC | |||
| zlaed0.f zlaed7.f zlaed8.f | |||
| zlaein.f zlaesy.f zlaev2.f zlags2.f zlagtm.f | |||
| zlahef.f zlahef_rook.f zlahqr.f | |||
| zlahrd.f zlahr2.f zlaic1.f zlals0.f zlalsa.f zlalsd.f zlangb.f zlange.f | |||
| DEPRECATED/zlahrd.f zlahr2.f zlaic1.f zlals0.f zlalsa.f zlalsd.f zlangb.f zlange.f | |||
| zlangt.f zlanhb.f | |||
| zlanhe.f | |||
| zlanhp.f zlanhs.f zlanht.f zlansb.f zlansp.f zlansy.f zlantb.f | |||
| @@ -288,7 +288,7 @@ set(ZLASRC | |||
| zlarfx.f zlargv.f zlarnv.f zlarrv.f zlartg.f zlartv.f | |||
| zlarz.f zlarzb.f zlarzt.f zlascl.f zlaset.f zlasr.f | |||
| zlassq.f zlasyf.f zlasyf_rook.f | |||
| zlatbs.f zlatdf.f zlatps.f zlatrd.f zlatrs.f zlatrz.f zlatzm.f | |||
| zlatbs.f zlatdf.f zlatps.f zlatrd.f zlatrs.f zlatrz.f DEPRECATED/zlatzm.f | |||
| zpbcon.f zpbequ.f zpbrfs.f zpbstf.f zpbsv.f | |||
| zpbsvx.f zpbtf2.f zpbtrf.f zpbtrs.f zpocon.f zpoequ.f zporfs.f | |||
| zposv.f zposvx.f zpotrs.f zpstrf.f zpstf2.f | |||
| @@ -306,7 +306,7 @@ set(ZLASRC | |||
| ztgexc.f ztgsen.f ztgsja.f ztgsna.f ztgsy2.f ztgsyl.f ztpcon.f | |||
| ztprfs.f ztptri.f | |||
| ztptrs.f ztrcon.f ztrevc.f ztrexc.f ztrrfs.f ztrsen.f ztrsna.f | |||
| ztrsyl.f ztrtrs.f ztzrqf.f ztzrzf.f zung2l.f | |||
| ztrsyl.f ztrtrs.f DEPRECATED/ztzrqf.f ztzrzf.f zung2l.f | |||
| zung2r.f zungbr.f zunghr.f zungl2.f zunglq.f zungql.f zungqr.f zungr2.f | |||
| zungrq.f zungtr.f zunm2l.f zunm2r.f zunmbr.f zunmhr.f zunml2.f | |||
| zunmlq.f zunmql.f zunmqr.f zunmr2.f zunmr3.f zunmrq.f zunmrz.f | |||
| @@ -2038,6 +2038,59 @@ set(MATGEN | |||
| lapacke_zlagsy_work.c | |||
| ) | |||
| set(Utils_SRC | |||
| lapacke_cgb_nancheck.c lapacke_dpf_nancheck.c lapacke_ssy_trans.c | |||
| lapacke_cgb_trans.c lapacke_dpf_trans.c lapacke_stb_nancheck.c | |||
| lapacke_cge_nancheck.c lapacke_dpo_nancheck.c lapacke_stb_trans.c | |||
| lapacke_cge_trans.c lapacke_dpo_trans.c lapacke_stf_nancheck.c | |||
| lapacke_cgg_nancheck.c lapacke_dpp_nancheck.c lapacke_stf_trans.c | |||
| lapacke_cgg_trans.c lapacke_dpp_trans.c lapacke_stp_nancheck.c | |||
| lapacke_cgt_nancheck.c lapacke_dpt_nancheck.c lapacke_stp_trans.c | |||
| lapacke_chb_nancheck.c lapacke_dsb_nancheck.c lapacke_str_nancheck.c | |||
| lapacke_chb_trans.c lapacke_dsb_trans.c lapacke_str_trans.c | |||
| lapacke_che_nancheck.c lapacke_dsp_nancheck.c lapacke_xerbla.c | |||
| lapacke_che_trans.c lapacke_dsp_trans.c lapacke_zgb_nancheck.c | |||
| lapacke_chp_nancheck.c lapacke_dst_nancheck.c lapacke_zgb_trans.c | |||
| lapacke_chp_trans.c lapacke_dsy_nancheck.c lapacke_zge_nancheck.c | |||
| lapacke_chs_nancheck.c lapacke_dsy_trans.c lapacke_zge_trans.c | |||
| lapacke_chs_trans.c lapacke_dtb_nancheck.c lapacke_zgg_nancheck.c | |||
| lapacke_c_nancheck.c lapacke_dtb_trans.c lapacke_zgg_trans.c | |||
| lapacke_cpb_nancheck.c lapacke_dtf_nancheck.c lapacke_zgt_nancheck.c | |||
| lapacke_cpb_trans.c lapacke_dtf_trans.c lapacke_zhb_nancheck.c | |||
| lapacke_cpf_nancheck.c lapacke_dtp_nancheck.c lapacke_zhb_trans.c | |||
| lapacke_cpf_trans.c lapacke_dtp_trans.c lapacke_zhe_nancheck.c | |||
| lapacke_cpo_nancheck.c lapacke_dtr_nancheck.c lapacke_zhe_trans.c | |||
| lapacke_cpo_trans.c lapacke_dtr_trans.c lapacke_zhp_nancheck.c | |||
| lapacke_cpp_nancheck.c lapacke_lsame.c lapacke_zhp_trans.c | |||
| lapacke_cpp_trans.c lapacke_make_complex_double.c lapacke_zhs_nancheck.c | |||
| lapacke_cpt_nancheck.c lapacke_make_complex_float.c lapacke_zhs_trans.c | |||
| lapacke_csp_nancheck.c lapacke_sgb_nancheck.c lapacke_z_nancheck.c | |||
| lapacke_csp_trans.c lapacke_sgb_trans.c lapacke_zpb_nancheck.c | |||
| lapacke_cst_nancheck.c lapacke_sge_nancheck.c lapacke_zpb_trans.c | |||
| lapacke_csy_nancheck.c lapacke_sge_trans.c lapacke_zpf_nancheck.c | |||
| lapacke_csy_trans.c lapacke_sgg_nancheck.c lapacke_zpf_trans.c | |||
| lapacke_ctb_nancheck.c lapacke_sgg_trans.c lapacke_zpo_nancheck.c | |||
| lapacke_ctb_trans.c lapacke_sgt_nancheck.c lapacke_zpo_trans.c | |||
| lapacke_ctf_nancheck.c lapacke_shs_nancheck.c lapacke_zpp_nancheck.c | |||
| lapacke_ctf_trans.c lapacke_shs_trans.c lapacke_zpp_trans.c | |||
| lapacke_ctp_nancheck.c lapacke_s_nancheck.c lapacke_zpt_nancheck.c | |||
| lapacke_ctp_trans.c lapacke_spb_nancheck.c lapacke_zsp_nancheck.c | |||
| lapacke_ctr_nancheck.c lapacke_spb_trans.c lapacke_zsp_trans.c | |||
| lapacke_ctr_trans.c lapacke_spf_nancheck.c lapacke_zst_nancheck.c | |||
| lapacke_dgb_nancheck.c lapacke_spf_trans.c lapacke_zsy_nancheck.c | |||
| lapacke_dgb_trans.c lapacke_spo_nancheck.c lapacke_zsy_trans.c | |||
| lapacke_dge_nancheck.c lapacke_spo_trans.c lapacke_ztb_nancheck.c | |||
| lapacke_dge_trans.c lapacke_spp_nancheck.c lapacke_ztb_trans.c | |||
| lapacke_dgg_nancheck.c lapacke_spp_trans.c lapacke_ztf_nancheck.c | |||
| lapacke_dgg_trans.c lapacke_spt_nancheck.c lapacke_ztf_trans.c | |||
| lapacke_dgt_nancheck.c lapacke_ssb_nancheck.c lapacke_ztp_nancheck.c | |||
| lapacke_dhs_nancheck.c lapacke_ssb_trans.c lapacke_ztp_trans.c | |||
| lapacke_dhs_trans.c lapacke_ssp_nancheck.c lapacke_ztr_nancheck.c | |||
| lapacke_d_nancheck.c lapacke_ssp_trans.c lapacke_ztr_trans.c | |||
| lapacke_dpb_nancheck.c lapacke_sst_nancheck.c | |||
| lapacke_dpb_trans.c lapacke_ssy_nancheck.c | |||
| ) | |||
| set(LAPACKE_REL_SRC "") | |||
| if (BUILD_SINGLE) | |||
| list(APPEND LAPACKE_REL_SRC ${SSRC}) | |||
| @@ -2058,10 +2111,14 @@ endif () | |||
| # add lapack-netlib folder to the sources | |||
| set(LAPACKE_SOURCES "") | |||
| foreach (LAE_FILE ${LAPACKE_REL_SRC}) | |||
| list(APPEND LAPACKE_SOURCES "${NETLIB_LAPACK_DIR}/lapacke/src/${LAE_FILE}") | |||
| list(APPEND LAPACKE_SOURCES "${NETLIB_LAPACK_DIR}/LAPACKE/src/${LAE_FILE}") | |||
| endforeach () | |||
| foreach (Utils_FILE ${Utils_SRC}) | |||
| list(APPEND LAPACKE_SOURCES "${NETLIB_LAPACK_DIR}/LAPACKE/utils/${Utils_FILE}") | |||
| endforeach () | |||
| set(lapacke_include_dir "${NETLIB_LAPACK_DIR}/lapacke/include") | |||
| set(lapacke_include_dir "${NETLIB_LAPACK_DIR}/LAPACKE/include") | |||
| execute_process(COMMAND ${CMAKE_COMMAND} -E copy "${lapacke_include_dir}/lapacke_mangling_with_flags.h" "${lapacke_include_dir}/lapacke_mangling.h") | |||
| include_directories(${lapacke_include_dir}) | |||
| set_source_files_properties(${LAPACKE_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_CFLAGS}") | |||
| @@ -86,13 +86,14 @@ extern "C" { | |||
| #if !defined(_MSC_VER) | |||
| #include <unistd.h> | |||
| #endif | |||
| #include <time.h> | |||
| #ifdef OS_LINUX | |||
| #include <malloc.h> | |||
| #include <sched.h> | |||
| #endif | |||
| #if defined(OS_DARWIN) || defined(OS_FREEBSD) || defined(OS_NETBSD) | |||
| #if defined(OS_DARWIN) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(OS_ANDROID) | |||
| #include <sched.h> | |||
| #endif | |||
| @@ -331,12 +332,13 @@ typedef int blasint; | |||
| #endif | |||
| #endif | |||
| /* | |||
| #ifdef PILEDRIVER | |||
| #ifndef YIELDING | |||
| #define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n"); | |||
| #endif | |||
| #endif | |||
| */ | |||
| /* | |||
| #ifdef STEAMROLLER | |||
| @@ -410,7 +412,7 @@ please https://github.com/xianyi/OpenBLAS/issues/246 | |||
| #ifndef ASSEMBLER | |||
| #ifdef OS_WINDOWS | |||
| typedef char env_var_t[MAX_PATH]; | |||
| #define readenv(p, n) GetEnvironmentVariable((n), (p), sizeof(p)) | |||
| #define readenv(p, n) GetEnvironmentVariable((LPCTSTR)(n), (LPTSTR)(p), sizeof(p)) | |||
| #else | |||
| typedef char* env_var_t; | |||
| #define readenv(p, n) ((p)=getenv(n)) | |||
| @@ -726,6 +728,7 @@ typedef struct { | |||
| #endif | |||
| #ifndef ASSEMBLER | |||
| #include "common_stackalloc.h" | |||
| #if 0 | |||
| #include "symcopy.h" | |||
| #endif | |||
| @@ -43,28 +43,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #ifndef ASSEMBLER | |||
| static void __inline blas_lock(volatile BLASULONG *address){ | |||
| long register ret; | |||
| BLASULONG ret; | |||
| do { | |||
| while (*address) {YIELDING;}; | |||
| __asm__ __volatile__( | |||
| "ldaxr %0, [%1] \n\t" | |||
| "stlxr w2, %2, [%1] \n\t" | |||
| "orr %0, %0, x2 \n\t" | |||
| : "=r"(ret) | |||
| : "r"(address), "r"(1l) | |||
| : "memory", "x2" | |||
| "mov x4, #1 \n\t" | |||
| "1: \n\t" | |||
| "ldaxr x2, [%1] \n\t" | |||
| "cbnz x2, 1b \n\t" | |||
| "2: \n\t" | |||
| "stxr w3, x4, [%1] \n\t" | |||
| "cbnz w3, 1b \n\t" | |||
| "mov %0, #0 \n\t" | |||
| : "=r"(ret), "=r"(address) | |||
| : "1"(address) | |||
| : "memory", "x2" , "x3", "x4" | |||
| ); | |||
| } while (ret); | |||
| MB; | |||
| } | |||
| #define BLAS_LOCK_DEFINED | |||
| static inline int blas_quickdivide(blasint x, blasint y){ | |||
| return x / y; | |||
| } | |||
| @@ -89,8 +100,10 @@ static inline int blas_quickdivide(blasint x, blasint y){ | |||
| #if defined(ASSEMBLER) && !defined(NEEDPARAM) | |||
| #define PROLOGUE \ | |||
| .text ;\ | |||
| .align 4 ;\ | |||
| .global REALNAME ;\ | |||
| .func REALNAME ;\ | |||
| .type REALNAME, %function ;\ | |||
| REALNAME: | |||
| #define EPILOGUE | |||
| @@ -107,7 +120,11 @@ REALNAME: | |||
| #endif | |||
| #define HUGE_PAGESIZE ( 4 << 20) | |||
| #if defined(CORTEXA57) | |||
| #define BUFFER_SIZE (20 << 20) | |||
| #else | |||
| #define BUFFER_SIZE (16 << 20) | |||
| #endif | |||
| #define BASE_ADDRESS (START_ADDRESS - BUFFER_SIZE * MAX_CPU_NUMBER) | |||
| @@ -236,7 +236,7 @@ static inline int blas_quickdivide(blasint x, blasint y){ | |||
| #define HAVE_PREFETCH | |||
| #endif | |||
| #if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) | |||
| #if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8) | |||
| #define DCBT_ARG 0 | |||
| #else | |||
| #define DCBT_ARG 8 | |||
| @@ -258,6 +258,13 @@ static inline int blas_quickdivide(blasint x, blasint y){ | |||
| #define L1_PREFETCH dcbtst | |||
| #endif | |||
| #if defined(POWER8) | |||
| #define L1_DUALFETCH | |||
| #define L1_PREFETCHSIZE (16 + 128 * 100) | |||
| #define L1_PREFETCH dcbtst | |||
| #endif | |||
| # | |||
| #ifndef L1_PREFETCH | |||
| #define L1_PREFETCH dcbt | |||
| #endif | |||
| @@ -790,6 +797,8 @@ Lmcount$lazy_ptr: | |||
| #define BUFFER_SIZE ( 2 << 20) | |||
| #elif defined(PPC440FP2) | |||
| #define BUFFER_SIZE ( 16 << 20) | |||
| #elif defined(POWER8) | |||
| #define BUFFER_SIZE ( 64 << 20) | |||
| #else | |||
| #define BUFFER_SIZE ( 16 << 20) | |||
| #endif | |||
| @@ -0,0 +1,73 @@ | |||
| /******************************************************************************* | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *******************************************************************************/ | |||
| #define STACK_ALLOC_PROTECT | |||
| #ifdef STACK_ALLOC_PROTECT | |||
| // Try to detect stack smashing | |||
| #include <assert.h> | |||
| #define STACK_ALLOC_PROTECT_SET volatile int stack_check = 0x7fc01234; | |||
| #define STACK_ALLOC_PROTECT_CHECK assert(stack_check == 0x7fc01234); | |||
| #else | |||
| #define STACK_ALLOC_PROTECT_SET | |||
| #define STACK_ALLOC_PROTECT_CHECK | |||
| #endif | |||
| #if defined(MAX_STACK_ALLOC) && MAX_STACK_ALLOC > 0 | |||
| /* | |||
| * Allocate a buffer on the stack if the size is smaller than MAX_STACK_ALLOC. | |||
| * Stack allocation is much faster than blas_memory_alloc or malloc, particularly | |||
| * when OpenBLAS is used from a multi-threaded application. | |||
| * SIZE must be carefully chosen to be: | |||
| * - as small as possible to maximize the number of stack allocation | |||
| * - large enough to support all architectures and kernel | |||
| * Chosing a too small SIZE will lead to a stack smashing. | |||
| */ | |||
| #define STACK_ALLOC(SIZE, TYPE, BUFFER) \ | |||
| /* make it volatile because some function (ex: dgemv_n.S) */ \ | |||
| /* do not restore all register */ \ | |||
| volatile int stack_alloc_size = SIZE; \ | |||
| if(stack_alloc_size > MAX_STACK_ALLOC / sizeof(TYPE)) \ | |||
| stack_alloc_size = 0; \ | |||
| STACK_ALLOC_PROTECT_SET \ | |||
| TYPE stack_buffer[stack_alloc_size] __attribute__((aligned(0x20))); \ | |||
| BUFFER = stack_alloc_size ? stack_buffer : (TYPE *)blas_memory_alloc(1); | |||
| #else | |||
| //Original OpenBLAS/GotoBLAS codes. | |||
| #define STACK_ALLOC(SIZE, TYPE, BUFFER) BUFFER = (TYPE *)blas_memory_alloc(1) | |||
| #endif | |||
| #if defined(MAX_STACK_ALLOC) && MAX_STACK_ALLOC > 0 | |||
| #define STACK_FREE(BUFFER) \ | |||
| STACK_ALLOC_PROTECT_CHECK \ | |||
| if(!stack_alloc_size) \ | |||
| blas_memory_free(BUFFER); | |||
| #else | |||
| #define STACK_FREE(BUFFER) blas_memory_free(BUFFER) | |||
| #endif | |||
| @@ -41,6 +41,10 @@ | |||
| #ifndef ASSEMBLER | |||
| #ifdef C_MSVC | |||
| #include <intrin.h> | |||
| #endif | |||
| #define MB | |||
| #define WMB | |||
| @@ -170,12 +174,13 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ | |||
| if (y <= 1) return x; | |||
| y = blas_quick_divide_table[y]; | |||
| #if defined(_MSC_VER) && !defined(__clang__) | |||
| (void*)result; | |||
| return x*y; | |||
| result = x/y; | |||
| return result; | |||
| #else | |||
| y = blas_quick_divide_table[y]; | |||
| __asm__ __volatile__ ("mull %0" :"=d" (result) :"a"(x), "0" (y)); | |||
| return result; | |||
| @@ -396,7 +396,7 @@ REALNAME: | |||
| #define PROFCODE | |||
| #define EPILOGUE .end REALNAME | |||
| #define EPILOGUE .end | |||
| #endif | |||
| #if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(__ELF__) || defined(C_PGI) | |||
| @@ -115,6 +115,9 @@ int detect(void) | |||
| if (strstr(p, "0xc0f")) { | |||
| return CPU_CORTEXA15; | |||
| } | |||
| if (strstr(p, "0xd07")) { | |||
| return CPU_ARMV7; //ARMV8 on 32-bit | |||
| } | |||
| } | |||
| @@ -158,6 +161,27 @@ int detect(void) | |||
| } | |||
| p = (char *) NULL ; | |||
| infile = fopen("/proc/cpuinfo", "r"); | |||
| while (fgets(buffer, sizeof(buffer), infile)) | |||
| { | |||
| if ((!strncmp("CPU architecture", buffer, 16))) | |||
| { | |||
| p = strchr(buffer, ':') + 2; | |||
| break; | |||
| } | |||
| } | |||
| fclose(infile); | |||
| if(p != NULL) { | |||
| if (strstr(p, "8")) { | |||
| return CPU_ARMV7; //ARMV8 on 32-bit | |||
| } | |||
| } | |||
| #endif | |||
| return CPU_UNKNOWN; | |||
| @@ -29,12 +29,19 @@ | |||
| #define CPU_UNKNOWN 0 | |||
| #define CPU_ARMV8 1 | |||
| #define CPU_CORTEXA57 2 | |||
| static char *cpuname[] = { | |||
| "UNKOWN", | |||
| "ARMV8" | |||
| "UNKNOWN", | |||
| "ARMV8" , | |||
| "CORTEXA57" | |||
| }; | |||
| static char *cpuname_lower[] = { | |||
| "unknown", | |||
| "armv8" , | |||
| "cortexa57" | |||
| }; | |||
| int get_feature(char *search) | |||
| { | |||
| @@ -53,13 +60,13 @@ int get_feature(char *search) | |||
| { | |||
| p = strchr(buffer, ':') + 2; | |||
| break; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| fclose(infile); | |||
| fclose(infile); | |||
| if( p == NULL ) return; | |||
| if( p == NULL ) return 0; | |||
| t = strtok(p," "); | |||
| while( t = strtok(NULL," ")) | |||
| @@ -82,11 +89,30 @@ int detect(void) | |||
| p = (char *) NULL ; | |||
| infile = fopen("/proc/cpuinfo", "r"); | |||
| while (fgets(buffer, sizeof(buffer), infile)) | |||
| { | |||
| if (!strncmp("CPU part", buffer, 8)) | |||
| { | |||
| p = strchr(buffer, ':') + 2; | |||
| break; | |||
| } | |||
| } | |||
| fclose(infile); | |||
| if(p != NULL) { | |||
| if (strstr(p, "0xd07")) { | |||
| return CPU_CORTEXA57; | |||
| } | |||
| } | |||
| p = (char *) NULL ; | |||
| infile = fopen("/proc/cpuinfo", "r"); | |||
| while (fgets(buffer, sizeof(buffer), infile)) | |||
| { | |||
| if ((!strncmp("model name", buffer, 10)) || (!strncmp("Processor", buffer, 9))) | |||
| if ((!strncmp("model name", buffer, 10)) || (!strncmp("Processor", buffer, 9)) || | |||
| (!strncmp("CPU architecture", buffer, 16))) | |||
| { | |||
| p = strchr(buffer, ':') + 2; | |||
| break; | |||
| @@ -100,7 +126,7 @@ int detect(void) | |||
| if (strstr(p, "AArch64")) | |||
| { | |||
| return CPU_ARMV8; | |||
| return CPU_ARMV8; | |||
| } | |||
| @@ -118,23 +144,13 @@ char *get_corename(void) | |||
| void get_architecture(void) | |||
| { | |||
| printf("ARM"); | |||
| printf("ARM64"); | |||
| } | |||
| void get_subarchitecture(void) | |||
| { | |||
| int d = detect(); | |||
| switch (d) | |||
| { | |||
| case CPU_ARMV8: | |||
| printf("ARMV8"); | |||
| break; | |||
| default: | |||
| printf("UNKNOWN"); | |||
| break; | |||
| } | |||
| printf("%s", cpuname[d]); | |||
| } | |||
| void get_subdirname(void) | |||
| @@ -160,26 +176,34 @@ void get_cpuconfig(void) | |||
| printf("#define L2_ASSOCIATIVE 4\n"); | |||
| break; | |||
| case CPU_CORTEXA57: | |||
| printf("#define CORTEXA57\n"); | |||
| printf("#define HAVE_VFP\n"); | |||
| printf("#define HAVE_VFPV3\n"); | |||
| printf("#define HAVE_NEON\n"); | |||
| printf("#define HAVE_VFPV4\n"); | |||
| printf("#define L1_CODE_SIZE 49152\n"); | |||
| printf("#define L1_CODE_LINESIZE 64\n"); | |||
| printf("#define L1_CODE_ASSOCIATIVE 3\n"); | |||
| printf("#define L1_DATA_SIZE 32768\n"); | |||
| printf("#define L1_DATA_LINESIZE 64\n"); | |||
| printf("#define L1_DATA_ASSOCIATIVE 2\n"); | |||
| printf("#define L2_SIZE 2097152\n"); | |||
| printf("#define L2_LINESIZE 64\n"); | |||
| printf("#define L2_ASSOCIATIVE 16\n"); | |||
| printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||
| printf("#define DTB_SIZE 4096\n"); | |||
| break; | |||
| } | |||
| } | |||
| void get_libname(void) | |||
| { | |||
| int d = detect(); | |||
| switch (d) | |||
| { | |||
| case CPU_ARMV8: | |||
| printf("armv8\n"); | |||
| break; | |||
| } | |||
| printf("%s", cpuname_lower[d]); | |||
| } | |||
| void get_features(void) | |||
| { | |||
| @@ -55,6 +55,7 @@ | |||
| #define CPUTYPE_POWER6 5 | |||
| #define CPUTYPE_CELL 6 | |||
| #define CPUTYPE_PPCG4 7 | |||
| #define CPUTYPE_POWER8 8 | |||
| char *cpuname[] = { | |||
| "UNKNOWN", | |||
| @@ -65,6 +66,7 @@ char *cpuname[] = { | |||
| "POWER6", | |||
| "CELL", | |||
| "PPCG4", | |||
| "POWER8" | |||
| }; | |||
| char *lowercpuname[] = { | |||
| @@ -76,6 +78,7 @@ char *lowercpuname[] = { | |||
| "power6", | |||
| "cell", | |||
| "ppcg4", | |||
| "power8" | |||
| }; | |||
| char *corename[] = { | |||
| @@ -87,6 +90,7 @@ char *corename[] = { | |||
| "POWER6", | |||
| "CELL", | |||
| "PPCG4", | |||
| "POWER8" | |||
| }; | |||
| int detect(void){ | |||
| @@ -115,7 +119,7 @@ int detect(void){ | |||
| if (!strncasecmp(p, "POWER5", 6)) return CPUTYPE_POWER5; | |||
| if (!strncasecmp(p, "POWER6", 6)) return CPUTYPE_POWER6; | |||
| if (!strncasecmp(p, "POWER7", 6)) return CPUTYPE_POWER6; | |||
| if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER6; | |||
| if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER8; | |||
| if (!strncasecmp(p, "Cell", 4)) return CPUTYPE_CELL; | |||
| if (!strncasecmp(p, "7447", 4)) return CPUTYPE_PPCG4; | |||
| @@ -1172,6 +1172,9 @@ int get_cpuname(void){ | |||
| #endif | |||
| else | |||
| return CPUTYPE_NEHALEM; | |||
| case 13: | |||
| // Avoton | |||
| return CPUTYPE_NEHALEM; | |||
| } | |||
| break; | |||
| case 5: | |||
| @@ -1229,6 +1232,7 @@ int get_cpuname(void){ | |||
| case 2: | |||
| return CPUTYPE_OPTERON; | |||
| case 1: | |||
| case 3: | |||
| case 10: | |||
| return CPUTYPE_BARCELONA; | |||
| case 6: | |||
| @@ -1239,13 +1243,19 @@ int get_cpuname(void){ | |||
| return CPUTYPE_BULLDOZER; | |||
| else | |||
| return CPUTYPE_BARCELONA; //OS don't support AVX. | |||
| case 2: | |||
| case 2: //AMD Piledriver | |||
| case 3: //AMD Richland | |||
| if(support_avx()) | |||
| return CPUTYPE_PILEDRIVER; | |||
| else | |||
| return CPUTYPE_BARCELONA; //OS don't support AVX. | |||
| case 0: | |||
| switch(exmodel){ | |||
| case 1: //AMD Trinity | |||
| if(support_avx()) | |||
| return CPUTYPE_PILEDRIVER; | |||
| else | |||
| return CPUTYPE_BARCELONA; //OS don't support AVX. | |||
| case 3: | |||
| if(support_avx()) | |||
| return CPUTYPE_STEAMROLLER; | |||
| @@ -1668,6 +1678,9 @@ int get_coretype(void){ | |||
| #endif | |||
| else | |||
| return CORE_NEHALEM; | |||
| case 13: | |||
| // Avoton | |||
| return CORE_NEHALEM; | |||
| } | |||
| break; | |||
| case 5: | |||
| @@ -1718,7 +1731,8 @@ int get_coretype(void){ | |||
| return CORE_BULLDOZER; | |||
| else | |||
| return CORE_BARCELONA; //OS don't support AVX. | |||
| case 2: | |||
| case 2: //AMD Piledriver | |||
| case 3: //AMD Richland | |||
| if(support_avx()) | |||
| return CORE_PILEDRIVER; | |||
| else | |||
| @@ -1726,6 +1740,12 @@ int get_coretype(void){ | |||
| case 0: | |||
| switch(exmodel){ | |||
| case 1: //AMD Trinity | |||
| if(support_avx()) | |||
| return CORE_PILEDRIVER; | |||
| else | |||
| return CORE_BARCELONA; //OS don't support AVX. | |||
| case 3: | |||
| if(support_avx()) | |||
| return CORE_STEAMROLLER; | |||
| @@ -1365,8 +1365,9 @@ | |||
| * | |||
| 150 CONTINUE | |||
| WRITE( NOUT, FMT = 9996 )SNAME | |||
| CALL CPRCN3( NTRA, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, DIAG, | |||
| $ M, N, ALPHA, LDA, LDB) | |||
| IF( TRACE ) | |||
| $ CALL CPRCN3( NTRA, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, DIAG, | |||
| $ M, N, ALPHA, LDA, LDB) | |||
| * | |||
| 160 CONTINUE | |||
| RETURN | |||
| @@ -1365,8 +1365,9 @@ | |||
| * | |||
| 150 CONTINUE | |||
| WRITE( NOUT, FMT = 9996 )SNAME | |||
| CALL CPRCN3( NTRA, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, DIAG, | |||
| $ M, N, ALPHA, LDA, LDB) | |||
| IF( TRACE ) | |||
| $ CALL CPRCN3( NTRA, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, DIAG, | |||
| $ M, N, ALPHA, LDA, LDB) | |||
| * | |||
| 160 CONTINUE | |||
| RETURN | |||
| @@ -1335,8 +1335,9 @@ | |||
| * | |||
| 150 CONTINUE | |||
| WRITE( NOUT, FMT = 9996 )SNAME | |||
| CALL DPRCN3( NTRA, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, DIAG, | |||
| $ M, N, ALPHA, LDA, LDB) | |||
| IF( TRACE ) | |||
| $ CALL DPRCN3( NTRA, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, DIAG, | |||
| $ M, N, ALPHA, LDA, LDB) | |||
| * | |||
| 160 CONTINUE | |||
| RETURN | |||
| @@ -1339,8 +1339,9 @@ | |||
| * | |||
| 150 CONTINUE | |||
| WRITE( NOUT, FMT = 9996 )SNAME | |||
| CALL SPRCN3( NTRA, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, DIAG, | |||
| $ M, N, ALPHA, LDA, LDB) | |||
| IF( TRACE ) | |||
| $ CALL SPRCN3( NTRA, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, DIAG, | |||
| $ M, N, ALPHA, LDA, LDB) | |||
| * | |||
| 160 CONTINUE | |||
| RETURN | |||
| @@ -1350,7 +1350,7 @@ | |||
| * | |||
| * Call the subroutine. | |||
| * | |||
| IF( SNAME( 4: 5 ).EQ.'mv' )THEN | |||
| IF( SNAME( 10: 11 ).EQ.'mv' )THEN | |||
| IF( FULL )THEN | |||
| IF( TRACE ) | |||
| $ WRITE( NTRA, FMT = 9993 )NC, SNAME, | |||
| @@ -1376,7 +1376,7 @@ | |||
| CALL CZTPMV( IORDER, UPLO, TRANS, DIAG, | |||
| $ N, AA, XX, INCX ) | |||
| END IF | |||
| ELSE IF( SNAME( 4: 5 ).EQ.'sv' )THEN | |||
| ELSE IF( SNAME( 10: 11 ).EQ.'sv' )THEN | |||
| IF( FULL )THEN | |||
| IF( TRACE ) | |||
| $ WRITE( NTRA, FMT = 9993 )NC, SNAME, | |||
| @@ -1465,7 +1465,7 @@ | |||
| END IF | |||
| * | |||
| IF( .NOT.NULL )THEN | |||
| IF( SNAME( 4: 5 ).EQ.'mv' )THEN | |||
| IF( SNAME( 10: 11 ).EQ.'mv' )THEN | |||
| * | |||
| * Check the result. | |||
| * | |||
| @@ -1473,7 +1473,7 @@ | |||
| $ INCX, ZERO, Z, INCX, XT, G, | |||
| $ XX, EPS, ERR, FATAL, NOUT, | |||
| $ .TRUE. ) | |||
| ELSE IF( SNAME( 4: 5 ).EQ.'sv' )THEN | |||
| ELSE IF( SNAME( 10: 11 ).EQ.'sv' )THEN | |||
| * | |||
| * Compute approximation to original vector. | |||
| * | |||
| @@ -1611,7 +1611,7 @@ | |||
| * .. Common blocks .. | |||
| COMMON /INFOC/INFOT, NOUTC, OK | |||
| * .. Executable Statements .. | |||
| CONJ = SNAME( 5: 5 ).EQ.'c' | |||
| CONJ = SNAME( 11: 11 ).EQ.'c' | |||
| * Define the number of arguments. | |||
| NARGS = 9 | |||
| * | |||
| @@ -1366,8 +1366,9 @@ | |||
| * | |||
| 150 CONTINUE | |||
| WRITE( NOUT, FMT = 9996 )SNAME | |||
| CALL ZPRCN3( NTRA, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, DIAG, | |||
| $ M, N, ALPHA, LDA, LDB) | |||
| IF( TRACE ) | |||
| $ CALL ZPRCN3( NTRA, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, DIAG, | |||
| $ M, N, ALPHA, LDA, LDB) | |||
| * | |||
| 160 CONTINUE | |||
| RETURN | |||
| @@ -1366,8 +1366,9 @@ | |||
| * | |||
| 150 CONTINUE | |||
| WRITE( NOUT, FMT = 9996 )SNAME | |||
| CALL ZPRCN3( NTRA, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, DIAG, | |||
| $ M, N, ALPHA, LDA, LDB) | |||
| IF( TRACE ) | |||
| $ CALL ZPRCN3( NTRA, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, DIAG, | |||
| $ M, N, ALPHA, LDA, LDB) | |||
| * | |||
| 160 CONTINUE | |||
| RETURN | |||
| @@ -1,7 +1,7 @@ | |||
| 'CBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE | |||
| -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) | |||
| F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. | |||
| F LOGICAL FLAG, T TO STOP ON FAILURES. | |||
| T LOGICAL FLAG, T TO STOP ON FAILURES. | |||
| T LOGICAL FLAG, T TO TEST ERROR EXITS. | |||
| 2 LOGICAL FLAG, T TO TEST ROW-MAJOR (IF FALSE COLUMN-MAJOR IS TESTED) | |||
| 16.0 THRESHOLD VALUE OF TEST RATIO | |||
| @@ -1,7 +1,7 @@ | |||
| 'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE | |||
| -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) | |||
| F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. | |||
| F LOGICAL FLAG, T TO STOP ON FAILURES. | |||
| T LOGICAL FLAG, T TO STOP ON FAILURES. | |||
| T LOGICAL FLAG, T TO TEST ERROR EXITS. | |||
| 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH | |||
| 16.0 THRESHOLD VALUE OF TEST RATIO | |||
| @@ -1,7 +1,7 @@ | |||
| 'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE | |||
| -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) | |||
| F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. | |||
| F LOGICAL FLAG, T TO STOP ON FAILURES. | |||
| T LOGICAL FLAG, T TO STOP ON FAILURES. | |||
| T LOGICAL FLAG, T TO TEST ERROR EXITS. | |||
| 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH | |||
| 16.0 THRESHOLD VALUE OF TEST RATIO | |||
| @@ -1,7 +1,7 @@ | |||
| 'DBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE | |||
| -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) | |||
| F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. | |||
| F LOGICAL FLAG, T TO STOP ON FAILURES. | |||
| T LOGICAL FLAG, T TO STOP ON FAILURES. | |||
| T LOGICAL FLAG, T TO TEST ERROR EXITS. | |||
| 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH | |||
| 16.0 THRESHOLD VALUE OF TEST RATIO | |||
| @@ -1,7 +1,7 @@ | |||
| 'DBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE | |||
| -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) | |||
| F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. | |||
| F LOGICAL FLAG, T TO STOP ON FAILURES. | |||
| T LOGICAL FLAG, T TO STOP ON FAILURES. | |||
| T LOGICAL FLAG, T TO TEST ERROR EXITS. | |||
| 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH | |||
| 16.0 THRESHOLD VALUE OF TEST RATIO | |||
| @@ -1,7 +1,7 @@ | |||
| 'SBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE | |||
| -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) | |||
| F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. | |||
| F LOGICAL FLAG, T TO STOP ON FAILURES. | |||
| T LOGICAL FLAG, T TO STOP ON FAILURES. | |||
| T LOGICAL FLAG, T TO TEST ERROR EXITS. | |||
| 2 LOGICAL FLAG, T TO TEST ROW-MAJOR (IF FALSE COLUMN-MAJOR IS TESTED) | |||
| 16.0 THRESHOLD VALUE OF TEST RATIO | |||
| @@ -1,7 +1,7 @@ | |||
| 'SBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE | |||
| -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) | |||
| F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. | |||
| F LOGICAL FLAG, T TO STOP ON FAILURES. | |||
| T LOGICAL FLAG, T TO STOP ON FAILURES. | |||
| T LOGICAL FLAG, T TO TEST ERROR EXITS. | |||
| 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH | |||
| 16.0 THRESHOLD VALUE OF TEST RATIO | |||
| @@ -1,7 +1,7 @@ | |||
| 'ZBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE | |||
| -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) | |||
| F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. | |||
| F LOGICAL FLAG, T TO STOP ON FAILURES. | |||
| T LOGICAL FLAG, T TO STOP ON FAILURES. | |||
| T LOGICAL FLAG, T TO TEST ERROR EXITS. | |||
| 2 LOGICAL FLAG, T TO TEST ROW-MAJOR (IF FALSE COLUMN-MAJOR IS TESTED) | |||
| 16.0 THRESHOLD VALUE OF TEST RATIO | |||
| @@ -1,7 +1,7 @@ | |||
| 'ZBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE | |||
| -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) | |||
| F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. | |||
| F LOGICAL FLAG, T TO STOP ON FAILURES. | |||
| T LOGICAL FLAG, T TO STOP ON FAILURES. | |||
| T LOGICAL FLAG, T TO TEST ERROR EXITS. | |||
| 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH | |||
| 16.0 THRESHOLD VALUE OF TEST RATIO | |||
| @@ -1,7 +1,7 @@ | |||
| 'ZBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE | |||
| -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) | |||
| F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. | |||
| F LOGICAL FLAG, T TO STOP ON FAILURES. | |||
| T LOGICAL FLAG, T TO STOP ON FAILURES. | |||
| T LOGICAL FLAG, T TO TEST ERROR EXITS. | |||
| 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH | |||
| 16.0 THRESHOLD VALUE OF TEST RATIO | |||
| @@ -55,7 +55,7 @@ | |||
| static int spmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){ | |||
| FLOAT *a, *x, *y; | |||
| BLASLONG incx, incy; | |||
| BLASLONG incx; | |||
| BLASLONG m_from, m_to, i; | |||
| #ifndef COMPLEX | |||
| FLOAT result; | |||
| @@ -68,7 +68,6 @@ static int spmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F | |||
| y = (FLOAT *)args -> c; | |||
| incx = args -> ldb; | |||
| incy = args -> ldc; | |||
| m_from = 0; | |||
| m_to = args -> m; | |||
| @@ -43,7 +43,7 @@ | |||
| static int syr_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){ | |||
| FLOAT *a, *x, *y; | |||
| BLASLONG lda, incx, incy; | |||
| BLASLONG incx, incy; | |||
| BLASLONG i, m_from, m_to; | |||
| FLOAT alpha_r; | |||
| #ifdef COMPLEX | |||
| @@ -56,7 +56,6 @@ static int syr_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FL | |||
| incx = args -> lda; | |||
| incy = args -> ldb; | |||
| lda = args -> ldc; | |||
| alpha_r = *((FLOAT *)args -> alpha + 0); | |||
| #ifdef COMPLEX | |||
| @@ -46,7 +46,7 @@ static int syr_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FL | |||
| BLASLONG incx; | |||
| BLASLONG i, m_from, m_to; | |||
| FLOAT alpha_r; | |||
| #if defined(COMPLEX) && !defined(HER) && !defined(HERREV) | |||
| #if defined(COMPLEX) && !defined(HEMV) && !defined(HEMVREV) | |||
| FLOAT alpha_i; | |||
| #endif | |||
| @@ -56,7 +56,7 @@ static int syr_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FL | |||
| incx = args -> lda; | |||
| alpha_r = *((FLOAT *)args -> alpha + 0); | |||
| #if defined(COMPLEX) && !defined(HER) && !defined(HERREV) | |||
| #if defined(COMPLEX) && !defined(HEMV) && !defined(HEMVREV) | |||
| alpha_i = *((FLOAT *)args -> alpha + 1); | |||
| #endif | |||
| @@ -55,7 +55,7 @@ | |||
| static int symv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){ | |||
| FLOAT *a, *x, *y; | |||
| BLASLONG lda, incx, incy; | |||
| BLASLONG lda, incx; | |||
| BLASLONG m_from, m_to; | |||
| a = (FLOAT *)args -> a; | |||
| @@ -64,7 +64,6 @@ static int symv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F | |||
| lda = args -> lda; | |||
| incx = args -> ldb; | |||
| incy = args -> ldc; | |||
| m_from = 0; | |||
| m_to = args -> m; | |||
| @@ -45,13 +45,11 @@ const static FLOAT dp1 = 1.; | |||
| int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){ | |||
| BLASLONG i; | |||
| FLOAT *gemvbuffer = (FLOAT *)buffer; | |||
| FLOAT *B = b; | |||
| BLASLONG length; | |||
| if (incb != 1) { | |||
| B = buffer; | |||
| gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) + 4095) & ~4095); | |||
| COPY_K(n, b, incb, buffer, 1); | |||
| } | |||
| @@ -45,13 +45,11 @@ const static FLOAT dp1 = 1.; | |||
| int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){ | |||
| BLASLONG i; | |||
| FLOAT *gemvbuffer = (FLOAT *)buffer; | |||
| FLOAT *B = b; | |||
| BLASLONG length; | |||
| if (incb != 1) { | |||
| B = buffer; | |||
| gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) + 4095) & ~4095); | |||
| COPY_K(n, b, incb, buffer, 1); | |||
| } | |||
| @@ -45,13 +45,11 @@ const static FLOAT dp1 = 1.; | |||
| int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){ | |||
| BLASLONG i; | |||
| FLOAT *gemvbuffer = (FLOAT *)buffer; | |||
| FLOAT *B = b; | |||
| BLASLONG length; | |||
| if (incb != 1) { | |||
| B = buffer; | |||
| gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) + 4095) & ~4095); | |||
| COPY_K(n, b, incb, buffer, 1); | |||
| } | |||
| @@ -45,13 +45,11 @@ const static FLOAT dp1 = 1.; | |||
| int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){ | |||
| BLASLONG i; | |||
| FLOAT *gemvbuffer = (FLOAT *)buffer; | |||
| FLOAT *B = b; | |||
| BLASLONG length; | |||
| if (incb != 1) { | |||
| B = buffer; | |||
| gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) + 4095) & ~4095); | |||
| COPY_K(n, b, incb, buffer, 1); | |||
| } | |||
| @@ -43,12 +43,10 @@ | |||
| int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){ | |||
| BLASLONG i; | |||
| FLOAT *gemvbuffer = (FLOAT *)buffer; | |||
| FLOAT *B = b; | |||
| if (incb != 1) { | |||
| B = buffer; | |||
| gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) + 4095) & ~4095); | |||
| COPY_K(m, b, incb, buffer, 1); | |||
| } | |||
| @@ -43,12 +43,10 @@ | |||
| int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){ | |||
| BLASLONG i; | |||
| FLOAT *gemvbuffer = (FLOAT *)buffer; | |||
| FLOAT *B = b; | |||
| if (incb != 1) { | |||
| B = buffer; | |||
| gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) + 4095) & ~4095); | |||
| COPY_K(m, b, incb, buffer, 1); | |||
| } | |||
| @@ -119,7 +119,7 @@ static int trmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F | |||
| #endif | |||
| x = buffer; | |||
| buffer += ((COMPSIZE * args -> m + 1023) & ~1023); | |||
| buffer += ((COMPSIZE * args -> m + 3) & ~3); | |||
| } | |||
| #ifndef TRANS | |||
| @@ -403,7 +403,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu | |||
| if (num_cpu) { | |||
| queue[0].sa = NULL; | |||
| queue[0].sb = buffer + num_cpu * (((m + 255) & ~255) + 16) * COMPSIZE; | |||
| queue[0].sb = buffer + num_cpu * (((m + 3) & ~3) + 16) * COMPSIZE; | |||
| queue[num_cpu - 1].next = NULL; | |||
| @@ -45,7 +45,6 @@ const static FLOAT dp1 = 1.; | |||
| int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){ | |||
| BLASLONG i; | |||
| FLOAT *gemvbuffer = (FLOAT *)buffer; | |||
| FLOAT *B = b; | |||
| BLASLONG length; | |||
| #if (TRANSA == 2) || (TRANSA == 4) | |||
| @@ -57,7 +56,6 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG inc | |||
| if (incb != 1) { | |||
| B = buffer; | |||
| gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) * COMPSIZE+ 4095) & ~4095); | |||
| COPY_K(n, b, incb, buffer, 1); | |||
| } | |||
| @@ -45,7 +45,6 @@ const static FLOAT dp1 = 1.; | |||
| int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){ | |||
| BLASLONG i; | |||
| FLOAT *gemvbuffer = (FLOAT *)buffer; | |||
| FLOAT *B = b; | |||
| BLASLONG length; | |||
| #if (TRANSA == 2) || (TRANSA == 4) | |||
| @@ -57,7 +56,6 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG inc | |||
| if (incb != 1) { | |||
| B = buffer; | |||
| gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) * COMPSIZE + 4095) & ~4095); | |||
| COPY_K(n, b, incb, buffer, 1); | |||
| } | |||
| @@ -45,7 +45,6 @@ const static FLOAT dp1 = 1.; | |||
| int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){ | |||
| BLASLONG i; | |||
| FLOAT *gemvbuffer = (FLOAT *)buffer; | |||
| FLOAT *B = b; | |||
| BLASLONG length; | |||
| #if (TRANSA == 2) || (TRANSA == 4) | |||
| @@ -57,7 +56,6 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG inc | |||
| if (incb != 1) { | |||
| B = buffer; | |||
| gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) * COMPSIZE + 4095) & ~4095); | |||
| COPY_K(n, b, incb, buffer, 1); | |||
| } | |||
| @@ -45,7 +45,6 @@ const static FLOAT dp1 = 1.; | |||
| int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){ | |||
| BLASLONG i; | |||
| FLOAT *gemvbuffer = (FLOAT *)buffer; | |||
| FLOAT *B = b; | |||
| BLASLONG length; | |||
| #if (TRANSA == 2) || (TRANSA == 4) | |||
| @@ -57,7 +56,6 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG inc | |||
| if (incb != 1) { | |||
| B = buffer; | |||
| gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) * COMPSIZE+ 4095) & ~4095); | |||
| COPY_K(n, b, incb, buffer, 1); | |||
| } | |||
| @@ -49,12 +49,10 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){ | |||
| #ifndef UNIT | |||
| FLOAT atemp1, atemp2, btemp1, btemp2; | |||
| #endif | |||
| FLOAT *gemvbuffer = (FLOAT *)buffer; | |||
| FLOAT *B = b; | |||
| if (incb != 1) { | |||
| B = buffer; | |||
| gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 4095) & ~4095); | |||
| COPY_K(m, b, incb, buffer, 1); | |||
| } | |||
| @@ -49,12 +49,10 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){ | |||
| #ifndef UNIT | |||
| FLOAT atemp1, atemp2, btemp1, btemp2; | |||
| #endif | |||
| FLOAT *gemvbuffer = (FLOAT *)buffer; | |||
| FLOAT *B = b; | |||
| if (incb != 1) { | |||
| B = buffer; | |||
| gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 4095) & ~4095); | |||
| COPY_K(m, b, incb, buffer, 1); | |||
| } | |||
| @@ -51,12 +51,10 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){ | |||
| #ifndef UNIT | |||
| FLOAT ar, ai, br, bi, ratio, den; | |||
| #endif | |||
| FLOAT *gemvbuffer = (FLOAT *)buffer; | |||
| FLOAT *B = b; | |||
| if (incb != 1) { | |||
| B = buffer; | |||
| gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 4095) & ~4095); | |||
| COPY_K(m, b, incb, buffer, 1); | |||
| } | |||
| @@ -49,12 +49,10 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){ | |||
| #ifndef UNIT | |||
| FLOAT ar, ai, br, bi, ratio, den; | |||
| #endif | |||
| FLOAT *gemvbuffer = (FLOAT *)buffer; | |||
| FLOAT *B = b; | |||
| if (incb != 1) { | |||
| B = buffer; | |||
| gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 4095) & ~4095); | |||
| COPY_K(m, b, incb, buffer, 1); | |||
| } | |||
| @@ -56,7 +56,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, FLOAT *bu | |||
| if (incb != 1) { | |||
| B = buffer; | |||
| gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 4095) & ~4095); | |||
| gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 15) & ~15); | |||
| COPY_K(m, b, incb, buffer, 1); | |||
| } | |||
| @@ -56,7 +56,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, FLOAT *bu | |||
| if (incb != 1) { | |||
| B = buffer; | |||
| gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 4095) & ~4095); | |||
| gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 15) & ~15); | |||
| COPY_K(m, b, incb, buffer, 1); | |||
| } | |||
| @@ -48,8 +48,7 @@ foreach (float_type ${FLOAT_TYPES}) | |||
| # TRANS needs to be set/unset when CONJ is set/unset, so can't use it as a combination | |||
| GenerateCombinationObjects("zherk_k.c" "LOWER" "U" "HERK" 3 "herk_N" false ${float_type}) | |||
| GenerateCombinationObjects("zherk_k.c" "LOWER" "U" "HERK;TRANS;CONJ" 3 "herk_C" false ${float_type}) | |||
| GenerateCombinationObjects("zherk_k.c" "LOWER" "U" "HERK;THREADED_LEVEL3" 3 "herk_thread_N" false ${float_type}) | |||
| GenerateCombinationObjects("zherk_k.c" "LOWER" "U" "HERK;THREADED_LEVEL3;TRANS;CONJ" 3 "herk_thread_C" false ${float_type}) | |||
| # Need to set CONJ for trmm and trsm | |||
| GenerateCombinationObjects("trmm_L.c" "UPPER;UNIT" "L;N" "CONJ" 0 "trmm_LR" false ${float_type}) | |||
| GenerateCombinationObjects("trmm_L.c" "UPPER;UNIT" "L;N" "TRANSA;CONJ" 0 "trmm_LC" false ${float_type}) | |||
| @@ -72,6 +71,10 @@ foreach (float_type ${FLOAT_TYPES}) | |||
| GenerateNamedObjects("zher2k_k.c" "HER2K;LOWER;TRANS;CONJ" "her2k_LC" false "" "" false ${float_type}) | |||
| if (SMP AND NOT USE_SIMPLE_THREADED_LEVEL3) | |||
| #herk | |||
| GenerateCombinationObjects("zherk_k.c" "LOWER" "U" "HERK;THREADED_LEVEL3" 3 "herk_thread_N" false ${float_type}) | |||
| GenerateCombinationObjects("zherk_k.c" "LOWER" "U" "HERK;THREADED_LEVEL3;TRANS;CONJ" 3 "herk_thread_C" false ${float_type}) | |||
| #hemm | |||
| GenerateCombinationObjects("zhemm_k.c" "LOWER" "U" "NN;THREADED_LEVEL3" 0 "hemm_thread_L" false ${float_type}) | |||
| GenerateCombinationObjects("zhemm_k.c" "LOWER" "U" "NC;RSIDE;THREADED_LEVEL3" 0 "hemm_thread_R" false ${float_type}) | |||
| @@ -96,6 +99,17 @@ foreach (float_type ${FLOAT_TYPES}) | |||
| endif() | |||
| endif () | |||
| endforeach () | |||
| # for gemm3m | |||
| if(USE_GEMM3M) | |||
| foreach (GEMM_DEFINE ${GEMM_DEFINES}) | |||
| string(TOLOWER ${GEMM_DEFINE} GEMM_DEFINE_LC) | |||
| GenerateNamedObjects("gemm3m.c" "${GEMM_DEFINE}" "gemm3m_${GEMM_DEFINE_LC}" false "" "" false ${float_type}) | |||
| if (SMP AND NOT USE_SIMPLE_THREADED_LEVEL3) | |||
| GenerateNamedObjects("gemm3m.c" "${GEMM_DEFINE};THREADED_LEVEL3" "gemm3m_thread_${GEMM_DEFINE_LC}" false "" "" false ${float_type}) | |||
| endif () | |||
| endforeach () | |||
| endif() | |||
| endif () | |||
| endforeach () | |||
| @@ -65,7 +65,7 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int ( | |||
| blas_queue_t queue[MAX_CPU_NUMBER]; | |||
| BLASLONG range_M[MAX_CPU_NUMBER + 1], range_N[MAX_CPU_NUMBER + 1]; | |||
| BLASLONG procs, total_procs, num_cpu_m, num_cpu_n; | |||
| BLASLONG procs, num_cpu_m, num_cpu_n; | |||
| BLASLONG width, i, j; | |||
| BLASLONG divM, divN; | |||
| @@ -335,7 +335,9 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||
| if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N; | |||
| else | |||
| if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; | |||
| if (min_jj >= 2*GEMM_UNROLL_N) min_jj = 2*GEMM_UNROLL_N; | |||
| else | |||
| if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; | |||
| @@ -230,7 +230,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||
| BLASLONG is, min_i, div_n; | |||
| BLASLONG i, current; | |||
| BLASLONG l1stride, l2size; | |||
| BLASLONG l1stride; | |||
| #ifdef TIMING | |||
| BLASULONG rpcc_counter; | |||
| @@ -298,8 +298,6 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||
| #endif | |||
| ) return 0; | |||
| l2size = GEMM_P * GEMM_Q; | |||
| #if 0 | |||
| fprintf(stderr, "Thread[%ld] m_from : %ld m_to : %ld n_from : %ld n_to : %ld N_from : %ld N_to : %ld\n", | |||
| mypos, m_from, m_to, n_from, n_to, N_from, N_to); | |||
| @@ -369,7 +367,9 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||
| if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N; | |||
| else | |||
| if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; | |||
| if (min_jj >= 2*GEMM_UNROLL_N) min_jj = 2*GEMM_UNROLL_N; | |||
| else | |||
| if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; | |||
| START_RPCC(); | |||
| @@ -706,7 +706,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO | |||
| n = n_to - n_from; | |||
| } | |||
| if ((args -> m < nthreads * SWITCH_RATIO) || (args -> n < nthreads * SWITCH_RATIO)) { | |||
| if ((m < nthreads * SWITCH_RATIO) || (n < nthreads * SWITCH_RATIO)) { | |||
| GEMM_LOCAL(args, range_m, range_n, sa, sb, 0); | |||
| return 0; | |||
| } | |||
| @@ -33,6 +33,7 @@ set(COMMON_SOURCES | |||
| xerbla.c | |||
| openblas_set_num_threads.c | |||
| openblas_error_handle.c | |||
| openblas_env.c | |||
| openblas_get_num_procs.c | |||
| openblas_get_num_threads.c | |||
| ) | |||
| @@ -1,7 +1,7 @@ | |||
| TOPDIR = ../.. | |||
| include ../../Makefile.system | |||
| COMMONOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) openblas_set_num_threads.$(SUFFIX) openblas_get_num_threads.$(SUFFIX) openblas_get_num_procs.$(SUFFIX) openblas_get_config.$(SUFFIX) openblas_get_parallel.$(SUFFIX) openblas_error_handle.$(SUFFIX) | |||
| COMMONOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) openblas_set_num_threads.$(SUFFIX) openblas_get_num_threads.$(SUFFIX) openblas_get_num_procs.$(SUFFIX) openblas_get_config.$(SUFFIX) openblas_get_parallel.$(SUFFIX) openblas_error_handle.$(SUFFIX) openblas_env.$(SUFFIX) | |||
| #COMMONOBJS += slamch.$(SUFFIX) slamc3.$(SUFFIX) dlamch.$(SUFFIX) dlamc3.$(SUFFIX) | |||
| @@ -118,6 +118,9 @@ openblas_get_parallel.$(SUFFIX) : openblas_get_parallel.c | |||
| openblas_error_handle.$(SUFFIX) : openblas_error_handle.c | |||
| $(CC) $(CFLAGS) -c $< -o $(@F) | |||
| openblas_env.$(SUFFIX) : openblas_env.c | |||
| $(CC) $(CFLAGS) -c $< -o $(@F) | |||
| blasL1thread.$(SUFFIX) : blas_l1_thread.c ../../common.h ../../common_thread.h | |||
| $(CC) $(CFLAGS) -c $< -o $(@F) | |||
| @@ -70,7 +70,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| /*********************************************************************/ | |||
| #include "common.h" | |||
| #if defined(OS_LINUX) || defined(OS_NETBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) | |||
| #if defined(OS_LINUX) || defined(OS_NETBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_SUNOS) | |||
| #include <dlfcn.h> | |||
| #include <signal.h> | |||
| #include <sys/resource.h> | |||
| @@ -92,6 +92,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| #endif | |||
| extern unsigned int openblas_thread_timeout(); | |||
| #ifdef SMP_SERVER | |||
| #undef MONITOR | |||
| @@ -524,6 +526,7 @@ static int blas_monitor(void *arg){ | |||
| int blas_thread_init(void){ | |||
| BLASLONG i; | |||
| int ret; | |||
| int thread_timeout_env; | |||
| #ifdef NEED_STACKATTR | |||
| pthread_attr_t attr; | |||
| #endif | |||
| @@ -540,22 +543,12 @@ int blas_thread_init(void){ | |||
| if (!blas_server_avail){ | |||
| env_var_t p; | |||
| if (readenv(p,"THREAD_TIMEOUT")) { | |||
| thread_timeout = atoi(p); | |||
| if (thread_timeout < 4) thread_timeout = 4; | |||
| if (thread_timeout > 30) thread_timeout = 30; | |||
| thread_timeout = (1 << thread_timeout); | |||
| }else{ | |||
| if (readenv(p,"GOTO_THREAD_TIMEOUT")) { | |||
| thread_timeout = atoi(p); | |||
| if (thread_timeout < 4) thread_timeout = 4; | |||
| if (thread_timeout > 30) thread_timeout = 30; | |||
| thread_timeout = (1 << thread_timeout); | |||
| } | |||
| } | |||
| thread_timeout_env=openblas_thread_timeout(); | |||
| if (thread_timeout_env>0) { | |||
| if (thread_timeout_env < 4) thread_timeout_env = 4; | |||
| if (thread_timeout_env > 30) thread_timeout_env = 30; | |||
| thread_timeout = (1 << thread_timeout_env); | |||
| } | |||
| for(i = 0; i < blas_num_threads - 1; i++){ | |||
| @@ -576,10 +569,12 @@ int blas_thread_init(void){ | |||
| struct rlimit rlim; | |||
| const char *msg = strerror(ret); | |||
| fprintf(STDERR, "OpenBLAS blas_thread_init: pthread_create: %s\n", msg); | |||
| #ifdef RLIMIT_NPROC | |||
| if(0 == getrlimit(RLIMIT_NPROC, &rlim)) { | |||
| fprintf(STDERR, "OpenBLAS blas_thread_init: RLIMIT_NPROC " | |||
| "%ld current, %ld max\n", (long)(rlim.rlim_cur), (long)(rlim.rlim_max)); | |||
| } | |||
| #endif | |||
| if(0 != raise(SIGINT)) { | |||
| fprintf(STDERR, "OpenBLAS blas_thread_init: calling exit(3)\n"); | |||
| exit(EXIT_FAILURE); | |||
| @@ -261,6 +261,11 @@ static gotoblas_t *get_coretype(void){ | |||
| return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. | |||
| } | |||
| } | |||
| //Intel Avoton | |||
| if (model == 13) { | |||
| openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); | |||
| return &gotoblas_NEHALEM; | |||
| } | |||
| return NULL; | |||
| case 5: | |||
| //Intel Broadwell | |||
| @@ -318,7 +323,7 @@ static gotoblas_t *get_coretype(void){ | |||
| openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK); | |||
| return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. | |||
| } | |||
| }else if(model == 2){ | |||
| }else if(model == 2 || model == 3){ | |||
| //AMD Bulldozer Opteron 6300 / Opteron 4300 / Opteron 3300 | |||
| if(support_avx()) | |||
| return &gotoblas_PILEDRIVER; | |||
| @@ -327,7 +332,15 @@ static gotoblas_t *get_coretype(void){ | |||
| return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. | |||
| } | |||
| }else if(model == 0){ | |||
| if (exmodel == 3) { | |||
| if (exmodel == 1) { | |||
| //AMD Trinity | |||
| if(support_avx()) | |||
| return &gotoblas_PILEDRIVER; | |||
| else{ | |||
| openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK); | |||
| return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. | |||
| } | |||
| }else if (exmodel == 3) { | |||
| //AMD STEAMROLLER | |||
| if(support_avx()) | |||
| return &gotoblas_STEAMROLLER; | |||
| @@ -378,7 +391,7 @@ static char *corename[] = { | |||
| "Nehalem", | |||
| "Athlon", | |||
| "Opteron", | |||
| "Opteron(SSE3)", | |||
| "Opteron_SSE3", | |||
| "Barcelona", | |||
| "Nano", | |||
| "Sandybridge", | |||
| @@ -104,6 +104,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include <errno.h> | |||
| #include <linux/unistd.h> | |||
| #include <sys/syscall.h> | |||
| #include <sys/time.h> | |||
| #include <sys/resource.h> | |||
| #endif | |||
| #if defined(OS_FREEBSD) || defined(OS_DARWIN) | |||
| @@ -142,7 +144,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #if defined(_MSC_VER) && !defined(__clang__) | |||
| #define CONSTRUCTOR __cdecl | |||
| #define DESTRUCTOR __cdecl | |||
| #elif defined(OS_DARWIN) && defined(C_GCC) | |||
| #elif (defined(OS_DARWIN) || defined(OS_SUNOS)) && defined(C_GCC) | |||
| #define CONSTRUCTOR __attribute__ ((constructor)) | |||
| #define DESTRUCTOR __attribute__ ((destructor)) | |||
| #else | |||
| @@ -167,7 +169,7 @@ void goto_set_num_threads(int num_threads) {}; | |||
| #else | |||
| #ifdef OS_LINUX | |||
| #if defined(OS_LINUX) || defined(OS_SUNOS) | |||
| #ifndef NO_AFFINITY | |||
| int get_num_procs(void); | |||
| #else | |||
| @@ -292,8 +294,11 @@ void openblas_fork_handler() | |||
| #endif | |||
| } | |||
| extern int openblas_num_threads_env(); | |||
| extern int openblas_goto_num_threads_env(); | |||
| extern int openblas_omp_num_threads_env(); | |||
| int blas_get_cpu_number(void){ | |||
| env_var_t p; | |||
| #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) | |||
| int max_num; | |||
| #endif | |||
| @@ -308,18 +313,18 @@ int blas_get_cpu_number(void){ | |||
| blas_goto_num = 0; | |||
| #ifndef USE_OPENMP | |||
| if (readenv(p,"OPENBLAS_NUM_THREADS")) blas_goto_num = atoi(p); | |||
| blas_goto_num=openblas_num_threads_env(); | |||
| if (blas_goto_num < 0) blas_goto_num = 0; | |||
| if (blas_goto_num == 0) { | |||
| if (readenv(p,"GOTO_NUM_THREADS")) blas_goto_num = atoi(p); | |||
| if (blas_goto_num < 0) blas_goto_num = 0; | |||
| blas_goto_num=openblas_goto_num_threads_env(); | |||
| if (blas_goto_num < 0) blas_goto_num = 0; | |||
| } | |||
| #endif | |||
| blas_omp_num = 0; | |||
| if (readenv(p,"OMP_NUM_THREADS")) blas_omp_num = atoi(p); | |||
| blas_omp_num=openblas_omp_num_threads_env(); | |||
| if (blas_omp_num < 0) blas_omp_num = 0; | |||
| if (blas_goto_num > 0) blas_num_threads = blas_goto_num; | |||
| @@ -355,7 +360,9 @@ int openblas_get_num_threads(void) { | |||
| #ifndef SMP | |||
| return 1; | |||
| #else | |||
| return blas_get_cpu_number(); | |||
| // init blas_cpu_number if needed | |||
| blas_get_cpu_number(); | |||
| return blas_cpu_number; | |||
| #endif | |||
| } | |||
| @@ -914,7 +921,6 @@ static volatile struct { | |||
| } memory[NUM_BUFFERS]; | |||
| static int memory_initialized = 0; | |||
| static void gotoblas_memory_init(void); | |||
| /* Memory allocation routine */ | |||
| /* procpos ... indicates where it comes from */ | |||
| @@ -1337,6 +1343,7 @@ static void gotoblas_memory_init(void) { | |||
| /* Initialization for all function; this function should be called before main */ | |||
| static int gotoblas_initialized = 0; | |||
| extern void openblas_read_env(); | |||
| void CONSTRUCTOR gotoblas_init(void) { | |||
| @@ -1346,6 +1353,8 @@ void CONSTRUCTOR gotoblas_init(void) { | |||
| openblas_fork_handler(); | |||
| #endif | |||
| openblas_read_env(); | |||
| #ifdef PROFILE | |||
| moncontrol (0); | |||
| #endif | |||
| @@ -1362,6 +1371,19 @@ void CONSTRUCTOR gotoblas_init(void) { | |||
| gotoblas_memory_init(); | |||
| #endif | |||
| //#if defined(OS_LINUX) | |||
| #if 0 | |||
| struct rlimit curlimit; | |||
| if ( getrlimit(RLIMIT_STACK, &curlimit ) == 0 ) | |||
| { | |||
| if ( curlimit.rlim_cur != curlimit.rlim_max ) | |||
| { | |||
| curlimit.rlim_cur = curlimit.rlim_max; | |||
| setrlimit(RLIMIT_STACK, &curlimit); | |||
| } | |||
| } | |||
| #endif | |||
| #ifdef SMP | |||
| if (blas_cpu_number == 0) blas_get_cpu_number(); | |||
| #ifdef SMP_SERVER | |||
| @@ -0,0 +1,84 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2011-2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| static int openblas_env_verbose=0; | |||
| static unsigned int openblas_env_thread_timeout=0; | |||
| static int openblas_env_block_factor=0; | |||
| static int openblas_env_openblas_num_threads=0; | |||
| static int openblas_env_goto_num_threads=0; | |||
| static int openblas_env_omp_num_threads=0; | |||
| int openblas_verbose() { return openblas_env_verbose;} | |||
| unsigned int openblas_thread_timeout() { return openblas_env_thread_timeout;} | |||
| int openblas_block_factor() { return openblas_env_block_factor;} | |||
| int openblas_num_threads_env() { return openblas_env_openblas_num_threads;} | |||
| int openblas_goto_num_threads_env() { return openblas_env_goto_num_threads;} | |||
| int openblas_omp_num_threads_env() { return openblas_env_omp_num_threads;} | |||
| void openblas_read_env() { | |||
| int ret=0; | |||
| env_var_t p; | |||
| if (readenv(p,"OPENBLAS_VERBOSE")) ret = atoi(p); | |||
| if(ret<0) ret=0; | |||
| openblas_env_verbose=ret; | |||
| ret=0; | |||
| if (readenv(p,"OPENBLAS_BLOCK_FACTOR")) ret = atoi(p); | |||
| if(ret<0) ret=0; | |||
| openblas_env_block_factor=ret; | |||
| ret=0; | |||
| if (readenv(p,"OPENBLAS_THREAD_TIMEOUT")) ret = atoi(p); | |||
| if(ret<0) ret=0; | |||
| openblas_env_thread_timeout=(unsigned int)ret; | |||
| ret=0; | |||
| if (readenv(p,"OPENBLAS_NUM_THREADS")) ret = atoi(p); | |||
| if(ret<0) ret=0; | |||
| openblas_env_openblas_num_threads=ret; | |||
| ret=0; | |||
| if (readenv(p,"GOTO_NUM_THREADS")) ret = atoi(p); | |||
| if(ret<0) ret=0; | |||
| openblas_env_goto_num_threads=ret; | |||
| ret=0; | |||
| if (readenv(p,"OMP_NUM_THREADS")) ret = atoi(p); | |||
| if(ret<0) ret=0; | |||
| openblas_env_omp_num_threads=ret; | |||
| } | |||
| @@ -33,13 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| int openblas_verbose() { | |||
| int ret=0; | |||
| env_var_t p; | |||
| if (readenv(p,"OPENBLAS_VERBOSE")) ret = atoi(p); | |||
| if(ret<0) ret=0; | |||
| return ret; | |||
| } | |||
| extern int openblas_verbose(); | |||
| void openblas_warning(int verbose, const char * msg) { | |||
| int current_verbose; | |||
| @@ -40,6 +40,7 @@ | |||
| #include <string.h> | |||
| #include "common.h" | |||
| extern int openblas_block_factor(); | |||
| int get_L2_size(void); | |||
| #define DEFAULT_GEMM_P 128 | |||
| @@ -249,7 +250,6 @@ int get_L2_size(void){ | |||
| void blas_set_parameter(void){ | |||
| env_var_t p; | |||
| int factor; | |||
| #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL) || defined(STEAMROLLER) | |||
| int size = 16; | |||
| @@ -468,9 +468,8 @@ void blas_set_parameter(void){ | |||
| #endif | |||
| #endif | |||
| if (readenv(p,"GOTO_BLOCK_FACTOR")) { | |||
| factor = atoi(p); | |||
| factor=openblas_block_factor(); | |||
| if (factor>0) { | |||
| if (factor < 10) factor = 10; | |||
| if (factor > 200) factor = 200; | |||
| @@ -26,10 +26,16 @@ ifndef ONLY_CBLAS | |||
| ONLY_CBLAS = 0 | |||
| endif | |||
| ifndef BUILD_LAPACK_DEPRECATED | |||
| BUILD_LAPACK_DEPRECATED = 0 | |||
| endif | |||
| ifeq ($(OSNAME), WINNT) | |||
| ifeq ($(F_COMPILER), GFORTRAN) | |||
| ifndef ONLY_CBLAS | |||
| EXTRALIB += -lgfortran | |||
| endif | |||
| endif | |||
| ifeq ($(USE_OPENMP), 1) | |||
| ifeq ($(C_COMPILER), GCC) | |||
| EXTRALIB += -lgomp | |||
| @@ -39,9 +45,11 @@ endif | |||
| ifeq ($(OSNAME), CYGWIN_NT) | |||
| ifeq ($(F_COMPILER), GFORTRAN) | |||
| ifndef ONLY_CBLAS | |||
| EXTRALIB += -lgfortran | |||
| endif | |||
| endif | |||
| endif | |||
| all:: | |||
| @@ -88,17 +96,17 @@ dll : ../$(LIBDLLNAME) | |||
| -Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive $(FEXTRALIB) $(EXTRALIB) | |||
| libopenblas.def : gensymbol | |||
| perl ./gensymbol win2k $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" > $(@F) | |||
| perl ./gensymbol win2k $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F) | |||
| libgoto_hpl.def : gensymbol | |||
| perl ./gensymbol win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" > $(@F) | |||
| perl ./gensymbol win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F) | |||
| ifeq (, $(SYMBOLPREFIX)$(SYMBOLSUFFIX)) | |||
| $(LIBDYNNAME) : ../$(LIBNAME) osx.def | |||
| else | |||
| ../$(LIBNAME).renamed : ../$(LIBNAME) objconv.def | |||
| $(OBJCONV) @objconv.def ../$(LIBNAME) ../$(LIBNAME).renamed | |||
| $(LIBDYNNAME) : ../$(LIBNAME).renamed osx.def | |||
| ../$(LIBNAME).osx.renamed : ../$(LIBNAME) objconv.def | |||
| $(OBJCONV) @objconv.def ../$(LIBNAME) ../$(LIBNAME).osx.renamed | |||
| $(LIBDYNNAME) : ../$(LIBNAME).osx.renamed osx.def | |||
| endif | |||
| ifeq ($(NOFORTRAN), $(filter $(NOFORTRAN),1 2)) | |||
| #only build without Fortran | |||
| @@ -110,7 +118,7 @@ endif | |||
| dllinit.$(SUFFIX) : dllinit.c | |||
| $(CC) $(CFLAGS) -c -o $(@F) -s $< | |||
| ifeq ($(OSNAME), Linux) | |||
| ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS)) | |||
| so : ../$(LIBSONAME) | |||
| @@ -201,26 +209,26 @@ static : ../$(LIBNAME) | |||
| rm -f goto.$(SUFFIX) | |||
| osx.def : gensymbol ../Makefile.system ../getarch.c | |||
| perl ./gensymbol osx $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" > $(@F) | |||
| perl ./gensymbol osx $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F) | |||
| aix.def : gensymbol ../Makefile.system ../getarch.c | |||
| perl ./gensymbol aix $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" > $(@F) | |||
| perl ./gensymbol aix $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F) | |||
| objcopy.def : gensymbol ../Makefile.system ../getarch.c | |||
| perl ./gensymbol objcopy $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" > $(@F) | |||
| perl ./gensymbol objcopy $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F) | |||
| objconv.def : gensymbol ../Makefile.system ../getarch.c | |||
| perl ./gensymbol objconv $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" > $(@F) | |||
| perl ./gensymbol objconv $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F) | |||
| test : linktest.c | |||
| $(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) -lm && echo OK. | |||
| rm -f linktest | |||
| linktest.c : gensymbol ../Makefile.system ../getarch.c | |||
| perl ./gensymbol linktest $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" > linktest.c | |||
| perl ./gensymbol linktest $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > linktest.c | |||
| clean :: | |||
| @rm -f *.def *.dylib __.SYMDEF* | |||
| @rm -f *.def *.dylib __.SYMDEF* *.renamed | |||
| include ../Makefile.tail | |||
| @@ -173,18 +173,18 @@ | |||
| sgbbrd, sgbcon, sgbequ, sgbrfs, sgbsv, | |||
| sgbsvx, sgbtf2, sgbtrf, sgbtrs, sgebak, sgebal, sgebd2, | |||
| sgebrd, sgecon, sgeequ, sgees, sgeesx, sgeev, sgeevx, | |||
| sgegs, sgegv, sgehd2, sgehrd, sgelq2, sgelqf, | |||
| sgels, sgelsd, sgelss, sgelsx, sgelsy, sgeql2, sgeqlf, | |||
| sgeqp3, sgeqpf, sgeqr2, sgeqr2p, sgeqrf, sgeqrfp, sgerfs, | |||
| sgehd2, sgehrd, sgelq2, sgelqf, | |||
| sgels, sgelsd, sgelss, sgelsy, sgeql2, sgeqlf, | |||
| sgeqp3, sgeqr2, sgeqr2p, sgeqrf, sgeqrfp, sgerfs, | |||
| sgerq2, sgerqf, sgesc2, sgesdd, sgesvd, sgesvx, | |||
| sgetc2, sgetri, | |||
| sggbak, sggbal, sgges, sggesx, sggev, sggevx, | |||
| sggglm, sgghrd, sgglse, sggqrf, | |||
| sggrqf, sggsvd, sggsvp, sgtcon, sgtrfs, sgtsv, | |||
| sggrqf, sgtcon, sgtrfs, sgtsv, | |||
| sgtsvx, sgttrf, sgttrs, sgtts2, shgeqz, | |||
| shsein, shseqr, slabrd, slacon, slacn2, | |||
| slaein, slaexc, slag2, slags2, slagtm, slagv2, slahqr, | |||
| slahrd, slahr2, slaic1, slaln2, slals0, slalsa, slalsd, | |||
| slahr2, slaic1, slaln2, slals0, slalsa, slalsd, | |||
| slangb, slange, slangt, slanhs, slansb, slansp, | |||
| slansy, slantb, slantp, slantr, slanv2, | |||
| slapll, slapmt, | |||
| @@ -194,7 +194,7 @@ | |||
| slarf, slarfb, slarfg, slarfgp, slarft, slarfx, slargv, | |||
| slarrv, slartv, | |||
| slarz, slarzb, slarzt, slasy2, slasyf, | |||
| slatbs, slatdf, slatps, slatrd, slatrs, slatrz, slatzm, | |||
| slatbs, slatdf, slatps, slatrd, slatrs, slatrz, | |||
| sopgtr, sopmtr, sorg2l, sorg2r, | |||
| sorgbr, sorghr, sorgl2, sorglq, sorgql, sorgqr, sorgr2, | |||
| sorgrq, sorgtr, sorm2l, sorm2r, | |||
| @@ -220,7 +220,7 @@ | |||
| stgsja, stgsna, stgsy2, stgsyl, stpcon, stprfs, stptri, | |||
| stptrs, | |||
| strcon, strevc, strexc, strrfs, strsen, strsna, strsyl, | |||
| strtrs, stzrqf, stzrzf, sstemr, | |||
| strtrs, stzrzf, sstemr, | |||
| slansf, spftrf, spftri, spftrs, ssfrk, stfsm, stftri, stfttp, | |||
| stfttr, stpttf, stpttr, strttf, strttp, | |||
| sgejsv, sgesvj, sgsvj0, sgsvj1, | |||
| @@ -245,14 +245,13 @@ | |||
| cbdsqr, cgbbrd, cgbcon, cgbequ, cgbrfs, cgbsv, cgbsvx, | |||
| cgbtf2, cgbtrf, cgbtrs, cgebak, cgebal, cgebd2, cgebrd, | |||
| cgecon, cgeequ, cgees, cgeesx, cgeev, cgeevx, | |||
| cgegs, cgegv, cgehd2, cgehrd, cgelq2, cgelqf, | |||
| cgels, cgelsd, cgelss, cgelsx, cgelsy, cgeql2, cgeqlf, cgeqp3, | |||
| cgeqpf, cgeqr2, cgeqr2p, cgeqrf, cgeqrfp, cgerfs, | |||
| cgehd2, cgehrd, cgelq2, cgelqf, | |||
| cgels, cgelsd, cgelss, cgelsy, cgeql2, cgeqlf, cgeqp3, | |||
| cgeqr2, cgeqr2p, cgeqrf, cgeqrfp, cgerfs, | |||
| cgerq2, cgerqf, cgesc2, cgesdd, cgesvd, | |||
| cgesvx, cgetc2, cgetri, | |||
| cggbak, cggbal, cgges, cggesx, cggev, cggevx, cggglm, | |||
| cgghrd, cgglse, cggqrf, cggrqf, | |||
| cggsvd, cggsvp, | |||
| cgtcon, cgtrfs, cgtsv, cgtsvx, cgttrf, cgttrs, cgtts2, chbev, | |||
| chbevd, chbevx, chbgst, chbgv, chbgvd, chbgvx, chbtrd, | |||
| checon, cheev, cheevd, cheevr, cheevx, chegs2, chegst, | |||
| @@ -267,7 +266,7 @@ | |||
| claed0, claed7, claed8, | |||
| claein, claesy, claev2, clags2, clagtm, | |||
| clahef, clahqr, | |||
| clahrd, clahr2, claic1, clals0, clalsa, clalsd, clangb, clange, clangt, | |||
| clahr2, claic1, clals0, clalsa, clalsd, clangb, clange, clangt, | |||
| clanhb, clanhe, | |||
| clanhp, clanhs, clanht, clansb, clansp, clansy, clantb, | |||
| clantp, clantr, clapll, clapmt, clarcm, claqgb, claqge, | |||
| @@ -278,7 +277,7 @@ | |||
| clarfx, clargv, clarnv, clarrv, clartg, clartv, | |||
| clarz, clarzb, clarzt, clascl, claset, clasr, classq, | |||
| clasyf, clatbs, clatdf, clatps, clatrd, clatrs, clatrz, | |||
| clatzm, cpbcon, cpbequ, cpbrfs, cpbstf, cpbsv, | |||
| cpbcon, cpbequ, cpbrfs, cpbstf, cpbsv, | |||
| cpbsvx, cpbtf2, cpbtrf, cpbtrs, cpocon, cpoequ, cporfs, | |||
| cposv, cposvx, cpstrf, cpstf2, | |||
| cppcon, cppequ, cpprfs, cppsv, cppsvx, cpptrf, cpptri, cpptrs, | |||
| @@ -293,7 +292,7 @@ | |||
| ctgexc, ctgsen, ctgsja, ctgsna, ctgsy2, ctgsyl, ctpcon, | |||
| ctprfs, ctptri, | |||
| ctptrs, ctrcon, ctrevc, ctrexc, ctrrfs, ctrsen, ctrsna, | |||
| ctrsyl, ctrtrs, ctzrqf, ctzrzf, cung2l, cung2r, | |||
| ctrsyl, ctrtrs, ctzrzf, cung2l, cung2r, | |||
| cungbr, cunghr, cungl2, cunglq, cungql, cungqr, cungr2, | |||
| cungrq, cungtr, cunm2l, cunm2r, cunmbr, cunmhr, cunml2, | |||
| cunmlq, cunmql, cunmqr, cunmr2, cunmr3, cunmrq, cunmrz, | |||
| @@ -321,18 +320,18 @@ | |||
| dgbbrd, dgbcon, dgbequ, dgbrfs, dgbsv, | |||
| dgbsvx, dgbtf2, dgbtrf, dgbtrs, dgebak, dgebal, dgebd2, | |||
| dgebrd, dgecon, dgeequ, dgees, dgeesx, dgeev, dgeevx, | |||
| dgegs, dgegv, dgehd2, dgehrd, dgelq2, dgelqf, | |||
| dgels, dgelsd, dgelss, dgelsx, dgelsy, dgeql2, dgeqlf, | |||
| dgeqp3, dgeqpf, dgeqr2, dgeqr2p, dgeqrf, dgeqrfp, dgerfs, | |||
| dgehd2, dgehrd, dgelq2, dgelqf, | |||
| dgels, dgelsd, dgelss, dgelsy, dgeql2, dgeqlf, | |||
| dgeqp3, dgeqr2, dgeqr2p, dgeqrf, dgeqrfp, dgerfs, | |||
| dgerq2, dgerqf, dgesc2, dgesdd, dgesvd, dgesvx, | |||
| dgetc2, dgetri, | |||
| dggbak, dggbal, dgges, dggesx, dggev, dggevx, | |||
| dggglm, dgghrd, dgglse, dggqrf, | |||
| dggrqf, dggsvd, dggsvp, dgtcon, dgtrfs, dgtsv, | |||
| dggrqf, dgtcon, dgtrfs, dgtsv, | |||
| dgtsvx, dgttrf, dgttrs, dgtts2, dhgeqz, | |||
| dhsein, dhseqr, dlabrd, dlacon, dlacn2, | |||
| dlaein, dlaexc, dlag2, dlags2, dlagtm, dlagv2, dlahqr, | |||
| dlahrd, dlahr2, dlaic1, dlaln2, dlals0, dlalsa, dlalsd, | |||
| dlahr2, dlaic1, dlaln2, dlals0, dlalsa, dlalsd, | |||
| dlangb, dlange, dlangt, dlanhs, dlansb, dlansp, | |||
| dlansy, dlantb, dlantp, dlantr, dlanv2, | |||
| dlapll, dlapmt, | |||
| @@ -342,7 +341,7 @@ | |||
| dlarf, dlarfb, dlarfg, dlarfgp, dlarft, dlarfx, | |||
| dlargv, dlarrv, dlartv, | |||
| dlarz, dlarzb, dlarzt, dlasy2, dlasyf, | |||
| dlatbs, dlatdf, dlatps, dlatrd, dlatrs, dlatrz, dlatzm, | |||
| dlatbs, dlatdf, dlatps, dlatrd, dlatrs, dlatrz, | |||
| dopgtr, dopmtr, dorg2l, dorg2r, | |||
| dorgbr, dorghr, dorgl2, dorglq, dorgql, dorgqr, dorgr2, | |||
| dorgrq, dorgtr, dorm2l, dorm2r, | |||
| @@ -368,7 +367,7 @@ | |||
| dtgsja, dtgsna, dtgsy2, dtgsyl, dtpcon, dtprfs, dtptri, | |||
| dtptrs, | |||
| dtrcon, dtrevc, dtrexc, dtrrfs, dtrsen, dtrsna, dtrsyl, | |||
| dtrtrs, dtzrqf, dtzrzf, dstemr, | |||
| dtrtrs, dtzrzf, dstemr, | |||
| dsgesv, dsposv, dlag2s, slag2d, dlat2s, | |||
| dlansf, dpftrf, dpftri, dpftrs, dsfrk, dtfsm, dtftri, dtfttp, | |||
| dtfttr, dtpttf, dtpttr, dtrttf, dtrttp, | |||
| @@ -387,14 +386,13 @@ | |||
| zbdsqr, zgbbrd, zgbcon, zgbequ, zgbrfs, zgbsv, zgbsvx, | |||
| zgbtf2, zgbtrf, zgbtrs, zgebak, zgebal, zgebd2, zgebrd, | |||
| zgecon, zgeequ, zgees, zgeesx, zgeev, zgeevx, | |||
| zgegs, zgegv, zgehd2, zgehrd, zgelq2, zgelqf, | |||
| zgels, zgelsd, zgelss, zgelsx, zgelsy, zgeql2, zgeqlf, zgeqp3, | |||
| zgeqpf, zgeqr2, zgeqr2p, zgeqrf, zgeqrfp, zgerfs, zgerq2, zgerqf, | |||
| zgehd2, zgehrd, zgelq2, zgelqf, | |||
| zgels, zgelsd, zgelss, zgelsy, zgeql2, zgeqlf, zgeqp3, | |||
| zgeqr2, zgeqr2p, zgeqrf, zgeqrfp, zgerfs, zgerq2, zgerqf, | |||
| zgesc2, zgesdd, zgesvd, zgesvx, zgetc2, | |||
| zgetri, | |||
| zggbak, zggbal, zgges, zggesx, zggev, zggevx, zggglm, | |||
| zgghrd, zgglse, zggqrf, zggrqf, | |||
| zggsvd, zggsvp, | |||
| zgtcon, zgtrfs, zgtsv, zgtsvx, zgttrf, zgttrs, zgtts2, zhbev, | |||
| zhbevd, zhbevx, zhbgst, zhbgv, zhbgvd, zhbgvx, zhbtrd, | |||
| zhecon, zheev, zheevd, zheevr, zheevx, zhegs2, zhegst, | |||
| @@ -409,7 +407,7 @@ | |||
| zlaed0, zlaed7, zlaed8, | |||
| zlaein, zlaesy, zlaev2, zlags2, zlagtm, | |||
| zlahef, zlahqr, | |||
| zlahrd, zlahr2, zlaic1, zlals0, zlalsa, zlalsd, zlangb, zlange, | |||
| zlahr2, zlaic1, zlals0, zlalsa, zlalsd, zlangb, zlange, | |||
| zlangt, zlanhb, | |||
| zlanhe, | |||
| zlanhp, zlanhs, zlanht, zlansb, zlansp, zlansy, zlantb, | |||
| @@ -422,7 +420,7 @@ | |||
| zlarfx, zlargv, zlarnv, zlarrv, zlartg, zlartv, | |||
| zlarz, zlarzb, zlarzt, zlascl, zlaset, zlasr, | |||
| zlassq, zlasyf, | |||
| zlatbs, zlatdf, zlatps, zlatrd, zlatrs, zlatrz, zlatzm, | |||
| zlatbs, zlatdf, zlatps, zlatrd, zlatrs, zlatrz, | |||
| zpbcon, zpbequ, zpbrfs, zpbstf, zpbsv, | |||
| zpbsvx, zpbtf2, zpbtrf, zpbtrs, zpocon, zpoequ, zporfs, | |||
| zposv, zposvx, zpotrs, zpstrf, zpstf2, | |||
| @@ -438,7 +436,7 @@ | |||
| ztgexc, ztgsen, ztgsja, ztgsna, ztgsy2, ztgsyl, ztpcon, | |||
| ztprfs, ztptri, | |||
| ztptrs, ztrcon, ztrevc, ztrexc, ztrrfs, ztrsen, ztrsna, | |||
| ztrsyl, ztrtrs, ztzrqf, ztzrzf, zung2l, | |||
| ztrsyl, ztrtrs, ztzrzf, zung2l, | |||
| zung2r, zungbr, zunghr, zungl2, zunglq, zungql, zungqr, zungr2, | |||
| zungrq, zungtr, zunm2l, zunm2r, zunmbr, zunmhr, zunml2, | |||
| zunmlq, zunmql, zunmqr, zunmr2, zunmr3, zunmrq, zunmrz, | |||
| @@ -452,6 +450,139 @@ | |||
| zunbdb5, zunbdb6, zuncsd, zuncsd2by1, | |||
| zgeqrt, zgeqrt2, zgeqrt3, zgemqrt, | |||
| ztpqrt, ztpqrt2, ztpmqrt, ztprfb, | |||
| # functions added for lapack-3.6.0 | |||
| cgejsv, | |||
| cgesvdx, | |||
| cgesvj, | |||
| cgetrf2, | |||
| cgges3, | |||
| cggev3, | |||
| cgghd3, | |||
| cggsvd3, | |||
| cggsvp3, | |||
| cgsvj0, | |||
| cgsvj1, | |||
| clagge, | |||
| claghe, | |||
| clagsy, | |||
| clahilb, | |||
| clakf2, | |||
| clarge, | |||
| clarnd, | |||
| claror, | |||
| clarot, | |||
| clatm1, | |||
| clatm2, | |||
| clatm3, | |||
| clatm5, | |||
| clatm6, | |||
| clatme, | |||
| clatmr, | |||
| clatms, | |||
| clatmt, | |||
| cpotrf2, | |||
| csbmv, | |||
| cspr2, | |||
| csyr2, | |||
| cunm22, | |||
| dbdsvdx, | |||
| dgesvdx, | |||
| dgetrf2, | |||
| dgges3, | |||
| dggev3, | |||
| dgghd3, | |||
| dggsvd3, | |||
| dggsvp3, | |||
| dladiv2, | |||
| dlagge, | |||
| dlagsy, | |||
| dlahilb, | |||
| dlakf2, | |||
| dlaran, | |||
| dlarge, | |||
| dlarnd, | |||
| dlaror, | |||
| dlarot, | |||
| dlatm1, | |||
| dlatm2, | |||
| dlatm3, | |||
| dlatm5, | |||
| dlatm6, | |||
| dlatm7, | |||
| dlatme, | |||
| dlatmr, | |||
| dlatms, | |||
| dlatmt, | |||
| dorm22, | |||
| dpotrf2, | |||
| dsecnd, | |||
| sbdsvdx, | |||
| second, | |||
| sgesvdx, | |||
| sgetrf2, | |||
| sgges3, | |||
| sggev3, | |||
| sgghd3, | |||
| sggsvd3, | |||
| sggsvp3, | |||
| sladiv2, | |||
| slagge, | |||
| slagsy, | |||
| slahilb, | |||
| slakf2, | |||
| slaran, | |||
| slarge, | |||
| slarnd, | |||
| slaror, | |||
| slarot, | |||
| slatm1, | |||
| slatm2, | |||
| slatm3, | |||
| slatm5, | |||
| slatm6, | |||
| slatm7, | |||
| slatme, | |||
| slatmr, | |||
| slatms, | |||
| slatmt, | |||
| sorm22, | |||
| spotrf2, | |||
| zgejsv, | |||
| zgesvdx, | |||
| zgesvj, | |||
| zgetrf2, | |||
| zgges3, | |||
| zggev3, | |||
| zgghd3, | |||
| zggsvd3, | |||
| zggsvp3, | |||
| zgsvj0, | |||
| zgsvj1, | |||
| zlagge, | |||
| zlaghe, | |||
| zlagsy, | |||
| zlahilb, | |||
| zlakf2, | |||
| zlarge, | |||
| zlarnd, | |||
| zlaror, | |||
| zlarot, | |||
| zlatm1, | |||
| zlatm2, | |||
| zlatm3, | |||
| zlatm5, | |||
| zlatm6, | |||
| zlatme, | |||
| zlatmr, | |||
| zlatms, | |||
| zlatmt, | |||
| zpotrf2, | |||
| zsbmv, | |||
| zspr2, | |||
| zsyr2, | |||
| zunm22 | |||
| ); | |||
| @lapack_extendedprecision_objs = ( | |||
| @@ -459,6 +590,13 @@ | |||
| dlagsy, dsysvxx, sporfsx, slatms, zlatms, zherfsx, csysvxx, | |||
| ); | |||
| @lapack_deprecated_objs = ( | |||
| cgegs, cggsvd, ctzrqf, dgeqpf, dlatzm, sgelsx, slahrd, zgegv, zggsvp, | |||
| cgegv, cggsvp, dgegs, dggsvd, dtzrqf, sgeqpf, slatzm, zgelsx, zlahrd, | |||
| cgelsx, clahrd, dgegv, dggsvp, sgegs, sggsvd, stzrqf, zgeqpf, zlatzm, | |||
| cgeqpf, clatzm, dgelsx, dlahrd, sgegv, sggsvp, zgegs, zggsvd, ztzrqf, | |||
| ); | |||
| @lapackeobjs = ( | |||
| # LAPACK C interface routines. | |||
| # | |||
| @@ -682,8 +820,6 @@ | |||
| LAPACKE_cgeqlf_work, | |||
| LAPACKE_cgeqp3, | |||
| LAPACKE_cgeqp3_work, | |||
| LAPACKE_cgeqpf, | |||
| LAPACKE_cgeqpf_work, | |||
| LAPACKE_cgeqr2, | |||
| LAPACKE_cgeqr2_work, | |||
| LAPACKE_cgeqrf, | |||
| @@ -738,10 +874,6 @@ | |||
| LAPACKE_cggqrf_work, | |||
| LAPACKE_cggrqf, | |||
| LAPACKE_cggrqf_work, | |||
| LAPACKE_cggsvd, | |||
| LAPACKE_cggsvd_work, | |||
| LAPACKE_cggsvp, | |||
| LAPACKE_cggsvp_work, | |||
| LAPACKE_cgtcon, | |||
| LAPACKE_cgtcon_work, | |||
| LAPACKE_cgtrfs, | |||
| @@ -1186,8 +1318,6 @@ | |||
| LAPACKE_dgeqlf_work, | |||
| LAPACKE_dgeqp3, | |||
| LAPACKE_dgeqp3_work, | |||
| LAPACKE_dgeqpf, | |||
| LAPACKE_dgeqpf_work, | |||
| LAPACKE_dgeqr2, | |||
| LAPACKE_dgeqr2_work, | |||
| LAPACKE_dgeqrf, | |||
| @@ -1244,10 +1374,6 @@ | |||
| LAPACKE_dggqrf_work, | |||
| LAPACKE_dggrqf, | |||
| LAPACKE_dggrqf_work, | |||
| LAPACKE_dggsvd, | |||
| LAPACKE_dggsvd_work, | |||
| LAPACKE_dggsvp, | |||
| LAPACKE_dggsvp_work, | |||
| LAPACKE_dgtcon, | |||
| LAPACKE_dgtcon_work, | |||
| LAPACKE_dgtrfs, | |||
| @@ -1676,8 +1802,6 @@ | |||
| LAPACKE_sgeqlf_work, | |||
| LAPACKE_sgeqp3, | |||
| LAPACKE_sgeqp3_work, | |||
| LAPACKE_sgeqpf, | |||
| LAPACKE_sgeqpf_work, | |||
| LAPACKE_sgeqr2, | |||
| LAPACKE_sgeqr2_work, | |||
| LAPACKE_sgeqrf, | |||
| @@ -1734,10 +1858,6 @@ | |||
| LAPACKE_sggqrf_work, | |||
| LAPACKE_sggrqf, | |||
| LAPACKE_sggrqf_work, | |||
| LAPACKE_sggsvd, | |||
| LAPACKE_sggsvd_work, | |||
| LAPACKE_sggsvp, | |||
| LAPACKE_sggsvp_work, | |||
| LAPACKE_sgtcon, | |||
| LAPACKE_sgtcon_work, | |||
| LAPACKE_sgtrfs, | |||
| @@ -2158,8 +2278,6 @@ | |||
| LAPACKE_zgeqlf_work, | |||
| LAPACKE_zgeqp3, | |||
| LAPACKE_zgeqp3_work, | |||
| LAPACKE_zgeqpf, | |||
| LAPACKE_zgeqpf_work, | |||
| LAPACKE_zgeqr2, | |||
| LAPACKE_zgeqr2_work, | |||
| LAPACKE_zgeqrf, | |||
| @@ -2214,10 +2332,6 @@ | |||
| LAPACKE_zggqrf_work, | |||
| LAPACKE_zggrqf, | |||
| LAPACKE_zggrqf_work, | |||
| LAPACKE_zggsvd, | |||
| LAPACKE_zggsvd_work, | |||
| LAPACKE_zggsvp, | |||
| LAPACKE_zggsvp_work, | |||
| LAPACKE_zgtcon, | |||
| LAPACKE_zgtcon_work, | |||
| LAPACKE_zgtrfs, | |||
| @@ -2707,6 +2821,134 @@ | |||
| LAPACKE_slagsy_work, | |||
| LAPACKE_zlagsy, | |||
| LAPACKE_zlagsy_work, | |||
| ## new function from lapack-3.6.0 | |||
| LAPACKE_cgejsv, | |||
| LAPACKE_cgejsv_work, | |||
| LAPACKE_cgesvdx, | |||
| LAPACKE_cgesvdx_work, | |||
| LAPACKE_cgesvj, | |||
| LAPACKE_cgesvj_work, | |||
| LAPACKE_cgetrf2, | |||
| LAPACKE_cgetrf2_work, | |||
| LAPACKE_cgges3, | |||
| LAPACKE_cgges3_work, | |||
| LAPACKE_cggev3, | |||
| LAPACKE_cggev3_work, | |||
| LAPACKE_cgghd3, | |||
| LAPACKE_cgghd3_work, | |||
| LAPACKE_cggsvd3, | |||
| LAPACKE_cggsvd3_work, | |||
| LAPACKE_cggsvp3, | |||
| LAPACKE_cggsvp3_work, | |||
| LAPACKE_chetrf_rook, | |||
| LAPACKE_chetrf_rook_work, | |||
| LAPACKE_chetrs_rook, | |||
| LAPACKE_chetrs_rook_work, | |||
| LAPACKE_clapmt, | |||
| LAPACKE_clapmt_work, | |||
| LAPACKE_clascl, | |||
| LAPACKE_clascl_work, | |||
| LAPACKE_cpotrf2, | |||
| LAPACKE_cpotrf2_work, | |||
| LAPACKE_csytrf_rook, | |||
| LAPACKE_csytrf_rook_work, | |||
| LAPACKE_csytrs_rook, | |||
| LAPACKE_csytrs_rook_work, | |||
| LAPACKE_cuncsd2by1, | |||
| LAPACKE_cuncsd2by1_work, | |||
| LAPACKE_dbdsvdx, | |||
| LAPACKE_dbdsvdx_work, | |||
| LAPACKE_dgesvdx, | |||
| LAPACKE_dgesvdx_work, | |||
| LAPACKE_dgetrf2, | |||
| LAPACKE_dgetrf2_work, | |||
| LAPACKE_dgges3, | |||
| LAPACKE_dgges3_work, | |||
| LAPACKE_dggev3, | |||
| LAPACKE_dggev3_work, | |||
| LAPACKE_dgghd3, | |||
| LAPACKE_dgghd3_work, | |||
| LAPACKE_dggsvd3, | |||
| LAPACKE_dggsvd3_work, | |||
| LAPACKE_dggsvp3, | |||
| LAPACKE_dggsvp3_work, | |||
| LAPACKE_dlapmt, | |||
| LAPACKE_dlapmt_work, | |||
| LAPACKE_dlascl, | |||
| LAPACKE_dlascl_work, | |||
| LAPACKE_dorcsd2by1, | |||
| LAPACKE_dorcsd2by1_work, | |||
| LAPACKE_dpotrf2, | |||
| LAPACKE_dpotrf2_work, | |||
| LAPACKE_dsytrf_rook, | |||
| LAPACKE_dsytrf_rook_work, | |||
| LAPACKE_dsytrs_rook, | |||
| LAPACKE_dsytrs_rook_work, | |||
| LAPACKE_sbdsvdx, | |||
| LAPACKE_sbdsvdx_work, | |||
| LAPACKE_sgesvdx, | |||
| LAPACKE_sgesvdx_work, | |||
| LAPACKE_sgetrf2, | |||
| LAPACKE_sgetrf2_work, | |||
| LAPACKE_sgges3, | |||
| LAPACKE_sgges3_work, | |||
| LAPACKE_sggev3, | |||
| LAPACKE_sggev3_work, | |||
| LAPACKE_sgghd3, | |||
| LAPACKE_sgghd3_work, | |||
| LAPACKE_sggsvd3, | |||
| LAPACKE_sggsvd3_work, | |||
| LAPACKE_sggsvp3, | |||
| LAPACKE_sggsvp3_work, | |||
| LAPACKE_slapmt, | |||
| LAPACKE_slapmt_work, | |||
| LAPACKE_slascl, | |||
| LAPACKE_slascl_work, | |||
| LAPACKE_sorcsd2by1, | |||
| LAPACKE_sorcsd2by1_work, | |||
| LAPACKE_spotrf2, | |||
| LAPACKE_spotrf2_work, | |||
| LAPACKE_ssytrf_rook, | |||
| LAPACKE_ssytrf_rook_work, | |||
| LAPACKE_ssytrs_rook, | |||
| LAPACKE_ssytrs_rook_work, | |||
| LAPACKE_stpqrt, | |||
| LAPACKE_stpqrt_work, | |||
| LAPACKE_zgejsv, | |||
| LAPACKE_zgejsv_work, | |||
| LAPACKE_zgesvdx, | |||
| LAPACKE_zgesvdx_work, | |||
| LAPACKE_zgesvj, | |||
| LAPACKE_zgesvj_work, | |||
| LAPACKE_zgetrf2, | |||
| LAPACKE_zgetrf2_work, | |||
| LAPACKE_zgges3, | |||
| LAPACKE_zgges3_work, | |||
| LAPACKE_zggev3, | |||
| LAPACKE_zggev3_work, | |||
| LAPACKE_zgghd3, | |||
| LAPACKE_zgghd3_work, | |||
| LAPACKE_zggsvd3, | |||
| LAPACKE_zggsvd3_work, | |||
| LAPACKE_zggsvp3, | |||
| LAPACKE_zggsvp3_work, | |||
| LAPACKE_zhetrf_rook, | |||
| LAPACKE_zhetrf_rook_work, | |||
| LAPACKE_zhetrs_rook, | |||
| LAPACKE_zhetrs_rook_work, | |||
| LAPACKE_zlapmt, | |||
| LAPACKE_zlapmt_work, | |||
| LAPACKE_zlascl, | |||
| LAPACKE_zlascl_work, | |||
| LAPACKE_zpotrf2, | |||
| LAPACKE_zpotrf2_work, | |||
| LAPACKE_zsytrf_rook, | |||
| LAPACKE_zsytrf_rook_work, | |||
| LAPACKE_zsytrs_rook, | |||
| LAPACKE_zsytrs_rook_work, | |||
| LAPACKE_zuncsd2by1, | |||
| LAPACKE_zuncsd2by1_work | |||
| ); | |||
| #These function may need 2 underscores. | |||
| @@ -2749,6 +2991,11 @@ if ($ARGV[8] == 1) { | |||
| @need_2underscore_objs = (@lapack_embeded_underscore_objs); | |||
| }; | |||
| if ($ARGV[11] == 1){ | |||
| #BUILD_LAPACK_DEPRECATED=1 | |||
| @underscore_objs =(@underscore_objs, @lapack_deprecated_objs); | |||
| } | |||
| } else { | |||
| @underscore_objs = (@blasobjs, @lapackobjs, @misc_underscore_objs); | |||
| } | |||
| @@ -1,5 +1,7 @@ | |||
| #!/usr/bin/perl | |||
| $hostos = `uname -s | sed -e s/\-.*//`; chop($hostos); | |||
| # | |||
| # 1. Not specified | |||
| # 1.1 Automatically detect, then check compiler | |||
| @@ -272,8 +274,9 @@ if ($link ne "") { | |||
| } | |||
| if ($flags =~ /^\-Y/) { | |||
| next if ($hostos eq 'SunOS'); | |||
| $linker_L .= "-Wl,". $flags . " "; | |||
| } | |||
| } | |||
| if ($flags =~ /^\-rpath\@/) { | |||
| $flags =~ s/\@/\,/g; | |||
| @@ -86,7 +86,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include <sys/types.h> | |||
| #include <sys/sysctl.h> | |||
| #endif | |||
| #ifdef linux | |||
| #if defined(linux) || defined(__sun__) | |||
| #include <sys/sysinfo.h> | |||
| #include <unistd.h> | |||
| #endif | |||
| @@ -552,7 +552,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define CORENAME "POWER5" | |||
| #endif | |||
| #if defined(FORCE_POWER6) || defined(FORCE_POWER7) || defined(FORCE_POWER8) | |||
| #if defined(FORCE_POWER6) || defined(FORCE_POWER7) | |||
| #define FORCE | |||
| #define ARCHITECTURE "POWER" | |||
| #define SUBARCHITECTURE "POWER6" | |||
| @@ -565,6 +565,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define CORENAME "POWER6" | |||
| #endif | |||
| #if defined(FORCE_POWER8) | |||
| #define FORCE | |||
| #define ARCHITECTURE "POWER" | |||
| #define SUBARCHITECTURE "POWER8" | |||
| #define SUBDIRNAME "power" | |||
| #define ARCHCONFIG "-DPOWER8 " \ | |||
| "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=128 " \ | |||
| "-DL2_SIZE=4194304 -DL2_LINESIZE=128 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " | |||
| #define LIBNAME "power8" | |||
| #define CORENAME "POWER8" | |||
| #endif | |||
| #ifdef FORCE_PPCG4 | |||
| #define FORCE | |||
| #define ARCHITECTURE "POWER" | |||
| @@ -819,10 +833,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 " | |||
| #define LIBNAME "armv8" | |||
| #define CORENAME "XGENE1" | |||
| #else | |||
| #define CORENAME "ARMV8" | |||
| #endif | |||
| #ifdef FORCE_CORTEXA57 | |||
| #define FORCE | |||
| #define ARCHITECTURE "ARM64" | |||
| #define SUBARCHITECTURE "ARMV8" | |||
| #define SUBDIRNAME "arm64" | |||
| #define ARCHCONFIG "-DCORTEXA57 " \ | |||
| "-DL1_CODE_SIZE=49152 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=3 " \ | |||
| "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=2 " \ | |||
| "-DL2_SIZE=2097152 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | |||
| "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON" | |||
| #define LIBNAME "cortexa57" | |||
| #define CORENAME "CORTEXA57" | |||
| #else | |||
| #endif | |||
| #ifndef FORCE | |||
| @@ -892,7 +920,7 @@ static int get_num_cores(void) { | |||
| size_t len; | |||
| #endif | |||
| #ifdef linux | |||
| #if defined(linux) || defined(__sun__) | |||
| //returns the number of processors which are currently online | |||
| return sysconf(_SC_NPROCESSORS_ONLN); | |||
| @@ -984,7 +1012,9 @@ int main(int argc, char *argv[]){ | |||
| #endif | |||
| #endif | |||
| #if NO_PARALLEL_MAKE==1 | |||
| #ifdef MAKE_NB_JOBS | |||
| printf("MAKE += -j %d\n", MAKE_NB_JOBS); | |||
| #elif NO_PARALLEL_MAKE==1 | |||
| printf("MAKE += -j 1\n"); | |||
| #else | |||
| #ifndef OS_WINDOWS | |||
| @@ -79,11 +79,9 @@ void NAME(char *TRANS, blasint *M, blasint *N, | |||
| FLOAT alpha = *ALPHA; | |||
| FLOAT beta = *BETA; | |||
| FLOAT *buffer; | |||
| int buffer_size; | |||
| #ifdef SMP | |||
| int nthreads; | |||
| int nthreads_max; | |||
| int nthreads_avail; | |||
| double MNK; | |||
| #endif | |||
| int (*gemv[])(BLASLONG, BLASLONG, BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT * , BLASLONG, FLOAT *, BLASLONG, FLOAT *) = { | |||
| @@ -134,13 +132,10 @@ void CNAME(enum CBLAS_ORDER order, | |||
| FLOAT *buffer; | |||
| blasint lenx, leny; | |||
| int trans; | |||
| int trans, buffer_size; | |||
| blasint info, t; | |||
| #ifdef SMP | |||
| int nthreads; | |||
| int nthreads_max; | |||
| int nthreads_avail; | |||
| double MNK; | |||
| #endif | |||
| int (*gemv[])(BLASLONG, BLASLONG, BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT * , BLASLONG, FLOAT *, BLASLONG, FLOAT *) = { | |||
| @@ -215,43 +210,20 @@ void CNAME(enum CBLAS_ORDER order, | |||
| if (incx < 0) x -= (lenx - 1) * incx; | |||
| if (incy < 0) y -= (leny - 1) * incy; | |||
| #ifdef MAX_STACK_ALLOC | |||
| // make it volatile because some gemv implementation (ex: dgemv_n.S) | |||
| // do not restore all register | |||
| volatile int stack_alloc_size = 0; | |||
| //for gemv_n and gemv_t, try to allocate on stack | |||
| stack_alloc_size = m + n; | |||
| #ifdef ALIGNED_ACCESS | |||
| stack_alloc_size += 3; | |||
| #endif | |||
| if(stack_alloc_size < 128) | |||
| //dgemv_n.S require a 128 bytes buffer | |||
| stack_alloc_size = 128; | |||
| if(stack_alloc_size > MAX_STACK_ALLOC / sizeof(FLOAT)) | |||
| stack_alloc_size = 0; | |||
| FLOAT stack_buffer[stack_alloc_size]; | |||
| buffer = stack_alloc_size ? stack_buffer : (FLOAT *)blas_memory_alloc(1); | |||
| // printf("stack_alloc_size=%d\n", stack_alloc_size); | |||
| #else | |||
| //Original OpenBLAS/GotoBLAS codes. | |||
| buffer = (FLOAT *)blas_memory_alloc(1); | |||
| buffer_size = m + n + 128 / sizeof(FLOAT); | |||
| #ifdef WINDOWS_ABI | |||
| buffer_size += 160 / sizeof(FLOAT) ; | |||
| #endif | |||
| // for alignment | |||
| buffer_size = (buffer_size + 3) & ~3; | |||
| STACK_ALLOC(buffer_size, FLOAT, buffer); | |||
| #ifdef SMP | |||
| nthreads_max = num_cpu_avail(2); | |||
| nthreads_avail = nthreads_max; | |||
| MNK = (double) m * (double) n; | |||
| if ( MNK <= (24.0 * 24.0 * (double) (GEMM_MULTITHREAD_THRESHOLD*GEMM_MULTITHREAD_THRESHOLD) ) ) | |||
| nthreads_max = 1; | |||
| if ( nthreads_max > nthreads_avail ) | |||
| nthreads = nthreads_avail; | |||
| if ( 1L * m * n < 2304L * GEMM_MULTITHREAD_THRESHOLD ) | |||
| nthreads = 1; | |||
| else | |||
| nthreads = nthreads_max; | |||
| nthreads = num_cpu_avail(2); | |||
| if (nthreads == 1) { | |||
| #endif | |||
| @@ -266,14 +238,7 @@ void CNAME(enum CBLAS_ORDER order, | |||
| } | |||
| #endif | |||
| #ifdef MAX_STACK_ALLOC | |||
| if(!stack_alloc_size){ | |||
| blas_memory_free(buffer); | |||
| } | |||
| #else | |||
| blas_memory_free(buffer); | |||
| #endif | |||
| STACK_FREE(buffer); | |||
| FUNCTION_PROFILE_END(1, m * n + m + n, 2 * m * n); | |||
| IDEBUG_END; | |||
| @@ -171,19 +171,14 @@ void CNAME(enum CBLAS_ORDER order, | |||
| if (incy < 0) y -= (n - 1) * incy; | |||
| if (incx < 0) x -= (m - 1) * incx; | |||
| #ifdef MAX_STACK_ALLOC | |||
| volatile int stack_alloc_size = m; | |||
| if(stack_alloc_size > MAX_STACK_ALLOC / sizeof(FLOAT)) | |||
| stack_alloc_size = 0; | |||
| FLOAT stack_buffer[stack_alloc_size]; | |||
| buffer = stack_alloc_size ? stack_buffer : (FLOAT *)blas_memory_alloc(1); | |||
| #else | |||
| buffer = (FLOAT *)blas_memory_alloc(1); | |||
| #endif | |||
| STACK_ALLOC(m, FLOAT, buffer); | |||
| #ifdef SMPTEST | |||
| nthreads = num_cpu_avail(2); | |||
| // Threshold chosen so that speed-up is > 1 on a Xeon E5-2630 | |||
| if(1L * m * n > 2048L * GEMM_MULTITHREAD_THRESHOLD) | |||
| nthreads = num_cpu_avail(2); | |||
| else | |||
| nthreads = 1; | |||
| if (nthreads == 1) { | |||
| #endif | |||
| @@ -198,11 +193,7 @@ void CNAME(enum CBLAS_ORDER order, | |||
| } | |||
| #endif | |||
| #ifdef MAX_STACK_ALLOC | |||
| if(!stack_alloc_size) | |||
| #endif | |||
| blas_memory_free(buffer); | |||
| STACK_FREE(buffer); | |||
| FUNCTION_PROFILE_END(1, m * n + m + n, 2 * m * n); | |||
| IDEBUG_END; | |||
| @@ -95,7 +95,7 @@ void CNAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){ | |||
| s = db / r; | |||
| z = ONE; | |||
| if (ada > adb) z = s; | |||
| if ((ada < adb) && (c != ZERO)) z = ONE / c; | |||
| if ((ada <= adb) && (c != ZERO)) z = ONE / c; | |||
| *C = c; | |||
| *S = s; | |||
| @@ -77,12 +77,13 @@ void CNAME(blasint n, FLOAT *x, blasint incx, FLOAT *y, blasint incy){ | |||
| if (incy < 0) y -= (n - 1) * incy; | |||
| #ifdef SMP | |||
| nthreads = num_cpu_avail(1); | |||
| //disable multi-thread when incx==0 or incy==0 | |||
| //In that case, the threads would be dependent. | |||
| if (incx == 0 || incy == 0) | |||
| nthreads = 1; | |||
| if (incx == 0 || incy == 0 || n < 2097152 * GEMM_MULTITHREAD_THRESHOLD / sizeof(FLOAT)) | |||
| nthreads = 1; | |||
| else | |||
| nthreads = num_cpu_avail(1); | |||
| if (nthreads == 1) { | |||
| #endif | |||
| @@ -91,6 +91,27 @@ | |||
| #endif | |||
| #endif | |||
| #ifdef SMP | |||
| #ifndef COMPLEX | |||
| #ifdef XDOUBLE | |||
| #define MODE (BLAS_XDOUBLE | BLAS_REAL) | |||
| #elif defined(DOUBLE) | |||
| #define MODE (BLAS_DOUBLE | BLAS_REAL) | |||
| #else | |||
| #define MODE (BLAS_SINGLE | BLAS_REAL) | |||
| #endif | |||
| #else | |||
| #ifdef XDOUBLE | |||
| #define MODE (BLAS_XDOUBLE | BLAS_COMPLEX) | |||
| #elif defined(DOUBLE) | |||
| #define MODE (BLAS_DOUBLE | BLAS_COMPLEX) | |||
| #else | |||
| #define MODE (BLAS_SINGLE | BLAS_COMPLEX) | |||
| #endif | |||
| #endif | |||
| #endif | |||
| static int (*symm[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = { | |||
| #ifndef GEMM3M | |||
| #ifndef HEMM | |||
| @@ -135,26 +156,6 @@ void NAME(char *SIDE, char *UPLO, | |||
| FLOAT *buffer; | |||
| FLOAT *sa, *sb; | |||
| #ifdef SMP | |||
| #ifndef COMPLEX | |||
| #ifdef XDOUBLE | |||
| int mode = BLAS_XDOUBLE | BLAS_REAL; | |||
| #elif defined(DOUBLE) | |||
| int mode = BLAS_DOUBLE | BLAS_REAL; | |||
| #else | |||
| int mode = BLAS_SINGLE | BLAS_REAL; | |||
| #endif | |||
| #else | |||
| #ifdef XDOUBLE | |||
| int mode = BLAS_XDOUBLE | BLAS_COMPLEX; | |||
| #elif defined(DOUBLE) | |||
| int mode = BLAS_DOUBLE | BLAS_COMPLEX; | |||
| #else | |||
| int mode = BLAS_SINGLE | BLAS_COMPLEX; | |||
| #endif | |||
| #endif | |||
| #endif | |||
| #if defined(SMP) && !defined(NO_AFFINITY) | |||
| int nodes; | |||
| #endif | |||
| @@ -246,26 +247,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, | |||
| FLOAT *buffer; | |||
| FLOAT *sa, *sb; | |||
| #ifdef SMP | |||
| #ifndef COMPLEX | |||
| #ifdef XDOUBLE | |||
| int mode = BLAS_XDOUBLE | BLAS_REAL; | |||
| #elif defined(DOUBLE) | |||
| int mode = BLAS_DOUBLE | BLAS_REAL; | |||
| #else | |||
| int mode = BLAS_SINGLE | BLAS_REAL; | |||
| #endif | |||
| #else | |||
| #ifdef XDOUBLE | |||
| int mode = BLAS_XDOUBLE | BLAS_COMPLEX; | |||
| #elif defined(DOUBLE) | |||
| int mode = BLAS_DOUBLE | BLAS_COMPLEX; | |||
| #else | |||
| int mode = BLAS_SINGLE | BLAS_COMPLEX; | |||
| #endif | |||
| #endif | |||
| #endif | |||
| #if defined(SMP) && !defined(NO_AFFINITY) | |||
| int nodes; | |||
| #endif | |||
| @@ -407,7 +388,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, | |||
| args.nthreads /= nodes; | |||
| gemm_thread_mn(mode, &args, NULL, NULL, | |||
| gemm_thread_mn(MODE, &args, NULL, NULL, | |||
| symm[4 | (side << 1) | uplo ], sa, sb, nodes); | |||
| } else { | |||
| @@ -419,7 +400,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, | |||
| #else | |||
| GEMM_THREAD(mode, &args, NULL, NULL, symm[(side << 1) | uplo ], sa, sb, args.nthreads); | |||
| GEMM_THREAD(MODE, &args, NULL, NULL, symm[(side << 1) | uplo ], sa, sb, args.nthreads); | |||
| #endif | |||
| @@ -116,7 +116,7 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, | |||
| void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *a, blasint lda) { | |||
| FLOAT *buffer; | |||
| int trans, uplo; | |||
| int uplo; | |||
| blasint info; | |||
| #ifdef SMP | |||
| int nthreads; | |||
| @@ -124,7 +124,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha, | |||
| PRINT_DEBUG_CNAME; | |||
| trans = -1; | |||
| uplo = -1; | |||
| info = 0; | |||
| @@ -118,7 +118,7 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, | |||
| void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint incy, FLOAT *a, blasint lda) { | |||
| FLOAT *buffer; | |||
| int trans, uplo; | |||
| int uplo; | |||
| blasint info; | |||
| #ifdef SMP | |||
| int nthreads; | |||
| @@ -126,7 +126,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha, | |||
| PRINT_DEBUG_CNAME; | |||
| trans = -1; | |||
| uplo = -1; | |||
| info = 0; | |||
| @@ -77,11 +77,9 @@ void NAME(char *TRANS, blasint *M, blasint *N, | |||
| blasint incy = *INCY; | |||
| FLOAT *buffer; | |||
| int buffer_size; | |||
| #ifdef SMP | |||
| int nthreads; | |||
| int nthreads_max; | |||
| int nthreads_avail; | |||
| double MNK; | |||
| #endif | |||
| int (*gemv[])(BLASLONG, BLASLONG, BLASLONG, FLOAT, FLOAT, FLOAT *, BLASLONG, | |||
| @@ -144,13 +142,10 @@ void CNAME(enum CBLAS_ORDER order, | |||
| FLOAT *buffer; | |||
| blasint lenx, leny; | |||
| int trans; | |||
| int trans, buffer_size; | |||
| blasint info, t; | |||
| #ifdef SMP | |||
| int nthreads; | |||
| int nthreads_max; | |||
| int nthreads_avail; | |||
| double MNK; | |||
| #endif | |||
| int (*gemv[])(BLASLONG, BLASLONG, BLASLONG, FLOAT, FLOAT, FLOAT *, BLASLONG, | |||
| @@ -236,22 +231,26 @@ void CNAME(enum CBLAS_ORDER order, | |||
| if (incx < 0) x -= (lenx - 1) * incx * 2; | |||
| if (incy < 0) y -= (leny - 1) * incy * 2; | |||
| buffer = (FLOAT *)blas_memory_alloc(1); | |||
| buffer_size = 2 * (m + n) + 128 / sizeof(FLOAT); | |||
| #ifdef WINDOWS_ABI | |||
| buffer_size += 160 / sizeof(FLOAT) ; | |||
| #endif | |||
| // for alignment | |||
| buffer_size = (buffer_size + 3) & ~3; | |||
| STACK_ALLOC(buffer_size, FLOAT, buffer); | |||
| #if defined(ARCH_X86_64) && defined(MAX_STACK_ALLOC) && MAX_STACK_ALLOC > 0 | |||
| // cgemv_t.S return NaN if there are NaN or Inf in the buffer (see bug #746) | |||
| if(trans && stack_alloc_size) | |||
| memset(buffer, 0, MIN(BUFFER_SIZE, sizeof(FLOAT) * buffer_size)); | |||
| #endif | |||
| #ifdef SMP | |||
| nthreads_max = num_cpu_avail(2); | |||
| nthreads_avail = nthreads_max; | |||
| MNK = (double) m * (double) n; | |||
| if ( MNK <= ( 256.0 * (double) (GEMM_MULTITHREAD_THRESHOLD * GEMM_MULTITHREAD_THRESHOLD) )) | |||
| nthreads_max = 1; | |||
| if ( nthreads_max > nthreads_avail ) | |||
| nthreads = nthreads_avail; | |||
| if ( 1L * m * n < 1024L * GEMM_MULTITHREAD_THRESHOLD ) | |||
| nthreads = 1; | |||
| else | |||
| nthreads = nthreads_max; | |||
| nthreads = num_cpu_avail(2); | |||
| if (nthreads == 1) { | |||
| #endif | |||
| @@ -267,7 +266,7 @@ void CNAME(enum CBLAS_ORDER order, | |||
| } | |||
| #endif | |||
| blas_memory_free(buffer); | |||
| STACK_FREE(buffer); | |||
| FUNCTION_PROFILE_END(4, m * n + m + n, 2 * m * n); | |||
| @@ -210,10 +210,14 @@ void CNAME(enum CBLAS_ORDER order, | |||
| if (incy < 0) y -= (n - 1) * incy * 2; | |||
| if (incx < 0) x -= (m - 1) * incx * 2; | |||
| buffer = (FLOAT *)blas_memory_alloc(1); | |||
| STACK_ALLOC(2 * m, FLOAT, buffer); | |||
| #ifdef SMPTEST | |||
| nthreads = num_cpu_avail(2); | |||
| // Threshold chosen so that speed-up is > 1 on a Xeon E5-2630 | |||
| if(1L * m * n > 36L * sizeof(FLOAT) * sizeof(FLOAT) * GEMM_MULTITHREAD_THRESHOLD) | |||
| nthreads = num_cpu_avail(2); | |||
| else | |||
| nthreads = 1; | |||
| if (nthreads == 1) { | |||
| #endif | |||
| @@ -245,7 +249,7 @@ void CNAME(enum CBLAS_ORDER order, | |||
| } | |||
| #endif | |||
| blas_memory_free(buffer); | |||
| STACK_FREE(buffer); | |||
| FUNCTION_PROFILE_END(4, m * n + m + n, 2 * m * n); | |||
| @@ -117,7 +117,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT *ALPHA | |||
| FLOAT beta_i = BETA[1]; | |||
| FLOAT *buffer; | |||
| int trans, uplo; | |||
| int uplo; | |||
| blasint info; | |||
| #ifdef SMP | |||
| int nthreads; | |||
| @@ -135,7 +135,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT *ALPHA | |||
| PRINT_DEBUG_CNAME; | |||
| trans = -1; | |||
| uplo = -1; | |||
| info = 0; | |||
| @@ -116,7 +116,7 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, | |||
| void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *a, blasint lda) { | |||
| FLOAT *buffer; | |||
| int trans, uplo; | |||
| int uplo; | |||
| blasint info; | |||
| #ifdef SMP | |||
| int nthreads; | |||
| @@ -124,7 +124,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha, | |||
| PRINT_DEBUG_CNAME; | |||
| trans = -1; | |||
| uplo = -1; | |||
| info = 0; | |||
| @@ -121,7 +121,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT *ALPHA | |||
| FLOAT alpha_r = ALPHA[0]; | |||
| FLOAT alpha_i = ALPHA[1]; | |||
| FLOAT *buffer; | |||
| int trans, uplo; | |||
| int uplo; | |||
| blasint info; | |||
| #ifdef SMP | |||
| int nthreads; | |||
| @@ -129,7 +129,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT *ALPHA | |||
| PRINT_DEBUG_CNAME; | |||
| trans = -1; | |||
| uplo = -1; | |||
| info = 0; | |||