| @@ -4,10 +4,10 @@ dist: precise | |||
| sudo: true | |||
| language: c | |||
| jobs: | |||
| matrix: | |||
| include: | |||
| - &test-ubuntu | |||
| stage: test | |||
| os: linux | |||
| compiler: gcc | |||
| addons: | |||
| apt: | |||
| @@ -57,7 +57,7 @@ jobs: | |||
| - TARGET_BOX=LINUX32 | |||
| - BTYPE="BINARY=32" | |||
| - stage: test | |||
| - os: linux | |||
| compiler: gcc | |||
| addons: | |||
| apt: | |||
| @@ -77,13 +77,13 @@ jobs: | |||
| # which is slower than container-based infrastructure used for jobs | |||
| # that don't require sudo. | |||
| - &test-alpine | |||
| stage: test | |||
| os: linux | |||
| dist: trusty | |||
| sudo: true | |||
| language: minimal | |||
| before_install: | |||
| - "wget 'https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.6.0/alpine-chroot-install' \ | |||
| && echo 'a827a4ba3d0817e7c88bae17fe34e50204983d1e alpine-chroot-install' | sha1sum -c || exit 1" | |||
| - "wget 'https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.9.0/alpine-chroot-install' \ | |||
| && echo 'e5dfbbdc0c4b3363b99334510976c86bfa6cb251 alpine-chroot-install' | sha1sum -c || exit 1" | |||
| - alpine() { /alpine/enter-chroot -u "$USER" "$@"; } | |||
| install: | |||
| - sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers' | |||
| @@ -117,10 +117,10 @@ jobs: | |||
| - <<: *test-alpine | |||
| env: | |||
| - TARGET_BOX=LINUX64_MUSL | |||
| - BTYPE="BINARY=64 NO_AFFINITY=1 USE_OPENMP=0 NO_LAPACK=0 TARGET=core2" | |||
| - BTYPE="BINARY=64 NO_AFFINITY=1 USE_OPENMP=0 NO_LAPACK=0 TARGET=CORE2" | |||
| - &test-cmake | |||
| stage: test | |||
| os: linux | |||
| compiler: clang | |||
| addons: | |||
| apt: | |||
| @@ -147,6 +147,58 @@ jobs: | |||
| env: | |||
| - CMAKE=1 | |||
| - &test-macos | |||
| os: osx | |||
| osx_image: xcode8.3 | |||
| before_script: | |||
| - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32" | |||
| - brew update | |||
| - brew install gcc # for gfortran | |||
| script: | |||
| - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE | |||
| env: | |||
| - BTYPE="BINARY=64 INTERFACE64=1" | |||
| - <<: *test-macos | |||
| env: | |||
| - BTYPE="BINARY=32" | |||
| - &emulated-arm | |||
| dist: trusty | |||
| sudo: required | |||
| services: docker | |||
| env: IMAGE_ARCH=arm32 TARGET_ARCH=ARMV6 COMPILER=gcc | |||
| name: "Emulated Build for ARMV6 with gcc" | |||
| before_install: sudo docker run --rm --privileged multiarch/qemu-user-static:register --reset | |||
| script: | | |||
| echo "FROM openblas/alpine:${IMAGE_ARCH} | |||
| COPY . /tmp/openblas | |||
| RUN mkdir /tmp/openblas/build && \ | |||
| cd /tmp/openblas/build && \ | |||
| CC=${COMPILER} cmake -D DYNAMIC_ARCH=OFF \ | |||
| -D TARGET=${TARGET_ARCH} \ | |||
| -D BUILD_SHARED_LIBS=ON \ | |||
| -D BUILD_WITHOUT_LAPACK=ON \ | |||
| -D BUILD_WITHOUT_CBLAS=ON \ | |||
| -D CMAKE_BUILD_TYPE=Release ../ && \ | |||
| cmake --build ." > Dockerfile | |||
| docker build . | |||
| - <<: *emulated-arm | |||
| env: IMAGE_ARCH=arm32 TARGET_ARCH=ARMV6 COMPILER=clang | |||
| name: "Emulated Build for ARMV6 with clang" | |||
| - <<: *emulated-arm | |||
| env: IMAGE_ARCH=arm64 TARGET_ARCH=ARMV8 COMPILER=gcc | |||
| name: "Emulated Build for ARMV8 with gcc" | |||
| - <<: *emulated-arm | |||
| env: IMAGE_ARCH=arm64 TARGET_ARCH=ARMV8 COMPILER=clang | |||
| name: "Emulated Build for ARMV8 with clang" | |||
| allow_failures: | |||
| - env: IMAGE_ARCH=arm32 TARGET_ARCH=ARMV6 COMPILER=gcc | |||
| - env: IMAGE_ARCH=arm32 TARGET_ARCH=ARMV6 COMPILER=clang | |||
| - env: IMAGE_ARCH=arm64 TARGET_ARCH=ARMV8 COMPILER=gcc | |||
| - env: IMAGE_ARCH=arm64 TARGET_ARCH=ARMV8 COMPILER=clang | |||
| # whitelist | |||
| branches: | |||
| only: | |||
| @@ -6,21 +6,30 @@ cmake_minimum_required(VERSION 2.8.5) | |||
| project(OpenBLAS C ASM) | |||
| set(OpenBLAS_MAJOR_VERSION 0) | |||
| set(OpenBLAS_MINOR_VERSION 3) | |||
| set(OpenBLAS_PATCH_VERSION 0.dev) | |||
| set(OpenBLAS_PATCH_VERSION 6.dev) | |||
| set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") | |||
| # Adhere to GNU filesystem layout conventions | |||
| include(GNUInstallDirs) | |||
| set(OpenBLAS_LIBNAME openblas) | |||
| include(CMakePackageConfigHelpers) | |||
| ####### | |||
| if(MSVC) | |||
| option(BUILD_WITHOUT_LAPACK "Without LAPACK and LAPACKE (Only BLAS or CBLAS)" ON) | |||
| option(BUILD_WITHOUT_LAPACK "Do not build LAPACK and LAPACKE (Only BLAS or CBLAS)" ON) | |||
| endif() | |||
| option(BUILD_WITHOUT_CBLAS "Without CBLAS" OFF) | |||
| option(DYNAMIC_ARCH "Build with DYNAMIC_ARCH" OFF) | |||
| option(BUILD_RELAPACK "Build with ReLAPACK (recursive LAPACK" OFF) | |||
| option(BUILD_WITHOUT_CBLAS "Do not build the C interface (CBLAS) to the BLAS functions" OFF) | |||
| option(DYNAMIC_ARCH "Include support for multiple CPU targets, with automatic selection at runtime (x86/x86_64 only)" OFF) | |||
| option(DYNAMIC_OLDER "Include specific support for older cpu models (Penryn,Dunnington,Atom,Nano,Opteron) with DYNAMIC_ARCH" OFF) | |||
| option(BUILD_RELAPACK "Build with ReLAPACK (recursive implementation of several LAPACK functions on top of standard LAPACK)" OFF) | |||
| # Add a prefix or suffix to all exported symbol names in the shared library. | |||
| # Avoids conflicts with other BLAS libraries, especially when using | |||
| # 64 bit integer interfaces in OpenBLAS. | |||
| set(SYMBOLPREFIX "" CACHE STRING "Add a prefix to all exported symbol names in the shared library to avoid conflicts with other BLAS libraries" ) | |||
| set(SYMBOLSUFFIX "" CACHE STRING "Add a suffix to all exported symbol names in the shared library, e.g. _64 for INTERFACE64 builds" ) | |||
| ####### | |||
| if(BUILD_WITHOUT_LAPACK) | |||
| set(NO_LAPACK 1) | |||
| @@ -34,11 +43,13 @@ endif() | |||
| ####### | |||
| message(WARNING "CMake support is experimental. This will not produce the same Makefiles that OpenBLAS ships with. Only x86 support is currently available.") | |||
| message(WARNING "CMake support is experimental. It does not yet support all build options and may not produce the same Makefiles that OpenBLAS ships with.") | |||
| include("${PROJECT_SOURCE_DIR}/cmake/utils.cmake") | |||
| include("${PROJECT_SOURCE_DIR}/cmake/system.cmake") | |||
| set(OpenBLAS_LIBNAME openblas${SUFFIX64_UNDERSCORE}) | |||
| set(BLASDIRS interface driver/level2 driver/level3 driver/others) | |||
| if (NOT DYNAMIC_ARCH) | |||
| @@ -146,6 +157,7 @@ endif() | |||
| # add objects to the openblas lib | |||
| add_library(${OpenBLAS_LIBNAME} ${LA_SOURCES} ${LAPACKE_SOURCES} ${RELA_SOURCES} ${TARGET_OBJS} ${OpenBLAS_DEF_FILE}) | |||
| target_include_directories(${OpenBLAS_LIBNAME} INTERFACE $<INSTALL_INTERFACE:include>) | |||
| # Android needs to explicitly link against libm | |||
| if(ANDROID) | |||
| @@ -165,6 +177,7 @@ endif() | |||
| # Set output for libopenblas | |||
| set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib) | |||
| set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES LIBRARY_OUTPUT_NAME_DEBUG "${OpenBLAS_LIBNAME}_d") | |||
| set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES EXPORT_NAME "OpenBLAS") | |||
| foreach (OUTPUTCONFIG ${CMAKE_CONFIGURATION_TYPES}) | |||
| string( TOUPPER ${OUTPUTCONFIG} OUTPUTCONFIG ) | |||
| @@ -204,14 +217,84 @@ set_target_properties(${OpenBLAS_LIBNAME} PROPERTIES | |||
| SOVERSION ${OpenBLAS_MAJOR_VERSION} | |||
| ) | |||
| if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFIX} STREQUAL "") | |||
| if (NOT DEFINED ARCH) | |||
| set(ARCH_IN "x86_64") | |||
| else() | |||
| set(ARCH_IN ${ARCH}) | |||
| endif() | |||
| if (${CORE} STREQUAL "generic") | |||
| set(ARCH_IN "GENERIC") | |||
| endif () | |||
| if (NOT DEFINED EXPRECISION) | |||
| set(EXPRECISION_IN 0) | |||
| else() | |||
| set(EXPRECISION_IN ${EXPRECISION}) | |||
| endif() | |||
| if (NOT DEFINED NO_CBLAS) | |||
| set(NO_CBLAS_IN 0) | |||
| else() | |||
| set(NO_CBLAS_IN ${NO_CBLAS}) | |||
| endif() | |||
| if (NOT DEFINED NO_LAPACK) | |||
| set(NO_LAPACK_IN 0) | |||
| else() | |||
| set(NO_LAPACK_IN ${NO_LAPACK}) | |||
| endif() | |||
| if (NOT DEFINED NO_LAPACKE) | |||
| set(NO_LAPACKE_IN 0) | |||
| else() | |||
| set(NO_LAPACKE_IN ${NO_LAPACKE}) | |||
| endif() | |||
| if (NOT DEFINED NEED2UNDERSCORES) | |||
| set(NEED2UNDERSCORES_IN 0) | |||
| else() | |||
| set(NEED2UNDERSCORES_IN ${NEED2UNDERSCORES}) | |||
| endif() | |||
| if (NOT DEFINED ONLY_CBLAS) | |||
| set(ONLY_CBLAS_IN 0) | |||
| else() | |||
| set(ONLY_CBLAS_IN ${ONLY_CBLAS}) | |||
| endif() | |||
| if (NOT DEFINED BU) | |||
| set(BU _) | |||
| endif() | |||
| if (NOT ${SYMBOLPREFIX} STREQUAL "") | |||
| message(STATUS "adding prefix ${SYMBOLPREFIX} to names of exported symbols in ${OpenBLAS_LIBNAME}") | |||
| endif() | |||
| if (NOT ${SYMBOLSUFFIX} STREQUAL "") | |||
| message(STATUS "adding suffix ${SYMBOLSUFFIX} to names of exported symbols in ${OpenBLAS_LIBNAME}") | |||
| endif() | |||
| add_custom_command(TARGET ${OpenBLAS_LIBNAME} POST_BUILD | |||
| COMMAND perl ${PROJECT_SOURCE_DIR}/exports/gensymbol "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BUILD_LAPACK_DEPRECATED}" > ${PROJECT_BINARY_DIR}/objcopy.def | |||
| COMMAND objcopy -v --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def ${PROJECT_BINARY_DIR}/lib/lib${OpenBLAS_LIBNAME}.so | |||
| COMMENT "renaming symbols" | |||
| ) | |||
| endif() | |||
| # Install project | |||
| # Install libraries | |||
| install(TARGETS ${OpenBLAS_LIBNAME} | |||
| EXPORT "OpenBLAS${SUFFIX64}Targets" | |||
| RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} | |||
| ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} | |||
| LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} ) | |||
| # Install headers | |||
| set(CMAKE_INSTALL_INCLUDEDIR ${CMAKE_INSTALL_INCLUDEDIR}/openblas${SUFFIX64}) | |||
| set(CMAKE_INSTALL_FULL_INCLUDEDIR ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_INCLUDEDIR}) | |||
| message(STATUS "Generating openblas_config.h in ${CMAKE_INSTALL_INCLUDEDIR}") | |||
| set(OPENBLAS_CONFIG_H ${CMAKE_BINARY_DIR}/openblas_config.h) | |||
| @@ -259,11 +342,31 @@ if(NOT NO_LAPACKE) | |||
| ADD_CUSTOM_TARGET(genlapacke | |||
| COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/lapack-netlib/LAPACKE/include/lapacke_mangling_with_flags.h.in "${CMAKE_BINARY_DIR}/lapacke_mangling.h" | |||
| ) | |||
| install (FILES ${CMAKE_BINARY_DIR}/lapacke_mangling.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) | |||
| install (FILES ${CMAKE_BINARY_DIR}/lapacke_mangling.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/openblas${SUFFIX64}) | |||
| endif() | |||
| include(FindPkgConfig QUIET) | |||
| if(PKG_CONFIG_FOUND) | |||
| configure_file(${PROJECT_SOURCE_DIR}/cmake/openblas.pc.in ${PROJECT_BINARY_DIR}/openblas.pc @ONLY) | |||
| install (FILES ${PROJECT_BINARY_DIR}/openblas.pc DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig/) | |||
| configure_file(${PROJECT_SOURCE_DIR}/cmake/openblas.pc.in ${PROJECT_BINARY_DIR}/openblas${SUFFIX64}.pc @ONLY) | |||
| install (FILES ${PROJECT_BINARY_DIR}/openblas${SUFFIX64}.pc DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig/) | |||
| endif() | |||
| # GNUInstallDirs "DATADIR" wrong here; CMake search path wants "share". | |||
| set(PN OpenBLAS) | |||
| set(CMAKECONFIG_INSTALL_DIR "share/cmake/${PN}${SUFFIX64}") | |||
| configure_package_config_file(cmake/${PN}Config.cmake.in | |||
| "${CMAKE_CURRENT_BINARY_DIR}/${PN}${SUFFIX64}Config.cmake" | |||
| INSTALL_DESTINATION ${CMAKECONFIG_INSTALL_DIR}) | |||
| write_basic_package_version_file(${CMAKE_CURRENT_BINARY_DIR}/${PN}ConfigVersion.cmake | |||
| VERSION ${${PN}_VERSION} | |||
| COMPATIBILITY AnyNewerVersion) | |||
| install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${PN}${SUFFIX64}Config.cmake | |||
| DESTINATION ${CMAKECONFIG_INSTALL_DIR}) | |||
| install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${PN}ConfigVersion.cmake | |||
| RENAME ${PN}${SUFFIX64}ConfigVersion.cmake | |||
| DESTINATION ${CMAKECONFIG_INSTALL_DIR}) | |||
| install(EXPORT "${PN}${SUFFIX64}Targets" | |||
| NAMESPACE "${PN}${SUFFIX64}::" | |||
| DESTINATION ${CMAKECONFIG_INSTALL_DIR}) | |||
| @@ -1,4 +1,247 @@ | |||
| OpenBLAS ChangeLog | |||
| ==================================================================== | |||
| Version 0.3.5 | |||
| 31-Dec-2018 | |||
| common: | |||
| * loop unrolling in TRMV has been enabled again. | |||
| * A domain error in the thread workload distribution for SYRK | |||
| has been fixed. | |||
| * gmake builds will now automatically add -fPIC to the build | |||
| options if the platform requires it. | |||
| * a pthreads key leakage (and associate crash on dlclose) in | |||
| the USE_TLS codepath was fixed. | |||
| * building of the utest cases on systems that do not provide | |||
| an implementation of complex.h was fixed. | |||
| x86_64: | |||
| * the SkylakeX code was changed to compile on OSX. | |||
| * unwanted application of the -march=skylake-avx512 option | |||
| to the common code parts of a DYNAMIC_ARCH build was fixed. | |||
| * improved performance of SGEMM for small workloads on Skylake X. | |||
| * performance of SGEMM and DGEMM was improved on Haswell. | |||
| ARMV8: | |||
| * a configuration error that broke the CNRM2 kernel was corrected. | |||
| * compilation of the GEMM kernels with CMAKE was fixed. | |||
| * DYNAMIC_ARCH builds are now available with CMAKE as well. | |||
| * using CMAKE for cross-compilation to the new cpu TARGETs | |||
| introduced in 0.3.4 now works. | |||
| POWER: | |||
| * a problem in cpu autodetection for AIX has been corrected. | |||
| ==================================================================== | |||
| Version 0.3.4 | |||
| 02-Dec-2018 | |||
| common: | |||
| * the new, experimental thread-local memory allocation had | |||
| inadvertently been left enabled for gmake builds in 0.3.3 | |||
| despite the announcement. It is now disabled by default, and | |||
| single-threaded builds will keep using the old allocator even | |||
| if the USE_TLS option is turned on. | |||
| * OpenBLAS will now provide enough buffer space for at least 50 | |||
| threads by default. | |||
| * The output of openblas_get_config() now contains the version | |||
| number. | |||
| * A serious thread safety bug in GEMV operation with small M and | |||
| large N size has been fixed. | |||
| * The code will now automatically call blas_thread_init after a | |||
| fork if needed before handling a call to openblas_set_num_threads | |||
| * Accesses to parallelized level3 functions from multiple callers | |||
| are now serialized to avoid thread races (unless using OpenMP). | |||
| This should provide better performance than the known-threadsafe | |||
| (but non-default) USE_SIMPLE_THREADED_LEVEL3 option. | |||
| * When building LAPACK with gfortran, -frecursive is now (again) | |||
| enabled by default to ensure correct behaviour. | |||
| * The OpenBLAS version cblas.h now supports both CBLAS_ORDER and | |||
| CBLAS_LAYOUT as the name of the matrix row/column order option. | |||
| * Externally set LDFLAGS are now passed through to the final compile/link | |||
| steps to facilitate setting platform-specific linker flags. | |||
| * A potential race condition during the build of LAPACK (that would | |||
| usually manifest itself as a failure to build TESTING/MATGEN) has been | |||
| fixed. | |||
| * xHEMV has been changed to stay single-threaded for small input sizes | |||
| where the overhead of multithreading exceeds any possible gains | |||
| * CSWAP and ZSWAP have been limited to a single thread except on ARMV8 or | |||
| ThunderX hardware with sizable input. | |||
| * Linker flags for the PGI compiler have been updated | |||
| * Behaviour of AXPY with zero increments is now handled in the C interface, | |||
| correcting the result on at least Intel Atom. | |||
| * The result matrix from calling SGELSS with an all-zero input matrix is | |||
| now zeroed completely. | |||
| x86_64: | |||
| * Autodetection of AMD Ryzen2 has been fixed (again). | |||
| * CMAKE builds now support labeling of an INTERFACE64=1 build of | |||
| the library with the _64 suffix. | |||
| * AVX512 version of DGEMM has been added and the AVX512 SGEMM kernel | |||
| has been sped up by rewriting with C intrinsics | |||
| * Fixed compilation on RHEL5/CENTOS5 (issue with typename __WAIT_STATUS) | |||
| POWER: | |||
| * added support for building on AIX (with gcc and GNU tools from AIX Toolbox). | |||
| * CPU type detection has been implemented for AIX. | |||
| * CPU type detection has been fixed for NETBSD. | |||
| MIPS64: | |||
| * AXPY on LOONGSON3A has been corrected to pass "zero increment" utest. | |||
| * DSDOT on LOONGSON3A has been fixed. | |||
| * the SGEMM microkernel has been hardened against potential data loss. | |||
| ARMV8: | |||
| * DYNAMic_ARCH support is now available for 64bit ARM | |||
| * cross-compiling for ARMV8 under iOS now works. | |||
| * cpu-specific code has been rearranged to make better use of both | |||
| hardware commonalities and model-specific compiler optimizations. | |||
| * XGENE1 has been removed as a TARGET, superseded by the improved generic | |||
| ARMV8 support. | |||
| ARMV7: | |||
| * Older assembly mnemonics have been converted to UAL form to allow | |||
| building with clang 7.0 | |||
| * Cross compiling LAPACKE for Android has been fixed again (broken by | |||
| update to LAPACK 3.7.0 some while ago). | |||
| ==================================================================== | |||
| Version 0.3.3 | |||
| 31-Aug-2018 | |||
| common: | |||
| * thread memory allocation has been switched back to the method | |||
| used before version 0.3.1 due to unexpected problems caused by | |||
| the new code under some circumstances. A new compile-time option | |||
| USE_TLS has been added to enable the new code, and it is hoped | |||
| that this can become the default again in the next version. | |||
| * LAPAck PR272 has been integrated, which fixes spurious errors | |||
| in DSYEVR and related functions caused by missing conversion | |||
| from ILAENV to ILAENV_2STAGE in several _2stage routines. | |||
| * the cmake-generated OpenBLASConfig.cmake now uses correct case | |||
| for the name of the library | |||
| * added support for Haiku OS | |||
| x86_64: | |||
| * added AVX512 implementations of SDOT, DDOT, SAXPY, DAXPY, | |||
| DSCAL, DGEMVN and DSYMVL | |||
| * added a workaround for a cygwin issue that prevented compilation | |||
| of AVX512 code | |||
| IBM Z: | |||
| * added autodetection of Z14 | |||
| * fixed TRMM errors in the generic target | |||
| ==================================================================== | |||
| Version 0.3.2 | |||
| 30-Jul-2018 | |||
| common: | |||
| * fixes for regressions caused by the rewrite of the thread | |||
| initialization code in 0.3.1 | |||
| POWER: | |||
| * fixed cpu autodetection for the BSDs | |||
| MIPS64: | |||
| * fixed utest errors in AXPY, DSDOT, ROT and SWAP | |||
| x86_64: | |||
| * added autodetection of AMD Ryzen 2 | |||
| * fixed build with older versions of MSVC | |||
| ==================================================================== | |||
| Version 0.3.1 | |||
| 01-Jul-2018 | |||
| common: | |||
| * rewritten thread initialization code with significantly reduced overhead | |||
| * added CBLAS interfaces to the IxAMIN BLAS extension functions | |||
| * fixed the lapack-test target | |||
| * CMAKE builds now create an OpenBLASConfig.cmake file | |||
| * ZAXPY now uses a single thread for small input sizes | |||
| * the LAPACK code was updated from Reference-LAPACK/lapack#253 | |||
| (fixing LAPACKE interfaces to Aasen's functions) | |||
| POWER: | |||
| * corrected CROT and ZROT behaviour with zero INC_X | |||
| ARMV7: | |||
| * corrected xDOT behaviour with zero INC_X or INC_Y | |||
| x86_64: | |||
| * retired some older targets of DYNAMIC_ARCH builds to a new option DYNAMIC_OLDER, | |||
| this affects PENRYN,DUNNINGTON,OPTERON,OPTERON_SSE3,BOBCAT,ATOM and NANO | |||
| (which will still be supported via the slower PRESCOTT kernels when this option is not set) | |||
| * added an option DYNAMIC_LIST that (used in conjunction with DYNAMIC_ARCH) allows to | |||
| specify the list of x86_64 targets to include. Any target not on the list will be supported | |||
| by the Sandybridge or Nehalem kernels if available, or by Prescott. | |||
| * improved SWITCH_RATIO on Haswell for increased GEMM throughput | |||
| * added initial support for Intel Skylake X, including an AVX512 SGEMM kernel | |||
| * added autodetection of Intel Cannon Lake series as Skylake X | |||
| * added a default L2 cache size for hypervisors that return zero here (Chromebook) | |||
| * fixed a name clash with recent Windows10 headers that broke the build with (at least) | |||
| recent mingw from MSYS2 | |||
| * fixed a link error in mixed clang/gfortran builds with OpenMP | |||
| * updated the OSX deployment target to 10.8 | |||
| * switched on parallel make for builds on MS Windows by default | |||
| x86: | |||
| * fixed SSWAP and DSWAP behaviour with zero INC_X and INC_Y | |||
| ==================================================================== | |||
| Version 0.3.0 | |||
| 23-May-2108 | |||
| common: | |||
| * fixed some more thread race and locking bugs | |||
| * added preliminary support for calling an OpenMP build of the library from multiple threads | |||
| * removed performance impact of thread locks added in 0.2.20 on OpenMP code | |||
| * general code cleanup | |||
| * optimized DSDOT implementation | |||
| * improved thread distribution for GEMM | |||
| * corrected IMATCOPY/OMATCOPY implementation | |||
| * fixed out-of-bounds accesses in the multithreaded xBMV/xPMV and SYMV implementations | |||
| * cmake build improvements | |||
| * pkgconfig file now contains build options | |||
| * openblas_get_config() now reports USE_OPENMP and NUM_THREADS settings used for the build | |||
| * corrections and improvements for systems with more than 64 cpus | |||
| * LAPACK code updated to 3.8.0 including later fixes | |||
| * added ReLAPACK, a recursive implementation of several LAPACK functions | |||
| * Rewrote ROTMG to handle cases that the netlib code failed to address | |||
| * Disabled (broken) multithreading code for xTRMV | |||
| * corrected prototypes of complex CBLAS functions to make our cblas.h match the generally accepted standard | |||
| * shared memory access failures on startup are now handled more gracefully | |||
| * restored utests from earlier releases (and made them pass on all affected systems) | |||
| SPARC: | |||
| * several fixes for cpu autodetection | |||
| POWER: | |||
| * corrected vector register overwriting in several Power8 kernels | |||
| * optimized additional BLAS functions | |||
| ARM: | |||
| * added support for CortexA53 and A72 | |||
| * added autodetection for ThunderX2T99 | |||
| * made most optimized kernels the default for generic ARMv8 targets | |||
| x86_64: | |||
| * parallelized DDOT kernel for Haswell | |||
| * changed alignment directives in assembly kernels to boost performance on OSX | |||
| * fixed register handling in the GEMV microkernels (bug exposed by gcc7) | |||
| * added support for building on OpenBSD and Dragonfly | |||
| * updated compiler options to work with Intel release 2018 | |||
| * support fully optimized build with clang/flang on Microsoft Windows | |||
| * fixed building on AIX | |||
| IBM Z: | |||
| * added optimized BLAS 1/2 functions | |||
| MIPS: | |||
| * fixed cpu autodetection helper code | |||
| * added mips32 1004K cpu (Mediatek MT7621 and similar SoC) | |||
| * added mips64 I6500 cpu | |||
| ==================================================================== | |||
| Version 0.2.20 | |||
| 24-Jul-2017 | |||
| @@ -21,6 +21,17 @@ ifeq ($(BUILD_RELAPACK), 1) | |||
| RELA = re_lapack | |||
| endif | |||
| ifeq ($(NO_FORTRAN), 1) | |||
| define NOFORTRAN | |||
| 1 | |||
| endef | |||
| define NO_LAPACK | |||
| 1 | |||
| endef | |||
| export NOFORTRAN | |||
| export NO_LAPACK | |||
| endif | |||
| LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast,$(LAPACK_FFLAGS)) | |||
| SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench | |||
| @@ -47,7 +58,7 @@ endif | |||
| endif | |||
| @echo " C compiler ... $(C_COMPILER) (command line : $(CC))" | |||
| ifndef NOFORTRAN | |||
| ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) | |||
| @echo " Fortran compiler ... $(F_COMPILER) (command line : $(FC))" | |||
| endif | |||
| ifneq ($(OSNAME), AIX) | |||
| @@ -86,16 +97,12 @@ endif | |||
| shared : | |||
| ifndef NO_SHARED | |||
| ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android)) | |||
| ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku)) | |||
| @$(MAKE) -C exports so | |||
| @ln -fs $(LIBSONAME) $(LIBPREFIX).so | |||
| @ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION) | |||
| endif | |||
| ifeq ($(OSNAME), FreeBSD) | |||
| @$(MAKE) -C exports so | |||
| @ln -fs $(LIBSONAME) $(LIBPREFIX).so | |||
| endif | |||
| ifeq ($(OSNAME), NetBSD) | |||
| ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD OpenBSD NetBSD DragonFly)) | |||
| @$(MAKE) -C exports so | |||
| @ln -fs $(LIBSONAME) $(LIBPREFIX).so | |||
| endif | |||
| @@ -112,7 +119,7 @@ endif | |||
| endif | |||
| tests : | |||
| ifndef NOFORTRAN | |||
| ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) | |||
| touch $(LIBNAME) | |||
| ifndef NO_FBLAS | |||
| $(MAKE) -C test all | |||
| @@ -124,7 +131,7 @@ endif | |||
| endif | |||
| libs : | |||
| ifeq ($(CORE), UNKOWN) | |||
| ifeq ($(CORE), UNKNOWN) | |||
| $(error OpenBLAS: Detecting CPU failed. Please set TARGET explicitly, e.g. make TARGET=your_cpu_target. Please read README for the detail.) | |||
| endif | |||
| ifeq ($(NOFORTRAN), 1) | |||
| @@ -157,6 +164,9 @@ ifeq ($(DYNAMIC_ARCH), 1) | |||
| do $(MAKE) GOTOBLAS_MAKEFILE= -C kernel TARGET_CORE=$$d kernel || exit 1 ;\ | |||
| done | |||
| @echo DYNAMIC_ARCH=1 >> Makefile.conf_last | |||
| ifeq ($(DYNAMIC_OLDER), 1) | |||
| @echo DYNAMIC_OLDER=1 >> Makefile.conf_last | |||
| endif | |||
| endif | |||
| ifdef USE_THREAD | |||
| @echo USE_THREAD=$(USE_THREAD) >> Makefile.conf_last | |||
| @@ -211,7 +221,7 @@ netlib : | |||
| else | |||
| netlib : lapack_prebuild | |||
| ifndef NOFORTRAN | |||
| ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) | |||
| @$(MAKE) -C $(NETLIB_LAPACK_DIR) lapacklib | |||
| @$(MAKE) -C $(NETLIB_LAPACK_DIR) tmglib | |||
| endif | |||
| @@ -232,7 +242,7 @@ prof_lapack : lapack_prebuild | |||
| @$(MAKE) -C $(NETLIB_LAPACK_DIR) lapack_prof | |||
| lapack_prebuild : | |||
| ifndef NOFORTRAN | |||
| ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) | |||
| -@echo "FORTRAN = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc | |||
| -@echo "OPTS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc | |||
| -@echo "POPTS = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc | |||
| @@ -241,7 +251,7 @@ ifndef NOFORTRAN | |||
| -@echo "LOADOPTS = $(FFLAGS) $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc | |||
| -@echo "CC = $(CC)" >> $(NETLIB_LAPACK_DIR)/make.inc | |||
| -@echo "override CFLAGS = $(LAPACK_CFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc | |||
| -@echo "ARCH = $(AR)" >> $(NETLIB_LAPACK_DIR)/make.inc | |||
| -@echo "override ARCH = $(AR)" >> $(NETLIB_LAPACK_DIR)/make.inc | |||
| -@echo "ARCHFLAGS = $(ARFLAGS) -ru" >> $(NETLIB_LAPACK_DIR)/make.inc | |||
| -@echo "RANLIB = $(RANLIB)" >> $(NETLIB_LAPACK_DIR)/make.inc | |||
| -@echo "LAPACKLIB = ../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc | |||
| @@ -257,6 +267,8 @@ ifeq ($(F_COMPILER), GFORTRAN) | |||
| ifdef SMP | |||
| ifeq ($(OSNAME), WINNT) | |||
| -@echo "LOADER = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc | |||
| else ifeq ($(OSNAME), Haiku) | |||
| -@echo "LOADER = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc | |||
| else | |||
| -@echo "LOADER = $(FC) -pthread" >> $(NETLIB_LAPACK_DIR)/make.inc | |||
| endif | |||
| @@ -275,21 +287,21 @@ endif | |||
| endif | |||
| large.tgz : | |||
| ifndef NOFORTRAN | |||
| ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) | |||
| if [ ! -a $< ]; then | |||
| -wget http://www.netlib.org/lapack/timing/large.tgz; | |||
| fi | |||
| endif | |||
| timing.tgz : | |||
| ifndef NOFORTRAN | |||
| ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) | |||
| if [ ! -a $< ]; then | |||
| -wget http://www.netlib.org/lapack/timing/timing.tgz; | |||
| fi | |||
| endif | |||
| lapack-timing : large.tgz timing.tgz | |||
| ifndef NOFORTRAN | |||
| ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) | |||
| (cd $(NETLIB_LAPACK_DIR); $(TAR) zxf ../timing.tgz TIMING) | |||
| (cd $(NETLIB_LAPACK_DIR)/TIMING; $(TAR) zxf ../../large.tgz ) | |||
| $(MAKE) -C $(NETLIB_LAPACK_DIR)/TIMING | |||
| @@ -298,9 +310,10 @@ endif | |||
| lapack-test : | |||
| (cd $(NETLIB_LAPACK_DIR)/TESTING && rm -f x* *.out) | |||
| $(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR)/TESTING xeigtstc xeigtstd xeigtsts xeigtstz xlintstc xlintstd xlintstds xlintstrfd xlintstrfz xlintsts xlintstz xlintstzc xlintstrfs xlintstrfc | |||
| $(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR)/TESTING/EIG xeigtstc xeigtstd xeigtsts xeigtstz | |||
| $(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR)/TESTING/LIN xlintstc xlintstd xlintstds xlintstrfd xlintstrfz xlintsts xlintstz xlintstzc xlintstrfs xlintstrfc | |||
| ifneq ($(CROSS), 1) | |||
| ( cd $(NETLIB_LAPACK_DIR)/INSTALL; ./testlsame; ./testslamch; ./testdlamch; \ | |||
| ( cd $(NETLIB_LAPACK_DIR)/INSTALL; make all; ./testlsame; ./testslamch; ./testdlamch; \ | |||
| ./testsecond; ./testdsecnd; ./testieee; ./testversion ) | |||
| (cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r ) | |||
| endif | |||
| @@ -312,9 +325,9 @@ lapack-runtest: | |||
| blas-test: | |||
| (cd $(NETLIB_LAPACK_DIR)/BLAS && rm -f x* *.out) | |||
| (cd $(NETLIB_LAPACK_DIR)/BLAS/TESTING && rm -f x* *.out) | |||
| $(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR) blas_testing | |||
| (cd $(NETLIB_LAPACK_DIR)/BLAS && cat *.out) | |||
| (cd $(NETLIB_LAPACK_DIR)/BLAS/TESTING && cat *.out) | |||
| dummy : | |||
| @@ -4,22 +4,37 @@ CCOMMON_OPT += -march=armv8-a | |||
| FCOMMON_OPT += -march=armv8-a | |||
| endif | |||
| ifeq ($(CORE), CORTEXA53) | |||
| CCOMMON_OPT += -march=armv8-a -mtune=cortex-a53 | |||
| FCOMMON_OPT += -march=armv8-a -mtune=cortex-a53 | |||
| endif | |||
| ifeq ($(CORE), CORTEXA57) | |||
| CCOMMON_OPT += -march=armv8-a+crc+crypto+fp+simd -mtune=cortex-a57 | |||
| FCOMMON_OPT += -march=armv8-a+crc+crypto+fp+simd -mtune=cortex-a57 | |||
| CCOMMON_OPT += -march=armv8-a -mtune=cortex-a57 | |||
| FCOMMON_OPT += -march=armv8-a -mtune=cortex-a57 | |||
| endif | |||
| ifeq ($(CORE), CORTEXA72) | |||
| CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 | |||
| FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 | |||
| endif | |||
| ifeq ($(CORE), VULCAN) | |||
| CCOMMON_OPT += -mtune=vulcan -mcpu=vulcan | |||
| FCOMMON_OPT += -mtune=vulcan -mcpu=vulcan | |||
| ifeq ($(CORE), CORTEXA73) | |||
| CCOMMON_OPT += -march=armv8-a -mtune=cortex-a73 | |||
| FCOMMON_OPT += -march=armv8-a -mtune=cortex-a73 | |||
| endif | |||
| ifeq ($(CORE), THUNDERX) | |||
| CCOMMON_OPT += -mtune=thunderx -mcpu=thunderx | |||
| FCOMMON_OPT += -mtune=thunderx -mcpu=thunderx | |||
| CCOMMON_OPT += -march=armv8-a -mtune=thunderx | |||
| FCOMMON_OPT += -march=armv8-a -mtune=thunderx | |||
| endif | |||
| ifeq ($(CORE), FALKOR) | |||
| CCOMMON_OPT += -march=armv8-a -mtune=falkor | |||
| FCOMMON_OPT += -march=armv8-a -mtune=falkor | |||
| endif | |||
| ifeq ($(CORE), THUNDERX2T99) | |||
| CCOMMON_OPT += -mtune=thunderx2t99 -mcpu=thunderx2t99 | |||
| FCOMMON_OPT += -mtune=thunderx2t99 -mcpu=thunderx2t99 | |||
| CCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99 | |||
| FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99 | |||
| endif | |||
| @@ -48,6 +48,7 @@ ifndef NO_CBLAS | |||
| @sed 's/common/openblas_config/g' cblas.h > "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/cblas.h" | |||
| endif | |||
| ifneq ($(OSNAME), AIX) | |||
| ifndef NO_LAPACKE | |||
| @echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) | |||
| @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h" | |||
| @@ -66,18 +67,14 @@ endif | |||
| #for install shared library | |||
| ifndef NO_SHARED | |||
| @echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) | |||
| ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android)) | |||
| ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku)) | |||
| @install -pm755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" | |||
| @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ | |||
| ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \ | |||
| ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION) | |||
| endif | |||
| ifeq ($(OSNAME), FreeBSD) | |||
| @cp $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" | |||
| @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ | |||
| ln -fs $(LIBSONAME) $(LIBPREFIX).so | |||
| endif | |||
| ifeq ($(OSNAME), NetBSD) | |||
| ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD OpenBSD NetBSD DragonFly)) | |||
| @cp $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" | |||
| @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ | |||
| ln -fs $(LIBSONAME) $(LIBPREFIX).so | |||
| @@ -98,11 +95,39 @@ ifeq ($(OSNAME), CYGWIN_NT) | |||
| endif | |||
| endif | |||
| else | |||
| #install on AIX has different options syntax | |||
| ifndef NO_LAPACKE | |||
| @echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) | |||
| @-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h" | |||
| @-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h" | |||
| @-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h.in "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h" | |||
| @-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_utils.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h" | |||
| endif | |||
| #for install static library | |||
| ifndef NO_STATIC | |||
| @echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) | |||
| @installbsd -c -m 644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" | |||
| @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ | |||
| ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) | |||
| endif | |||
| #for install shared library | |||
| ifndef NO_SHARED | |||
| @echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) | |||
| @installbsd -c -m 755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" | |||
| @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ | |||
| ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \ | |||
| ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION) | |||
| endif | |||
| endif | |||
| #Generating openblas.pc | |||
| @echo Generating openblas.pc in "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)" | |||
| @echo 'libdir='$(OPENBLAS_LIBRARY_DIR) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" | |||
| @echo 'libdir='$(OPENBLAS_LIBRARY_DIR) > "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" | |||
| @echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" | |||
| @echo 'openblas_config= USE_64BITINT='$(USE_64BITINT) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'DYNAMIC_OLDER='$(DYNAMIC_OLDER) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(CORE) 'MAX_THREADS='$(NUM_THREADS)>> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" | |||
| @echo 'version='$(VERSION) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" | |||
| @echo 'extralib='$(EXTRALIB) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" | |||
| @cat openblas.pc.in >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" | |||
| @@ -115,7 +140,7 @@ endif | |||
| ifndef NO_SHARED | |||
| #ifeq logical or | |||
| ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD)) | |||
| ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD OpenBSD DragonFly)) | |||
| @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).so)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" | |||
| endif | |||
| ifeq ($(OSNAME), $(filter $(OSNAME),WINNT CYGWIN_NT)) | |||
| @@ -17,6 +17,10 @@ ifdef CPUIDEMU | |||
| EXFLAGS = -DCPUIDEMU -DVENDOR=99 | |||
| endif | |||
| ifeq ($(TARGET), 1004K) | |||
| TARGET_FLAGS = -mips32r2 | |||
| endif | |||
| ifeq ($(TARGET), P5600) | |||
| TARGET_FLAGS = -mips32r5 | |||
| endif | |||
| @@ -3,7 +3,7 @@ | |||
| # | |||
| # This library's version | |||
| VERSION = 0.3.0.dev | |||
| VERSION = 0.3.6.dev | |||
| # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a | |||
| # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library | |||
| @@ -17,6 +17,11 @@ VERSION = 0.3.0.dev | |||
| # If you want to support multiple architecture in one binary | |||
| # DYNAMIC_ARCH = 1 | |||
| # If you want the full list of x86_64 architectures supported in DYNAMIC_ARCH | |||
| # mode (including individual optimizied codes for PENRYN, DUNNINGTON, OPTERON, | |||
| # OPTERON_SSE3, ATOM and NANO rather than fallbacks to older architectures) | |||
| # DYNAMIC_OLDER = 1 | |||
| # C compiler including binary type(32bit / 64bit). Default is gcc. | |||
| # Don't use Intel Compiler or PGI, it won't generate right codes as I expect. | |||
| # CC = gcc | |||
| @@ -55,11 +60,26 @@ VERSION = 0.3.0.dev | |||
| # This flag is always set for POWER8. Don't modify the flag | |||
| # USE_OPENMP = 1 | |||
| # The OpenMP scheduler to use - by default this is "static" and you | |||
| # will normally not want to change this unless you know that your main | |||
| # workload will involve tasks that have highly unbalanced running times | |||
| # for individual threads. Changing away from "static" may also adversely | |||
| # affect memory access locality in NUMA systems. Setting to "runtime" will | |||
| # allow you to select the scheduler from the environment variable OMP_SCHEDULE | |||
| # CCOMMON_OPT += -DOMP_SCHED=dynamic | |||
| # You can define maximum number of threads. Basically it should be | |||
| # less than actual number of cores. If you don't specify one, it's | |||
| # automatically detected by the the script. | |||
| # NUM_THREADS = 24 | |||
| # If you have enabled USE_OPENMP and your application would call | |||
| # OpenBLAS's calculation API from multi threads, please comment it in. | |||
| # This flag defines how many instances of OpenBLAS's calculation API can | |||
| # actually run in parallel. If more threads call OpenBLAS's calculation API, | |||
| # they need to wait for the preceding API calls to finish or risk data corruption. | |||
| # NUM_PARALLEL = 2 | |||
| # if you don't need to install the static library, please comment it in. | |||
| # NO_STATIC = 1 | |||
| @@ -89,6 +109,12 @@ BUILD_LAPACK_DEPRECATED = 1 | |||
| # If you want to use legacy threaded Level 3 implementation. | |||
| # USE_SIMPLE_THREADED_LEVEL3 = 1 | |||
| # If you want to use the new, still somewhat experimental code that uses | |||
| # thread-local storage instead of a central memory buffer in memory.c | |||
| # Note that if your system uses GLIBC, it needs to have at least glibc 2.21 | |||
| # for this to work. | |||
| # USE_TLS = 1 | |||
| # If you want to drive whole 64bit region by BLAS. Not all Fortran | |||
| # compiler supports this. It's safe to keep comment it out if you | |||
| # are not sure(equivalent to "-i8" option). | |||
| @@ -100,7 +126,7 @@ BUILD_LAPACK_DEPRECATED = 1 | |||
| NO_WARMUP = 1 | |||
| # If you want to disable CPU/Memory affinity on Linux. | |||
| #NO_AFFINITY = 1 | |||
| NO_AFFINITY = 1 | |||
| # if you are compiling for Linux and you have more than 16 numa nodes or more than 256 cpus | |||
| # BIGNUMA = 1 | |||
| @@ -126,6 +152,9 @@ NO_WARMUP = 1 | |||
| # FUNCTION_PROFILE = 1 | |||
| # Support for IEEE quad precision(it's *real* REAL*16)( under testing) | |||
| # This option should not be used - it is a holdover from unfinished code present | |||
| # in the original GotoBLAS2 library that may be usable as a starting point but | |||
| # is not even expected to compile in its present form. | |||
| # QUAD_PRECISION = 1 | |||
| # Theads are still working for a while after finishing BLAS operation | |||
| @@ -144,8 +173,11 @@ NO_WARMUP = 1 | |||
| # CONSISTENT_FPCSR = 1 | |||
| # If any gemm arguement m, n or k is less or equal this threshold, gemm will be execute | |||
| # with single thread. You can use this flag to avoid the overhead of multi-threading | |||
| # in small matrix sizes. The default value is 4. | |||
| # with single thread. (Actually in recent versions this is a factor proportional to the | |||
| # number of floating point operations necessary for the given problem size, no longer | |||
| # an individual dimension). You can use this setting to avoid the overhead of multi- | |||
| # threading in small matrix sizes. The default value is 4, but values as high as 50 have | |||
| # been reported to be optimal for certain workloads (50 is the recommended value for Julia). | |||
| # GEMM_MULTITHREAD_THRESHOLD = 4 | |||
| # If you need santy check by comparing reference BLAS. It'll be very | |||
| @@ -160,8 +192,8 @@ NO_WARMUP = 1 | |||
| # Flags for POWER8 are defined in Makefile.power. Don't modify COMMON_OPT | |||
| # COMMON_OPT = -O2 | |||
| # gfortran option for LAPACK | |||
| # enable this flag only on 64bit Linux and if you need a thread safe lapack library | |||
| # gfortran option for LAPACK to improve thread-safety | |||
| # It is enabled by default in Makefile.system for gfortran | |||
| # Flags for POWER8 are defined in Makefile.power. Don't modify FCOMMON_OPT | |||
| # FCOMMON_OPT = -frecursive | |||
| @@ -9,6 +9,17 @@ ifndef TOPDIR | |||
| TOPDIR = . | |||
| endif | |||
| # Catch conflicting usage of ARCH in some BSD environments | |||
| ifeq ($(ARCH), amd64) | |||
| override ARCH=x86_64 | |||
| else ifeq ($(ARCH), powerpc64) | |||
| override ARCH=power | |||
| else ifeq ($(ARCH), i386) | |||
| override ARCH=x86 | |||
| else ifeq ($(ARCH), aarch64) | |||
| override ARCH=arm64 | |||
| endif | |||
| NETLIB_LAPACK_DIR = $(TOPDIR)/lapack-netlib | |||
| # Default C compiler | |||
| @@ -17,15 +28,24 @@ NETLIB_LAPACK_DIR = $(TOPDIR)/lapack-netlib | |||
| # http://stackoverflow.com/questions/4029274/mingw-and-make-variables | |||
| # - Default value is 'cc' which is not always a valid command (e.g. MinGW). | |||
| ifeq ($(origin CC),default) | |||
| # Check if $(CC) refers to a valid command and set the value to gcc if not | |||
| ifneq ($(findstring cmd.exe,$(SHELL)),) | |||
| ifeq ($(shell where $(CC) 2>NUL),) | |||
| CC = gcc | |||
| # Change the default compile to clang on Mac OSX. | |||
| # http://stackoverflow.com/questions/714100/os-detecting-makefile | |||
| UNAME_S := $(shell uname -s) | |||
| ifeq ($(UNAME_S),Darwin) | |||
| CC = clang | |||
| # EXTRALIB += -Wl,-no_compact_unwind | |||
| endif | |||
| endif | |||
| else # POSIX-ish | |||
| ifeq ($(shell command -v $(CC) 2>/dev/null),) | |||
| ifeq ($(shell uname -s),Darwin) | |||
| CC = clang | |||
| # EXTRALIB += -Wl,-no_compact_unwind | |||
| else | |||
| CC = gcc | |||
| endif # Darwin | |||
| endif # CC exists | |||
| endif # Shell is sane | |||
| endif # CC is set to default | |||
| # Default Fortran compiler (FC) is selected by f_check. | |||
| @@ -45,6 +65,7 @@ endif | |||
| ifdef TARGET | |||
| GETARCH_FLAGS := -DFORCE_$(TARGET) | |||
| GETARCH_FLAGS += -DUSER_TARGET | |||
| endif | |||
| # Force fallbacks for 32bit | |||
| @@ -53,6 +74,9 @@ ifeq ($(BINARY), 32) | |||
| ifeq ($(TARGET), HASWELL) | |||
| GETARCH_FLAGS := -DFORCE_NEHALEM | |||
| endif | |||
| ifeq ($(TARGET), SKYLAKEX) | |||
| GETARCH_FLAGS := -DFORCE_NEHALEM | |||
| endif | |||
| ifeq ($(TARGET), SANDYBRIDGE) | |||
| GETARCH_FLAGS := -DFORCE_NEHALEM | |||
| endif | |||
| @@ -86,6 +110,9 @@ ifeq ($(BINARY), 32) | |||
| ifeq ($(TARGET_CORE), HASWELL) | |||
| GETARCH_FLAGS := -DFORCE_NEHALEM | |||
| endif | |||
| ifeq ($(TARGET_CORE), SKYLAKEX) | |||
| GETARCH_FLAGS := -DFORCE_NEHALEM | |||
| endif | |||
| ifeq ($(TARGET_CORE), SANDYBRIDGE) | |||
| GETARCH_FLAGS := -DFORCE_NEHALEM | |||
| endif | |||
| @@ -132,6 +159,10 @@ ifeq ($(NO_AVX2), 1) | |||
| GETARCH_FLAGS += -DNO_AVX2 | |||
| endif | |||
| ifeq ($(NO_AVX512), 1) | |||
| GETARCH_FLAGS += -DNO_AVX512 | |||
| endif | |||
| ifeq ($(DEBUG), 1) | |||
| GETARCH_FLAGS += -g | |||
| endif | |||
| @@ -175,6 +206,10 @@ endif | |||
| endif | |||
| ifndef NUM_PARALLEL | |||
| NUM_PARALLEL = 1 | |||
| endif | |||
| ifndef NUM_THREADS | |||
| NUM_THREADS = $(NUM_CORES) | |||
| endif | |||
| @@ -225,12 +260,12 @@ endif | |||
| ifeq ($(OSNAME), Darwin) | |||
| ifndef MACOSX_DEPLOYMENT_TARGET | |||
| export MACOSX_DEPLOYMENT_TARGET=10.6 | |||
| export MACOSX_DEPLOYMENT_TARGET=10.8 | |||
| endif | |||
| MD5SUM = md5 -r | |||
| endif | |||
| ifeq ($(OSNAME), FreeBSD) | |||
| ifneq (,$(findstring $(OSNAME), FreeBSD OpenBSD DragonFly)) | |||
| MD5SUM = md5 -r | |||
| endif | |||
| @@ -304,6 +339,7 @@ endif | |||
| ifeq ($(OSNAME), CYGWIN_NT) | |||
| NEED_PIC = 0 | |||
| NO_EXPRECISION = 1 | |||
| OS_CYGWIN_NT = 1 | |||
| endif | |||
| ifneq ($(OSNAME), WINNT) | |||
| @@ -423,7 +459,7 @@ CCOMMON_OPT += -fopenmp | |||
| endif | |||
| ifeq ($(C_COMPILER), INTEL) | |||
| CCOMMON_OPT += -openmp | |||
| CCOMMON_OPT += -fopenmp | |||
| endif | |||
| ifeq ($(C_COMPILER), PGI) | |||
| @@ -448,13 +484,44 @@ DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \ | |||
| endif | |||
| ifeq ($(ARCH), x86_64) | |||
| DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO | |||
| DYNAMIC_CORE = PRESCOTT CORE2 | |||
| ifeq ($(DYNAMIC_OLDER), 1) | |||
| DYNAMIC_CORE += PENRYN DUNNINGTON | |||
| endif | |||
| DYNAMIC_CORE += NEHALEM | |||
| ifeq ($(DYNAMIC_OLDER), 1) | |||
| DYNAMIC_CORE += OPTERON OPTERON_SSE3 | |||
| endif | |||
| DYNAMIC_CORE += BARCELONA | |||
| ifeq ($(DYNAMIC_OLDER), 1) | |||
| DYNAMIC_CORE += BOBCAT ATOM NANO | |||
| endif | |||
| ifneq ($(NO_AVX), 1) | |||
| DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER STEAMROLLER EXCAVATOR | |||
| endif | |||
| ifneq ($(NO_AVX2), 1) | |||
| DYNAMIC_CORE += HASWELL ZEN | |||
| endif | |||
| ifneq ($(NO_AVX512), 1) | |||
| ifneq ($(NO_AVX2), 1) | |||
| DYNAMIC_CORE += SKYLAKEX | |||
| endif | |||
| endif | |||
| endif | |||
| ifdef DYNAMIC_LIST | |||
| override DYNAMIC_CORE = PRESCOTT $(DYNAMIC_LIST) | |||
| XCCOMMON_OPT = -DDYNAMIC_LIST -DDYN_PRESCOTT | |||
| XCCOMMON_OPT += $(foreach dcore,$(DYNAMIC_LIST),-DDYN_$(dcore)) | |||
| CCOMMON_OPT += $(XCCOMMON_OPT) | |||
| #CCOMMON_OPT += -DDYNAMIC_LIST='$(DYNAMIC_LIST)' | |||
| endif | |||
| ifeq ($(ARCH), arm64) | |||
| DYNAMIC_CORE = ARMV8 | |||
| DYNAMIC_CORE += CORTEXA57 | |||
| DYNAMIC_CORE += THUNDERX | |||
| DYNAMIC_CORE += THUNDERX2T99 | |||
| endif | |||
| # If DYNAMIC_CORE is not set, DYNAMIC_ARCH cannot do anything, so force it to empty | |||
| @@ -554,9 +621,14 @@ CCOMMON_OPT += -march=mips64 | |||
| FCOMMON_OPT += -march=mips64 | |||
| endif | |||
| ifeq ($(CORE), 1004K) | |||
| CCOMMON_OPT += -mips32r2 $(MSA_FLAGS) | |||
| FCOMMON_OPT += -mips32r2 $(MSA_FLAGS) | |||
| endif | |||
| ifeq ($(CORE), P5600) | |||
| CCOMMON_OPT += -mips32r5 -mnan=2008 -mtune=p5600 $(MSA_FLAGS) | |||
| FCOMMON_OPT += -mips32r5 -mnan=2008 -mtune=p5600 $(MSA_FLAGS) | |||
| CCOMMON_OPT += -mips32r5 -mnan=2008 -mtune=p5600 $(MSA_FLAGS) | |||
| FCOMMON_OPT += -mips32r5 -mnan=2008 -mtune=p5600 $(MSA_FLAGS) | |||
| endif | |||
| ifeq ($(CORE), I6400) | |||
| @@ -660,6 +732,8 @@ endif | |||
| ifeq ($(F_COMPILER), GFORTRAN) | |||
| CCOMMON_OPT += -DF_INTERFACE_GFORT | |||
| FCOMMON_OPT += -Wall | |||
| # make single-threaded LAPACK calls thread-safe #1847 | |||
| FCOMMON_OPT += -frecursive | |||
| #Don't include -lgfortran, when NO_LAPACK=1 or lsbcc | |||
| ifneq ($(NO_LAPACK), 1) | |||
| EXTRALIB += -lgfortran | |||
| @@ -703,7 +777,7 @@ FCOMMON_OPT += -i8 | |||
| endif | |||
| endif | |||
| ifeq ($(USE_OPENMP), 1) | |||
| FCOMMON_OPT += -openmp | |||
| FCOMMON_OPT += -fopenmp | |||
| endif | |||
| endif | |||
| @@ -883,6 +957,10 @@ ifeq ($(DYNAMIC_ARCH), 1) | |||
| CCOMMON_OPT += -DDYNAMIC_ARCH | |||
| endif | |||
| ifeq ($(DYNAMIC_OLDER), 1) | |||
| CCOMMON_OPT += -DDYNAMIC_OLDER | |||
| endif | |||
| ifeq ($(NO_LAPACK), 1) | |||
| CCOMMON_OPT += -DNO_LAPACK | |||
| #Disable LAPACK C interface | |||
| @@ -905,6 +983,10 @@ ifeq ($(NO_AVX2), 1) | |||
| CCOMMON_OPT += -DNO_AVX2 | |||
| endif | |||
| ifeq ($(NO_AVX512), 1) | |||
| CCOMMON_OPT += -DNO_AVX512 | |||
| endif | |||
| ifdef SMP | |||
| CCOMMON_OPT += -DSMP_SERVER | |||
| @@ -951,10 +1033,18 @@ endif | |||
| CCOMMON_OPT += -DMAX_CPU_NUMBER=$(NUM_THREADS) | |||
| CCOMMON_OPT += -DMAX_PARALLEL_NUMBER=$(NUM_PARALLEL) | |||
| ifdef USE_SIMPLE_THREADED_LEVEL3 | |||
| CCOMMON_OPT += -DUSE_SIMPLE_THREADED_LEVEL3 | |||
| endif | |||
| ifdef USE_TLS | |||
| CCOMMON_OPT += -DUSE_TLS | |||
| endif | |||
| CCOMMON_OPT += -DVERSION=\"$(VERSION)\" | |||
| ifndef SYMBOLPREFIX | |||
| SYMBOLPREFIX = | |||
| endif | |||
| @@ -1065,8 +1155,6 @@ ifndef FCOMMON_OPT | |||
| FCOMMON_OPT = -O2 -frecursive | |||
| endif | |||
| override CFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) | |||
| override PFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) -DPROFILE $(COMMON_PROF) | |||
| @@ -1074,6 +1162,12 @@ override FFLAGS += $(COMMON_OPT) $(FCOMMON_OPT) | |||
| override FPFLAGS += $(FCOMMON_OPT) $(COMMON_PROF) | |||
| #MAKEOVERRIDES = | |||
| ifdef NEED_PIC | |||
| ifeq (,$(findstring PIC,$(FFLAGS))) | |||
| override FFLAGS += -fPIC | |||
| endif | |||
| endif | |||
| #For LAPACK Fortran codes. | |||
| #Disable -fopenmp for LAPACK Fortran codes on Windows. | |||
| ifdef OS_WINDOWS | |||
| @@ -1132,7 +1226,11 @@ endif | |||
| LIBDLLNAME = $(LIBPREFIX).dll | |||
| IMPLIBNAME = lib$(LIBNAMEBASE).dll.a | |||
| ifneq ($(OSNAME), AIX) | |||
| LIBSONAME = $(LIBNAME:.$(LIBSUFFIX)=.so) | |||
| else | |||
| LIBSONAME = $(LIBNAME:.$(LIBSUFFIX)=.a) | |||
| endif | |||
| LIBDYNNAME = $(LIBNAME:.$(LIBSUFFIX)=.dylib) | |||
| LIBDEFNAME = $(LIBNAME:.$(LIBSUFFIX)=.def) | |||
| LIBEXPNAME = $(LIBNAME:.$(LIBSUFFIX)=.exp) | |||
| @@ -1209,6 +1307,7 @@ export MSA_FLAGS | |||
| export KERNELDIR | |||
| export FUNCTION_PROFILE | |||
| export TARGET_CORE | |||
| export NO_AVX512 | |||
| export SGEMM_UNROLL_M | |||
| export SGEMM_UNROLL_N | |||
| @@ -8,6 +8,34 @@ endif | |||
| endif | |||
| endif | |||
| ifeq ($(CORE), SKYLAKEX) | |||
| ifndef DYNAMIC_ARCH | |||
| ifndef NO_AVX512 | |||
| CCOMMON_OPT += -march=skylake-avx512 | |||
| FCOMMON_OPT += -march=skylake-avx512 | |||
| ifeq ($(OSNAME), CYGWIN_NT) | |||
| CCOMMON_OPT += -fno-asynchronous-unwind-tables | |||
| endif | |||
| ifeq ($(OSNAME), WINNT) | |||
| ifeq ($(C_COMPILER), GCC) | |||
| CCOMMON_OPT += -fno-asynchronous-unwind-tables | |||
| endif | |||
| endif | |||
| endif | |||
| endif | |||
| endif | |||
| ifeq ($(CORE), HASWELL) | |||
| ifndef DYNAMIC_ARCH | |||
| ifndef NO_AVX2 | |||
| CCOMMON_OPT += -mavx2 | |||
| FCOMMON_OPT += -mavx2 | |||
| endif | |||
| endif | |||
| endif | |||
| ifeq ($(OSNAME), Interix) | |||
| ARFLAGS = -m x64 | |||
| endif | |||
| @@ -5,175 +5,221 @@ | |||
| Travis CI: [](https://travis-ci.org/xianyi/OpenBLAS) | |||
| AppVeyor: [](https://ci.appveyor.com/project/xianyi/openblas/branch/develop) | |||
| ## Introduction | |||
| OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version. | |||
| Please read the documents on OpenBLAS wiki pages <http://github.com/xianyi/OpenBLAS/wiki>. | |||
| Please read the documentation on the OpenBLAS wiki pages: <http://github.com/xianyi/OpenBLAS/wiki>. | |||
| ## Binary Packages | |||
| We provide binary packages for the following platform. | |||
| We provide official binary packages for the following platform: | |||
| * Windows x86/x86_64 | |||
| You can download them from [file hosting on sourceforge.net](https://sourceforge.net/projects/openblas/files/). | |||
| ## Installation from Source | |||
| Download from project homepage. http://xianyi.github.com/OpenBLAS/ | |||
| Or, check out codes from git://github.com/xianyi/OpenBLAS.git | |||
| ### Normal compile | |||
| * type "make" to detect the CPU automatically. | |||
| or | |||
| * type "make TARGET=xxx" to set target CPU, e.g. "make TARGET=NEHALEM". The full target list is in file TargetList.txt. | |||
| Download from project homepage, http://xianyi.github.com/OpenBLAS/, or check out the code | |||
| using Git from https://github.com/xianyi/OpenBLAS.git. | |||
| ### Cross compile | |||
| Please set CC and FC with the cross toolchains. Then, set HOSTCC with your host C compiler. At last, set TARGET explicitly. | |||
| ### Dependencies | |||
| Examples: | |||
| Building OpenBLAS requires the following to be installed: | |||
| On X86 box, compile this library for loongson3a CPU. | |||
| * GNU Make | |||
| * A C compiler, e.g. GCC or Clang | |||
| * A Fortran compiler (optional, for LAPACK) | |||
| * IBM MASS (optional, see below) | |||
| make BINARY=64 CC=mips64el-unknown-linux-gnu-gcc FC=mips64el-unknown-linux-gnu-gfortran HOSTCC=gcc TARGET=LOONGSON3A | |||
| ### Normal compile | |||
| On X86 box, compile this library for loongson3a CPU with loongcc (based on Open64) compiler. | |||
| Simply invoking `make` (or `gmake` on BSD) will detect the CPU automatically. | |||
| To set a specific target CPU, use `make TARGET=xxx`, e.g. `make TARGET=NEHALEM`. | |||
| The full target list is in the file `TargetList.txt`. | |||
| make CC=loongcc FC=loongf95 HOSTCC=gcc TARGET=LOONGSON3A CROSS=1 CROSS_SUFFIX=mips64el-st-linux-gnu- NO_LAPACKE=1 NO_SHARED=1 BINARY=32 | |||
| ### Cross compile | |||
| ### Debug version | |||
| Set `CC` and `FC` to point to the cross toolchains, and set `HOSTCC` to your host C compiler. | |||
| The target must be specified explicitly when cross compiling. | |||
| Examples: | |||
| make DEBUG=1 | |||
| * On an x86 box, compile this library for a loongson3a CPU: | |||
| ```sh | |||
| make BINARY=64 CC=mips64el-unknown-linux-gnu-gcc FC=mips64el-unknown-linux-gnu-gfortran HOSTCC=gcc TARGET=LOONGSON3A | |||
| ``` | |||
| ### Compile with MASS Support on Power CPU (Optional dependency) | |||
| * On an x86 box, compile this library for a loongson3a CPU with loongcc (based on Open64) compiler: | |||
| ```sh | |||
| make CC=loongcc FC=loongf95 HOSTCC=gcc TARGET=LOONGSON3A CROSS=1 CROSS_SUFFIX=mips64el-st-linux-gnu- NO_LAPACKE=1 NO_SHARED=1 BINARY=32 | |||
| ``` | |||
| [IBM MASS](http://www-01.ibm.com/software/awdtools/mass/linux/mass-linux.html) library consists of a set of mathematical functions for C, C++, and | |||
| Fortran-language applications that are tuned for optimum performance on POWER architectures. OpenBLAS with MASS requires 64-bit, little-endian OS on POWER. | |||
| The library can be installed as below - | |||
| ### Debug version | |||
| * On Ubuntu: | |||
| A debug version can be built using `make DEBUG=1`. | |||
| wget -q http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/ubuntu/public.gpg -O- | sudo apt-key add -</br> | |||
| echo "deb http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/ubuntu/ trusty main" | sudo tee /etc/apt/sources.list.d/ibm-xl-compiler-eval.list</br> | |||
| sudo apt-get update</br> | |||
| sudo apt-get install libxlmass-devel.8.1.5</br> | |||
| ### Compile with MASS support on Power CPU (optional) | |||
| * On RHEL/CentOS: | |||
| The [IBM MASS](http://www-01.ibm.com/software/awdtools/mass/linux/mass-linux.html) library | |||
| consists of a set of mathematical functions for C, C++, and Fortran applications that are | |||
| are tuned for optimum performance on POWER architectures. | |||
| OpenBLAS with MASS requires a 64-bit, little-endian OS on POWER. | |||
| The library can be installed as shown: | |||
| wget http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/rhel7/repodata/repomd.xml.key</br> | |||
| sudo rpm --import repomd.xml.key</br> | |||
| wget http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/rhel7/ibm-xl-compiler-eval.repo</br> | |||
| sudo cp ibm-xl-compiler-eval.repo /etc/yum.repos.d/</br> | |||
| sudo yum install libxlmass-devel.8.1.5</br> | |||
| * On Ubuntu: | |||
| ```sh | |||
| wget -q http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/ubuntu/public.gpg -O- | sudo apt-key add - | |||
| echo "deb http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/ubuntu/ trusty main" | sudo tee /etc/apt/sources.list.d/ibm-xl-compiler-eval.list | |||
| sudo apt-get update | |||
| sudo apt-get install libxlmass-devel.8.1.5 | |||
| ``` | |||
| After installing MASS library, compile openblas with USE_MASS=1. | |||
| * On RHEL/CentOS: | |||
| ```sh | |||
| wget http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/rhel7/repodata/repomd.xml.key | |||
| sudo rpm --import repomd.xml.key | |||
| wget http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/rhel7/ibm-xl-compiler-eval.repo | |||
| sudo cp ibm-xl-compiler-eval.repo /etc/yum.repos.d/ | |||
| sudo yum install libxlmass-devel.8.1.5 | |||
| ``` | |||
| Example: | |||
| After installing the MASS library, compile OpenBLAS with `USE_MASS=1`. | |||
| For example, to compile on Power8 with MASS support: `make USE_MASS=1 TARGET=POWER8`. | |||
| Compiling on Power8 with MASS support - | |||
| ### Install to a specific directory (optional) | |||
| make USE_MASS=1 TARGET=POWER8 | |||
| Use `PREFIX=` when invoking `make`, for example | |||
| ### Install to the directory (optional) | |||
| ```sh | |||
| make install PREFIX=your_installation_directory | |||
| ``` | |||
| Example: | |||
| The default installation directory is `/opt/OpenBLAS`. | |||
| make install PREFIX=your_installation_directory | |||
| ## Supported CPUs and Operating Systems | |||
| The default directory is /opt/OpenBLAS | |||
| Please read `GotoBLAS_01Readme.txt`. | |||
| ## Support CPU & OS | |||
| Please read GotoBLAS_01Readme.txt | |||
| ### Additional supported CPUs | |||
| ### Additional support CPU: | |||
| #### x86/x86-64 | |||
| #### x86/x86-64: | |||
| - **Intel Xeon 56xx (Westmere)**: Used GotoBLAS2 Nehalem codes. | |||
| - **Intel Sandy Bridge**: Optimized Level-3 and Level-2 BLAS with AVX on x86-64. | |||
| - **Intel Haswell**: Optimized Level-3 and Level-2 BLAS with AVX2 and FMA on x86-64. | |||
| - **Intel Skylake**: Optimized Level-3 and Level-2 BLAS with AVX512 and FMA on x86-64. | |||
| - **AMD Bobcat**: Used GotoBLAS2 Barcelona codes. | |||
| - **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thank Werner Saar) | |||
| - **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thanks to Werner Saar) | |||
| - **AMD PILEDRIVER**: Uses Bulldozer codes with some optimizations. | |||
| - **AMD STEAMROLLER**: Uses Bulldozer codes with some optimizations. | |||
| #### MIPS64: | |||
| #### MIPS64 | |||
| - **ICT Loongson 3A**: Optimized Level-3 BLAS and the part of Level-1,2. | |||
| - **ICT Loongson 3B**: Experimental | |||
| #### ARM: | |||
| - **ARMV6**: Optimized BLAS for vfpv2 and vfpv3-d16 ( e.g. BCM2835, Cortex M0+ ) | |||
| - **ARMV7**: Optimized BLAS for vfpv3-d32 ( e.g. Cortex A8, A9 and A15 ) | |||
| #### ARM | |||
| #### ARM64: | |||
| - **ARMV8**: Experimental | |||
| - **ARMv6**: Optimized BLAS for vfpv2 and vfpv3-d16 (e.g. BCM2835, Cortex M0+) | |||
| - **ARMv7**: Optimized BLAS for vfpv3-d32 (e.g. Cortex A8, A9 and A15) | |||
| #### ARM64 | |||
| - **ARMv8**: Experimental | |||
| - **ARM Cortex-A57**: Experimental | |||
| #### PPC/PPC64 | |||
| - **POWER8**: Optmized Level-3 BLAS and some Level-1, only with USE_OPENMP=1 | |||
| #### IBM zEnterprise System: | |||
| - **Z13**: Optimized Level-3 BLAS and Level-1,2 (double precision) | |||
| - **POWER8**: Optmized Level-3 BLAS and some Level-1, only with `USE_OPENMP=1` | |||
| ### Support OS: | |||
| - **GNU/Linux** | |||
| - **MingWin or Visual Studio(CMake)/Windows**: Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-in-Microsoft-Visual-Studio>. | |||
| - **Darwin/Mac OS X**: Experimental. Although GotoBLAS2 supports Darwin, we are the beginner on Mac OS X. | |||
| - **FreeBSD**: Supported by community. We didn't test the library on this OS. | |||
| - **Android**: Supported by community. Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-build-OpenBLAS-for-Android>. | |||
| #### IBM zEnterprise System | |||
| ## Usages | |||
| Link with libopenblas.a or -lopenblas for shared library. | |||
| - **Z13**: Optimized Level-3 BLAS and Level-1,2 (double precision) | |||
| ### Set the number of threads with environment variables. | |||
| ### Supported OS | |||
| Examples: | |||
| - **GNU/Linux** | |||
| - **MinGW or Visual Studio (CMake)/Windows**: Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-in-Microsoft-Visual-Studio>. | |||
| - **Darwin/macOS**: Experimental. Although GotoBLAS2 supports Darwin, we are not macOS experts. | |||
| - **FreeBSD**: Supported by the community. We don't actively test the library on this OS. | |||
| - **OpenBSD**: Supported by the community. We don't actively test the library on this OS. | |||
| - **DragonFly BSD**: Supported by the community. We don't actively test the library on this OS. | |||
| - **Android**: Supported by the community. Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-build-OpenBLAS-for-Android>. | |||
| export OPENBLAS_NUM_THREADS=4 | |||
| ## Usage | |||
| or | |||
| Statically link with `libopenblas.a` or dynamically link with `-lopenblas` if OpenBLAS was | |||
| compiled as a shared library. | |||
| export GOTO_NUM_THREADS=4 | |||
| ### Setting the number of threads using environment variables | |||
| or | |||
| Environment variables are used to specify a maximum number of threads. | |||
| For example, | |||
| export OMP_NUM_THREADS=4 | |||
| ```sh | |||
| export OPENBLAS_NUM_THREADS=4 | |||
| export GOTO_NUM_THREADS=4 | |||
| export OMP_NUM_THREADS=4 | |||
| ``` | |||
| The priorities are OPENBLAS_NUM_THREADS > GOTO_NUM_THREADS > OMP_NUM_THREADS. | |||
| The priorities are `OPENBLAS_NUM_THREADS` > `GOTO_NUM_THREADS` > `OMP_NUM_THREADS`. | |||
| If you compile this lib with USE_OPENMP=1, you should set OMP_NUM_THREADS environment variable. OpenBLAS ignores OPENBLAS_NUM_THREADS and GOTO_NUM_THREADS with USE_OPENMP=1. | |||
| If you compile this library with `USE_OPENMP=1`, you should set the `OMP_NUM_THREADS` | |||
| environment variable; OpenBLAS ignores `OPENBLAS_NUM_THREADS` and `GOTO_NUM_THREADS` when | |||
| compiled with `USE_OPENMP=1`. | |||
| ### Set the number of threads on runtime. | |||
| ### Setting the number of threads at runtime | |||
| We provided the below functions to control the number of threads on runtime. | |||
| We provide the following functions to control the number of threads at runtime: | |||
| void goto_set_num_threads(int num_threads); | |||
| ```c | |||
| void goto_set_num_threads(int num_threads); | |||
| void openblas_set_num_threads(int num_threads); | |||
| ``` | |||
| void openblas_set_num_threads(int num_threads); | |||
| If you compile this library with `USE_OPENMP=1`, you should use the above functions too. | |||
| If you compile this lib with USE_OPENMP=1, you should use the above functions, too. | |||
| ## Reporting bugs | |||
| ## Report Bugs | |||
| Please add a issue in https://github.com/xianyi/OpenBLAS/issues | |||
| Please submit an issue in https://github.com/xianyi/OpenBLAS/issues. | |||
| ## Contact | |||
| * OpenBLAS users mailing list: https://groups.google.com/forum/#!forum/openblas-users | |||
| * OpenBLAS developers mailing list: https://groups.google.com/forum/#!forum/openblas-dev | |||
| ## ChangeLog | |||
| Please see Changelog.txt to obtain the differences between GotoBLAS2 1.13 BSD version. | |||
| ## Change log | |||
| Please see Changelog.txt to view the differences between OpenBLAS and GotoBLAS2 1.13 BSD version. | |||
| ## Troubleshooting | |||
| * Please read [Faq](https://github.com/xianyi/OpenBLAS/wiki/Faq) at first. | |||
| * Please use gcc version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MingW/BSD. | |||
| * Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture. The Clang 3.0 will generate the wrong AVX binary code. | |||
| * The number of CPUs/Cores should less than or equal to 256. On Linux x86_64(amd64), there is experimental support for up to 1024 CPUs/Cores and 128 numa nodes if you build the library with BIGNUMA=1. | |||
| * OpenBLAS does not set processor affinity by default. On Linux, you can enable processor affinity by commenting the line NO_AFFINITY=1 in Makefile.rule. But this may cause [the conflict with R parallel](https://stat.ethz.ch/pipermail/r-sig-hpc/2012-April/001348.html). | |||
| * On Loongson 3A. make test would be failed because of pthread_create error. The error code is EAGAIN. However, it will be OK when you run the same testcase on shell. | |||
| * Please read the [FAQ](https://github.com/xianyi/OpenBLAS/wiki/Faq) first. | |||
| * Please use GCC version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MinGW/BSD. | |||
| * Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture. | |||
| Clang 3.0 will generate the wrong AVX binary code. | |||
| * Please use GCC version 6 or LLVM version 6 and above to compile Skylake AVX512 kernels. | |||
| * The number of CPUs/cores should less than or equal to 256. On Linux `x86_64` (`amd64`), | |||
| there is experimental support for up to 1024 CPUs/cores and 128 numa nodes if you build | |||
| the library with `BIGNUMA=1`. | |||
| * OpenBLAS does not set processor affinity by default. | |||
| On Linux, you can enable processor affinity by commenting out the line `NO_AFFINITY=1` in | |||
| Makefile.rule. However, note that this may cause | |||
| [a conflict with R parallel](https://stat.ethz.ch/pipermail/r-sig-hpc/2012-April/001348.html). | |||
| * On Loongson 3A, `make test` may fail with a `pthread_create` error (`EAGAIN`). | |||
| However, it will be okay when you run the same test case on the shell. | |||
| ## Contributing | |||
| 1. [Check for open issues](https://github.com/xianyi/OpenBLAS/issues) or open a fresh issue to start a discussion around a feature idea or a bug. | |||
| 1. Fork the [OpenBLAS](https://github.com/xianyi/OpenBLAS) repository to start making your changes. | |||
| 1. Write a test which shows that the bug was fixed or that the feature works as expected. | |||
| 1. Send a pull request. Make sure to add yourself to `CONTRIBUTORS.md`. | |||
| 1. [Check for open issues](https://github.com/xianyi/OpenBLAS/issues) or open a fresh issue | |||
| to start a discussion around a feature idea or a bug. | |||
| 2. Fork the [OpenBLAS](https://github.com/xianyi/OpenBLAS) repository to start making your changes. | |||
| 3. Write a test which shows that the bug was fixed or that the feature works as expected. | |||
| 4. Send a pull request. Make sure to add yourself to `CONTRIBUTORS.md`. | |||
| ## Donation | |||
| Please read [this wiki page](https://github.com/xianyi/OpenBLAS/wiki/Donation). | |||
| @@ -20,6 +20,7 @@ DUNNINGTON | |||
| NEHALEM | |||
| SANDYBRIDGE | |||
| HASWELL | |||
| SKYLAKEX | |||
| ATOM | |||
| b)AMD CPU: | |||
| @@ -56,6 +57,7 @@ CELL | |||
| 3.MIPS CPU: | |||
| P5600 | |||
| 1004K | |||
| 4.MIPS64 CPU: | |||
| SICORTEX | |||
| @@ -81,8 +83,11 @@ ARMV5 | |||
| 8.ARM 64-bit CPU: | |||
| ARMV8 | |||
| CORTEXA53 | |||
| CORTEXA57 | |||
| VULCAN | |||
| CORTEXA72 | |||
| CORTEXA73 | |||
| FALKOR | |||
| THUNDERX | |||
| THUNDERX2T99 | |||
| @@ -14,6 +14,20 @@ Please build OpenBLAS with larger `NUM_THREADS`. For example, `make | |||
| NUM_THREADS=32` or `make NUM_THREADS=64`. In `Makefile.system`, we will set | |||
| `MAX_CPU_NUMBER=NUM_THREADS`. | |||
| Despite its name, and due to the use of memory buffers in functions like SGEMM, | |||
| the setting of NUM_THREADS can be relevant even for a single-threaded build | |||
| of OpenBLAS, if such functions get called by multiple threads of a program | |||
| that uses OpenBLAS. In some cases, the affected code may simply crash or throw | |||
| a segmentation fault without displaying the above warning first. | |||
| Note that the number of threads used at runtime can be altered to differ from the | |||
| value NUM_THREADS was set to at build time. At runtime, the actual number of | |||
| threads can be set anywhere from 1 to the build's NUM_THREADS (note however, | |||
| that this does not change the number of memory buffers that will be allocated, | |||
| which is set at build time). The number of threads for a process can be set by | |||
| using the mechanisms described below. | |||
| #### How can I use OpenBLAS in multi-threaded applications? | |||
| If your application is already multi-threaded, it will conflict with OpenBLAS | |||
| @@ -237,7 +237,7 @@ int main(int argc, char *argv[]){ | |||
| timeg = time1/loops; | |||
| fprintf(stderr, | |||
| " %10.2f MFlops %10.6f sec\n", | |||
| COMPSIZE * COMPSIZE * 2. * (double)m * (double)m * (double)n / timeg * 1.e-6, time1); | |||
| COMPSIZE * COMPSIZE * 2. * (double)k * (double)m * (double)n / timeg * 1.e-6, time1); | |||
| } | |||
| @@ -122,7 +122,7 @@ int main(int argc, char *argv[]){ | |||
| FLOAT *a, *x, *y; | |||
| FLOAT alpha[] = {1.0, 1.0}; | |||
| FLOAT beta [] = {1.0, 1.0}; | |||
| FLOAT beta [] = {1.0, 0.0}; | |||
| char trans='N'; | |||
| blasint m, i, j; | |||
| blasint inc_x=1,inc_y=1; | |||
| @@ -54,6 +54,8 @@ $compiler = GCC if ($compiler eq ""); | |||
| $os = Linux if ($data =~ /OS_LINUX/); | |||
| $os = FreeBSD if ($data =~ /OS_FREEBSD/); | |||
| $os = NetBSD if ($data =~ /OS_NETBSD/); | |||
| $os = OpenBSD if ($data =~ /OS_OPENBSD/); | |||
| $os = DragonFly if ($data =~ /OS_DRAGONFLY/); | |||
| $os = Darwin if ($data =~ /OS_DARWIN/); | |||
| $os = SunOS if ($data =~ /OS_SUNOS/); | |||
| $os = AIX if ($data =~ /OS_AIX/); | |||
| @@ -62,6 +64,7 @@ $os = WINNT if ($data =~ /OS_WINNT/); | |||
| $os = CYGWIN_NT if ($data =~ /OS_CYGWIN_NT/); | |||
| $os = Interix if ($data =~ /OS_INTERIX/); | |||
| $os = Android if ($data =~ /OS_ANDROID/); | |||
| $os = Haiku if ($data =~ /OS_HAIKU/); | |||
| $architecture = x86 if ($data =~ /ARCH_X86/); | |||
| $architecture = x86_64 if ($data =~ /ARCH_X86_64/); | |||
| @@ -199,6 +202,21 @@ $architecture = zarch if ($data =~ /ARCH_ZARCH/); | |||
| $binformat = bin32; | |||
| $binformat = bin64 if ($data =~ /BINARY_64/); | |||
| $no_avx512= 0; | |||
| if (($architecture eq "x86") || ($architecture eq "x86_64")) { | |||
| $code = '"vbroadcastss -4 * 4(%rsi), %zmm2"'; | |||
| print $tmpf "#include <immintrin.h>\n\nint main(void){ __asm__ volatile($code); }\n"; | |||
| $args = " -march=skylake-avx512 -o $tmpf.o -x c $tmpf"; | |||
| my @cmd = ("$compiler_name $args >/dev/null 2>/dev/null"); | |||
| system(@cmd) == 0; | |||
| if ($? != 0) { | |||
| $no_avx512 = 1; | |||
| } else { | |||
| $no_avx512 = 0; | |||
| } | |||
| unlink("tmpf.o"); | |||
| } | |||
| $data = `$compiler_name -S ctest1.c && grep globl ctest1.s | head -n 1 && rm -f ctest1.s`; | |||
| $data =~ /globl\s([_\.]*)(.*)/; | |||
| @@ -206,7 +224,6 @@ $data =~ /globl\s([_\.]*)(.*)/; | |||
| $need_fu = $1; | |||
| $cross = 0; | |||
| $cross = 1 if ($os ne $hostos); | |||
| if ($architecture ne $hostarch) { | |||
| $cross = 1; | |||
| @@ -214,6 +231,8 @@ if ($architecture ne $hostarch) { | |||
| $cross = 0 if (($hostarch eq "mips64") && ($architecture eq "mips")); | |||
| } | |||
| $cross = 1 if ($os ne $hostos); | |||
| $openmp = "" if $ENV{USE_OPENMP} != 1; | |||
| $linker_L = ""; | |||
| @@ -286,6 +305,7 @@ print MAKEFILE "CROSS=1\n" if $cross != 0; | |||
| print MAKEFILE "CEXTRALIB=$linker_L $linker_l $linker_a\n"; | |||
| print MAKEFILE "HAVE_MSA=1\n" if $have_msa eq 1; | |||
| print MAKEFILE "MSA_FLAGS=$msa_flags\n" if $have_msa eq 1; | |||
| print MAKEFILE "NO_AVX512=1\n" if $no_avx512 eq 1; | |||
| $os =~ tr/[a-z]/[A-Z]/; | |||
| $architecture =~ tr/[a-z]/[A-Z]/; | |||
| @@ -51,7 +51,8 @@ typedef enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=1 | |||
| typedef enum CBLAS_UPLO {CblasUpper=121, CblasLower=122} CBLAS_UPLO; | |||
| typedef enum CBLAS_DIAG {CblasNonUnit=131, CblasUnit=132} CBLAS_DIAG; | |||
| typedef enum CBLAS_SIDE {CblasLeft=141, CblasRight=142} CBLAS_SIDE; | |||
| typedef CBLAS_ORDER CBLAS_LAYOUT; | |||
| float cblas_sdsdot(OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float *y, OPENBLAS_CONST blasint incy); | |||
| double cblas_dsdot (OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float *y, OPENBLAS_CONST blasint incy); | |||
| float cblas_sdot(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float *y, OPENBLAS_CONST blasint incy); | |||
| @@ -82,6 +83,11 @@ CBLAS_INDEX cblas_idamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPE | |||
| CBLAS_INDEX cblas_icamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); | |||
| CBLAS_INDEX cblas_izamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); | |||
| CBLAS_INDEX cblas_isamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx); | |||
| CBLAS_INDEX cblas_idamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx); | |||
| CBLAS_INDEX cblas_icamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); | |||
| CBLAS_INDEX cblas_izamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); | |||
| void cblas_saxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, float *y, OPENBLAS_CONST blasint incy); | |||
| void cblas_daxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, double *y, OPENBLAS_CONST blasint incy); | |||
| void cblas_caxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy); | |||
| @@ -0,0 +1,79 @@ | |||
| # OpenBLASConfig.cmake | |||
| # -------------------- | |||
| # | |||
| # OpenBLAS cmake module. | |||
| # This module sets the following variables in your project:: | |||
| # | |||
| # OpenBLAS_FOUND - true if OpenBLAS and all required components found on the system | |||
| # OpenBLAS_VERSION - OpenBLAS version in format Major.Minor.Release | |||
| # OpenBLAS_INCLUDE_DIRS - Directory where OpenBLAS header is located. | |||
| # OpenBLAS_INCLUDE_DIR - same as DIRS | |||
| # OpenBLAS_LIBRARIES - OpenBLAS library to link against. | |||
| # OpenBLAS_LIBRARY - same as LIBRARIES | |||
| # | |||
| # | |||
| # Available components:: | |||
| # | |||
| ## shared - search for only shared library | |||
| ## static - search for only static library | |||
| # serial - search for unthreaded library | |||
| # pthread - search for native pthread threaded library | |||
| # openmp - search for OpenMP threaded library | |||
| # | |||
| # | |||
| # Exported targets:: | |||
| # | |||
| # If OpenBLAS is found, this module defines the following :prop_tgt:`IMPORTED` | |||
| ## target. Target is shared _or_ static, so, for both, use separate, not | |||
| ## overlapping, installations. :: | |||
| # | |||
| # OpenBLAS::OpenBLAS - the main OpenBLAS library #with header & defs attached. | |||
| # | |||
| # | |||
| # Suggested usage:: | |||
| # | |||
| # find_package(OpenBLAS) | |||
| # find_package(OpenBLAS 0.2.20 EXACT CONFIG REQUIRED COMPONENTS pthread) | |||
| # | |||
| # | |||
| # The following variables can be set to guide the search for this package:: | |||
| # | |||
| # OpenBLAS_DIR - CMake variable, set to directory containing this Config file | |||
| # CMAKE_PREFIX_PATH - CMake variable, set to root directory of this package | |||
| # PATH - environment variable, set to bin directory of this package | |||
| # CMAKE_DISABLE_FIND_PACKAGE_OpenBLAS - CMake variable, disables | |||
| # find_package(OpenBLAS) when not REQUIRED, perhaps to force internal build | |||
| @PACKAGE_INIT@ | |||
| set(PN OpenBLAS) | |||
| # need to check that the @USE_*@ evaluate to something cmake can perform boolean logic upon | |||
| if(@USE_OPENMP@) | |||
| set(${PN}_openmp_FOUND 1) | |||
| elseif(@USE_THREAD@) | |||
| set(${PN}_pthread_FOUND 1) | |||
| else() | |||
| set(${PN}_serial_FOUND 1) | |||
| endif() | |||
| check_required_components(${PN}) | |||
| #----------------------------------------------------------------------------- | |||
| # Don't include targets if this file is being picked up by another | |||
| # project which has already built this as a subproject | |||
| #----------------------------------------------------------------------------- | |||
| if(NOT TARGET ${PN}::OpenBLAS) | |||
| include("${CMAKE_CURRENT_LIST_DIR}/${PN}Targets.cmake") | |||
| get_property(_loc TARGET ${PN}::OpenBLAS PROPERTY LOCATION) | |||
| set(${PN}_LIBRARY ${_loc}) | |||
| get_property(_ill TARGET ${PN}::OpenBLAS PROPERTY INTERFACE_LINK_LIBRARIES) | |||
| set(${PN}_LIBRARIES ${_ill}) | |||
| get_property(_id TARGET ${PN}::OpenBLAS PROPERTY INCLUDE_DIRECTORIES) | |||
| set(${PN}_INCLUDE_DIR ${_id}) | |||
| get_property(_iid TARGET ${PN}::OpenBLAS PROPERTY INTERFACE_INCLUDE_DIRECTORIES) | |||
| set(${PN}_INCLUDE_DIRS ${_iid}) | |||
| endif() | |||
| @@ -44,18 +44,36 @@ endif () | |||
| if (DYNAMIC_ARCH) | |||
| if (ARM64) | |||
| set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99) | |||
| endif () | |||
| if (X86) | |||
| set(DYNAMIC_CORE KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO) | |||
| endif () | |||
| if (X86_64) | |||
| set(DYNAMIC_CORE PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO) | |||
| set(DYNAMIC_CORE PRESCOTT CORE2) | |||
| if (DYNAMIC_OLDER) | |||
| set (DYNAMIC_CORE ${DYNAMIC_CORE} PENRYN DUNNINGTON) | |||
| endif () | |||
| set (DYNAMIC_CORE ${DYNAMIC_CORE} NEHALEM) | |||
| if (DYNAMIC_OLDER) | |||
| set (DYNAMIC_CORE ${DYNAMIC_CORE} OPTERON OPTERON_SSE3) | |||
| endif () | |||
| set (DYNAMIC_CORE ${DYNAMIC_CORE} BARCELONA) | |||
| if (DYNAMIC_OLDER) | |||
| set (DYNAMIC_CORE ${DYNAMIC_CORE} BOBCAT ATOM NANO) | |||
| endif () | |||
| if (NOT NO_AVX) | |||
| set(DYNAMIC_CORE ${DYNAMIC_CORE} SANDYBRIDGE BULLDOZER PILEDRIVER STEAMROLLER EXCAVATOR) | |||
| endif () | |||
| if (NOT NO_AVX2) | |||
| set(DYNAMIC_CORE ${DYNAMIC_CORE} HASWELL ZEN) | |||
| endif () | |||
| if (NOT NO_AVX512) | |||
| set(DYNAMIC_CORE ${DYNAMIC_CORE} SKYLAKEX) | |||
| endif () | |||
| endif () | |||
| if (NOT DYNAMIC_CORE) | |||
| @@ -3,6 +3,11 @@ | |||
| ## Description: Ported from portion of OpenBLAS/Makefile.system | |||
| ## Sets Fortran related variables. | |||
| if (INTERFACE64) | |||
| set(SUFFIX64 64) | |||
| set(SUFFIX64_UNDERSCORE _64) | |||
| endif() | |||
| if (${F_COMPILER} STREQUAL "FLANG") | |||
| set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_FLANG") | |||
| if (BINARY64 AND INTERFACE64) | |||
| @@ -39,7 +44,7 @@ endif () | |||
| if (${F_COMPILER} STREQUAL "GFORTRAN") | |||
| set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_GFORT") | |||
| set(FCOMMON_OPT "${FCOMMON_OPT} -Wall") | |||
| set(FCOMMON_OPT "${FCOMMON_OPT} -Wall -frecursive") | |||
| #Don't include -lgfortran, when NO_LAPACK=1 or lsbcc | |||
| if (NOT NO_LAPACK) | |||
| set(EXTRALIB "{EXTRALIB} -lgfortran") | |||
| @@ -1,9 +1,11 @@ | |||
| libdir=@CMAKE_INSTALL_FULL_LIBDIR@ | |||
| libsuffix=@SUFFIX64_UNDERSCORE@ | |||
| includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@ | |||
| openblas_config=USE_64BITINT=@USE_64BITINT@ NO_CBLAS=@NO_CBLAS@ NO_LAPACK=@NO_LAPACK@ NO_LAPACKE=@NO_LAPACKE@ DYNAMIC_ARCH=@DYNAMIC_ARCH@ DYNAMIC_OLDER=@DYNAMIC_OLDER@ NO_AFFINITY=@NO_AFFINITY@ USE_OPENMP=@USE_OPENMP@ @CORE@ MAX_THREADS=@NUM_THREADS@ | |||
| Name: OpenBLAS | |||
| Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version | |||
| Version: @OPENBLAS_VERSION@ | |||
| URL: https://github.com/xianyi/OpenBLAS | |||
| Libs: -L${libdir} -lopenblas | |||
| Libs: -L${libdir} -lopenblas${libsuffix} | |||
| Cflags: -I${includedir} | |||
| @@ -85,15 +85,20 @@ if (NOT NOFORTRAN) | |||
| endif () | |||
| # Cannot run getarch on target if we are cross-compiling | |||
| if (DEFINED CORE AND CMAKE_CROSSCOMPILING) | |||
| if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSSTORE")) | |||
| # Write to config as getarch would | |||
| if (DEFINED TARGET_CORE) | |||
| set(TCORE ${TARGET_CORE}) | |||
| else() | |||
| set(TCORE ${CORE}) | |||
| endif() | |||
| # TODO: Set up defines that getarch sets up based on every other target | |||
| # Perhaps this should be inside a different file as it grows larger | |||
| file(APPEND ${TARGET_CONF_TEMP} | |||
| "#define ${CORE}\n" | |||
| "#define CHAR_CORENAME \"${CORE}\"\n") | |||
| if ("${CORE}" STREQUAL "ARMV7") | |||
| "#define ${TCORE}\n" | |||
| "#define CHAR_CORENAME \"${TCORE}\"\n") | |||
| if ("${TCORE}" STREQUAL "ARMV7") | |||
| file(APPEND ${TARGET_CONF_TEMP} | |||
| "#define L1_DATA_SIZE\t65536\n" | |||
| "#define L1_DATA_LINESIZE\t32\n" | |||
| @@ -108,7 +113,7 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING) | |||
| set(SGEMM_UNROLL_N 4) | |||
| set(DGEMM_UNROLL_M 4) | |||
| set(DGEMM_UNROLL_N 4) | |||
| elseif ("${CORE}" STREQUAL "ARMV8") | |||
| elseif ("${TCORE}" STREQUAL "ARMV8") | |||
| file(APPEND ${TARGET_CONF_TEMP} | |||
| "#define L1_DATA_SIZE\t32768\n" | |||
| "#define L1_DATA_LINESIZE\t64\n" | |||
| @@ -116,10 +121,45 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING) | |||
| "#define L2_LINESIZE\t64\n" | |||
| "#define DTB_DEFAULT_ENTRIES\t64\n" | |||
| "#define DTB_SIZE\t4096\n" | |||
| "#define L2_ASSOCIATIVE\t32\n") | |||
| set(SGEMM_UNROLL_M 4) | |||
| "#define L2_ASSOCIATIVE\t32\n" | |||
| "#define ARMV8\n") | |||
| set(SGEMM_UNROLL_M 16) | |||
| set(SGEMM_UNROLL_N 4) | |||
| elseif ("${CORE}" STREQUAL "CORTEXA57") | |||
| set(DGEMM_UNROLL_M 8) | |||
| set(DGEMM_UNROLL_N 4) | |||
| set(CGEMM_UNROLL_M 8) | |||
| set(CGEMM_UNROLL_N 4) | |||
| set(ZGEMM_UNROLL_M 4) | |||
| set(ZGEMM_UNROLL_N 4) | |||
| set(SYMV_P 16) | |||
| elseif ("${TCORE}" STREQUAL "CORTEXA57" OR "${TCORE}" STREQUAL "CORTEXA53") | |||
| file(APPEND ${TARGET_CONF_TEMP} | |||
| "#define L1_CODE_SIZE\t32768\n" | |||
| "#define L1_CODE_LINESIZE\t64\n" | |||
| "#define L1_CODE_ASSOCIATIVE\t3\n" | |||
| "#define L1_DATA_SIZE\t32768\n" | |||
| "#define L1_DATA_LINESIZE\t64\n" | |||
| "#define L1_DATA_ASSOCIATIVE\t2\n" | |||
| "#define L2_SIZE\t262144\n" | |||
| "#define L2_LINESIZE\t64\n" | |||
| "#define L2_ASSOCIATIVE\t16\n" | |||
| "#define DTB_DEFAULT_ENTRIES\t64\n" | |||
| "#define DTB_SIZE\t4096\n" | |||
| "#define HAVE_VFPV4\n" | |||
| "#define HAVE_VFPV3\n" | |||
| "#define HAVE_VFP\n" | |||
| "#define HAVE_NEON\n" | |||
| "#define ARMV8\n") | |||
| set(SGEMM_UNROLL_M 16) | |||
| set(SGEMM_UNROLL_N 4) | |||
| set(DGEMM_UNROLL_M 8) | |||
| set(DGEMM_UNROLL_N 4) | |||
| set(CGEMM_UNROLL_M 8) | |||
| set(CGEMM_UNROLL_N 4) | |||
| set(ZGEMM_UNROLL_M 4) | |||
| set(ZGEMM_UNROLL_N 4) | |||
| set(SYMV_P 16) | |||
| elseif ("${TCORE}" STREQUAL "CORTEXA72" OR "${TCORE}" STREQUAL "CORTEXA73") | |||
| file(APPEND ${TARGET_CONF_TEMP} | |||
| "#define L1_CODE_SIZE\t49152\n" | |||
| "#define L1_CODE_LINESIZE\t64\n" | |||
| @@ -127,7 +167,7 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING) | |||
| "#define L1_DATA_SIZE\t32768\n" | |||
| "#define L1_DATA_LINESIZE\t64\n" | |||
| "#define L1_DATA_ASSOCIATIVE\t2\n" | |||
| "#define L2_SIZE\t2097152\n" | |||
| "#define L2_SIZE\t524288\n" | |||
| "#define L2_LINESIZE\t64\n" | |||
| "#define L2_ASSOCIATIVE\t16\n" | |||
| "#define DTB_DEFAULT_ENTRIES\t64\n" | |||
| @@ -135,15 +175,97 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING) | |||
| "#define HAVE_VFPV4\n" | |||
| "#define HAVE_VFPV3\n" | |||
| "#define HAVE_VFP\n" | |||
| "#define HAVE_NEON\n") | |||
| "#define HAVE_NEON\n" | |||
| "#define ARMV8\n") | |||
| set(SGEMM_UNROLL_M 16) | |||
| set(SGEMM_UNROLL_N 4) | |||
| set(DGEMM_UNROLL_M 8) | |||
| set(DGEMM_UNROLL_N 4) | |||
| set(CGEMM_UNROLL_M 8) | |||
| set(CGEMM_UNROLL_N 4) | |||
| set(ZGEMM_UNROLL_M 4) | |||
| set(ZGEMM_UNROLL_N 4) | |||
| set(SYMV_P 16) | |||
| elseif ("${TCORE}" STREQUAL "FALKOR") | |||
| file(APPEND ${TARGET_CONF_TEMP} | |||
| "#define L1_CODE_SIZE\t65536\n" | |||
| "#define L1_CODE_LINESIZE\t64\n" | |||
| "#define L1_CODE_ASSOCIATIVE\t3\n" | |||
| "#define L1_DATA_SIZE\t32768\n" | |||
| "#define L1_DATA_LINESIZE\t128\n" | |||
| "#define L1_DATA_ASSOCIATIVE\t2\n" | |||
| "#define L2_SIZE\t524288\n" | |||
| "#define L2_LINESIZE\t64\n" | |||
| "#define L2_ASSOCIATIVE\t16\n" | |||
| "#define DTB_DEFAULT_ENTRIES\t64\n" | |||
| "#define DTB_SIZE\t4096\n" | |||
| "#define HAVE_VFPV4\n" | |||
| "#define HAVE_VFPV3\n" | |||
| "#define HAVE_VFP\n" | |||
| "#define HAVE_NEON\n" | |||
| "#define ARMV8\n") | |||
| set(SGEMM_UNROLL_M 16) | |||
| set(SGEMM_UNROLL_N 4) | |||
| set(DGEMM_UNROLL_M 8) | |||
| set(DGEMM_UNROLL_N 4) | |||
| set(CGEMM_UNROLL_M 8) | |||
| set(CGEMM_UNROLL_N 4) | |||
| set(ZGEMM_UNROLL_M 4) | |||
| set(ZGEMM_UNROLL_N 4) | |||
| set(SYMV_P 16) | |||
| elseif ("${TCORE}" STREQUAL "THUNDERX") | |||
| file(APPEND ${TARGET_CONF_TEMP} | |||
| "#define L1_CODE_SIZE\t32768\n" | |||
| "#define L1_CODE_LINESIZE\t64\n" | |||
| "#define L1_CODE_ASSOCIATIVE\t3\n" | |||
| "#define L1_DATA_SIZE\t32768\n" | |||
| "#define L1_DATA_LINESIZE\t128\n" | |||
| "#define L1_DATA_ASSOCIATIVE\t2\n" | |||
| "#define L2_SIZE\t167772164\n" | |||
| "#define L2_LINESIZE\t128\n" | |||
| "#define L2_ASSOCIATIVE\t16\n" | |||
| "#define DTB_DEFAULT_ENTRIES\t64\n" | |||
| "#define DTB_SIZE\t4096\n" | |||
| "#define HAVE_VFPV4\n" | |||
| "#define HAVE_VFPV3\n" | |||
| "#define HAVE_VFP\n" | |||
| "#define HAVE_NEON\n" | |||
| "#define ARMV8\n") | |||
| set(SGEMM_UNROLL_M 4) | |||
| set(SGEMM_UNROLL_N 4) | |||
| set(DGEMM_UNROLL_M 2) | |||
| set(DGEMM_UNROLL_N 2) | |||
| set(CGEMM_UNROLL_M 2) | |||
| set(CGEMM_UNROLL_N 2) | |||
| set(ZGEMM_UNROLL_M 2) | |||
| set(ZGEMM_UNROLL_N 2) | |||
| set(SYMV_P 16) | |||
| elseif ("${TCORE}" STREQUAL "THUNDERX2T99") | |||
| file(APPEND ${TARGET_CONF_TEMP} | |||
| "#define L1_CODE_SIZE\t32768\n" | |||
| "#define L1_CODE_LINESIZE\t64\n" | |||
| "#define L1_CODE_ASSOCIATIVE\t8\n" | |||
| "#define L1_DATA_SIZE\t32768\n" | |||
| "#define L1_DATA_LINESIZE\t64\n" | |||
| "#define L1_DATA_ASSOCIATIVE\t8\n" | |||
| "#define L2_SIZE\t262144\n" | |||
| "#define L2_LINESIZE\t64\n" | |||
| "#define L2_ASSOCIATIVE\t8\n" | |||
| "#define L3_SIZE\t33554432\n" | |||
| "#define L3_LINESIZE\t64\n" | |||
| "#define L3_ASSOCIATIVE\t32\n" | |||
| "#define DTB_DEFAULT_ENTRIES\t64\n" | |||
| "#define DTB_SIZE\t4096\n" | |||
| "#define ARMV8\n") | |||
| set(SGEMM_UNROLL_M 16) | |||
| set(SGEMM_UNROLL_N 4) | |||
| set(DGEMM_UNROLL_M 8) | |||
| set(DGEMM_UNROLL_N 4) | |||
| set(CGEMM_UNROLL_M 8) | |||
| set(CGEMM_UNROLL_N 4) | |||
| set(ZGEMM_UNROLL_M 8) | |||
| set(ZGEMM_UNROLL_M 4) | |||
| set(ZGEMM_UNROLL_N 4) | |||
| set(SYMV_P 16) | |||
| endif() | |||
| # Or should this actually be NUM_CORES? | |||
| @@ -163,6 +285,7 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING) | |||
| file(APPEND ${TARGET_CONF_TEMP} | |||
| "#define GEMM_MULTITHREAD_THRESHOLD\t${GEMM_MULTITHREAD_THRESHOLD}\n") | |||
| # Move to where gen_config_h would place it | |||
| file(MAKE_DIRECTORY ${TARGET_CONF_DIR}) | |||
| file(RENAME ${TARGET_CONF_TEMP} "${TARGET_CONF_DIR}/${TARGET_CONF}") | |||
| else(NOT CMAKE_CROSSCOMPILING) | |||
| @@ -33,7 +33,7 @@ endif () | |||
| if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32) | |||
| message(STATUS "Compiling a ${BINARY}-bit binary.") | |||
| set(NO_AVX 1) | |||
| if (${TARGET} STREQUAL "HASWELL" OR ${TARGET} STREQUAL "SANDYBRIDGE") | |||
| if (${TARGET} STREQUAL "HASWELL" OR ${TARGET} STREQUAL "SANDYBRIDGE" OR ${TARGET} STREQUAL "SKYLAKEX") | |||
| set(TARGET "NEHALEM") | |||
| endif () | |||
| if (${TARGET} STREQUAL "BULLDOZER" OR ${TARGET} STREQUAL "PILEDRIVER" OR ${TARGET} STREQUAL "ZEN") | |||
| @@ -41,6 +41,22 @@ if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32) | |||
| endif () | |||
| endif () | |||
| if (DEFINED TARGET) | |||
| if (${TARGET} STREQUAL "SKYLAKEX" AND NOT NO_AVX512) | |||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512") | |||
| endif() | |||
| if (${TARGET} STREQUAL "HASWELL" AND NOT NO_AVX2) | |||
| if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU") | |||
| execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) | |||
| if (${GCC_VERSION} VERSION_GREATER 4.7 OR ${GCC_VERSION} VERSION_EQUAL 4.7) | |||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2") | |||
| endif() | |||
| elseif (${CMAKE_C_COMPILER_ID} STREQUAL "CLANG") | |||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2") | |||
| endif() | |||
| endif() | |||
| endif() | |||
| if (DEFINED TARGET) | |||
| message(STATUS "Targeting the ${TARGET} architecture.") | |||
| set(GETARCH_FLAGS "-DFORCE_${TARGET}") | |||
| @@ -96,8 +112,12 @@ if (NOT CMAKE_CROSSCOMPILING) | |||
| endif() | |||
| if (NOT DEFINED NUM_PARALLEL) | |||
| set(NUM_PARALLEL 1) | |||
| endif() | |||
| if (NOT DEFINED NUM_THREADS) | |||
| if (NOT NUM_CORES EQUAL 0) | |||
| if (DEFINED NUM_CORES AND NOT NUM_CORES EQUAL 0) | |||
| # HT? | |||
| set(NUM_THREADS ${NUM_CORES}) | |||
| else () | |||
| @@ -159,6 +179,9 @@ endif () | |||
| if (DYNAMIC_ARCH) | |||
| set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_ARCH") | |||
| if (DYNAMIC_OLDER) | |||
| set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_OLDER") | |||
| endif () | |||
| endif () | |||
| if (NO_LAPACK) | |||
| @@ -207,6 +230,10 @@ if (CONSISTENT_FPCSR) | |||
| set(CCOMMON_OPT "${CCOMMON_OPT} -DCONSISTENT_FPCSR") | |||
| endif () | |||
| if (USE_TLS) | |||
| set(CCOMMON_OPT "${CCOMMON_OPT} -DUSE_TLS") | |||
| endif () | |||
| # Only for development | |||
| # set(CCOMMON_OPT "${CCOMMON_OPT} -DPARAMTEST") | |||
| # set(CCOMMON_OPT "${CCOMMON_OPT} -DPREFETCHTEST") | |||
| @@ -224,6 +251,8 @@ endif () | |||
| set(CCOMMON_OPT "${CCOMMON_OPT} -DMAX_CPU_NUMBER=${NUM_THREADS}") | |||
| set(CCOMMON_OPT "${CCOMMON_OPT} -DMAX_PARALLEL_NUMBER=${NUM_PARALLEL}") | |||
| if (USE_SIMPLE_THREADED_LEVEL3) | |||
| set(CCOMMON_OPT "${CCOMMON_OPT} -DUSE_SIMPLE_THREADED_LEVEL3") | |||
| endif () | |||
| @@ -291,6 +320,8 @@ if (MIXED_MEMORY_ALLOCATION) | |||
| set(CCOMMON_OPT "${CCOMMON_OPT} -DMIXED_MEMORY_ALLOCATION") | |||
| endif () | |||
| set(CCOMMON_OPT "${CCOMMON_OPT} -DVERSION=\"\\\"${OpenBLAS_VERSION}\\\"\"") | |||
| set(REVISION "-r${OpenBLAS_VERSION}") | |||
| set(MAJOR_VERSION ${OpenBLAS_MAJOR_VERSION}) | |||
| @@ -10,6 +10,16 @@ if (${HOST_OS} STREQUAL "WINDOWS") | |||
| set(HOST_OS WINNT) | |||
| endif () | |||
| if (${HOST_OS} STREQUAL "LINUX") | |||
| # check if we're building natively on Android (TERMUX) | |||
| EXECUTE_PROCESS( COMMAND uname -o COMMAND tr -d '\n' OUTPUT_VARIABLE OPERATING_SYSTEM) | |||
| if(${OPERATING_SYSTEM} MATCHES "Android") | |||
| set(HOST_OS ANDROID) | |||
| endif(${OPERATING_SYSTEM} MATCHES "Android") | |||
| endif() | |||
| if(CMAKE_COMPILER_IS_GNUCC AND WIN32) | |||
| execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpmachine | |||
| OUTPUT_VARIABLE OPENBLAS_GCC_TARGET_MACHINE | |||
| @@ -66,3 +76,12 @@ else() | |||
| set(BINARY32 1) | |||
| endif() | |||
| if (X86_64 OR X86) | |||
| file(WRITE ${PROJECT_BINARY_DIR}/avx512.tmp "#include <immintrin.h>\n\nint main(void){ __asm__ volatile(\"vbroadcastss -4 * 4(%rsi), %zmm2\"); }") | |||
| execute_process(COMMAND ${CMAKE_C_COMPILER} -march=skylake-avx512 -v -o ${PROJECT_BINARY_DIR}/avx512.o -x c ${PROJECT_BINARY_DIR}/avx512.tmp OUTPUT_QUIET ERROR_QUIET RESULT_VARIABLE NO_AVX512) | |||
| if (NO_AVX512 EQUAL 1) | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX512") | |||
| endif() | |||
| file(REMOVE "avx512.tmp" "avx512.o") | |||
| endif() | |||
| @@ -93,7 +93,7 @@ extern "C" { | |||
| #include <sched.h> | |||
| #endif | |||
| #if defined(OS_DARWIN) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(OS_ANDROID) | |||
| #if defined(OS_DARWIN) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_ANDROID) | |||
| #include <sched.h> | |||
| #endif | |||
| @@ -105,6 +105,10 @@ extern "C" { | |||
| #endif | |||
| #endif | |||
| #ifdef OS_HAIKU | |||
| #define NO_SYSV_IPC | |||
| #endif | |||
| #ifdef OS_WINDOWS | |||
| #ifdef ATOM | |||
| #define GOTO_ATOM ATOM | |||
| @@ -179,7 +183,7 @@ extern "C" { | |||
| #define ALLOCA_ALIGN 63UL | |||
| #define NUM_BUFFERS (MAX_CPU_NUMBER * 2) | |||
| #define NUM_BUFFERS MAX(50,(MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER)) | |||
| #ifdef NEEDBUNDERSCORE | |||
| #define BLASFUNC(FUNC) FUNC##_ | |||
| @@ -253,8 +257,14 @@ typedef unsigned long BLASULONG; | |||
| #ifdef USE64BITINT | |||
| typedef BLASLONG blasint; | |||
| #if defined(OS_WINDOWS) && defined(__64BIT__) | |||
| #define blasabs(x) llabs(x) | |||
| #else | |||
| #define blasabs(x) labs(x) | |||
| #endif | |||
| #else | |||
| typedef int blasint; | |||
| #define blasabs(x) abs(x) | |||
| #endif | |||
| #else | |||
| #ifdef USE64BITINT | |||
| @@ -642,6 +652,7 @@ void gotoblas_profile_init(void); | |||
| void gotoblas_profile_quit(void); | |||
| #ifdef USE_OPENMP | |||
| #ifndef C_MSVC | |||
| int omp_in_parallel(void); | |||
| int omp_get_num_procs(void); | |||
| @@ -649,6 +660,21 @@ int omp_get_num_procs(void); | |||
| __declspec(dllimport) int __cdecl omp_in_parallel(void); | |||
| __declspec(dllimport) int __cdecl omp_get_num_procs(void); | |||
| #endif | |||
| #if (__STDC_VERSION__ >= 201112L) | |||
| #if defined(C_GCC) && ( __GNUC__ < 7) | |||
| // workaround for GCC bug 65467 | |||
| #ifndef _Atomic | |||
| #define _Atomic volatile | |||
| #endif | |||
| #endif | |||
| #include <stdatomic.h> | |||
| #else | |||
| #ifndef _Atomic | |||
| #define _Atomic volatile | |||
| #endif | |||
| #endif | |||
| #else | |||
| #ifdef __ELF__ | |||
| int omp_in_parallel (void) __attribute__ ((weak)); | |||
| @@ -47,6 +47,14 @@ __global__ void cuda_dgemm_kernel(int, int, int, double *, double *, double *); | |||
| extern "C" { | |||
| #endif | |||
| extern void sgemm_kernel_direct(BLASLONG M, BLASLONG N, BLASLONG K, | |||
| float * A, BLASLONG strideA, | |||
| float * B, BLASLONG strideB, | |||
| float * R, BLASLONG strideR); | |||
| extern int sgemm_kernel_direct_performant(BLASLONG M, BLASLONG N, BLASLONG K); | |||
| int sgemm_beta(BLASLONG, BLASLONG, BLASLONG, float, | |||
| float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); | |||
| int dgemm_beta(BLASLONG, BLASLONG, BLASLONG, double, | |||
| @@ -94,7 +94,7 @@ static inline unsigned int rpcc(void){ | |||
| #define RPCC_DEFINED | |||
| #ifndef NO_AFFINITY | |||
| #define WHEREAMI | |||
| //#define WHEREAMI | |||
| static inline int WhereAmI(void){ | |||
| int ret=0; | |||
| __asm__ __volatile__(".set push \n" | |||
| @@ -47,14 +47,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| * - large enough to support all architectures and kernel | |||
| * Chosing a too small SIZE will lead to a stack smashing. | |||
| */ | |||
| #define STACK_ALLOC(SIZE, TYPE, BUFFER) \ | |||
| /* make it volatile because some function (ex: dgemv_n.S) */ \ | |||
| /* do not restore all register */ \ | |||
| volatile int stack_alloc_size = SIZE; \ | |||
| if(stack_alloc_size > MAX_STACK_ALLOC / sizeof(TYPE)) \ | |||
| stack_alloc_size = 0; \ | |||
| STACK_ALLOC_PROTECT_SET \ | |||
| TYPE stack_buffer[stack_alloc_size] __attribute__((aligned(0x20))); \ | |||
| #define STACK_ALLOC(SIZE, TYPE, BUFFER) \ | |||
| /* make it volatile because some function (ex: dgemv_n.S) */ \ | |||
| /* do not restore all register */ \ | |||
| volatile int stack_alloc_size = SIZE; \ | |||
| if (stack_alloc_size > MAX_STACK_ALLOC / sizeof(TYPE)) stack_alloc_size = 0; \ | |||
| STACK_ALLOC_PROTECT_SET \ | |||
| /* Avoid declaring an array of length 0 */ \ | |||
| TYPE stack_buffer[stack_alloc_size ? stack_alloc_size : 1] \ | |||
| __attribute__((aligned(0x20))); \ | |||
| BUFFER = stack_alloc_size ? stack_buffer : (TYPE *)blas_memory_alloc(1); | |||
| #else | |||
| //Original OpenBLAS/GotoBLAS codes. | |||
| @@ -178,7 +178,13 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ | |||
| result = x/y; | |||
| return result; | |||
| #else | |||
| #if (MAX_CPU_NUMBER > 64) | |||
| if ( y > 64) { | |||
| result = x/y; | |||
| return result; | |||
| } | |||
| #endif | |||
| y = blas_quick_divide_table[y]; | |||
| __asm__ __volatile__ ("mull %0" :"=d" (result) :"a"(x), "0" (y)); | |||
| @@ -327,7 +333,7 @@ REALNAME: | |||
| #endif | |||
| #endif | |||
| #if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(__ELF__) | |||
| #if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(OS_OPENBSD) || defined(__ELF__) | |||
| #define PROLOGUE \ | |||
| .text; \ | |||
| .align 16; \ | |||
| @@ -60,8 +60,13 @@ | |||
| #endif | |||
| */ | |||
| #define MB | |||
| #define WMB | |||
| #ifdef __GNUC__ | |||
| #define MB do { __asm__ __volatile__("": : :"memory"); } while (0) | |||
| #define WMB do { __asm__ __volatile__("": : :"memory"); } while (0) | |||
| #else | |||
| #define MB do {} while (0) | |||
| #define WMB do {} while (0) | |||
| #endif | |||
| static void __inline blas_lock(volatile BLASULONG *address){ | |||
| @@ -129,7 +134,7 @@ static __inline void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx){ | |||
| "=b" (*ebx), | |||
| "=c" (*ecx), | |||
| "=d" (*edx) | |||
| : "0" (op)); | |||
| : "0" (op), "c"(0)); | |||
| #endif | |||
| } | |||
| @@ -196,6 +201,13 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ | |||
| if (y <= 1) return x; | |||
| #if (MAX_CPU_NUMBER > 64) | |||
| if (y > 64) { | |||
| result = x / y; | |||
| return result; | |||
| } | |||
| #endif | |||
| y = blas_quick_divide_table[y]; | |||
| __asm__ __volatile__ ("mull %0" :"=d" (result) :"a"(x), "0" (y)); | |||
| @@ -403,7 +415,7 @@ REALNAME: | |||
| #define EPILOGUE .end | |||
| #endif | |||
| #if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(__ELF__) || defined(C_PGI) | |||
| #if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(__ELF__) || defined(C_PGI) | |||
| #define PROLOGUE \ | |||
| .text; \ | |||
| .align 512; \ | |||
| @@ -53,6 +53,7 @@ | |||
| #define VENDOR_SIS 8 | |||
| #define VENDOR_TRANSMETA 9 | |||
| #define VENDOR_NSC 10 | |||
| #define VENDOR_HYGON 11 | |||
| #define VENDOR_UNKNOWN 99 | |||
| #define BITMASK(a, b, c) ((((a) >> (b)) & (c))) | |||
| @@ -115,6 +116,8 @@ | |||
| #define CORE_STEAMROLLER 25 | |||
| #define CORE_EXCAVATOR 26 | |||
| #define CORE_ZEN 27 | |||
| #define CORE_SKYLAKEX 28 | |||
| #define CORE_DHYANA 29 | |||
| #define HAVE_SSE (1 << 0) | |||
| #define HAVE_SSE2 (1 << 1) | |||
| @@ -137,6 +140,8 @@ | |||
| #define HAVE_AVX (1 << 18) | |||
| #define HAVE_FMA4 (1 << 19) | |||
| #define HAVE_FMA3 (1 << 20) | |||
| #define HAVE_AVX512VL (1 << 21) | |||
| #define HAVE_AVX2 (1 << 22) | |||
| #define CACHE_INFO_L1_I 1 | |||
| #define CACHE_INFO_L1_D 2 | |||
| @@ -211,5 +216,9 @@ typedef struct { | |||
| #define CPUTYPE_STEAMROLLER 49 | |||
| #define CPUTYPE_EXCAVATOR 50 | |||
| #define CPUTYPE_ZEN 51 | |||
| #define CPUTYPE_SKYLAKEX 52 | |||
| #define CPUTYPE_DHYANA 53 | |||
| #define CPUTYPE_HYGON_UNKNOWN 54 | |||
| #endif | |||
| @@ -34,7 +34,7 @@ | |||
| #define CPU_CORTEXA15 4 | |||
| static char *cpuname[] = { | |||
| "UNKOWN", | |||
| "UNKNOWN", | |||
| "ARMV6", | |||
| "ARMV7", | |||
| "CORTEXA9", | |||
| @@ -29,25 +29,37 @@ | |||
| #define CPU_UNKNOWN 0 | |||
| #define CPU_ARMV8 1 | |||
| #define CPU_CORTEXA57 2 | |||
| #define CPU_VULCAN 3 | |||
| #define CPU_THUNDERX 4 | |||
| #define CPU_THUNDERX2T99 5 | |||
| // Arm | |||
| #define CPU_CORTEXA53 2 | |||
| #define CPU_CORTEXA57 3 | |||
| #define CPU_CORTEXA72 4 | |||
| #define CPU_CORTEXA73 5 | |||
| // Qualcomm | |||
| #define CPU_FALKOR 6 | |||
| // Cavium | |||
| #define CPU_THUNDERX 7 | |||
| #define CPU_THUNDERX2T99 8 | |||
| static char *cpuname[] = { | |||
| "UNKNOWN", | |||
| "ARMV8" , | |||
| "CORTEXA53", | |||
| "CORTEXA57", | |||
| "VULCAN", | |||
| "CORTEXA72", | |||
| "CORTEXA73", | |||
| "FALKOR", | |||
| "THUNDERX", | |||
| "THUNDERX2T99" | |||
| }; | |||
| static char *cpuname_lower[] = { | |||
| "unknown", | |||
| "armv8" , | |||
| "armv8", | |||
| "cortexa53", | |||
| "cortexa57", | |||
| "vulcan", | |||
| "cortexa72", | |||
| "cortexa73", | |||
| "falkor", | |||
| "thunderx", | |||
| "thunderx2t99" | |||
| }; | |||
| @@ -114,13 +126,24 @@ int detect(void) | |||
| fclose(infile); | |||
| if(cpu_part != NULL && cpu_implementer != NULL) { | |||
| if (strstr(cpu_part, "0xd07") && strstr(cpu_implementer, "0x41")) | |||
| return CPU_CORTEXA57; | |||
| else if (strstr(cpu_part, "0x516") && strstr(cpu_implementer, "0x42")) | |||
| return CPU_VULCAN; | |||
| else if (strstr(cpu_part, "0x0a1") && strstr(cpu_implementer, "0x43")) | |||
| // Arm | |||
| if (strstr(cpu_implementer, "0x41")) { | |||
| if (strstr(cpu_part, "0xd03")) | |||
| return CPU_CORTEXA53; | |||
| else if (strstr(cpu_part, "0xd07")) | |||
| return CPU_CORTEXA57; | |||
| else if (strstr(cpu_part, "0xd08")) | |||
| return CPU_CORTEXA72; | |||
| else if (strstr(cpu_part, "0xd09")) | |||
| return CPU_CORTEXA73; | |||
| } | |||
| // Qualcomm | |||
| else if (strstr(cpu_implementer, "0x51") && strstr(cpu_part, "0xc00")) | |||
| return CPU_FALKOR; | |||
| // Cavium | |||
| else if (strstr(cpu_implementer, "0x43") && strstr(cpu_part, "0x0a1")) | |||
| return CPU_THUNDERX; | |||
| else if (strstr(cpu_part, "0xFFF") && strstr(cpu_implementer, "0x43")) /* TODO */ | |||
| else if (strstr(cpu_implementer, "0x43") && strstr(cpu_part, "0x0af")) | |||
| return CPU_THUNDERX2T99; | |||
| } | |||
| @@ -179,64 +202,63 @@ void get_subdirname(void) | |||
| void get_cpuconfig(void) | |||
| { | |||
| // All arches should define ARMv8 | |||
| printf("#define ARMV8\n"); | |||
| printf("#define HAVE_NEON\n"); // This shouldn't be necessary | |||
| printf("#define HAVE_VFPV4\n"); // This shouldn't be necessary | |||
| int d = detect(); | |||
| switch (d) | |||
| { | |||
| case CPU_CORTEXA53: | |||
| printf("#define %s\n", cpuname[d]); | |||
| // Fall-through | |||
| case CPU_ARMV8: | |||
| printf("#define ARMV8\n"); | |||
| printf("#define L1_DATA_SIZE 32768\n"); | |||
| printf("#define L1_DATA_LINESIZE 64\n"); | |||
| printf("#define L2_SIZE 262144\n"); | |||
| printf("#define L2_LINESIZE 64\n"); | |||
| printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||
| printf("#define DTB_SIZE 4096\n"); | |||
| printf("#define L2_ASSOCIATIVE 4\n"); | |||
| break; | |||
| case CPU_VULCAN: | |||
| printf("#define VULCAN \n"); | |||
| printf("#define HAVE_VFP \n"); | |||
| printf("#define HAVE_VFPV3 \n"); | |||
| printf("#define HAVE_NEON \n"); | |||
| printf("#define HAVE_VFPV4 \n"); | |||
| printf("#define L1_CODE_SIZE 32768 \n"); | |||
| printf("#define L1_CODE_LINESIZE 64 \n"); | |||
| printf("#define L1_CODE_ASSOCIATIVE 8 \n"); | |||
| printf("#define L1_DATA_SIZE 32768 \n"); | |||
| printf("#define L1_DATA_LINESIZE 64 \n"); | |||
| printf("#define L1_DATA_ASSOCIATIVE 8 \n"); | |||
| printf("#define L2_SIZE 262144 \n"); | |||
| printf("#define L2_LINESIZE 64 \n"); | |||
| printf("#define L2_ASSOCIATIVE 8 \n"); | |||
| printf("#define L3_SIZE 33554432 \n"); | |||
| printf("#define L3_LINESIZE 64 \n"); | |||
| printf("#define L3_ASSOCIATIVE 32 \n"); | |||
| printf("#define DTB_DEFAULT_ENTRIES 64 \n"); | |||
| printf("#define DTB_SIZE 4096 \n"); | |||
| // Minimum parameters for ARMv8 (based on A53) | |||
| printf("#define L1_DATA_SIZE 32768\n"); | |||
| printf("#define L1_DATA_LINESIZE 64\n"); | |||
| printf("#define L2_SIZE 262144\n"); | |||
| printf("#define L2_LINESIZE 64\n"); | |||
| printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||
| printf("#define DTB_SIZE 4096\n"); | |||
| printf("#define L2_ASSOCIATIVE 4\n"); | |||
| break; | |||
| case CPU_CORTEXA57: | |||
| printf("#define CORTEXA57\n"); | |||
| printf("#define HAVE_VFP\n"); | |||
| printf("#define HAVE_VFPV3\n"); | |||
| printf("#define HAVE_NEON\n"); | |||
| printf("#define HAVE_VFPV4\n"); | |||
| case CPU_CORTEXA72: | |||
| case CPU_CORTEXA73: | |||
| // Common minimum settings for these Arm cores | |||
| // Can change a lot, but we need to be conservative | |||
| // TODO: detect info from /sys if possible | |||
| printf("#define %s\n", cpuname[d]); | |||
| printf("#define L1_CODE_SIZE 49152\n"); | |||
| printf("#define L1_CODE_LINESIZE 64\n"); | |||
| printf("#define L1_CODE_ASSOCIATIVE 3\n"); | |||
| printf("#define L1_DATA_SIZE 32768\n"); | |||
| printf("#define L1_DATA_LINESIZE 64\n"); | |||
| printf("#define L1_DATA_ASSOCIATIVE 2\n"); | |||
| printf("#define L2_SIZE 2097152\n"); | |||
| printf("#define L2_SIZE 524288\n"); | |||
| printf("#define L2_LINESIZE 64\n"); | |||
| printf("#define L2_ASSOCIATIVE 16\n"); | |||
| printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||
| printf("#define DTB_SIZE 4096\n"); | |||
| break; | |||
| case CPU_FALKOR: | |||
| printf("#define FALKOR\n"); | |||
| printf("#define L1_CODE_SIZE 65536\n"); | |||
| printf("#define L1_CODE_LINESIZE 64\n"); | |||
| printf("#define L1_DATA_SIZE 32768\n"); | |||
| printf("#define L1_DATA_LINESIZE 128\n"); | |||
| printf("#define L2_SIZE 524288\n"); | |||
| printf("#define L2_LINESIZE 64\n"); | |||
| printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||
| printf("#define DTB_SIZE 4096\n"); | |||
| printf("#define L2_ASSOCIATIVE 16\n"); | |||
| break; | |||
| case CPU_THUNDERX: | |||
| printf("#define ARMV8\n"); | |||
| printf("#define THUNDERX\n"); | |||
| printf("#define L1_DATA_SIZE 32768\n"); | |||
| printf("#define L1_DATA_LINESIZE 128\n"); | |||
| @@ -248,11 +270,7 @@ void get_cpuconfig(void) | |||
| break; | |||
| case CPU_THUNDERX2T99: | |||
| printf("#define VULCAN \n"); | |||
| printf("#define HAVE_VFP \n"); | |||
| printf("#define HAVE_VFPV3 \n"); | |||
| printf("#define HAVE_NEON \n"); | |||
| printf("#define HAVE_VFPV4 \n"); | |||
| printf("#define THUNDERX2T99 \n"); | |||
| printf("#define L1_CODE_SIZE 32768 \n"); | |||
| printf("#define L1_CODE_LINESIZE 64 \n"); | |||
| printf("#define L1_CODE_ASSOCIATIVE 8 \n"); | |||
| @@ -72,10 +72,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define CPU_UNKNOWN 0 | |||
| #define CPU_P5600 1 | |||
| #define CPU_1004K 2 | |||
| static char *cpuname[] = { | |||
| "UNKOWN", | |||
| "P5600" | |||
| "UNKNOWN", | |||
| "P5600", | |||
| "1004K" | |||
| }; | |||
| int detect(void){ | |||
| @@ -90,7 +92,7 @@ int detect(void){ | |||
| if (!strncmp("cpu", buffer, 3)){ | |||
| p = strchr(buffer, ':') + 2; | |||
| #if 0 | |||
| fprintf(stderr, "%s\n", p); | |||
| fprintf(stderr, "%s \n", p); | |||
| #endif | |||
| break; | |||
| } | |||
| @@ -99,43 +101,13 @@ int detect(void){ | |||
| fclose(infile); | |||
| if(p != NULL){ | |||
| if (strstr(p, "Loongson-3A")){ | |||
| return CPU_LOONGSON3A; | |||
| }else if(strstr(p, "Loongson-3B")){ | |||
| return CPU_LOONGSON3B; | |||
| }else if (strstr(p, "Loongson-3")){ | |||
| infile = fopen("/proc/cpuinfo", "r"); | |||
| p = (char *)NULL; | |||
| while (fgets(buffer, sizeof(buffer), infile)){ | |||
| if (!strncmp("system type", buffer, 11)){ | |||
| p = strchr(buffer, ':') + 2; | |||
| break; | |||
| } | |||
| } | |||
| fclose(infile); | |||
| if (strstr(p, "loongson3a")) | |||
| return CPU_LOONGSON3A; | |||
| }else{ | |||
| if (strstr(p, "5600")) { | |||
| return CPU_P5600; | |||
| } else if (strstr(p, "1004K")) { | |||
| return CPU_1004K; | |||
| } else | |||
| return CPU_UNKNOWN; | |||
| } | |||
| } | |||
| //Check model name for Loongson3 | |||
| infile = fopen("/proc/cpuinfo", "r"); | |||
| p = (char *)NULL; | |||
| while (fgets(buffer, sizeof(buffer), infile)){ | |||
| if (!strncmp("model name", buffer, 10)){ | |||
| p = strchr(buffer, ':') + 2; | |||
| break; | |||
| } | |||
| } | |||
| fclose(infile); | |||
| if(p != NULL){ | |||
| if (strstr(p, "Loongson-3A")){ | |||
| return CPU_LOONGSON3A; | |||
| }else if(strstr(p, "Loongson-3B")){ | |||
| return CPU_LOONGSON3B; | |||
| } | |||
| } | |||
| #endif | |||
| return CPU_UNKNOWN; | |||
| } | |||
| @@ -149,7 +121,7 @@ void get_architecture(void){ | |||
| } | |||
| void get_subarchitecture(void){ | |||
| if(detect()==CPU_P5600){ | |||
| if(detect()==CPU_P5600|| detect()==CPU_1004K){ | |||
| printf("P5600"); | |||
| }else{ | |||
| printf("UNKNOWN"); | |||
| @@ -170,6 +142,14 @@ void get_cpuconfig(void){ | |||
| printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||
| printf("#define DTB_SIZE 4096\n"); | |||
| printf("#define L2_ASSOCIATIVE 8\n"); | |||
| } else if (detect()==CPU_1004K) { | |||
| printf("#define MIPS1004K\n"); | |||
| printf("#define L1_DATA_SIZE 32768\n"); | |||
| printf("#define L1_DATA_LINESIZE 32\n"); | |||
| printf("#define L2_SIZE 26144\n"); | |||
| printf("#define DTB_DEFAULT_ENTRIES 8\n"); | |||
| printf("#define DTB_SIZE 4096\n"); | |||
| printf("#define L2_ASSOCIATIVE 4\n"); | |||
| }else{ | |||
| printf("#define UNKNOWN\n"); | |||
| } | |||
| @@ -178,6 +158,8 @@ void get_cpuconfig(void){ | |||
| void get_libname(void){ | |||
| if(detect()==CPU_P5600) { | |||
| printf("p5600\n"); | |||
| } else if (detect()==CPU_1004K) { | |||
| printf("1004K\n"); | |||
| }else{ | |||
| printf("mips\n"); | |||
| } | |||
| @@ -79,7 +79,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define CPU_I6500 6 | |||
| static char *cpuname[] = { | |||
| "UNKOWN", | |||
| "UNKNOWN", | |||
| "SICORTEX", | |||
| "LOONGSON3A", | |||
| "LOONGSON3B", | |||
| @@ -56,6 +56,7 @@ | |||
| #define CPUTYPE_CELL 6 | |||
| #define CPUTYPE_PPCG4 7 | |||
| #define CPUTYPE_POWER8 8 | |||
| #define CPUTYPE_POWER9 9 | |||
| char *cpuname[] = { | |||
| "UNKNOWN", | |||
| @@ -66,7 +67,8 @@ char *cpuname[] = { | |||
| "POWER6", | |||
| "CELL", | |||
| "PPCG4", | |||
| "POWER8" | |||
| "POWER8", | |||
| "POWER9" | |||
| }; | |||
| char *lowercpuname[] = { | |||
| @@ -78,7 +80,8 @@ char *lowercpuname[] = { | |||
| "power6", | |||
| "cell", | |||
| "ppcg4", | |||
| "power8" | |||
| "power8", | |||
| "power9" | |||
| }; | |||
| char *corename[] = { | |||
| @@ -90,7 +93,8 @@ char *corename[] = { | |||
| "POWER6", | |||
| "CELL", | |||
| "PPCG4", | |||
| "POWER8" | |||
| "POWER8", | |||
| "POWER8" | |||
| }; | |||
| int detect(void){ | |||
| @@ -120,6 +124,7 @@ int detect(void){ | |||
| if (!strncasecmp(p, "POWER6", 6)) return CPUTYPE_POWER6; | |||
| if (!strncasecmp(p, "POWER7", 6)) return CPUTYPE_POWER6; | |||
| if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER8; | |||
| if (!strncasecmp(p, "POWER9", 6)) return CPUTYPE_POWER8; | |||
| if (!strncasecmp(p, "Cell", 4)) return CPUTYPE_CELL; | |||
| if (!strncasecmp(p, "7447", 4)) return CPUTYPE_PPCG4; | |||
| @@ -127,6 +132,33 @@ int detect(void){ | |||
| #endif | |||
| #ifdef _AIX | |||
| FILE *infile; | |||
| char buffer[512], *p; | |||
| p = (char *)NULL; | |||
| infile = popen("prtconf|grep 'Processor Type'", "r"); | |||
| while (fgets(buffer, sizeof(buffer), infile)){ | |||
| if (!strncmp("Pro", buffer, 3)){ | |||
| p = strchr(buffer, ':') + 2; | |||
| #if 0 | |||
| fprintf(stderr, "%s\n", p); | |||
| #endif | |||
| break; | |||
| } | |||
| } | |||
| pclose(infile); | |||
| if (!strncasecmp(p, "POWER3", 6)) return CPUTYPE_POWER3; | |||
| if (!strncasecmp(p, "POWER4", 6)) return CPUTYPE_POWER4; | |||
| if (!strncasecmp(p, "PPC970", 6)) return CPUTYPE_PPC970; | |||
| if (!strncasecmp(p, "POWER5", 6)) return CPUTYPE_POWER5; | |||
| if (!strncasecmp(p, "POWER6", 6)) return CPUTYPE_POWER6; | |||
| if (!strncasecmp(p, "POWER7", 6)) return CPUTYPE_POWER6; | |||
| if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER8; | |||
| if (!strncasecmp(p, "POWER9", 6)) return CPUTYPE_POWER8; | |||
| if (!strncasecmp(p, "Cell", 4)) return CPUTYPE_CELL; | |||
| if (!strncasecmp(p, "7447", 4)) return CPUTYPE_PPCG4; | |||
| return CPUTYPE_POWER5; | |||
| #endif | |||
| @@ -142,6 +174,52 @@ int detect(void){ | |||
| return CPUTYPE_PPC970; | |||
| #endif | |||
| #if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__) | |||
| int id; | |||
| __asm __volatile("mfpvr %0" : "=r"(id)); | |||
| switch ( id >> 16 ) { | |||
| case 0x4e: // POWER9 | |||
| return CPUTYPE_POWER8; | |||
| break; | |||
| case 0x4d: | |||
| case 0x4b: // POWER8/8E | |||
| return CPUTYPE_POWER8; | |||
| break; | |||
| case 0x4a: | |||
| case 0x3f: // POWER7/7E | |||
| return CPUTYPE_POWER6; | |||
| break; | |||
| case 0x3e: | |||
| return CPUTYPE_POWER6; | |||
| break; | |||
| case 0x3a: | |||
| return CPUTYPE_POWER5; | |||
| break; | |||
| case 0x35: | |||
| case 0x38: // POWER4 /4+ | |||
| return CPUTYPE_POWER4; | |||
| break; | |||
| case 0x40: | |||
| case 0x41: // POWER3 /3+ | |||
| return CPUTYPE_POWER3; | |||
| break; | |||
| case 0x39: | |||
| case 0x3c: | |||
| case 0x44: | |||
| case 0x45: | |||
| return CPUTYPE_PPC970; | |||
| break; | |||
| case 0x70: | |||
| return CPUTYPE_CELL; | |||
| break; | |||
| case 0x8003: | |||
| return CPUTYPE_PPCG4; | |||
| break; | |||
| default: | |||
| return CPUTYPE_UNKNOWN; | |||
| } | |||
| #endif | |||
| } | |||
| void get_architecture(void){ | |||
| @@ -57,3 +57,8 @@ void get_cpuconfig(void){ | |||
| void get_libname(void){ | |||
| printf("v9\n"); | |||
| } | |||
| char *get_corename(void){ | |||
| return "sparc"; | |||
| } | |||
| @@ -50,6 +50,8 @@ | |||
| #ifdef NO_AVX | |||
| #define CPUTYPE_HASWELL CPUTYPE_NEHALEM | |||
| #define CORE_HASWELL CORE_NEHALEM | |||
| #define CPUTYPE_SKYLAKEX CPUTYPE_NEHALEM | |||
| #define CORE_SKYLAKEX CORE_NEHALEM | |||
| #define CPUTYPE_SANDYBRIDGE CPUTYPE_NEHALEM | |||
| #define CORE_SANDYBRIDGE CORE_NEHALEM | |||
| #define CPUTYPE_BULLDOZER CPUTYPE_BARCELONA | |||
| @@ -95,10 +97,10 @@ static C_INLINE void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx){ | |||
| ("mov %%ebx, %%edi;" | |||
| "cpuid;" | |||
| "xchgl %%ebx, %%edi;" | |||
| : "=a" (*eax), "=D" (*ebx), "=c" (*ecx), "=d" (*edx) : "a" (op) : "cc"); | |||
| : "=a" (*eax), "=D" (*ebx), "=c" (*ecx), "=d" (*edx) : "a" (op), "c" (0) : "cc"); | |||
| #else | |||
| __asm__ __volatile__ | |||
| ("cpuid": "=a" (*eax), "=b" (*ebx), "=c" (*ecx), "=d" (*edx) : "a" (op) : "cc"); | |||
| ("cpuid": "=a" (*eax), "=b" (*ebx), "=c" (*ecx), "=d" (*edx) : "a" (op) , "c" (0) : "cc"); | |||
| #endif | |||
| } | |||
| @@ -209,6 +211,44 @@ int support_avx(){ | |||
| #endif | |||
| } | |||
| int support_avx2(){ | |||
| #ifndef NO_AVX2 | |||
| int eax, ebx, ecx=0, edx; | |||
| int ret=0; | |||
| if (!support_avx()) | |||
| return 0; | |||
| cpuid(7, &eax, &ebx, &ecx, &edx); | |||
| if((ebx & (1<<7)) != 0) | |||
| ret=1; //OS supports AVX2 | |||
| return ret; | |||
| #else | |||
| return 0; | |||
| #endif | |||
| } | |||
| int support_avx512(){ | |||
| #ifndef NO_AVX512 | |||
| int eax, ebx, ecx, edx; | |||
| int ret=0; | |||
| if (!support_avx()) | |||
| return 0; | |||
| cpuid(7, &eax, &ebx, &ecx, &edx); | |||
| if((ebx & 32) != 32){ | |||
| ret=0; //OS does not even support AVX2 | |||
| } | |||
| if((ebx & (1<<31)) != 0){ | |||
| xgetbv(0, &eax, &edx); | |||
| if((eax & 0xe0) == 0xe0) | |||
| ret=1; //OS supports AVX512VL | |||
| } | |||
| return ret; | |||
| #else | |||
| return 0; | |||
| #endif | |||
| } | |||
| int get_vendor(void){ | |||
| int eax, ebx, ecx, edx; | |||
| @@ -231,6 +271,7 @@ int get_vendor(void){ | |||
| if (!strcmp(vendor, " SiS SiS SiS")) return VENDOR_SIS; | |||
| if (!strcmp(vendor, "GenuineTMx86")) return VENDOR_TRANSMETA; | |||
| if (!strcmp(vendor, "Geode by NSC")) return VENDOR_NSC; | |||
| if (!strcmp(vendor, "HygonGenuine")) return VENDOR_HYGON; | |||
| if ((eax == 0) || ((eax & 0x500) != 0)) return VENDOR_INTEL; | |||
| @@ -292,6 +333,8 @@ int get_cputype(int gettype){ | |||
| if ((ecx & (1 << 20)) != 0) feature |= HAVE_SSE4_2; | |||
| #ifndef NO_AVX | |||
| if (support_avx()) feature |= HAVE_AVX; | |||
| if (support_avx2()) feature |= HAVE_AVX2; | |||
| if (support_avx512()) feature |= HAVE_AVX512VL; | |||
| if ((ecx & (1 << 12)) != 0) feature |= HAVE_FMA3; | |||
| #endif | |||
| @@ -1004,7 +1047,9 @@ int get_cacheinfo(int type, cache_info_t *cacheinfo){ | |||
| } | |||
| } | |||
| if ((get_vendor() == VENDOR_AMD) || (get_vendor() == VENDOR_CENTAUR)) { | |||
| if ((get_vendor() == VENDOR_AMD) || | |||
| (get_vendor() == VENDOR_HYGON) || | |||
| (get_vendor() == VENDOR_CENTAUR)) { | |||
| cpuid(0x80000005, &eax, &ebx, &ecx, &edx); | |||
| LDTB.size = 4096; | |||
| @@ -1226,22 +1271,18 @@ int get_cpuname(void){ | |||
| return CPUTYPE_NEHALEM; | |||
| case 12: | |||
| case 15: | |||
| if(support_avx()) | |||
| #ifndef NO_AVX2 | |||
| if(support_avx2()) | |||
| return CPUTYPE_HASWELL; | |||
| #else | |||
| if(support_avx()) | |||
| return CPUTYPE_SANDYBRIDGE; | |||
| #endif | |||
| else | |||
| return CPUTYPE_NEHALEM; | |||
| case 13: | |||
| //Broadwell | |||
| if(support_avx()) | |||
| #ifndef NO_AVX2 | |||
| if(support_avx2()) | |||
| return CPUTYPE_HASWELL; | |||
| #else | |||
| if(support_avx()) | |||
| return CPUTYPE_SANDYBRIDGE; | |||
| #endif | |||
| else | |||
| return CPUTYPE_NEHALEM; | |||
| } | |||
| @@ -1250,33 +1291,27 @@ int get_cpuname(void){ | |||
| switch (model) { | |||
| case 5: | |||
| case 6: | |||
| if(support_avx()) | |||
| #ifndef NO_AVX2 | |||
| if(support_avx2()) | |||
| return CPUTYPE_HASWELL; | |||
| #else | |||
| if(support_avx()) | |||
| return CPUTYPE_SANDYBRIDGE; | |||
| #endif | |||
| else | |||
| return CPUTYPE_NEHALEM; | |||
| case 7: | |||
| case 15: | |||
| //Broadwell | |||
| if(support_avx()) | |||
| #ifndef NO_AVX2 | |||
| if(support_avx2()) | |||
| return CPUTYPE_HASWELL; | |||
| #else | |||
| if(support_avx()) | |||
| return CPUTYPE_SANDYBRIDGE; | |||
| #endif | |||
| else | |||
| return CPUTYPE_NEHALEM; | |||
| case 14: | |||
| //Skylake | |||
| if(support_avx()) | |||
| #ifndef NO_AVX2 | |||
| if(support_avx2()) | |||
| return CPUTYPE_HASWELL; | |||
| #else | |||
| if(support_avx()) | |||
| return CPUTYPE_SANDYBRIDGE; | |||
| #endif | |||
| else | |||
| return CPUTYPE_NEHALEM; | |||
| case 12: | |||
| @@ -1290,33 +1325,36 @@ int get_cpuname(void){ | |||
| switch (model) { | |||
| case 6: | |||
| //Broadwell | |||
| if(support_avx()) | |||
| #ifndef NO_AVX2 | |||
| if(support_avx2()) | |||
| return CPUTYPE_HASWELL; | |||
| #else | |||
| if(support_avx()) | |||
| return CPUTYPE_SANDYBRIDGE; | |||
| #endif | |||
| else | |||
| return CPUTYPE_NEHALEM; | |||
| case 5: | |||
| // Skylake X | |||
| if(support_avx512()) | |||
| return CPUTYPE_SKYLAKEX; | |||
| if(support_avx2()) | |||
| return CPUTYPE_HASWELL; | |||
| if(support_avx()) | |||
| return CPUTYPE_SANDYBRIDGE; | |||
| else | |||
| return CPUTYPE_NEHALEM; | |||
| case 14: | |||
| // Skylake | |||
| if(support_avx()) | |||
| #ifndef NO_AVX2 | |||
| if(support_avx2()) | |||
| return CPUTYPE_HASWELL; | |||
| #else | |||
| if(support_avx()) | |||
| return CPUTYPE_SANDYBRIDGE; | |||
| #endif | |||
| else | |||
| return CPUTYPE_NEHALEM; | |||
| case 7: | |||
| // Xeon Phi Knights Landing | |||
| if(support_avx()) | |||
| #ifndef NO_AVX2 | |||
| if(support_avx2()) | |||
| return CPUTYPE_HASWELL; | |||
| #else | |||
| if(support_avx()) | |||
| return CPUTYPE_SANDYBRIDGE; | |||
| #endif | |||
| else | |||
| return CPUTYPE_NEHALEM; | |||
| case 12: | |||
| @@ -1324,16 +1362,27 @@ int get_cpuname(void){ | |||
| return CPUTYPE_NEHALEM; | |||
| } | |||
| break; | |||
| case 6: | |||
| switch (model) { | |||
| case 6: // Cannon Lake | |||
| if(support_avx512()) | |||
| return CPUTYPE_SKYLAKEX; | |||
| if(support_avx2()) | |||
| return CPUTYPE_HASWELL; | |||
| if(support_avx()) | |||
| return CPUTYPE_SANDYBRIDGE; | |||
| else | |||
| return CPUTYPE_NEHALEM; | |||
| } | |||
| break; | |||
| case 9: | |||
| case 8: | |||
| switch (model) { | |||
| case 14: // Kaby Lake | |||
| if(support_avx()) | |||
| #ifndef NO_AVX2 | |||
| if(support_avx2()) | |||
| return CPUTYPE_HASWELL; | |||
| #else | |||
| if(support_avx()) | |||
| return CPUTYPE_SANDYBRIDGE; | |||
| #endif | |||
| else | |||
| return CPUTYPE_NEHALEM; | |||
| } | |||
| @@ -1420,6 +1469,8 @@ int get_cpuname(void){ | |||
| switch (model) { | |||
| case 1: | |||
| // AMD Ryzen | |||
| case 8: | |||
| // AMD Ryzen2 | |||
| if(support_avx()) | |||
| #ifndef NO_AVX2 | |||
| return CPUTYPE_ZEN; | |||
| @@ -1435,6 +1486,26 @@ int get_cpuname(void){ | |||
| return CPUTYPE_AMD_UNKNOWN; | |||
| } | |||
| if (vendor == VENDOR_HYGON){ | |||
| switch (family) { | |||
| case 0xf: | |||
| switch (exfamily) { | |||
| case 9: | |||
| //Hygon Dhyana | |||
| if(support_avx()) | |||
| #ifndef NO_AVX2 | |||
| return CPUTYPE_ZEN; | |||
| #else | |||
| return CPUTYPE_SANDYBRIDGE; // closer in architecture to Sandy Bridge than to Excavator | |||
| #endif | |||
| else | |||
| return CPUTYPE_BARCELONA; | |||
| } | |||
| break; | |||
| } | |||
| return CPUTYPE_HYGON_UNKNOWN; | |||
| } | |||
| if (vendor == VENDOR_CYRIX){ | |||
| switch (family) { | |||
| case 0x4: | |||
| @@ -1556,6 +1627,8 @@ static char *cpuname[] = { | |||
| "STEAMROLLER", | |||
| "EXCAVATOR", | |||
| "ZEN", | |||
| "SKYLAKEX", | |||
| "DHYANA" | |||
| }; | |||
| static char *lowercpuname[] = { | |||
| @@ -1610,10 +1683,12 @@ static char *lowercpuname[] = { | |||
| "steamroller", | |||
| "excavator", | |||
| "zen", | |||
| "skylakex", | |||
| "dhyana" | |||
| }; | |||
| static char *corename[] = { | |||
| "UNKOWN", | |||
| "UNKNOWN", | |||
| "80486", | |||
| "P5", | |||
| "P6", | |||
| @@ -1641,6 +1716,8 @@ static char *corename[] = { | |||
| "STEAMROLLER", | |||
| "EXCAVATOR", | |||
| "ZEN", | |||
| "SKYLAKEX", | |||
| "DHYANA" | |||
| }; | |||
| static char *corename_lower[] = { | |||
| @@ -1672,6 +1749,8 @@ static char *corename_lower[] = { | |||
| "steamroller", | |||
| "excavator", | |||
| "zen", | |||
| "skylakex", | |||
| "dhyana" | |||
| }; | |||
| @@ -1860,6 +1939,19 @@ int get_coretype(void){ | |||
| else | |||
| return CORE_NEHALEM; | |||
| case 5: | |||
| // Skylake X | |||
| #ifndef NO_AVX512 | |||
| return CORE_SKYLAKEX; | |||
| #else | |||
| if(support_avx()) | |||
| #ifndef NO_AVX2 | |||
| return CORE_HASWELL; | |||
| #else | |||
| return CORE_SANDYBRIDGE; | |||
| #endif | |||
| else | |||
| return CORE_NEHALEM; | |||
| #endif | |||
| case 14: | |||
| // Skylake | |||
| if(support_avx()) | |||
| @@ -1958,6 +2050,8 @@ int get_coretype(void){ | |||
| switch (model) { | |||
| case 1: | |||
| // AMD Ryzen | |||
| case 8: | |||
| // Ryzen 2 | |||
| if(support_avx()) | |||
| #ifndef NO_AVX2 | |||
| return CORE_ZEN; | |||
| @@ -1973,6 +2067,23 @@ int get_coretype(void){ | |||
| } | |||
| } | |||
| if (vendor == VENDOR_HYGON){ | |||
| if (family == 0xf){ | |||
| if (exfamily == 9) { | |||
| if(support_avx()) | |||
| #ifndef NO_AVX2 | |||
| return CORE_ZEN; | |||
| #else | |||
| return CORE_SANDYBRIDGE; // closer in architecture to Sandy Bridge than to Excavator | |||
| #endif | |||
| else | |||
| return CORE_BARCELONA; | |||
| } else { | |||
| return CORE_BARCELONA; | |||
| } | |||
| } | |||
| } | |||
| if (vendor == VENDOR_CENTAUR) { | |||
| switch (family) { | |||
| case 0x6: | |||
| @@ -2059,6 +2170,8 @@ void get_cpuconfig(void){ | |||
| if (features & HAVE_SSE4A) printf("#define HAVE_SSE4A\n"); | |||
| if (features & HAVE_SSE5 ) printf("#define HAVE_SSSE5\n"); | |||
| if (features & HAVE_AVX ) printf("#define HAVE_AVX\n"); | |||
| if (features & HAVE_AVX2 ) printf("#define HAVE_AVX2\n"); | |||
| if (features & HAVE_AVX512VL ) printf("#define HAVE_AVX512VL\n"); | |||
| if (features & HAVE_3DNOWEX) printf("#define HAVE_3DNOWEX\n"); | |||
| if (features & HAVE_3DNOW) printf("#define HAVE_3DNOW\n"); | |||
| if (features & HAVE_FMA4 ) printf("#define HAVE_FMA4\n"); | |||
| @@ -2127,6 +2240,8 @@ void get_sse(void){ | |||
| if (features & HAVE_SSE4A) printf("HAVE_SSE4A=1\n"); | |||
| if (features & HAVE_SSE5 ) printf("HAVE_SSSE5=1\n"); | |||
| if (features & HAVE_AVX ) printf("HAVE_AVX=1\n"); | |||
| if (features & HAVE_AVX2 ) printf("HAVE_AVX2=1\n"); | |||
| if (features & HAVE_AVX512VL ) printf("HAVE_AVX512VL=1\n"); | |||
| if (features & HAVE_3DNOWEX) printf("HAVE_3DNOWEX=1\n"); | |||
| if (features & HAVE_3DNOW) printf("HAVE_3DNOW=1\n"); | |||
| if (features & HAVE_FMA4 ) printf("HAVE_FMA4=1\n"); | |||
| @@ -29,15 +29,18 @@ | |||
| #define CPU_GENERIC 0 | |||
| #define CPU_Z13 1 | |||
| #define CPU_Z14 2 | |||
| static char *cpuname[] = { | |||
| "ZARCH_GENERIC", | |||
| "Z13" | |||
| "Z13", | |||
| "Z14" | |||
| }; | |||
| static char *cpuname_lower[] = { | |||
| "zarch_generic", | |||
| "z13" | |||
| "z13", | |||
| "z14" | |||
| }; | |||
| int detect(void) | |||
| @@ -62,6 +65,10 @@ int detect(void) | |||
| if (strstr(p, "2964")) return CPU_Z13; | |||
| if (strstr(p, "2965")) return CPU_Z13; | |||
| /* detect z14, but fall back to z13 */ | |||
| if (strstr(p, "3906")) return CPU_Z13; | |||
| if (strstr(p, "3907")) return CPU_Z13; | |||
| return CPU_GENERIC; | |||
| } | |||
| @@ -107,5 +114,9 @@ void get_cpuconfig(void) | |||
| printf("#define Z13\n"); | |||
| printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||
| break; | |||
| case CPU_Z14: | |||
| printf("#define Z14\n"); | |||
| printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||
| break; | |||
| } | |||
| } | |||
| @@ -60,6 +60,14 @@ OS_FREEBSD | |||
| OS_NETBSD | |||
| #endif | |||
| #if defined(__OpenBSD__) | |||
| OS_OPENBSD | |||
| #endif | |||
| #if defined(__DragonFly__) | |||
| OS_DRAGONFLY | |||
| #endif | |||
| #if defined(__sun) | |||
| OS_SUNOS | |||
| #endif | |||
| @@ -93,6 +101,10 @@ OS_INTERIX | |||
| OS_LINUX | |||
| #endif | |||
| #if defined(__HAIKU__) | |||
| OS_HAIKU | |||
| #endif | |||
| #if defined(__i386) || defined(_X86) | |||
| ARCH_X86 | |||
| #endif | |||
| @@ -102,7 +102,13 @@ clean :: | |||
| rm -f x* | |||
| FLDFLAGS = $(FFLAGS:-fPIC=) $(LDFLAGS) | |||
| CEXTRALIB = | |||
| ifeq ($(USE_OPENMP), 1) | |||
| ifeq ($(F_COMPILER), GFORTRAN) | |||
| ifeq ($(C_COMPILER), CLANG) | |||
| CEXTRALIB = -lomp | |||
| endif | |||
| endif | |||
| endif | |||
| # Single real | |||
| xscblat1: $(stestl1o) c_sblat1.o $(TOPDIR)/$(LIBNAME) | |||
| @@ -62,9 +62,36 @@ | |||
| #endif | |||
| #endif | |||
| #ifndef TRANSA | |||
| #ifndef thread_local | |||
| # if __STDC_VERSION__ >= 201112 && !defined __STDC_NO_THREADS__ | |||
| # define thread_local _Thread_local | |||
| # elif defined _WIN32 && ( \ | |||
| defined _MSC_VER || \ | |||
| defined __ICL || \ | |||
| defined __DMC__ || \ | |||
| defined __BORLANDC__ ) | |||
| # define thread_local __declspec(thread) | |||
| /* note that ICC (linux) and Clang are covered by __GNUC__ */ | |||
| # elif defined __GNUC__ || \ | |||
| defined __SUNPRO_C || \ | |||
| defined __xlC__ | |||
| # define thread_local __thread | |||
| # else | |||
| # define UNSAFE | |||
| #endif | |||
| #endif | |||
| #if defined USE_OPENMP | |||
| #undef UNSAFE | |||
| #endif | |||
| #if !defined(TRANSA) && !defined(UNSAFE) | |||
| #define Y_DUMMY_NUM 1024 | |||
| #if defined(USE_OPENMP) | |||
| static FLOAT y_dummy[Y_DUMMY_NUM]; | |||
| #pragma omp threadprivate(y_dummy) | |||
| # else | |||
| static thread_local FLOAT y_dummy[Y_DUMMY_NUM]; | |||
| # endif | |||
| #endif | |||
| static int gemv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){ | |||
| @@ -105,10 +132,12 @@ static int gemv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F | |||
| #ifdef TRANSA | |||
| y += n_from * incy * COMPSIZE; | |||
| #else | |||
| # ifndef UNSAFE | |||
| //for split matrix row (n) direction and vector x of gemv_n | |||
| x += n_from * incx * COMPSIZE; | |||
| //store partial result for every thread | |||
| y += (m_to - m_from) * 1 * COMPSIZE * pos; | |||
| # endif | |||
| #endif | |||
| } | |||
| @@ -136,7 +165,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x | |||
| BLASLONG width, i, num_cpu; | |||
| #ifndef TRANSA | |||
| #if !defined(TRANSA) && !defined(UNSAFE) | |||
| int split_x=0; | |||
| #endif | |||
| @@ -212,7 +241,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x | |||
| i -= width; | |||
| } | |||
| #ifndef TRANSA | |||
| #if !defined(TRANSA) && !defined(UNSAFE) | |||
| //try to split matrix on row direction and x. | |||
| //Then, reduction. | |||
| if (num_cpu < nthreads) { | |||
| @@ -272,7 +301,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x | |||
| exec_blas(num_cpu, queue); | |||
| } | |||
| #ifndef TRANSA | |||
| #if !defined(TRANSA) && !defined(UNSAFE) | |||
| if(split_x==1){ | |||
| //reduction | |||
| for(i=0; i<num_cpu; i++){ | |||
| @@ -54,16 +54,12 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, FLOAT *bu | |||
| COPY_K(m, b, incb, buffer, 1); | |||
| } | |||
| /*FIXME the GEMV unrolling performed here was found to be broken, see issue 1332 */ | |||
| /* Multiplying DTB size by 100 is just a quick-and-dirty hack to disable it for now[B */ | |||
| for (is = 0; is < m; is += DTB_ENTRIES){ | |||
| for (is = 0; is < m; is += DTB_ENTRIES * 100){ | |||
| min_i = MIN(m - is, DTB_ENTRIES * 100); | |||
| min_i = MIN(m - is, DTB_ENTRIES); | |||
| #ifndef TRANSA | |||
| if (is > 0){ | |||
| fprintf(stderr,"WARNING unrolling of the trmv_U loop may give wrong results\n"); | |||
| if (is > 0){ | |||
| GEMV_N(is, min_i, 0, dp1, | |||
| a + is * lda, lda, | |||
| B + is, 1, | |||
| @@ -362,7 +362,7 @@ cgemm_ct.$(SUFFIX) : gemm.c level3.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $(@F) | |||
| cgemm_cr.$(SUFFIX) : gemm.c level3.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR $< -o $(@F) | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) | |||
| cgemm_cc.$(SUFFIX) : gemm.c level3.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $(@F) | |||
| @@ -410,7 +410,7 @@ zgemm_ct.$(SUFFIX) : gemm.c level3.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $(@F) | |||
| zgemm_cr.$(SUFFIX) : gemm.c level3.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR $< -o $(@F) | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) | |||
| zgemm_cc.$(SUFFIX) : gemm.c level3.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $(@F) | |||
| @@ -458,7 +458,7 @@ xgemm_ct.$(SUFFIX) : gemm.c level3.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F) | |||
| xgemm_cr.$(SUFFIX) : gemm.c level3.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F) | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) | |||
| xgemm_cc.$(SUFFIX) : gemm.c level3.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F) | |||
| @@ -558,7 +558,7 @@ cgemm_thread_ct.$(SUFFIX) : gemm.c level3_thread.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCT $< -o $(@F) | |||
| cgemm_thread_cr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR $< -o $(@F) | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) | |||
| cgemm_thread_cc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCC $< -o $(@F) | |||
| @@ -606,7 +606,7 @@ zgemm_thread_ct.$(SUFFIX) : gemm.c level3_thread.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCT $< -o $(@F) | |||
| zgemm_thread_cr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR $< -o $(@F) | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) | |||
| zgemm_thread_cc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCC $< -o $(@F) | |||
| @@ -654,7 +654,7 @@ xgemm_thread_ct.$(SUFFIX) : gemm.c level3_thread.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F) | |||
| xgemm_thread_cr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F) | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) | |||
| xgemm_thread_cc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F) | |||
| @@ -1821,7 +1821,7 @@ cgemm3m_ct.$(SUFFIX) : gemm3m.c gemm3m_level3.c | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $(@F) | |||
| cgemm3m_cr.$(SUFFIX) : gemm3m.c gemm3m_level3.c | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR $< -o $(@F) | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) | |||
| cgemm3m_cc.$(SUFFIX) : gemm3m.c gemm3m_level3.c | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $(@F) | |||
| @@ -1869,7 +1869,7 @@ zgemm3m_ct.$(SUFFIX) : gemm3m.c gemm3m_level3.c | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $(@F) | |||
| zgemm3m_cr.$(SUFFIX) : gemm3m.c gemm3m_level3.c | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR $< -o $(@F) | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) | |||
| zgemm3m_cc.$(SUFFIX) : gemm3m.c gemm3m_level3.c | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $(@F) | |||
| @@ -1917,7 +1917,7 @@ xgemm3m_ct.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F) | |||
| xgemm3m_cr.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F) | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) | |||
| xgemm3m_cc.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F) | |||
| @@ -1974,7 +1974,7 @@ cgemm3m_thread_ct.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCT $< -o $(@F) | |||
| cgemm3m_thread_cr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR $< -o $(@F) | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) | |||
| cgemm3m_thread_cc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCC $< -o $(@F) | |||
| @@ -2022,7 +2022,7 @@ zgemm3m_thread_ct.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCT $< -o $(@F) | |||
| zgemm3m_thread_cr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR $< -o $(@F) | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) | |||
| zgemm3m_thread_cc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCC $< -o $(@F) | |||
| @@ -2070,7 +2070,7 @@ xgemm3m_thread_ct.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F) | |||
| xgemm3m_thread_cr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F) | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) | |||
| xgemm3m_thread_cc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F) | |||
| @@ -2731,7 +2731,7 @@ cgemm_ct.$(PSUFFIX) : gemm.c level3.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $(@F) | |||
| cgemm_cr.$(PSUFFIX) : gemm.c level3.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR $< -o $(@F) | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) | |||
| cgemm_cc.$(PSUFFIX) : gemm.c level3.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $(@F) | |||
| @@ -2779,7 +2779,7 @@ zgemm_ct.$(PSUFFIX) : gemm.c level3.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $(@F) | |||
| zgemm_cr.$(PSUFFIX) : gemm.c level3.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR $< -o $(@F) | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) | |||
| zgemm_cc.$(PSUFFIX) : gemm.c level3.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $(@F) | |||
| @@ -2827,7 +2827,7 @@ xgemm_ct.$(PSUFFIX) : gemm.c level3.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F) | |||
| xgemm_cr.$(PSUFFIX) : gemm.c level3.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F) | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) | |||
| xgemm_cc.$(PSUFFIX) : gemm.c level3.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F) | |||
| @@ -2927,7 +2927,7 @@ cgemm_thread_ct.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCT $< -o $(@F) | |||
| cgemm_thread_cr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR $< -o $(@F) | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) | |||
| cgemm_thread_cc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCC $< -o $(@F) | |||
| @@ -2975,7 +2975,7 @@ zgemm_thread_ct.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCT $< -o $(@F) | |||
| zgemm_thread_cr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR $< -o $(@F) | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) | |||
| zgemm_thread_cc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCC $< -o $(@F) | |||
| @@ -3023,7 +3023,7 @@ xgemm_thread_ct.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F) | |||
| xgemm_thread_cr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F) | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) | |||
| xgemm_thread_cc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F) | |||
| @@ -4190,7 +4190,7 @@ cgemm3m_ct.$(PSUFFIX) : gemm3m.c gemm3m_level3.c | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $(@F) | |||
| cgemm3m_cr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR $< -o $(@F) | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) | |||
| cgemm3m_cc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $(@F) | |||
| @@ -4238,7 +4238,7 @@ zgemm3m_ct.$(PSUFFIX) : gemm3m.c gemm3m_level3.c | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $(@F) | |||
| zgemm3m_cr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR $< -o $(@F) | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) | |||
| zgemm3m_cc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $(@F) | |||
| @@ -4286,7 +4286,7 @@ xgemm3m_ct.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F) | |||
| xgemm3m_cr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F) | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) | |||
| xgemm3m_cc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F) | |||
| @@ -4343,7 +4343,7 @@ cgemm3m_thread_ct.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCT $< -o $(@F) | |||
| cgemm3m_thread_cr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR $< -o $(@F) | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) | |||
| cgemm3m_thread_cc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCC $< -o $(@F) | |||
| @@ -4391,7 +4391,7 @@ zgemm3m_thread_ct.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCT $< -o $(@F) | |||
| zgemm3m_thread_cr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR $< -o $(@F) | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) | |||
| zgemm3m_thread_cc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCC $< -o $(@F) | |||
| @@ -4439,7 +4439,7 @@ xgemm3m_thread_ct.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F) | |||
| xgemm3m_thread_cr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F) | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) | |||
| xgemm3m_thread_cc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F) | |||
| @@ -91,7 +91,12 @@ | |||
| #endif | |||
| typedef struct { | |||
| volatile BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; | |||
| #if __STDC_VERSION__ >= 201112L | |||
| _Atomic | |||
| #else | |||
| volatile | |||
| #endif | |||
| BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; | |||
| } job_t; | |||
| @@ -67,7 +67,12 @@ | |||
| #endif | |||
| typedef struct { | |||
| volatile BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; | |||
| #if __STDC_VERSION__ >= 201112L | |||
| _Atomic | |||
| #else | |||
| volatile | |||
| #endif | |||
| BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; | |||
| } job_t; | |||
| @@ -48,6 +48,10 @@ | |||
| #define SWITCH_RATIO 2 | |||
| #endif | |||
| #ifndef GEMM_PREFERED_SIZE | |||
| #define GEMM_PREFERED_SIZE 1 | |||
| #endif | |||
| //The array of job_t may overflow the stack. | |||
| //Instead, use malloc to alloc job_t. | |||
| #if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD | |||
| @@ -91,7 +95,8 @@ | |||
| #endif | |||
| typedef struct { | |||
| volatile BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; | |||
| volatile | |||
| BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; | |||
| } job_t; | |||
| @@ -346,7 +351,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||
| /* Make sure if no one is using workspace */ | |||
| START_RPCC(); | |||
| for (i = 0; i < args -> nthreads; i++) | |||
| while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;}; | |||
| while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;}; | |||
| STOP_RPCC(waiting1); | |||
| #if defined(FUSED_GEMM) && !defined(TIMING) | |||
| @@ -408,7 +413,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||
| /* Wait until other region of B is initialized */ | |||
| START_RPCC(); | |||
| while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;}; | |||
| while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;MB;}; | |||
| STOP_RPCC(waiting2); | |||
| /* Apply kernel with local region of A and part of other region of B */ | |||
| @@ -426,6 +431,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||
| /* Clear synchronization flag if this thread is done with other region of B */ | |||
| if (m_to - m_from == min_i) { | |||
| job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; | |||
| WMB; | |||
| } | |||
| } | |||
| } while (current != mypos); | |||
| @@ -487,7 +493,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||
| START_RPCC(); | |||
| for (i = 0; i < args -> nthreads; i++) { | |||
| for (js = 0; js < DIVIDE_RATE; js++) { | |||
| while (job[mypos].working[i][CACHE_LINE_SIZE * js] ) {YIELDING;}; | |||
| while (job[mypos].working[i][CACHE_LINE_SIZE * js] ) {YIELDING;MB;}; | |||
| } | |||
| } | |||
| STOP_RPCC(waiting3); | |||
| @@ -508,10 +514,29 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||
| return 0; | |||
| } | |||
| static int round_up(int remainder, int width, int multiple) | |||
| { | |||
| if (multiple > remainder || width <= multiple) | |||
| return width; | |||
| width = (width + multiple - 1) / multiple; | |||
| width = width * multiple; | |||
| return width; | |||
| } | |||
| static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG | |||
| *range_n, FLOAT *sa, FLOAT *sb, | |||
| BLASLONG nthreads_m, BLASLONG nthreads_n) { | |||
| #ifndef USE_OPENMP | |||
| #ifndef OS_WINDOWS | |||
| static pthread_mutex_t level3_lock = PTHREAD_MUTEX_INITIALIZER; | |||
| #else | |||
| CRITICAL_SECTION level3_lock; | |||
| InitializeCriticalSection((PCRITICAL_SECTION)&level3_lock); | |||
| #endif | |||
| #endif | |||
| blas_arg_t newarg; | |||
| #ifndef USE_ALLOC_HEAP | |||
| @@ -552,6 +577,14 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG | |||
| #endif | |||
| #endif | |||
| #ifndef USE_OPENMP | |||
| #ifndef OS_WINDOWS | |||
| pthread_mutex_lock(&level3_lock); | |||
| #else | |||
| EnterCriticalSection((PCRITICAL_SECTION)&level3_lock); | |||
| #endif | |||
| #endif | |||
| #ifdef USE_ALLOC_HEAP | |||
| /* Dynamically allocate workspace */ | |||
| job = (job_t*)malloc(MAX_CPU_NUMBER * sizeof(job_t)); | |||
| @@ -599,9 +632,14 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG | |||
| num_parts = 0; | |||
| while (m > 0){ | |||
| width = blas_quickdivide(m + nthreads_m - num_parts - 1, nthreads_m - num_parts); | |||
| width = round_up(m, width, GEMM_PREFERED_SIZE); | |||
| m -= width; | |||
| if (m < 0) width = width + m; | |||
| range_M[num_parts + 1] = range_M[num_parts] + width; | |||
| num_parts ++; | |||
| } | |||
| for (i = num_parts; i < MAX_CPU_NUMBER; i++) { | |||
| @@ -643,9 +681,12 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG | |||
| if (width < SWITCH_RATIO) { | |||
| width = SWITCH_RATIO; | |||
| } | |||
| width = round_up(n, width, GEMM_PREFERED_SIZE); | |||
| n -= width; | |||
| if (n < 0) width = width + n; | |||
| range_N[num_parts + 1] = range_N[num_parts] + width; | |||
| num_parts ++; | |||
| } | |||
| for (j = num_parts; j < MAX_CPU_NUMBER; j++) { | |||
| @@ -653,8 +694,8 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG | |||
| } | |||
| /* Clear synchronization flags */ | |||
| for (i = 0; i < MAX_CPU_NUMBER; i++) { | |||
| for (j = 0; j < MAX_CPU_NUMBER; j++) { | |||
| for (i = 0; i < nthreads; i++) { | |||
| for (j = 0; j < nthreads; j++) { | |||
| for (k = 0; k < DIVIDE_RATE; k++) { | |||
| job[i].working[j][CACHE_LINE_SIZE * k] = 0; | |||
| } | |||
| @@ -669,6 +710,14 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG | |||
| free(job); | |||
| #endif | |||
| #ifndef USE_OPENMP | |||
| #ifndef OS_WINDOWS | |||
| pthread_mutex_unlock(&level3_lock); | |||
| #else | |||
| LeaveCriticalSection((PCRITICAL_SECTION)&level3_lock); | |||
| #endif | |||
| #endif | |||
| return 0; | |||
| } | |||
| @@ -48,7 +48,7 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int ( | |||
| BLASLONG width, i; | |||
| BLASLONG n_from, n_to; | |||
| double dnum, nf, nt, di; | |||
| double dnum, nf, nt, di, dinum; | |||
| int num_cpu; | |||
| int mask = 0; | |||
| @@ -109,7 +109,11 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int ( | |||
| if (nthreads - num_cpu > 1) { | |||
| di = (double)i; | |||
| width = (BLASLONG)(( sqrt(di * di + dnum) - di + mask)/(mask+1)) * (mask+1); | |||
| dinum = di * di +dnum; | |||
| if (dinum <0) | |||
| width = (BLASLONG)(( - di + mask)/(mask+1)) * (mask+1); | |||
| else | |||
| width = (BLASLONG)(( sqrt(dinum) - di + mask)/(mask+1)) * (mask+1); | |||
| if ((width <= 0) || (width > n_to - i)) width = n_to - i; | |||
| @@ -136,9 +140,7 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int ( | |||
| nf = (double)(arg -> n - n_from); | |||
| nt = (double)(arg -> n - n_to); | |||
| dnum = (nt * nt - nf * nf) / (double)nthreads; | |||
| num_cpu = 0; | |||
| range[0] = n_from; | |||
| @@ -149,8 +151,11 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int ( | |||
| if (nthreads - num_cpu > 1) { | |||
| di = (double)(arg -> n - i); | |||
| width = ((BLASLONG)((-sqrt(di * di + dnum) + di) + mask)/(mask+1)) * (mask+1); | |||
| dinum = di * di + dnum; | |||
| if (dinum<0) | |||
| width = ((BLASLONG)(di + mask)/(mask+1)) * (mask+1); | |||
| else | |||
| width = ((BLASLONG)((-sqrt(dinum) + di) + mask)/(mask+1)) * (mask+1); | |||
| if ((width <= 0) || (width > n_to - i)) width = n_to - i; | |||
| } else { | |||
| @@ -47,7 +47,11 @@ GenerateNamedObjects("abs.c" "DOUBLE" "z_abs" 0 "" "" 1) | |||
| GenerateNamedObjects("openblas_get_config.c;openblas_get_parallel.c" "" "" 0 "" "" 1) | |||
| if (DYNAMIC_ARCH) | |||
| list(APPEND COMMON_SOURCES dynamic.c) | |||
| if (ARM64) | |||
| list(APPEND COMMON_SOURCES dynamic_arm64.c) | |||
| else () | |||
| list(APPEND COMMON_SOURCES dynamic.c) | |||
| endif () | |||
| else () | |||
| list(APPEND COMMON_SOURCES parameter.c) | |||
| endif () | |||
| @@ -15,7 +15,11 @@ endif | |||
| # COMMONOBJS += info.$(SUFFIX) | |||
| ifeq ($(DYNAMIC_ARCH), 1) | |||
| ifeq ($(ARCH),arm64) | |||
| COMMONOBJS += dynamic_arm64.$(SUFFIX) | |||
| else | |||
| COMMONOBJS += dynamic.$(SUFFIX) | |||
| endif | |||
| else | |||
| COMMONOBJS += parameter.$(SUFFIX) | |||
| endif | |||
| @@ -71,7 +75,11 @@ BLAS_SERVER = blas_server.c | |||
| endif | |||
| ifeq ($(DYNAMIC_ARCH), 1) | |||
| ifeq ($(ARCH),arm64) | |||
| HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_arm64.$(SUFFIX) | |||
| else | |||
| HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic.$(SUFFIX) | |||
| endif | |||
| else | |||
| HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) parameter.$(SUFFIX) | |||
| endif | |||
| @@ -70,7 +70,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| /*********************************************************************/ | |||
| #include "common.h" | |||
| #if defined(OS_LINUX) || defined(OS_NETBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_SUNOS) || defined(OS_FREEBSD) | |||
| #if defined(OS_LINUX) || defined(OS_NETBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_SUNOS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_HAIKU) | |||
| #include <dlfcn.h> | |||
| #include <signal.h> | |||
| #include <sys/resource.h> | |||
| @@ -582,7 +582,7 @@ int blas_thread_init(void){ | |||
| if(ret!=0){ | |||
| struct rlimit rlim; | |||
| const char *msg = strerror(ret); | |||
| fprintf(STDERR, "OpenBLAS blas_thread_init: pthread_create: %s\n", msg); | |||
| fprintf(STDERR, "OpenBLAS blas_thread_init: pthread_create failed for thread %ld of %ld: %s\n", i+1,blas_num_threads,msg); | |||
| #ifdef RLIMIT_NPROC | |||
| if(0 == getrlimit(RLIMIT_NPROC, &rlim)) { | |||
| fprintf(STDERR, "OpenBLAS blas_thread_init: RLIMIT_NPROC " | |||
| @@ -850,6 +850,11 @@ void goto_set_num_threads(int num_threads) { | |||
| long i; | |||
| #ifdef SMP_SERVER | |||
| // Handle lazy re-init of the thread-pool after a POSIX fork | |||
| if (unlikely(blas_server_avail == 0)) blas_thread_init(); | |||
| #endif | |||
| if (num_threads < 1) num_threads = blas_num_threads; | |||
| #ifndef NO_AFFINITY | |||
| @@ -36,6 +36,7 @@ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #include <stdbool.h> | |||
| #include <stdio.h> | |||
| #include <stdlib.h> | |||
| //#include <sys/mman.h> | |||
| @@ -47,13 +48,22 @@ | |||
| #else | |||
| #ifndef OMP_SCHED | |||
| #define OMP_SCHED static | |||
| #endif | |||
| int blas_server_avail = 0; | |||
| static void * blas_thread_buffer[MAX_CPU_NUMBER]; | |||
| static void * blas_thread_buffer[MAX_PARALLEL_NUMBER][MAX_CPU_NUMBER]; | |||
| #if __STDC_VERSION__ >= 201112L | |||
| static atomic_bool blas_buffer_inuse[MAX_PARALLEL_NUMBER]; | |||
| #else | |||
| static _Bool blas_buffer_inuse[MAX_PARALLEL_NUMBER]; | |||
| #endif | |||
| void goto_set_num_threads(int num_threads) { | |||
| int i=0; | |||
| int i=0, j=0; | |||
| if (num_threads < 1) num_threads = blas_num_threads; | |||
| @@ -68,15 +78,17 @@ void goto_set_num_threads(int num_threads) { | |||
| omp_set_num_threads(blas_cpu_number); | |||
| //adjust buffer for each thread | |||
| for(i=0; i<blas_cpu_number; i++){ | |||
| if(blas_thread_buffer[i]==NULL){ | |||
| blas_thread_buffer[i]=blas_memory_alloc(2); | |||
| for(i=0; i<MAX_PARALLEL_NUMBER; i++) { | |||
| for(j=0; j<blas_cpu_number; j++){ | |||
| if(blas_thread_buffer[i][j]==NULL){ | |||
| blas_thread_buffer[i][j]=blas_memory_alloc(2); | |||
| } | |||
| } | |||
| } | |||
| for(; i<MAX_CPU_NUMBER; i++){ | |||
| if(blas_thread_buffer[i]!=NULL){ | |||
| blas_memory_free(blas_thread_buffer[i]); | |||
| blas_thread_buffer[i]=NULL; | |||
| for(; j<MAX_CPU_NUMBER; j++){ | |||
| if(blas_thread_buffer[i][j]!=NULL){ | |||
| blas_memory_free(blas_thread_buffer[i][j]); | |||
| blas_thread_buffer[i][j]=NULL; | |||
| } | |||
| } | |||
| } | |||
| #if defined(ARCH_MIPS64) | |||
| @@ -92,30 +104,34 @@ void openblas_set_num_threads(int num_threads) { | |||
| int blas_thread_init(void){ | |||
| int i=0; | |||
| int i=0, j=0; | |||
| blas_get_cpu_number(); | |||
| blas_server_avail = 1; | |||
| for(i=0; i<blas_num_threads; i++){ | |||
| blas_thread_buffer[i]=blas_memory_alloc(2); | |||
| } | |||
| for(; i<MAX_CPU_NUMBER; i++){ | |||
| blas_thread_buffer[i]=NULL; | |||
| for(i=0; i<MAX_PARALLEL_NUMBER; i++) { | |||
| for(j=0; j<blas_num_threads; j++){ | |||
| blas_thread_buffer[i][j]=blas_memory_alloc(2); | |||
| } | |||
| for(; j<MAX_CPU_NUMBER; j++){ | |||
| blas_thread_buffer[i][j]=NULL; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| int BLASFUNC(blas_thread_shutdown)(void){ | |||
| int i=0; | |||
| int i=0, j=0; | |||
| blas_server_avail = 0; | |||
| for(i=0; i<MAX_CPU_NUMBER; i++){ | |||
| if(blas_thread_buffer[i]!=NULL){ | |||
| blas_memory_free(blas_thread_buffer[i]); | |||
| blas_thread_buffer[i]=NULL; | |||
| for(i=0; i<MAX_PARALLEL_NUMBER; i++) { | |||
| for(j=0; j<MAX_CPU_NUMBER; j++){ | |||
| if(blas_thread_buffer[i][j]!=NULL){ | |||
| blas_memory_free(blas_thread_buffer[i][j]); | |||
| blas_thread_buffer[i][j]=NULL; | |||
| } | |||
| } | |||
| } | |||
| @@ -206,7 +222,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ | |||
| } | |||
| } | |||
| static void exec_threads(blas_queue_t *queue){ | |||
| static void exec_threads(blas_queue_t *queue, int buf_index){ | |||
| void *buffer, *sa, *sb; | |||
| int pos=0, release_flag=0; | |||
| @@ -223,7 +239,7 @@ static void exec_threads(blas_queue_t *queue){ | |||
| if ((sa == NULL) && (sb == NULL) && ((queue -> mode & BLAS_PTHREAD) == 0)) { | |||
| pos = omp_get_thread_num(); | |||
| buffer = blas_thread_buffer[pos]; | |||
| buffer = blas_thread_buffer[buf_index][pos]; | |||
| //fallback | |||
| if(buffer==NULL) { | |||
| @@ -291,7 +307,7 @@ static void exec_threads(blas_queue_t *queue){ | |||
| int exec_blas(BLASLONG num, blas_queue_t *queue){ | |||
| BLASLONG i; | |||
| BLASLONG i, buf_index; | |||
| if ((num <= 0) || (queue == NULL)) return 0; | |||
| @@ -302,16 +318,39 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){ | |||
| } | |||
| #endif | |||
| #pragma omp parallel for schedule(static) | |||
| while(true) { | |||
| for(i=0; i < MAX_PARALLEL_NUMBER; i++) { | |||
| #if __STDC_VERSION__ >= 201112L | |||
| _Bool inuse = false; | |||
| if(atomic_compare_exchange_weak(&blas_buffer_inuse[i], &inuse, true)) { | |||
| #else | |||
| if(blas_buffer_inuse[i] == false) { | |||
| blas_buffer_inuse[i] = true; | |||
| #endif | |||
| buf_index = i; | |||
| break; | |||
| } | |||
| } | |||
| if(i != MAX_PARALLEL_NUMBER) | |||
| break; | |||
| } | |||
| #pragma omp parallel for schedule(OMP_SCHED) | |||
| for (i = 0; i < num; i ++) { | |||
| #ifndef USE_SIMPLE_THREADED_LEVEL3 | |||
| queue[i].position = i; | |||
| #endif | |||
| exec_threads(&queue[i]); | |||
| exec_threads(&queue[i], buf_index); | |||
| } | |||
| #if __STDC_VERSION__ >= 201112L | |||
| atomic_store(&blas_buffer_inuse[buf_index], false); | |||
| #else | |||
| blas_buffer_inuse[buf_index] = false; | |||
| #endif | |||
| return 0; | |||
| } | |||
| @@ -40,6 +40,14 @@ | |||
| #include <stdlib.h> | |||
| #include "common.h" | |||
| #if defined(OS_CYGWIN_NT) && !defined(unlikely) | |||
| #ifdef __GNUC__ | |||
| #define unlikely(x) __builtin_expect(!!(x), 0) | |||
| #else | |||
| #define unlikely(x) (x) | |||
| #endif | |||
| #endif | |||
| /* This is a thread implementation for Win32 lazy implementation */ | |||
| /* Thread server common infomation */ | |||
| @@ -53,7 +61,7 @@ typedef struct{ | |||
| } blas_pool_t; | |||
| /* We need this grobal for cheking if initialization is finished. */ | |||
| /* We need this global for cheking if initialization is finished. */ | |||
| int blas_server_avail = 0; | |||
| /* Local Variables */ | |||
| @@ -340,6 +348,11 @@ int blas_thread_init(void){ | |||
| int exec_blas_async(BLASLONG pos, blas_queue_t *queue){ | |||
| #if defined(SMP_SERVER) && defined(OS_CYGWIN_NT) | |||
| // Handle lazy re-init of the thread-pool after a POSIX fork | |||
| if (unlikely(blas_server_avail == 0)) blas_thread_init(); | |||
| #endif | |||
| blas_queue_t *current; | |||
| current = queue; | |||
| @@ -405,6 +418,11 @@ int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue){ | |||
| /* Execute Threads */ | |||
| int exec_blas(BLASLONG num, blas_queue_t *queue){ | |||
| #if defined(SMP_SERVER) && defined(OS_CYGWIN_NT) | |||
| // Handle lazy re-init of the thread-pool after a POSIX fork | |||
| if (unlikely(blas_server_avail == 0)) blas_thread_init(); | |||
| #endif | |||
| #ifndef ALL_THREADED | |||
| int (*routine)(blas_arg_t *, void *, void *, double *, double *, BLASLONG); | |||
| #endif | |||
| @@ -460,7 +478,12 @@ int BLASFUNC(blas_thread_shutdown)(void){ | |||
| void goto_set_num_threads(int num_threads) | |||
| { | |||
| long i; | |||
| long i; | |||
| #if defined(SMP_SERVER) && defined(OS_CYGWIN_NT) | |||
| // Handle lazy re-init of the thread-pool after a POSIX fork | |||
| if (unlikely(blas_server_avail == 0)) blas_thread_init(); | |||
| #endif | |||
| if (num_threads < 1) num_threads = blas_cpu_number; | |||
| @@ -49,6 +49,167 @@ | |||
| #define EXTERN | |||
| #endif | |||
| #ifdef DYNAMIC_LIST | |||
| extern gotoblas_t gotoblas_PRESCOTT; | |||
| #ifdef DYN_ATHLON | |||
| extern gotoblas_t gotoblas_ATHLON; | |||
| #else | |||
| #define gotoblas_ATHLON gotoblas_PRESCOTT | |||
| #endif | |||
| #ifdef DYN_KATMAI | |||
| extern gotoblas_t gotoblas_KATMAI; | |||
| #else | |||
| #define gotoblas_KATMAI gotoblas_PRESCOTT | |||
| #endif | |||
| #ifdef DYN_BANIAS | |||
| extern gotoblas_t gotoblas_BANIAS; | |||
| #else | |||
| #define gotoblas_BANIAS gotoblas_PRESCOTT | |||
| #endif | |||
| #ifdef DYN_COPPERMINE | |||
| extern gotoblas_t gotoblas_COPPERMINE; | |||
| #else | |||
| #define gotoblas_COPPERMINE gotoblas_PRESCOTT | |||
| #endif | |||
| #ifdef DYN_NORTHWOOD | |||
| extern gotoblas_t gotoblas_NORTHWOOD; | |||
| #else | |||
| #define gotoblas_NORTHWOOD gotoblas_PRESCOTT | |||
| #endif | |||
| #ifdef DYN_CORE2 | |||
| extern gotoblas_t gotoblas_CORE2; | |||
| #else | |||
| #define gotoblas_CORE2 gotoblas_PRESCOTT | |||
| #endif | |||
| #ifdef DYN_NEHALEM | |||
| extern gotoblas_t gotoblas_NEHALEM; | |||
| #else | |||
| #define gotoblas_NEHALEM gotoblas_PRESCOTT | |||
| #endif | |||
| #ifdef DYN_BARCELONA | |||
| extern gotoblas_t gotoblas_BARCELONA; | |||
| #elif defined(DYN_NEHALEM) | |||
| #define gotoblas_BARCELONA gotoblas_NEHALEM | |||
| #else | |||
| #define gotoblas_BARCELONA gotoblas_PRESCOTT | |||
| #endif | |||
| #ifdef DYN_ATOM | |||
| extern gotoblas_t gotoblas_ATOM; | |||
| elif defined(DYN_NEHALEM) | |||
| #define gotoblas_ATOM gotoblas_NEHALEM | |||
| #else | |||
| #define gotoblas_ATOM gotoblas_PRESCOTT | |||
| #endif | |||
| #ifdef DYN_NANO | |||
| extern gotoblas_t gotoblas_NANO; | |||
| #else | |||
| #define gotoblas_NANO gotoblas_PRESCOTT | |||
| #endif | |||
| #ifdef DYN_PENRYN | |||
| extern gotoblas_t gotoblas_PENRYN; | |||
| #else | |||
| #define gotoblas_PENRYN gotoblas_PRESCOTT | |||
| #endif | |||
| #ifdef DYN_DUNNINGTON | |||
| extern gotoblas_t gotoblas_DUNNINGTON; | |||
| #else | |||
| #define gotoblas_DUNNINGTON gotoblas_PRESCOTT | |||
| #endif | |||
| #ifdef DYN_OPTERON | |||
| extern gotoblas_t gotoblas_OPTERON; | |||
| #else | |||
| #define gotoblas_OPTERON gotoblas_PRESCOTT | |||
| #endif | |||
| #ifdef DYN_OPTERON_SSE3 | |||
| extern gotoblas_t gotoblas_OPTERON_SSE3; | |||
| #else | |||
| #define gotoblas_OPTERON_SSE3 gotoblas_PRESCOTT | |||
| #endif | |||
| #ifdef DYN_BOBCAT | |||
| extern gotoblas_t gotoblas_BOBCAT; | |||
| #elif defined(DYN_NEHALEM) | |||
| #define gotoblas_BOBCAT gotoblas_NEHALEM | |||
| #else | |||
| #define gotoblas_BOBCAT gotoblas_PRESCOTT | |||
| #endif | |||
| #ifdef DYN_SANDYBRIDGE | |||
| extern gotoblas_t gotoblas_SANDYBRIDGE; | |||
| #elif defined(DYN_NEHALEM) | |||
| #define gotoblas_SANDYBRIDGE gotoblas_NEHALEM | |||
| #else | |||
| #define gotoblas_SANDYBRIDGE gotoblas_PRESCOTT | |||
| #endif | |||
| #ifdef DYN_BULLDOZER | |||
| extern gotoblas_t gotoblas_BULLDOZER; | |||
| #elif defined(DYN_SANDYBRIDGE) | |||
| #define gotoblas_BULLDOZER gotoblas_SANDYBRIDGE | |||
| #elif defined(DYN_NEHALEM) | |||
| #define gotoblas_BULLDOZER gotoblas_NEHALEM | |||
| #else | |||
| #define gotoblas_BULLDOZER gotoblas_PRESCOTT | |||
| #endif | |||
| #ifdef DYN_PILEDRIVER | |||
| extern gotoblas_t gotoblas_PILEDRIVER; | |||
| #elif defined(DYN_SANDYBRIDGE) | |||
| #define gotoblas_PILEDRIVER gotoblas_SANDYBRIDGE | |||
| #elif defined(DYN_NEHALEM) | |||
| #define gotoblas_PILEDRIVER gotoblas_NEHALEM | |||
| #else | |||
| #define gotoblas_PILEDRIVER gotoblas_PRESCOTT | |||
| #endif | |||
| #ifdef DYN_STEAMROLLER | |||
| extern gotoblas_t gotoblas_STEAMROLLER; | |||
| #elif defined(DYN_SANDYBRIDGE) | |||
| #define gotoblas_STEAMROLLER gotoblas_SANDYBRIDGE | |||
| #elif defined(DYN_NEHALEM) | |||
| #define gotoblas_STEAMROLLER gotoblas_NEHALEM | |||
| #else | |||
| #define gotoblas_STEAMROLLER gotoblas_PRESCOTT | |||
| #endif | |||
| #ifdef DYN_EXCAVATOR | |||
| extern gotoblas_t gotoblas_EXCAVATOR; | |||
| #elif defined(DYN_SANDYBRIDGE) | |||
| #define gotoblas_EXCAVATOR gotoblas_SANDYBRIDGE | |||
| #elif defined(DYN_NEHALEM) | |||
| #define gotoblas_EXCAVATOR gotoblas_NEHALEM | |||
| #else | |||
| #define gotoblas_EXCAVATOR gotoblas_PRESCOTT | |||
| #endif | |||
| #ifdef DYN_HASWELL | |||
| extern gotoblas_t gotoblas_HASWELL; | |||
| #elif defined(DYN_SANDYBRIDGE) | |||
| #define gotoblas_HASWELL gotoblas_SANDYBRIDGE | |||
| #elif defined(DYN_NEHALEM) | |||
| #define gotoblas_HASWELL gotoblas_NEHALEM | |||
| #else | |||
| #define gotoblas_HASWELL gotoblas_PRESCOTT | |||
| #endif | |||
| #ifdef DYN_ZEN | |||
| extern gotoblas_t gotoblas_ZEN; | |||
| #elif defined(DYN_HASWELL) | |||
| #define gotoblas_ZEN gotoblas_HASWELL | |||
| #elif defined(DYN_SANDYBRIDGE) | |||
| #define gotoblas_ZEN gotoblas_SANDYBRIDGE | |||
| #elif defined(DYN_NEHALEM) | |||
| #define gotoblas_ZEN gotoblas_NEHALEM | |||
| #else | |||
| #define gotoblas_ZEN gotoblas_PRESCOTT | |||
| #endif | |||
| #ifdef DYN_SKYLAKEX | |||
| extern gotoblas_t gotoblas_SKYLAKEX; | |||
| #elif defined(DYN_HASWELL) | |||
| #define gotoblas_SKYLAKEX gotoblas_HASWELL | |||
| #elif defined(DYN_SANDYBRIDGE) | |||
| #define gotoblas_SKYLAKEX gotoblas_SANDYBRIDGE | |||
| #elif defined(DYN_NEHALEM) | |||
| #define gotoblas_SKYLAKEX gotoblas_NEHALEM | |||
| #else | |||
| #define gotoblas_SKYLAKEX gotoblas_PRESCOTT | |||
| #endif | |||
| #else // not DYNAMIC_LIST | |||
| EXTERN gotoblas_t gotoblas_KATMAI; | |||
| EXTERN gotoblas_t gotoblas_COPPERMINE; | |||
| EXTERN gotoblas_t gotoblas_NORTHWOOD; | |||
| @@ -56,16 +217,27 @@ EXTERN gotoblas_t gotoblas_BANIAS; | |||
| EXTERN gotoblas_t gotoblas_ATHLON; | |||
| extern gotoblas_t gotoblas_PRESCOTT; | |||
| extern gotoblas_t gotoblas_CORE2; | |||
| extern gotoblas_t gotoblas_NEHALEM; | |||
| extern gotoblas_t gotoblas_BARCELONA; | |||
| #ifdef DYNAMIC_OLDER | |||
| extern gotoblas_t gotoblas_ATOM; | |||
| extern gotoblas_t gotoblas_NANO; | |||
| extern gotoblas_t gotoblas_CORE2; | |||
| extern gotoblas_t gotoblas_PENRYN; | |||
| extern gotoblas_t gotoblas_DUNNINGTON; | |||
| extern gotoblas_t gotoblas_NEHALEM; | |||
| extern gotoblas_t gotoblas_OPTERON; | |||
| extern gotoblas_t gotoblas_OPTERON_SSE3; | |||
| extern gotoblas_t gotoblas_BARCELONA; | |||
| extern gotoblas_t gotoblas_BOBCAT; | |||
| #else | |||
| #define gotoblas_ATOM gotoblas_NEHALEM | |||
| #define gotoblas_NANO gotoblas_NEHALEM | |||
| #define gotoblas_PENRYN gotoblas_CORE2 | |||
| #define gotoblas_DUNNINGTON gotoblas_CORE2 | |||
| #define gotoblas_OPTERON gotoblas_CORE2 | |||
| #define gotoblas_OPTERON_SSE3 gotoblas_CORE2 | |||
| #define gotoblas_BOBCAT gotoblas_CORE2 | |||
| #endif | |||
| #ifndef NO_AVX | |||
| extern gotoblas_t gotoblas_SANDYBRIDGE; | |||
| extern gotoblas_t gotoblas_BULLDOZER; | |||
| @@ -74,15 +246,22 @@ extern gotoblas_t gotoblas_STEAMROLLER; | |||
| extern gotoblas_t gotoblas_EXCAVATOR; | |||
| #ifdef NO_AVX2 | |||
| #define gotoblas_HASWELL gotoblas_SANDYBRIDGE | |||
| #define gotoblas_SKYLAKEX gotoblas_SANDYBRIDGE | |||
| #define gotoblas_ZEN gotoblas_SANDYBRIDGE | |||
| #else | |||
| extern gotoblas_t gotoblas_HASWELL; | |||
| extern gotoblas_t gotoblas_ZEN; | |||
| #ifndef NO_AVX512 | |||
| extern gotoblas_t gotoblas_SKYLAKEX; | |||
| #else | |||
| #define gotoblas_SKYLAKEX gotoblas_HASWELL | |||
| #endif | |||
| #endif | |||
| #else | |||
| //Use NEHALEM kernels for sandy bridge | |||
| #define gotoblas_SANDYBRIDGE gotoblas_NEHALEM | |||
| #define gotoblas_HASWELL gotoblas_NEHALEM | |||
| #define gotoblas_SKYLAKEX gotoblas_NEHALEM | |||
| #define gotoblas_BULLDOZER gotoblas_BARCELONA | |||
| #define gotoblas_PILEDRIVER gotoblas_BARCELONA | |||
| #define gotoblas_STEAMROLLER gotoblas_BARCELONA | |||
| @@ -90,10 +269,12 @@ extern gotoblas_t gotoblas_ZEN; | |||
| #define gotoblas_ZEN gotoblas_BARCELONA | |||
| #endif | |||
| #endif // DYNAMIC_LIST | |||
| #define VENDOR_INTEL 1 | |||
| #define VENDOR_AMD 2 | |||
| #define VENDOR_CENTAUR 3 | |||
| #define VENDOR_HYGON 4 | |||
| #define VENDOR_UNKNOWN 99 | |||
| #define BITMASK(a, b, c) ((((a) >> (b)) & (c))) | |||
| @@ -124,9 +305,49 @@ int support_avx(){ | |||
| #endif | |||
| } | |||
| int support_avx2(){ | |||
| #ifndef NO_AVX2 | |||
| int eax, ebx, ecx=0, edx; | |||
| int ret=0; | |||
| if (!support_avx()) | |||
| return 0; | |||
| cpuid(7, &eax, &ebx, &ecx, &edx); | |||
| if((ebx & (1<<7)) != 0) | |||
| ret=1; //OS supports AVX2 | |||
| return ret; | |||
| #else | |||
| return 0; | |||
| #endif | |||
| } | |||
| int support_avx512(){ | |||
| #ifndef NO_AVX512 | |||
| int eax, ebx, ecx, edx; | |||
| int ret=0; | |||
| if (!support_avx()) | |||
| return 0; | |||
| cpuid(7, &eax, &ebx, &ecx, &edx); | |||
| if((ebx & (1<<7)) != 1){ | |||
| ret=0; //OS does not even support AVX2 | |||
| } | |||
| if((ebx & (1<<31)) != 0){ | |||
| xgetbv(0, &eax, &edx); | |||
| if((eax & 0xe0) == 0xe0) | |||
| ret=1; //OS supports AVX512VL | |||
| } | |||
| return ret; | |||
| #else | |||
| return 0; | |||
| #endif | |||
| } | |||
| extern void openblas_warning(int verbose, const char * msg); | |||
| #define FALLBACK_VERBOSE 1 | |||
| #define NEHALEM_FALLBACK "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n" | |||
| #define SANDYBRIDGE_FALLBACK "OpenBLAS : Your OS does not support AVX2 instructions. OpenBLAS is using Sandybridge kernels as a fallback, which may give poorer performance.\n" | |||
| #define HASWELL_FALLBACK "OpenBLAS : Your OS does not support AVX512VL instructions. OpenBLAS is using Haswell kernels as a fallback, which may give poorer performance.\n" | |||
| #define BARCELONA_FALLBACK "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Barcelona kernels as a fallback, which may give poorer performance.\n" | |||
| static int get_vendor(void){ | |||
| @@ -149,6 +370,7 @@ static int get_vendor(void){ | |||
| if (!strcmp(vendor.vchar, "GenuineIntel")) return VENDOR_INTEL; | |||
| if (!strcmp(vendor.vchar, "AuthenticAMD")) return VENDOR_AMD; | |||
| if (!strcmp(vendor.vchar, "CentaurHauls")) return VENDOR_CENTAUR; | |||
| if (!strcmp(vendor.vchar, "HygonGenuine")) return VENDOR_HYGON; | |||
| if ((eax == 0) || ((eax & 0x500) != 0)) return VENDOR_INTEL; | |||
| @@ -223,18 +445,24 @@ static gotoblas_t *get_coretype(void){ | |||
| } | |||
| //Intel Haswell | |||
| if (model == 12 || model == 15) { | |||
| if(support_avx()) | |||
| if(support_avx2()) | |||
| return &gotoblas_HASWELL; | |||
| else{ | |||
| if(support_avx()) { | |||
| openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); | |||
| return &gotoblas_SANDYBRIDGE; | |||
| } else { | |||
| openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); | |||
| return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. | |||
| } | |||
| } | |||
| //Intel Broadwell | |||
| if (model == 13) { | |||
| if(support_avx()) | |||
| if(support_avx2()) | |||
| return &gotoblas_HASWELL; | |||
| else{ | |||
| if(support_avx()) { | |||
| openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); | |||
| return &gotoblas_SANDYBRIDGE; | |||
| } else { | |||
| openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); | |||
| return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. | |||
| } | |||
| @@ -244,27 +472,36 @@ static gotoblas_t *get_coretype(void){ | |||
| case 4: | |||
| //Intel Haswell | |||
| if (model == 5 || model == 6) { | |||
| if(support_avx()) | |||
| if(support_avx2()) | |||
| return &gotoblas_HASWELL; | |||
| else{ | |||
| if(support_avx()) { | |||
| openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); | |||
| return &gotoblas_SANDYBRIDGE; | |||
| } else { | |||
| openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); | |||
| return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. | |||
| } | |||
| } | |||
| //Intel Broadwell | |||
| if (model == 7 || model == 15) { | |||
| if(support_avx()) | |||
| if(support_avx2()) | |||
| return &gotoblas_HASWELL; | |||
| else{ | |||
| if(support_avx()) { | |||
| openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); | |||
| return &gotoblas_SANDYBRIDGE; | |||
| } else { | |||
| openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); | |||
| return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. | |||
| } | |||
| } | |||
| //Intel Skylake | |||
| if (model == 14) { | |||
| if(support_avx()) | |||
| if(support_avx2()) | |||
| return &gotoblas_HASWELL; | |||
| else{ | |||
| if(support_avx()) { | |||
| openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); | |||
| return &gotoblas_SANDYBRIDGE; | |||
| } else { | |||
| openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); | |||
| return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. | |||
| } | |||
| @@ -277,27 +514,54 @@ static gotoblas_t *get_coretype(void){ | |||
| case 5: | |||
| //Intel Broadwell | |||
| if (model == 6) { | |||
| if(support_avx()) | |||
| if(support_avx2()) | |||
| return &gotoblas_HASWELL; | |||
| else{ | |||
| if(support_avx()) { | |||
| openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); | |||
| return &gotoblas_SANDYBRIDGE; | |||
| } else { | |||
| openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); | |||
| return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. | |||
| } | |||
| } | |||
| if (model == 5) { | |||
| // Intel Skylake X | |||
| if (support_avx512()) | |||
| return &gotoblas_SKYLAKEX; | |||
| if(support_avx2()){ | |||
| openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK); | |||
| return &gotoblas_HASWELL; | |||
| } | |||
| if(support_avx()) { | |||
| openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); | |||
| return &gotoblas_SANDYBRIDGE; | |||
| } else { | |||
| openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); | |||
| return &gotoblas_NEHALEM; | |||
| } | |||
| } | |||
| //Intel Skylake | |||
| if (model == 14 || model == 5) { | |||
| if(support_avx()) | |||
| if (model == 14) { | |||
| if(support_avx2()) | |||
| return &gotoblas_HASWELL; | |||
| else{ | |||
| if(support_avx()) { | |||
| openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); | |||
| return &gotoblas_SANDYBRIDGE; | |||
| } else { | |||
| openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); | |||
| return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. | |||
| } | |||
| } | |||
| //Intel Phi Knights Landing | |||
| if (model == 7) { | |||
| if(support_avx()) | |||
| if(support_avx2()){ | |||
| openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK); | |||
| return &gotoblas_HASWELL; | |||
| else{ | |||
| } | |||
| if(support_avx()) { | |||
| openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); | |||
| return &gotoblas_SANDYBRIDGE; | |||
| } else { | |||
| openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); | |||
| return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. | |||
| } | |||
| @@ -307,12 +571,29 @@ static gotoblas_t *get_coretype(void){ | |||
| return &gotoblas_NEHALEM; | |||
| } | |||
| return NULL; | |||
| case 6: | |||
| if (model == 6) { | |||
| // Cannon Lake | |||
| if(support_avx2()) | |||
| return &gotoblas_HASWELL; | |||
| if(support_avx()) { | |||
| openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); | |||
| return &gotoblas_SANDYBRIDGE; | |||
| } else { | |||
| openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); | |||
| return &gotoblas_NEHALEM; | |||
| } | |||
| } | |||
| return NULL; | |||
| case 9: | |||
| case 8: | |||
| if (model == 14 ) { // Kaby Lake | |||
| if(support_avx()) | |||
| if(support_avx2()) | |||
| return &gotoblas_HASWELL; | |||
| else{ | |||
| if(support_avx()) { | |||
| openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); | |||
| return &gotoblas_SANDYBRIDGE; | |||
| } else { | |||
| openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); | |||
| return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. | |||
| } | |||
| @@ -325,7 +606,7 @@ static gotoblas_t *get_coretype(void){ | |||
| } | |||
| } | |||
| if (vendor == VENDOR_AMD){ | |||
| if (vendor == VENDOR_AMD || vendor == VENDOR_HYGON){ | |||
| if (family <= 0xe) { | |||
| // Verify that CPU has 3dnow and 3dnowext before claiming it is Athlon | |||
| cpuid(0x80000000, &eax, &ebx, &ecx, &edx); | |||
| @@ -397,7 +678,7 @@ static gotoblas_t *get_coretype(void){ | |||
| } | |||
| } | |||
| } else if (exfamily == 8) { | |||
| if (model == 1) { | |||
| if (model == 1 || model == 8) { | |||
| if(support_avx()) | |||
| return &gotoblas_ZEN; | |||
| else{ | |||
| @@ -405,6 +686,13 @@ static gotoblas_t *get_coretype(void){ | |||
| return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. | |||
| } | |||
| } | |||
| } else if (exfamily == 9) { | |||
| if(support_avx()) | |||
| return &gotoblas_ZEN; | |||
| else{ | |||
| openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK); | |||
| return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. | |||
| } | |||
| }else { | |||
| return &gotoblas_BARCELONA; | |||
| } | |||
| @@ -445,7 +733,8 @@ static char *corename[] = { | |||
| "Haswell", | |||
| "Steamroller", | |||
| "Excavator", | |||
| "Zen" | |||
| "Zen", | |||
| "SkylakeX" | |||
| }; | |||
| char *gotoblas_corename(void) { | |||
| @@ -473,7 +762,7 @@ char *gotoblas_corename(void) { | |||
| if (gotoblas == &gotoblas_STEAMROLLER) return corename[21]; | |||
| if (gotoblas == &gotoblas_EXCAVATOR) return corename[22]; | |||
| if (gotoblas == &gotoblas_ZEN) return corename[23]; | |||
| if (gotoblas == &gotoblas_SKYLAKEX) return corename[24]; | |||
| return corename[0]; | |||
| } | |||
| @@ -485,7 +774,7 @@ static gotoblas_t *force_coretype(char *coretype){ | |||
| char message[128]; | |||
| //char mname[20]; | |||
| for ( i=1 ; i <= 23; i++) | |||
| for ( i=1 ; i <= 24; i++) | |||
| { | |||
| if (!strncasecmp(coretype,corename[i],20)) | |||
| { | |||
| @@ -503,6 +792,7 @@ static gotoblas_t *force_coretype(char *coretype){ | |||
| switch (found) | |||
| { | |||
| case 24: return (&gotoblas_SKYLAKEX); | |||
| case 23: return (&gotoblas_ZEN); | |||
| case 22: return (&gotoblas_EXCAVATOR); | |||
| case 21: return (&gotoblas_STEAMROLLER); | |||
| @@ -0,0 +1,198 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #include "common.h" | |||
| #include <asm/hwcap.h> | |||
| #include <sys/auxv.h> | |||
| extern gotoblas_t gotoblas_ARMV8; | |||
| extern gotoblas_t gotoblas_CORTEXA57; | |||
| extern gotoblas_t gotoblas_THUNDERX; | |||
| extern gotoblas_t gotoblas_THUNDERX2T99; | |||
| extern void openblas_warning(int verbose, const char * msg); | |||
| #define NUM_CORETYPES 4 | |||
| /* | |||
| * In case asm/hwcap.h is outdated on the build system, make sure | |||
| * that HWCAP_CPUID is defined | |||
| */ | |||
| #ifndef HWCAP_CPUID | |||
| #define HWCAP_CPUID (1 << 11) | |||
| #endif | |||
| #define get_cpu_ftr(id, var) ({ \ | |||
| asm("mrs %0, "#id : "=r" (var)); \ | |||
| }) | |||
| static char *corename[] = { | |||
| "armv8", | |||
| "cortexa57", | |||
| "thunderx", | |||
| "thunderx2t99", | |||
| "unknown" | |||
| }; | |||
| char *gotoblas_corename(void) { | |||
| if (gotoblas == &gotoblas_ARMV8) return corename[ 0]; | |||
| if (gotoblas == &gotoblas_CORTEXA57) return corename[ 1]; | |||
| if (gotoblas == &gotoblas_THUNDERX) return corename[ 2]; | |||
| if (gotoblas == &gotoblas_THUNDERX2T99) return corename[ 3]; | |||
| return corename[NUM_CORETYPES]; | |||
| } | |||
| static gotoblas_t *force_coretype(char *coretype) { | |||
| int i ; | |||
| int found = -1; | |||
| char message[128]; | |||
| for ( i=0 ; i < NUM_CORETYPES; i++) | |||
| { | |||
| if (!strncasecmp(coretype, corename[i], 20)) | |||
| { | |||
| found = i; | |||
| break; | |||
| } | |||
| } | |||
| switch (found) | |||
| { | |||
| case 0: return (&gotoblas_ARMV8); | |||
| case 1: return (&gotoblas_CORTEXA57); | |||
| case 2: return (&gotoblas_THUNDERX); | |||
| case 3: return (&gotoblas_THUNDERX2T99); | |||
| } | |||
| snprintf(message, 128, "Core not found: %s\n", coretype); | |||
| openblas_warning(1, message); | |||
| return NULL; | |||
| } | |||
| static gotoblas_t *get_coretype(void) { | |||
| int implementer, variant, part, arch, revision, midr_el1; | |||
| if (!(getauxval(AT_HWCAP) & HWCAP_CPUID)) { | |||
| char coremsg[128]; | |||
| snprintf(coremsg, 128, "Kernel lacks cpuid feature support. Auto detection of core type failed !!!\n"); | |||
| openblas_warning(1, coremsg); | |||
| return NULL; | |||
| } | |||
| get_cpu_ftr(MIDR_EL1, midr_el1); | |||
| /* | |||
| * MIDR_EL1 | |||
| * | |||
| * 31 24 23 20 19 16 15 4 3 0 | |||
| * ----------------------------------------------------------------- | |||
| * | Implementer | Variant | Architecture | Part Number | Revision | | |||
| * ----------------------------------------------------------------- | |||
| */ | |||
| implementer = (midr_el1 >> 24) & 0xFF; | |||
| part = (midr_el1 >> 4) & 0xFFF; | |||
| switch(implementer) | |||
| { | |||
| case 0x41: // ARM | |||
| switch (part) | |||
| { | |||
| case 0xd07: // Cortex A57 | |||
| case 0xd08: // Cortex A72 | |||
| case 0xd03: // Cortex A53 | |||
| return &gotoblas_CORTEXA57; | |||
| } | |||
| break; | |||
| case 0x42: // Broadcom | |||
| switch (part) | |||
| { | |||
| case 0x516: // Vulcan | |||
| return &gotoblas_THUNDERX2T99; | |||
| } | |||
| break; | |||
| case 0x43: // Cavium | |||
| switch (part) | |||
| { | |||
| case 0x0a1: // ThunderX | |||
| return &gotoblas_THUNDERX; | |||
| case 0x0af: // ThunderX2 | |||
| return &gotoblas_THUNDERX2T99; | |||
| } | |||
| break; | |||
| } | |||
| return NULL; | |||
| } | |||
| void gotoblas_dynamic_init(void) { | |||
| char coremsg[128]; | |||
| char coren[22]; | |||
| char *p; | |||
| if (gotoblas) return; | |||
| p = getenv("OPENBLAS_CORETYPE"); | |||
| if ( p ) | |||
| { | |||
| gotoblas = force_coretype(p); | |||
| } | |||
| else | |||
| { | |||
| gotoblas = get_coretype(); | |||
| } | |||
| if (gotoblas == NULL) | |||
| { | |||
| snprintf(coremsg, 128, "Falling back to generic ARMV8 core\n"); | |||
| openblas_warning(1, coremsg); | |||
| gotoblas = &gotoblas_ARMV8; | |||
| } | |||
| if (gotoblas && gotoblas->init) { | |||
| strncpy(coren, gotoblas_corename(), 20); | |||
| sprintf(coremsg, "Core: %s\n", coren); | |||
| openblas_warning(2, coremsg); | |||
| gotoblas -> init(); | |||
| } else { | |||
| openblas_warning(0, "OpenBLAS : Architecture Initialization failed. No initialization function found.\n"); | |||
| exit(1); | |||
| } | |||
| } | |||
| void gotoblas_dynamic_quit(void) { | |||
| gotoblas = NULL; | |||
| } | |||
| @@ -35,9 +35,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include <string.h> | |||
| #if defined(_WIN32) && defined(_MSC_VER) | |||
| #if _MSC_VER < 1900 | |||
| #define snprintf _snprintf | |||
| #endif | |||
| #endif | |||
| static char* openblas_config_str="" | |||
| "OpenBLAS " | |||
| VERSION | |||
| " " | |||
| #ifdef USE64BITINT | |||
| "USE64BITINT " | |||
| " USE64BITINT " | |||
| #endif | |||
| #ifdef NO_CBLAS | |||
| "NO_CBLAS " | |||
| @@ -54,6 +63,9 @@ static char* openblas_config_str="" | |||
| #ifdef NO_AFFINITY | |||
| "NO_AFFINITY " | |||
| #endif | |||
| #ifdef USE_OPENMP | |||
| "USE_OPENMP " | |||
| #endif | |||
| #ifndef DYNAMIC_ARCH | |||
| CHAR_CORENAME | |||
| #endif | |||
| @@ -61,18 +73,23 @@ static char* openblas_config_str="" | |||
| #ifdef DYNAMIC_ARCH | |||
| char *gotoblas_corename(); | |||
| static char tmp_config_str[256]; | |||
| #endif | |||
| static char tmp_config_str[256]; | |||
| int openblas_get_parallel(); | |||
| char* CNAME() { | |||
| #ifndef DYNAMIC_ARCH | |||
| return openblas_config_str; | |||
| #else | |||
| char tmpstr[20]; | |||
| strcpy(tmp_config_str, openblas_config_str); | |||
| #ifdef DYNAMIC_ARCH | |||
| strcat(tmp_config_str, gotoblas_corename()); | |||
| return tmp_config_str; | |||
| #endif | |||
| if (openblas_get_parallel() == 0) | |||
| sprintf(tmpstr, " SINGLE_THREADED"); | |||
| else | |||
| snprintf(tmpstr,19," MAX_THREADS=%d",MAX_CPU_NUMBER); | |||
| strcat(tmp_config_str, tmpstr); | |||
| return tmp_config_str; | |||
| } | |||
| @@ -83,3 +100,4 @@ char* openblas_get_corename() { | |||
| return gotoblas_corename(); | |||
| #endif | |||
| } | |||
| @@ -167,7 +167,7 @@ int get_L2_size(void){ | |||
| #if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) || \ | |||
| defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \ | |||
| defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) || \ | |||
| defined(PILEDRIVER) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined(ZEN) | |||
| defined(PILEDRIVER) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined(ZEN) || defined(SKYLAKEX) | |||
| cpuid(0x80000006, &eax, &ebx, &ecx, &edx); | |||
| @@ -251,7 +251,7 @@ int get_L2_size(void){ | |||
| void blas_set_parameter(void){ | |||
| int factor; | |||
| #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined(ZEN) | |||
| #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined(ZEN) || defined(SKYLAKEX) | |||
| int size = 16; | |||
| #else | |||
| int size = get_L2_size(); | |||
| @@ -730,35 +730,8 @@ void blas_set_parameter(void){ | |||
| #if defined(ARCH_ARM64) | |||
| #if defined(VULCAN) || defined(THUNDERX2T99) | |||
| unsigned long dgemm_prefetch_size_a; | |||
| unsigned long dgemm_prefetch_size_b; | |||
| unsigned long dgemm_prefetch_size_c; | |||
| #endif | |||
| void blas_set_parameter(void) | |||
| { | |||
| #if defined(VULCAN) || defined(THUNDERX2T99) | |||
| dgemm_p = 160; | |||
| dgemm_q = 128; | |||
| dgemm_r = 4096; | |||
| sgemm_p = 128; | |||
| sgemm_q = 352; | |||
| sgemm_r = 4096; | |||
| cgemm_p = 128; | |||
| cgemm_q = 224; | |||
| cgemm_r = 4096; | |||
| zgemm_p = 128; | |||
| zgemm_q = 112; | |||
| zgemm_r = 4096; | |||
| dgemm_prefetch_size_a = 3584; | |||
| dgemm_prefetch_size_b = 512; | |||
| dgemm_prefetch_size_c = 128; | |||
| #endif | |||
| } | |||
| #endif | |||
| @@ -114,20 +114,22 @@ $(LIBDYNNAME) : ../$(LIBNAME).osx.renamed osx.def | |||
| endif | |||
| ifneq (,$(filter 1 2,$(NOFORTRAN))) | |||
| #only build without Fortran | |||
| $(CC) $(CFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(LIBDYNNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) | |||
| $(CC) $(CFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(LIBDYNNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) | |||
| else | |||
| $(FC) $(FFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(LIBDYNNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) | |||
| $(FC) $(FFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(LIBDYNNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) | |||
| endif | |||
| dllinit.$(SUFFIX) : dllinit.c | |||
| $(CC) $(CFLAGS) -c -o $(@F) -s $< | |||
| ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android)) | |||
| ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku)) | |||
| so : ../$(LIBSONAME) | |||
| ifeq ($(OSNAME), Android) | |||
| INTERNALNAME = $(LIBPREFIX).so | |||
| FEXTRALIB += -lm | |||
| EXTRALIB += -lm | |||
| else | |||
| INTERNALNAME = $(LIBPREFIX).so.$(MAJOR_VERSION) | |||
| endif | |||
| @@ -156,7 +158,7 @@ endif | |||
| endif | |||
| #http://stackoverflow.com/questions/7656425/makefile-ifeq-logical-or | |||
| ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD NetBSD)) | |||
| ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD OpenBSD NetBSD DragonFly)) | |||
| so : ../$(LIBSONAME) | |||
| @@ -97,7 +97,7 @@ if ($compiler eq "") { | |||
| if ($data =~ /Intel/) { | |||
| $vendor = INTEL; | |||
| $openmp = "-openmp"; | |||
| $openmp = "-fopenmp"; | |||
| } | |||
| if ($data =~ /Sun Fortran/) { | |||
| @@ -127,7 +127,7 @@ if ($compiler eq "") { | |||
| # for embeded underscore name, e.g. zho_ge, it may append 2 underscores. | |||
| $data = `$compiler -O2 -S ftest3.f > /dev/null 2>&1 && cat ftest3.s && rm -f ftest3.s`; | |||
| if ($data =~ /zho_ge__/) { | |||
| if ($data =~ / zho_ge__/) { | |||
| $need2bu = 1; | |||
| } | |||
| } | |||
| @@ -155,7 +155,7 @@ if ($compiler eq "") { | |||
| if ($compiler =~ /ifort/) { | |||
| $vendor = INTEL; | |||
| $bu = "_"; | |||
| $openmp = "-openmp"; | |||
| $openmp = "-fopenmp"; | |||
| } | |||
| if ($compiler =~ /pathf/) { | |||
| @@ -292,9 +292,6 @@ if ($link ne "") { | |||
| && ($flags !~ /^-LIST:/) | |||
| && ($flags !~ /^-LANG:/) | |||
| ) { | |||
| if ($vendor eq "PGI") { | |||
| $flags =~ s/lib$/libso/; | |||
| } | |||
| $linker_L .= $flags . " "; | |||
| } | |||
| @@ -311,17 +308,11 @@ if ($link ne "") { | |||
| if ($flags =~ /^\-rpath\@/) { | |||
| $flags =~ s/\@/\,/g; | |||
| if ($vendor eq "PGI") { | |||
| $flags =~ s/lib$/libso/; | |||
| } | |||
| $linker_L .= "-Wl,". $flags . " " ; | |||
| } | |||
| if ($flags =~ /^\-rpath-link\@/) { | |||
| $flags =~ s/\@/\,/g; | |||
| if ($vendor eq "PGI") { | |||
| $flags =~ s/lib$/libso/; | |||
| } | |||
| $linker_L .= "-Wl,". $flags . " " ; | |||
| } | |||
| @@ -330,7 +321,6 @@ if ($link ne "") { | |||
| && ($flags !~ /gfortranbegin/) | |||
| && ($flags !~ /frtbegin/) | |||
| && ($flags !~ /pathfstart/) | |||
| && ($flags !~ /numa/) | |||
| && ($flags !~ /crt[0-9]/) | |||
| && ($flags !~ /gcc/) | |||
| && ($flags !~ /user32/) | |||
| @@ -82,7 +82,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #ifdef OS_WINDOWS | |||
| #include <windows.h> | |||
| #endif | |||
| #if defined(__FreeBSD__) || defined(__APPLE__) | |||
| #if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__DragonFly__) || defined(__APPLE__) | |||
| #include <sys/types.h> | |||
| #include <sys/sysctl.h> | |||
| #endif | |||
| @@ -326,6 +326,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define CORENAME "HASWELL" | |||
| #endif | |||
| #ifdef FORCE_SKYLAKEX | |||
| #define FORCE | |||
| #define FORCE_INTEL | |||
| #define ARCHITECTURE "X86" | |||
| #define SUBARCHITECTURE "SKYLAKEX" | |||
| #define ARCHCONFIG "-DSKYLAKEX " \ | |||
| "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ | |||
| "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | |||
| "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \ | |||
| "-DFMA3 -DHAVE_AVX512VL -march=skylake-avx512" | |||
| #define LIBNAME "skylakex" | |||
| #define CORENAME "SKYLAKEX" | |||
| #endif | |||
| #ifdef FORCE_ATOM | |||
| #define FORCE | |||
| #define FORCE_INTEL | |||
| @@ -912,11 +927,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define ARCHCONFIG "-DARMV8 " \ | |||
| "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ | |||
| "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 " | |||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 " \ | |||
| "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" | |||
| #define LIBNAME "armv8" | |||
| #define CORENAME "ARMV8" | |||
| #endif | |||
| #ifdef FORCE_CORTEXA53 | |||
| #define FORCE | |||
| #define ARCHITECTURE "ARM64" | |||
| #define SUBARCHITECTURE "CORTEXA53" | |||
| #define SUBDIRNAME "arm64" | |||
| #define ARCHCONFIG "-DCORTEXA53 " \ | |||
| "-DL1_CODE_SIZE=32768 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=3 " \ | |||
| "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=2 " \ | |||
| "-DL2_SIZE=262144 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | |||
| "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" | |||
| #define LIBNAME "cortexa53" | |||
| #define CORENAME "CORTEXA53" | |||
| #else | |||
| #endif | |||
| #ifdef FORCE_CORTEXA57 | |||
| #define FORCE | |||
| #define ARCHITECTURE "ARM64" | |||
| @@ -927,26 +959,57 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=2 " \ | |||
| "-DL2_SIZE=2097152 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | |||
| "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON" | |||
| "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" | |||
| #define LIBNAME "cortexa57" | |||
| #define CORENAME "CORTEXA57" | |||
| #else | |||
| #endif | |||
| #ifdef FORCE_VULCAN | |||
| #ifdef FORCE_CORTEXA72 | |||
| #define FORCE | |||
| #define ARCHITECTURE "ARM64" | |||
| #define SUBARCHITECTURE "VULCAN" | |||
| #define SUBARCHITECTURE "CORTEXA72" | |||
| #define SUBDIRNAME "arm64" | |||
| #define ARCHCONFIG "-DVULCAN " \ | |||
| "-DL1_CODE_SIZE=32768 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=8 " \ | |||
| "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=8 " \ | |||
| "-DL2_SIZE=262144 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=8 " \ | |||
| "-DL3_SIZE=33554432 -DL3_LINESIZE=64 -DL3_ASSOCIATIVE=32 " \ | |||
| #define ARCHCONFIG "-DCORTEXA72 " \ | |||
| "-DL1_CODE_SIZE=49152 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=3 " \ | |||
| "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=2 " \ | |||
| "-DL2_SIZE=2097152 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | |||
| "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON" | |||
| #define LIBNAME "vulcan" | |||
| #define CORENAME "VULCAN" | |||
| "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" | |||
| #define LIBNAME "cortexa72" | |||
| #define CORENAME "CORTEXA72" | |||
| #else | |||
| #endif | |||
| #ifdef FORCE_CORTEXA73 | |||
| #define FORCE | |||
| #define ARCHITECTURE "ARM64" | |||
| #define SUBARCHITECTURE "CORTEXA73" | |||
| #define SUBDIRNAME "arm64" | |||
| #define ARCHCONFIG "-DCORTEXA73 " \ | |||
| "-DL1_CODE_SIZE=49152 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=3 " \ | |||
| "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=2 " \ | |||
| "-DL2_SIZE=2097152 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | |||
| "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" | |||
| #define LIBNAME "cortexa73" | |||
| #define CORENAME "CORTEXA73" | |||
| #else | |||
| #endif | |||
| #ifdef FORCE_FALKOR | |||
| #define FORCE | |||
| #define ARCHITECTURE "ARM64" | |||
| #define SUBARCHITECTURE "FALKOR" | |||
| #define SUBDIRNAME "arm64" | |||
| #define ARCHCONFIG "-DFALKOR " \ | |||
| "-DL1_CODE_SIZE=49152 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=3 " \ | |||
| "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=2 " \ | |||
| "-DL2_SIZE=2097152 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | |||
| "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" | |||
| #define LIBNAME "falkor" | |||
| #define CORENAME "FALKOR" | |||
| #else | |||
| #endif | |||
| @@ -958,13 +1021,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define ARCHCONFIG "-DTHUNDERX " \ | |||
| "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=128 " \ | |||
| "-DL2_SIZE=16777216 -DL2_LINESIZE=128 -DL2_ASSOCIATIVE=16 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " | |||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | |||
| "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" | |||
| #define LIBNAME "thunderx" | |||
| #define CORENAME "THUNDERX" | |||
| #else | |||
| #endif | |||
| #ifdef FORCE_THUNDERX2T99 | |||
| #define ARMV8 | |||
| #define FORCE | |||
| #define ARCHITECTURE "ARM64" | |||
| #define SUBARCHITECTURE "THUNDERX2T99" | |||
| @@ -975,7 +1040,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| "-DL2_SIZE=262144 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=8 " \ | |||
| "-DL3_SIZE=33554432 -DL3_LINESIZE=64 -DL3_ASSOCIATIVE=32 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | |||
| "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON" | |||
| "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" | |||
| #define LIBNAME "thunderx2t99" | |||
| #define CORENAME "THUNDERX2T99" | |||
| #else | |||
| @@ -1003,6 +1068,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #ifndef FORCE | |||
| #ifdef USER_TARGET | |||
| #error "The TARGET specified on the command line or in Makefile.rule is not supported. Please choose a target from TargetList.txt" | |||
| #endif | |||
| #if defined(__powerpc__) || defined(__powerpc) || defined(powerpc) || \ | |||
| defined(__PPC__) || defined(PPC) || defined(_POWER) || defined(__POWERPC__) | |||
| #ifndef POWER | |||
| @@ -1074,7 +1143,7 @@ static int get_num_cores(void) { | |||
| #ifdef OS_WINDOWS | |||
| SYSTEM_INFO sysinfo; | |||
| #elif defined(__FreeBSD__) || defined(__APPLE__) | |||
| #elif defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__DragonFly__) || defined(__APPLE__) | |||
| int m[2], count; | |||
| size_t len; | |||
| #endif | |||
| @@ -1088,7 +1157,7 @@ static int get_num_cores(void) { | |||
| GetSystemInfo(&sysinfo); | |||
| return sysinfo.dwNumberOfProcessors; | |||
| #elif defined(__FreeBSD__) || defined(__APPLE__) | |||
| #elif defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__DragonFly__) || defined(__APPLE__) | |||
| m[0] = CTL_HW; | |||
| m[1] = HW_NCPU; | |||
| len = sizeof(int); | |||
| @@ -1116,7 +1185,7 @@ int main(int argc, char *argv[]){ | |||
| #ifdef FORCE | |||
| printf("CORE=%s\n", CORENAME); | |||
| #else | |||
| #if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) | |||
| #if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) | |||
| printf("CORE=%s\n", get_corename()); | |||
| #endif | |||
| #endif | |||
| @@ -1181,9 +1250,7 @@ int main(int argc, char *argv[]){ | |||
| #elif NO_PARALLEL_MAKE==1 | |||
| printf("MAKE += -j 1\n"); | |||
| #else | |||
| #ifndef OS_WINDOWS | |||
| printf("MAKE += -j %d\n", get_num_cores()); | |||
| #endif | |||
| #endif | |||
| break; | |||
| @@ -1224,7 +1291,7 @@ int main(int argc, char *argv[]){ | |||
| #ifdef FORCE | |||
| printf("#define CHAR_CORENAME \"%s\"\n", CORENAME); | |||
| #else | |||
| #if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) | |||
| #if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) | |||
| printf("#define CHAR_CORENAME \"%s\"\n", get_corename()); | |||
| #endif | |||
| #endif | |||
| @@ -260,7 +260,7 @@ HPLOBJS = dgemm.$(SUFFIX) dtrsm.$(SUFFIX) \ | |||
| idamax.$(SUFFIX) daxpy.$(SUFFIX) dcopy.$(SUFFIX) dscal.$(SUFFIX) | |||
| CSBLAS1OBJS = \ | |||
| cblas_isamax.$(SUFFIX) cblas_sasum.$(SUFFIX) cblas_saxpy.$(SUFFIX) \ | |||
| cblas_isamax.$(SUFFIX) cblas_isamin.$(SUFFIX) cblas_sasum.$(SUFFIX) cblas_saxpy.$(SUFFIX) \ | |||
| cblas_scopy.$(SUFFIX) cblas_sdot.$(SUFFIX) cblas_sdsdot.$(SUFFIX) cblas_dsdot.$(SUFFIX) \ | |||
| cblas_srot.$(SUFFIX) cblas_srotg.$(SUFFIX) cblas_srotm.$(SUFFIX) cblas_srotmg.$(SUFFIX) \ | |||
| cblas_sscal.$(SUFFIX) cblas_sswap.$(SUFFIX) cblas_snrm2.$(SUFFIX) cblas_saxpby.$(SUFFIX) | |||
| @@ -277,7 +277,7 @@ CSBLAS3OBJS = \ | |||
| cblas_sgeadd.$(SUFFIX) | |||
| CDBLAS1OBJS = \ | |||
| cblas_idamax.$(SUFFIX) cblas_dasum.$(SUFFIX) cblas_daxpy.$(SUFFIX) \ | |||
| cblas_idamax.$(SUFFIX) cblas_idamin.$(SUFFIX) cblas_dasum.$(SUFFIX) cblas_daxpy.$(SUFFIX) \ | |||
| cblas_dcopy.$(SUFFIX) cblas_ddot.$(SUFFIX) \ | |||
| cblas_drot.$(SUFFIX) cblas_drotg.$(SUFFIX) cblas_drotm.$(SUFFIX) cblas_drotmg.$(SUFFIX) \ | |||
| cblas_dscal.$(SUFFIX) cblas_dswap.$(SUFFIX) cblas_dnrm2.$(SUFFIX) cblas_daxpby.$(SUFFIX) | |||
| @@ -294,7 +294,7 @@ CDBLAS3OBJS += \ | |||
| cblas_dgeadd.$(SUFFIX) | |||
| CCBLAS1OBJS = \ | |||
| cblas_icamax.$(SUFFIX) cblas_scasum.$(SUFFIX) cblas_caxpy.$(SUFFIX) \ | |||
| cblas_icamax.$(SUFFIX) cblas_icamin.$(SUFFIX) cblas_scasum.$(SUFFIX) cblas_caxpy.$(SUFFIX) \ | |||
| cblas_ccopy.$(SUFFIX) \ | |||
| cblas_cdotc.$(SUFFIX) cblas_cdotu.$(SUFFIX) \ | |||
| cblas_cdotc_sub.$(SUFFIX) cblas_cdotu_sub.$(SUFFIX) \ | |||
| @@ -320,7 +320,7 @@ CCBLAS3OBJS = \ | |||
| CZBLAS1OBJS = \ | |||
| cblas_izamax.$(SUFFIX) cblas_dzasum.$(SUFFIX) cblas_zaxpy.$(SUFFIX) \ | |||
| cblas_izamax.$(SUFFIX) cblas_izamin.$(SUFFIX) cblas_dzasum.$(SUFFIX) cblas_zaxpy.$(SUFFIX) \ | |||
| cblas_zcopy.$(SUFFIX) \ | |||
| cblas_zdotc.$(SUFFIX) cblas_zdotu.$(SUFFIX) \ | |||
| cblas_zdotc_sub.$(SUFFIX) cblas_zdotu_sub.$(SUFFIX) \ | |||
| @@ -1359,6 +1359,18 @@ cblas_icamax.$(SUFFIX) cblas_icamax.$(PSUFFIX) : imax.c | |||
| cblas_izamax.$(SUFFIX) cblas_izamax.$(PSUFFIX) : imax.c | |||
| $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F) | |||
| cblas_isamin.$(SUFFIX) cblas_isamin.$(PSUFFIX) : imax.c | |||
| $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F) | |||
| cblas_idamin.$(SUFFIX) cblas_idamin.$(PSUFFIX) : imax.c | |||
| $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F) | |||
| cblas_icamin.$(SUFFIX) cblas_icamin.$(PSUFFIX) : imax.c | |||
| $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F) | |||
| cblas_izamin.$(SUFFIX) cblas_izamin.$(PSUFFIX) : imax.c | |||
| $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F) | |||
| cblas_ismax.$(SUFFIX) cblas_ismax.$(PSUFFIX) : imax.c | |||
| $(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -UUSE_MIN $< -o $(@F) | |||
| @@ -40,11 +40,11 @@ | |||
| #include "common.h" | |||
| #ifdef FUNCTION_PROFILE | |||
| #include "functable.h" | |||
| #endif | |||
| #endif | |||
| #if defined(Z13) | |||
| #define MULTI_THREAD_MINIMAL 200000 | |||
| #else | |||
| #define MULTI_THREAD_MINIMAL 10000 | |||
| #define MULTI_THREAD_MINIMAL 10000 | |||
| #endif | |||
| #ifndef CBLAS | |||
| @@ -75,6 +75,11 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint inc | |||
| if (alpha == ZERO) return; | |||
| if (incx == 0 && incy == 0) { | |||
| *y += n * alpha *(*x); | |||
| return; | |||
| } | |||
| IDEBUG_START; | |||
| FUNCTION_PROFILE_START(); | |||
| @@ -83,17 +88,15 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint inc | |||
| if (incy < 0) y -= (n - 1) * incy; | |||
| #ifdef SMP | |||
| nthreads = num_cpu_avail(1); | |||
| //disable multi-thread when incx==0 or incy==0 | |||
| //In that case, the threads would be dependent. | |||
| if (incx == 0 || incy == 0) | |||
| nthreads = 1; | |||
| // | |||
| //Temporarily work-around the low performance issue with small imput size & | |||
| //multithreads. | |||
| if (n <= MULTI_THREAD_MINIMAL) | |||
| if (incx == 0 || incy == 0 || n <= MULTI_THREAD_MINIMAL) | |||
| nthreads = 1; | |||
| else | |||
| nthreads = num_cpu_avail(1); | |||
| if (nthreads == 1) { | |||
| #endif | |||
| @@ -213,7 +213,7 @@ void CNAME(enum CBLAS_ORDER order, | |||
| if (trans) lenx = m; | |||
| if (trans) leny = n; | |||
| if (beta != ONE) SCAL_K(leny, 0, 0, beta, y, abs(incy), NULL, 0, NULL, 0); | |||
| if (beta != ONE) SCAL_K(leny, 0, 0, beta, y, blasabs(incy), NULL, 0, NULL, 0); | |||
| if (alpha == ZERO) return; | |||
| @@ -44,6 +44,7 @@ | |||
| #endif | |||
| #ifndef COMPLEX | |||
| #define SMP_THRESHOLD_MIN 65536.0 | |||
| #ifdef XDOUBLE | |||
| #define ERROR_NAME "QGEMM " | |||
| #elif defined(DOUBLE) | |||
| @@ -52,6 +53,7 @@ | |||
| #define ERROR_NAME "SGEMM " | |||
| #endif | |||
| #else | |||
| #define SMP_THRESHOLD_MIN 8192.0 | |||
| #ifndef GEMM3M | |||
| #ifdef XDOUBLE | |||
| #define ERROR_NAME "XGEMM " | |||
| @@ -121,8 +123,6 @@ void NAME(char *TRANSA, char *TRANSB, | |||
| FLOAT *sa, *sb; | |||
| #ifdef SMP | |||
| int nthreads_max; | |||
| int nthreads_avail; | |||
| double MNK; | |||
| #ifndef COMPLEX | |||
| #ifdef XDOUBLE | |||
| @@ -245,8 +245,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS | |||
| XFLOAT *sa, *sb; | |||
| #ifdef SMP | |||
| int nthreads_max; | |||
| int nthreads_avail; | |||
| double MNK; | |||
| #ifndef COMPLEX | |||
| #ifdef XDOUBLE | |||
| @@ -273,6 +271,14 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS | |||
| PRINT_DEBUG_CNAME; | |||
| #if !defined(COMPLEX) && !defined(DOUBLE) && defined(USE_SGEMM_KERNEL_DIRECT) | |||
| if (beta == 0 && alpha == 1.0 && order == CblasRowMajor && TransA == CblasNoTrans && TransB == CblasNoTrans && sgemm_kernel_direct_performant(m,n,k)) { | |||
| sgemm_kernel_direct(m, n, k, a, lda, b, ldb, c, ldc); | |||
| return; | |||
| } | |||
| #endif | |||
| #ifndef COMPLEX | |||
| args.alpha = (void *)α | |||
| args.beta = (void *)β | |||
| @@ -411,25 +417,12 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS | |||
| mode |= (transa << BLAS_TRANSA_SHIFT); | |||
| mode |= (transb << BLAS_TRANSB_SHIFT); | |||
| nthreads_max = num_cpu_avail(3); | |||
| nthreads_avail = nthreads_max; | |||
| #ifndef COMPLEX | |||
| MNK = (double) args.m * (double) args.n * (double) args.k; | |||
| if ( MNK <= (65536.0 * (double) GEMM_MULTITHREAD_THRESHOLD) ) | |||
| nthreads_max = 1; | |||
| #else | |||
| MNK = (double) args.m * (double) args.n * (double) args.k; | |||
| if ( MNK <= (8192.0 * (double) GEMM_MULTITHREAD_THRESHOLD) ) | |||
| nthreads_max = 1; | |||
| #endif | |||
| args.common = NULL; | |||
| if ( nthreads_max > nthreads_avail ) | |||
| args.nthreads = nthreads_avail; | |||
| if ( MNK <= (SMP_THRESHOLD_MIN * (double) GEMM_MULTITHREAD_THRESHOLD) ) | |||
| args.nthreads = 1; | |||
| else | |||
| args.nthreads = nthreads_max; | |||
| args.nthreads = num_cpu_avail(3); | |||
| args.common = NULL; | |||
| if (args.nthreads == 1) { | |||
| #endif | |||
| @@ -199,7 +199,7 @@ void CNAME(enum CBLAS_ORDER order, | |||
| if (trans) lenx = m; | |||
| if (trans) leny = n; | |||
| if (beta != ONE) SCAL_K(leny, 0, 0, beta, y, abs(incy), NULL, 0, NULL, 0); | |||
| if (beta != ONE) SCAL_K(leny, 0, 0, beta, y, blasabs(incy), NULL, 0, NULL, 0); | |||
| if (alpha == ZERO) return; | |||
| @@ -97,7 +97,7 @@ int NAME(blasint *N, FLOAT *a, blasint *LDA, blasint *K1, blasint *K2, blasint * | |||
| blas_level1_thread(mode, n, k1, k2, dummyalpha, | |||
| a, lda, NULL, 0, ipiv, incx, | |||
| laswp[flag], nthreads); | |||
| (int(*)())laswp[flag], nthreads); | |||
| } | |||
| #endif | |||
| @@ -96,7 +96,7 @@ int NAME(blasint *N, FLOAT *a, blasint *LDA, blasint *K1, blasint *K2, blasint * | |||
| mode = BLAS_SINGLE | BLAS_COMPLEX; | |||
| #endif | |||
| blas_level1_thread(mode, n, k1, k2, dummyalpha, a, lda, NULL, 0, ipiv, incx, laswp[flag], nthreads); | |||
| blas_level1_thread(mode, n, k1, k2, dummyalpha, a, lda, NULL, 0, ipiv, incx, (int(*)())laswp[flag], nthreads); | |||
| } | |||
| #endif | |||
| @@ -22,8 +22,8 @@ void CNAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){ | |||
| long double s; | |||
| long double r, roe, z; | |||
| long double ada = fabs(da); | |||
| long double adb = fabs(db); | |||
| long double ada = fabsl(da); | |||
| long double adb = fabsl(db); | |||
| long double scale = ada + adb; | |||
| #ifndef CBLAS | |||
| @@ -64,6 +64,13 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){ | |||
| FLOAT du, dp1, dp2, dq2, dq1, dh11=ZERO, dh21=ZERO, dh12=ZERO, dh22=ZERO, dflag=-ONE, dtemp; | |||
| if (*dd2 == ZERO || dy1 == ZERO) | |||
| { | |||
| dflag = -TWO; | |||
| dparam[0] = dflag; | |||
| return; | |||
| } | |||
| if(*dd1 < ZERO) | |||
| { | |||
| dflag = -ONE; | |||
| @@ -76,6 +83,16 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){ | |||
| *dd2 = ZERO; | |||
| *dx1 = ZERO; | |||
| } | |||
| else if ((*dd1 == ZERO || *dx1 == ZERO) && *dd2 > ZERO) | |||
| { | |||
| dflag = ONE; | |||
| dh12 = 1; | |||
| dh21 = -1; | |||
| *dx1 = dy1; | |||
| dtemp = *dd1; | |||
| *dd1 = *dd2; | |||
| *dd2 = dtemp; | |||
| } | |||
| else | |||
| { | |||
| dp2 = *dd2 * dy1; | |||
| @@ -90,6 +107,9 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){ | |||
| dq1 = dp1 * *dx1; | |||
| if(ABS(dq1) > ABS(dq2)) | |||
| { | |||
| dflag = ZERO; | |||
| dh11 = ONE; | |||
| dh22 = ONE; | |||
| dh21 = - dy1 / *dx1; | |||
| dh12 = dp2 / dp1; | |||
| @@ -100,8 +120,19 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){ | |||
| *dd1 = *dd1 / du; | |||
| *dd2 = *dd2 / du; | |||
| *dx1 = *dx1 * du; | |||
| } else { | |||
| dflag = -ONE; | |||
| dh11 = ZERO; | |||
| dh12 = ZERO; | |||
| dh21 = ZERO; | |||
| dh22 = ZERO; | |||
| *dd1 = ZERO; | |||
| *dd2 = ZERO; | |||
| *dx1 = ZERO; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| @@ -120,7 +151,9 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){ | |||
| } | |||
| else | |||
| { | |||
| dflag = ONE; | |||
| dflag = ONE; | |||
| dh21 = -ONE; | |||
| dh12 = ONE; | |||
| dh11 = dp1 / dp2; | |||
| dh22 = *dx1 / dy1; | |||
| @@ -134,74 +167,33 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){ | |||
| } | |||
| if(*dd1 != ZERO) | |||
| while ( *dd1 <= RGAMSQ && *dd1 != ZERO) | |||
| { | |||
| while( (*dd1 <= RGAMSQ) || (*dd1 >= GAMSQ) ) | |||
| { | |||
| if(dflag == ZERO) | |||
| { | |||
| dh11 = ONE; | |||
| dh22 = ONE; | |||
| dflag = -ONE; | |||
| } | |||
| else | |||
| { | |||
| if(dflag == ONE) | |||
| { | |||
| dh21 = -ONE; | |||
| dh12 = ONE; | |||
| dflag = -ONE; | |||
| } | |||
| } | |||
| if( *dd1 <= RGAMSQ ) | |||
| { | |||
| *dd1 = *dd1 * (GAM * GAM); | |||
| *dx1 = *dx1 / GAM; | |||
| dh11 = dh11 / GAM; | |||
| dh12 = dh12 / GAM; | |||
| } | |||
| else | |||
| { | |||
| *dd1 = *dd1 / (GAM * GAM); | |||
| *dx1 = *dx1 * GAM; | |||
| dh11 = dh11 * GAM; | |||
| dh12 = dh12 * GAM; | |||
| } | |||
| } | |||
| dflag = -ONE; | |||
| *dd1 = *dd1 * (GAM * GAM); | |||
| *dx1 = *dx1 / GAM; | |||
| dh11 = dh11 / GAM; | |||
| dh12 = dh12 / GAM; | |||
| } | |||
| while (ABS(*dd1) > GAMSQ) { | |||
| dflag = -ONE; | |||
| *dd1 = *dd1 / (GAM * GAM); | |||
| *dx1 = *dx1 * GAM; | |||
| dh11 = dh11 * GAM; | |||
| dh12 = dh12 * GAM; | |||
| } | |||
| if(*dd2 != ZERO) | |||
| { | |||
| while( (ABS(*dd2) <= RGAMSQ) || (ABS(*dd2) >= GAMSQ) ) | |||
| { | |||
| if(dflag == ZERO) | |||
| { | |||
| dh11 = ONE; | |||
| dh22 = ONE; | |||
| dflag = -ONE; | |||
| } | |||
| else | |||
| { | |||
| if(dflag == ONE) | |||
| { | |||
| dh21 = -ONE; | |||
| dh12 = ONE; | |||
| dflag = -ONE; | |||
| } | |||
| } | |||
| if( ABS(*dd2) <= RGAMSQ ) | |||
| { | |||
| *dd2 = *dd2 * (GAM * GAM); | |||
| dh21 = dh21 / GAM; | |||
| dh22 = dh22 / GAM; | |||
| } | |||
| else | |||
| { | |||
| *dd2 = *dd2 / (GAM * GAM); | |||
| dh21 = dh21 * GAM; | |||
| dh22 = dh22 * GAM; | |||
| } | |||
| } | |||
| while (ABS(*dd2) <= RGAMSQ && *dd2 != ZERO) { | |||
| dflag = -ONE; | |||
| *dd2 = *dd2 * (GAM * GAM); | |||
| dh21 = dh21 / GAM; | |||
| dh22 = dh22 / GAM; | |||
| } | |||
| while (ABS(*dd2) > GAMSQ) { | |||
| dflag = -ONE; | |||
| *dd2 = *dd2 / (GAM * GAM); | |||
| dh21 = dh21 * GAM; | |||
| dh22 = dh22 * GAM; | |||
| } | |||
| } | |||
| @@ -184,7 +184,7 @@ void CNAME(enum CBLAS_ORDER order, | |||
| if (n == 0) return; | |||
| if (beta != ONE) SCAL_K(n, 0, 0, beta, y, abs(incy), NULL, 0, NULL, 0); | |||
| if (beta != ONE) SCAL_K(n, 0, 0, beta, y, blasabs(incy), NULL, 0, NULL, 0); | |||
| if (alpha == ZERO) return; | |||
| @@ -76,10 +76,11 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx){ | |||
| #ifdef SMP | |||
| nthreads = num_cpu_avail(1); | |||
| if (n <= 1048576 ) | |||
| nthreads = 1; | |||
| else | |||
| nthreads = num_cpu_avail(1); | |||
| if (nthreads == 1) { | |||
| #endif | |||
| @@ -168,7 +168,7 @@ void CNAME(enum CBLAS_ORDER order, | |||
| if (n == 0) return; | |||
| if (beta != ONE) SCAL_K(n, 0, 0, beta, y, abs(incy), NULL, 0, NULL, 0); | |||
| if (beta != ONE) SCAL_K(n, 0, 0, beta, y, blasabs(incy), NULL, 0, NULL, 0); | |||
| if (alpha == ZERO) return; | |||
| @@ -42,7 +42,7 @@ | |||
| #include "functable.h" | |||
| #endif | |||
| #if defined(THUNDERX2T99) || defined(VULCAN) | |||
| #if defined(THUNDERX2T99) || defined(VULCAN) || defined(ARMV8) | |||
| // Multithreaded swap gives performance benefits in ThunderX2T99 | |||
| #else | |||
| // Disable multi-threading as it does not show any performance | |||
| @@ -166,7 +166,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha, | |||
| if (n == 0) return; | |||
| if (beta != ONE) SCAL_K(n, 0, 0, beta, y, abs(incy), NULL, 0, NULL, 0); | |||
| if (beta != ONE) SCAL_K(n, 0, 0, beta, y, blasabs(incy), NULL, 0, NULL, 0); | |||
| if (alpha == ZERO) return; | |||
| @@ -366,12 +366,13 @@ void CNAME(enum CBLAS_ORDER order, | |||
| mode |= (trans << BLAS_TRANSA_SHIFT); | |||
| mode |= (side << BLAS_RSIDE_SHIFT); | |||
| args.nthreads = num_cpu_avail(3); | |||
| if ( args.m < 2*GEMM_MULTITHREAD_THRESHOLD ) | |||
| args.nthreads = 1; | |||
| else | |||
| if ( args.n < 2*GEMM_MULTITHREAD_THRESHOLD ) | |||
| args.nthreads = 1; | |||
| else | |||
| args.nthreads = num_cpu_avail(3); | |||
| if (args.nthreads == 1) { | |||
| @@ -41,7 +41,11 @@ | |||
| #ifdef FUNCTION_PROFILE | |||
| #include "functable.h" | |||
| #endif | |||
| #if defined(Z13) | |||
| #define MULTI_THREAD_MINIMAL 200000 | |||
| #else | |||
| #define MULTI_THREAD_MINIMAL 10000 | |||
| #endif | |||
| #ifndef CBLAS | |||
| void NAME(blasint *N, FLOAT *ALPHA, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY){ | |||
| @@ -69,7 +73,7 @@ void CNAME(blasint n, FLOAT *ALPHA, FLOAT *x, blasint incx, FLOAT *y, blasint in | |||
| #endif | |||
| #ifndef CBLAS | |||
| PRINT_DEBUG_CNAME; | |||
| PRINT_DEBUG_NAME; | |||
| #else | |||
| PRINT_DEBUG_CNAME; | |||
| #endif | |||
| @@ -78,6 +82,12 @@ void CNAME(blasint n, FLOAT *ALPHA, FLOAT *x, blasint incx, FLOAT *y, blasint in | |||
| if ((alpha_r == ZERO) && (alpha_i == ZERO)) return; | |||
| if (incx == 0 && incy == 0) { | |||
| *y += n * (alpha_r * (*x) - alpha_i* (*(x+1)) ); | |||
| *(y+1) += n * (alpha_i * (*x) + alpha_r * (*(x +1)) ); | |||
| return; | |||
| } | |||
| IDEBUG_START; | |||
| FUNCTION_PROFILE_START(); | |||
| @@ -86,12 +96,15 @@ void CNAME(blasint n, FLOAT *ALPHA, FLOAT *x, blasint incx, FLOAT *y, blasint in | |||
| if (incy < 0) y -= (n - 1) * incy * 2; | |||
| #ifdef SMP | |||
| nthreads = num_cpu_avail(1); | |||
| //disable multi-thread when incx==0 or incy==0 | |||
| //In that case, the threads would be dependent. | |||
| if (incx == 0 || incy == 0) | |||
| // | |||
| //Temporarily work-around the low performance issue with small imput size & | |||
| //multithreads. | |||
| if (incx == 0 || incy == 0 || n <= MULTI_THREAD_MINIMAL) | |||
| nthreads = 1; | |||
| else | |||
| nthreads = num_cpu_avail(1); | |||
| if (nthreads == 1) { | |||
| #endif | |||
| @@ -237,7 +237,7 @@ void CNAME(enum CBLAS_ORDER order, | |||
| if (trans & 1) lenx = m; | |||
| if (trans & 1) leny = n; | |||
| if (beta_r != ONE || beta_i != ZERO) SCAL_K(leny, 0, 0, beta_r, beta_i, y, abs(incy), NULL, 0, NULL, 0); | |||
| if (beta_r != ONE || beta_i != ZERO) SCAL_K(leny, 0, 0, beta_r, beta_i, y, blasabs(incy), NULL, 0, NULL, 0); | |||
| if (alpha_r == ZERO && alpha_i == ZERO) return; | |||
| @@ -225,7 +225,7 @@ void CNAME(enum CBLAS_ORDER order, | |||
| if (trans & 1) lenx = m; | |||
| if (trans & 1) leny = n; | |||
| if (beta_r != ONE || beta_i != ZERO) SCAL_K(leny, 0, 0, beta_r, beta_i, y, abs(incy), NULL, 0, NULL, 0); | |||
| if (beta_r != ONE || beta_i != ZERO) SCAL_K(leny, 0, 0, beta_r, beta_i, y, blasabs(incy), NULL, 0, NULL, 0); | |||
| if (alpha_r == ZERO && alpha_i == ZERO) return; | |||
| @@ -190,7 +190,7 @@ void CNAME(enum CBLAS_ORDER order, | |||
| if (n == 0) return; | |||
| if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, y, abs(incy), NULL, 0, NULL, 0); | |||
| if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, y, blasabs(incy), NULL, 0, NULL, 0); | |||
| if ((alpha_r == ZERO) && (alpha_i == ZERO)) return; | |||
| @@ -43,6 +43,10 @@ | |||
| #include "functable.h" | |||
| #endif | |||
| // this is smallest dimension N of square input a to permit threading | |||
| // see graph in issue #1820 for explanation | |||
| #define MULTI_THREAD_MINIMAL 362 | |||
| #ifdef XDOUBLE | |||
| #define ERROR_NAME "XHEMV " | |||
| #elif defined(DOUBLE) | |||
| @@ -181,7 +185,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, void *VALPHA | |||
| if (n == 0) return; | |||
| if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, y, abs(incy), NULL, 0, NULL, 0); | |||
| if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, y, blasabs(incy), NULL, 0, NULL, 0); | |||
| if ((alpha_r == ZERO) && (alpha_i == ZERO)) return; | |||
| @@ -195,7 +199,11 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, void *VALPHA | |||
| buffer = (FLOAT *)blas_memory_alloc(1); | |||
| #ifdef SMP | |||
| nthreads = num_cpu_avail(2); | |||
| if (n<MULTI_THREAD_MINIMAL) { | |||
| nthreads = 1 ; | |||
| } else { | |||
| nthreads = num_cpu_avail(2); | |||
| }; | |||
| if (nthreads == 1) { | |||
| #endif | |||
| @@ -180,7 +180,7 @@ void CNAME(enum CBLAS_ORDER order, | |||
| if (n == 0) return; | |||
| if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, y, abs(incy), NULL, 0, NULL, 0); | |||
| if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, y, blasabs(incy), NULL, 0, NULL, 0); | |||
| if ((alpha_r == ZERO) && (alpha_i == ZERO)) return; | |||
| @@ -14,7 +14,7 @@ void NAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){ | |||
| long double db_i = *(DB + 1); | |||
| long double r; | |||
| long double ada = fabs(da_r) + fabs(da_i); | |||
| long double ada = fabsl(da_r) + fabsl(da_i); | |||
| PRINT_DEBUG_NAME; | |||
| @@ -126,7 +126,7 @@ void NAME(char *UPLO, blasint *N, blasint *K, FLOAT *ALPHA, FLOAT *a, blasint * | |||
| if (n == 0) return; | |||
| if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, c, abs(incy), NULL, 0, NULL, 0); | |||
| if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, c, blasabs(incy), NULL, 0, NULL, 0); | |||
| if ((alpha_r == ZERO) && (alpha_i == ZERO)) return; | |||
| @@ -90,10 +90,10 @@ void CNAME(blasint n, FLOAT alpha_r, void *vx, blasint incx){ | |||
| FUNCTION_PROFILE_START(); | |||
| #ifdef SMP | |||
| nthreads = num_cpu_avail(1); | |||
| if ( n <= 1048576 ) | |||
| nthreads = 1; | |||
| else | |||
| nthreads = num_cpu_avail(1); | |||
| if (nthreads == 1) { | |||
| #endif | |||
| @@ -42,6 +42,14 @@ | |||
| #include "functable.h" | |||
| #endif | |||
| #if defined(THUNDERX2T99) || defined(VULCAN) || defined(ARMV8) | |||
| // Multithreaded swap gives performance benefits in ThunderX2T99 | |||
| #else | |||
| // Disable multi-threading as it does not show any performance | |||
| // benefits. Keep the multi-threading code for the record. | |||
| #undef SMP | |||
| #endif | |||
| #ifndef CBLAS | |||
| void NAME(blasint *N, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY){ | |||
| @@ -79,12 +87,12 @@ FLOAT *y = (FLOAT*)vy; | |||
| if (incy < 0) y -= (n - 1) * incy * 2; | |||
| #ifdef SMP | |||
| nthreads = num_cpu_avail(1); | |||
| //disable multi-thread when incx==0 or incy==0 | |||
| //In that case, the threads would be dependent. | |||
| if (incx == 0 || incy == 0) | |||
| if (incx == 0 || incy == 0 || n < 1048576 * GEMM_MULTITHREAD_THRESHOLD / sizeof(FLOAT)) | |||
| nthreads = 1; | |||
| else | |||
| nthreads = num_cpu_avail(1); | |||
| if (nthreads == 1) { | |||
| #endif | |||
| @@ -239,6 +239,9 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, | |||
| } else | |||
| nthreads = 1; | |||
| /* FIXME TRMV multithreading appears to be broken, see issue 1332*/ | |||
| nthreads = 1; | |||
| if(nthreads > 1) { | |||
| buffer_size = n > 16 ? 0 : n * 4 + 40; | |||
| } | |||
| @@ -121,14 +121,17 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) | |||
| # Makefile.L3 | |||
| set(USE_TRMM false) | |||
| if (ARM OR ARM64 OR "${TARGET_CORE}" STREQUAL "LONGSOON3B" OR "${TARGET_CORE}" STREQUAL "GENERIC" OR "${CORE}" STREQUAL "generic" OR "${TARGET_CORE}" STREQUAL "HASWELL" OR "${CORE}" STREQUAL "haswell" OR "${CORE}" STREQUAL "zen") | |||
| if (ARM OR ARM64 OR "${TARGET_CORE}" STREQUAL "LONGSOON3B" OR "${TARGET_CORE}" STREQUAL "GENERIC" OR "${CORE}" STREQUAL "generic" OR "${TARGET_CORE}" STREQUAL "HASWELL" OR "${CORE}" STREQUAL "haswell" OR "${CORE}" STREQUAL "zen" OR "${TARGET_CORE}" STREQUAL "SKYLAKEX" OR "${CORE}" STREQUAL "skylakex") | |||
| set(USE_TRMM true) | |||
| endif () | |||
| foreach (float_type ${FLOAT_TYPES}) | |||
| foreach (float_type SINGLE DOUBLE) | |||
| string(SUBSTRING ${float_type} 0 1 float_char) | |||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMKERNEL}" "" "gemm_kernel" false "" "" false ${float_type}) | |||
| endforeach() | |||
| foreach (float_type ${FLOAT_TYPES}) | |||
| string(SUBSTRING ${float_type} 0 1 float_char) | |||
| if (${float_char}GEMMINCOPY) | |||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMINCOPY}" "${float_type}" "${${float_char}GEMMINCOPYOBJ}" false "" "" true ${float_type}) | |||
| endif () | |||
| @@ -5,8 +5,43 @@ endif | |||
| TOPDIR = .. | |||
| include $(TOPDIR)/Makefile.system | |||
| AVX2OPT = | |||
| ifeq ($(C_COMPILER), GCC) | |||
| # AVX2 support was added in 4.7.0 | |||
| GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4) | |||
| GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7) | |||
| ifeq ($(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7), 11) | |||
| AVX2OPT = -mavx2 | |||
| endif | |||
| endif | |||
| ifeq ($(C_COMPILER), CLANG) | |||
| # Any clang posing as gcc 4.2 should be new enough (3.4 or later) | |||
| GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4) | |||
| GCCMINORVERSIONGTEQ2 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 2) | |||
| ifeq ($(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ2), 11) | |||
| AVX2OPT = -mavx2 | |||
| endif | |||
| endif | |||
| ifdef NO_AVX2 | |||
| AVX2OPT= | |||
| endif | |||
| ifdef TARGET_CORE | |||
| override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) | |||
| ifeq ($(TARGET_CORE), SKYLAKEX) | |||
| override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -march=skylake-avx512 | |||
| ifeq ($(OSNAME), CYGWIN_NT) | |||
| override CFLAGS += -fno-asynchronous-unwind-tables | |||
| endif | |||
| ifeq ($(OSNAME), WINNT) | |||
| ifeq ($(C_COMPILER), GCC) | |||
| override CFLAGS += -fno-asynchronous-unwind-tables | |||
| endif | |||
| endif | |||
| else ifeq ($(TARGET_CORE), HASWELL) | |||
| override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) $(AVX2OPT) | |||
| else | |||
| override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) | |||
| endif | |||
| BUILD_KERNEL = 1 | |||
| KDIR = | |||
| TSUFFIX = _$(TARGET_CORE) | |||
| @@ -88,7 +123,11 @@ lsame.$(SUFFIX): $(KERNELDIR)/$(LSAME_KERNEL) | |||
| $(CC) -c $(CFLAGS) -DF_INTERFACE $< -o $(@F) | |||
| setparam$(TSUFFIX).$(SUFFIX): setparam$(TSUFFIX).c kernel$(TSUFFIX).h | |||
| ifeq ($(USE_GEMM3M), 1) | |||
| $(CC) -c $(CFLAGS) -DUSE_GEMM3M $< -o $@ | |||
| else | |||
| $(CC) -c $(CFLAGS) $< -o $@ | |||
| endif | |||
| setparam$(TSUFFIX).c : setparam-ref.c | |||
| sed 's/TS/$(TSUFFIX)/g' $< > $(@F) | |||
| @@ -29,9 +29,11 @@ USE_TRMM = 1 | |||
| endif | |||
| ifeq ($(CORE), HASWELL) | |||
| ifeq ($(ARCH), x86_64) | |||
| USE_TRMM = 1 | |||
| endif | |||
| ifeq ($(CORE), SKYLAKEX) | |||
| USE_TRMM = 1 | |||
| endif | |||
| ifeq ($(CORE), ZEN) | |||
| @@ -42,7 +44,7 @@ ifeq ($(CORE), POWER8) | |||
| USE_TRMM = 1 | |||
| endif | |||
| ifeq ($(CORE), Z13) | |||
| ifeq ($(ARCH), zarch) | |||
| USE_TRMM = 1 | |||
| endif | |||
| @@ -49,6 +49,7 @@ SDOTKERNEL = ../arm/dot.c | |||
| DDOTKERNEL = ../arm/dot.c | |||
| CDOTKERNEL = ../arm/zdot.c | |||
| ZDOTKERNEL = ../arm/zdot.c | |||
| DSDOTKERNEL = ../generic/dot.c | |||
| SNRM2KERNEL = ../arm/nrm2.c | |||
| DNRM2KERNEL = ../arm/nrm2.c | |||
| @@ -58,11 +58,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro KERNEL_F4 | |||
| pld [ X, #X_PRE ] | |||
| fldmiad X!, { d4 - d5 } | |||
| vldmia.f64 X!, { d4 - d5 } | |||
| vabs.f64 d4, d4 | |||
| vadd.f64 d0 , d0, d4 | |||
| vabs.f64 d5, d5 | |||
| fldmiad X!, { d6 - d7 } | |||
| vldmia.f64 X!, { d6 - d7 } | |||
| vabs.f64 d6, d6 | |||
| vadd.f64 d1 , d1, d5 | |||
| vabs.f64 d7, d7 | |||
| @@ -73,7 +73,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro KERNEL_F1 | |||
| fldmiad X!, { d4 } | |||
| vldmia.f64 X!, { d4 } | |||
| vabs.f64 d4, d4 | |||
| vadd.f64 d0 , d0, d4 | |||
| @@ -82,22 +82,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro KERNEL_S4 | |||
| fldmiad X, { d4 } | |||
| vldmia.f64 X, { d4 } | |||
| vabs.f64 d4, d4 | |||
| vadd.f64 d0 , d0, d4 | |||
| add X, X, INC_X | |||
| fldmiad X, { d4 } | |||
| vldmia.f64 X, { d4 } | |||
| vabs.f64 d4, d4 | |||
| vadd.f64 d0 , d0, d4 | |||
| add X, X, INC_X | |||
| fldmiad X, { d4 } | |||
| vldmia.f64 X, { d4 } | |||
| vabs.f64 d4, d4 | |||
| vadd.f64 d0 , d0, d4 | |||
| add X, X, INC_X | |||
| fldmiad X, { d4 } | |||
| vldmia.f64 X, { d4 } | |||
| vabs.f64 d4, d4 | |||
| vadd.f64 d0 , d0, d4 | |||
| add X, X, INC_X | |||
| @@ -107,7 +107,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro KERNEL_S1 | |||
| fldmiad X, { d4 } | |||
| vldmia.f64 X, { d4 } | |||
| vabs.f64 d4, d4 | |||
| vadd.f64 d0 , d0, d4 | |||
| add X, X, INC_X | |||
| @@ -118,11 +118,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro KERNEL_F4 | |||
| fldmias X!, { s4 - s5 } | |||
| vldmia.f32 X!, { s4 - s5 } | |||
| vabs.f32 s4, s4 | |||
| vadd.f32 s0 , s0, s4 | |||
| vabs.f32 s5, s5 | |||
| fldmias X!, { s6 - s7 } | |||
| vldmia.f32 X!, { s6 - s7 } | |||
| vabs.f32 s6, s6 | |||
| vadd.f32 s1 , s1, s5 | |||
| vabs.f32 s7, s7 | |||
| @@ -133,7 +133,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro KERNEL_F1 | |||
| fldmias X!, { s4 } | |||
| vldmia.f32 X!, { s4 } | |||
| vabs.f32 s4, s4 | |||
| vadd.f32 s0 , s0, s4 | |||
| @@ -142,22 +142,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro KERNEL_S4 | |||
| fldmias X, { s4 } | |||
| vldmia.f32 X, { s4 } | |||
| vabs.f32 s4, s4 | |||
| vadd.f32 s0 , s0, s4 | |||
| add X, X, INC_X | |||
| fldmias X, { s4 } | |||
| vldmia.f32 X, { s4 } | |||
| vabs.f32 s4, s4 | |||
| vadd.f32 s0 , s0, s4 | |||
| add X, X, INC_X | |||
| fldmias X, { s4 } | |||
| vldmia.f32 X, { s4 } | |||
| vabs.f32 s4, s4 | |||
| vadd.f32 s0 , s0, s4 | |||
| add X, X, INC_X | |||
| fldmias X, { s4 } | |||
| vldmia.f32 X, { s4 } | |||
| vabs.f32 s4, s4 | |||
| vadd.f32 s0 , s0, s4 | |||
| add X, X, INC_X | |||
| @@ -167,7 +167,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro KERNEL_S1 | |||
| fldmias X, { s4 } | |||
| vldmia.f32 X, { s4 } | |||
| vabs.f32 s4, s4 | |||
| vadd.f32 s0 , s0, s4 | |||
| add X, X, INC_X | |||
| @@ -184,11 +184,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro KERNEL_F4 | |||
| pld [ X, #X_PRE ] | |||
| fldmiad X!, { d4 - d5 } | |||
| vldmia.f64 X!, { d4 - d5 } | |||
| vabs.f64 d4, d4 | |||
| vadd.f64 d0 , d0, d4 | |||
| vabs.f64 d5, d5 | |||
| fldmiad X!, { d6 - d7 } | |||
| vldmia.f64 X!, { d6 - d7 } | |||
| vabs.f64 d6, d6 | |||
| vadd.f64 d1 , d1, d5 | |||
| vabs.f64 d7, d7 | |||
| @@ -196,11 +196,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| vadd.f64 d1 , d1, d7 | |||
| pld [ X, #X_PRE ] | |||
| fldmiad X!, { d4 - d5 } | |||
| vldmia.f64 X!, { d4 - d5 } | |||
| vabs.f64 d4, d4 | |||
| vadd.f64 d0 , d0, d4 | |||
| vabs.f64 d5, d5 | |||
| fldmiad X!, { d6 - d7 } | |||
| vldmia.f64 X!, { d6 - d7 } | |||
| vabs.f64 d6, d6 | |||
| vadd.f64 d1 , d1, d5 | |||
| vabs.f64 d7, d7 | |||
| @@ -212,11 +212,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro KERNEL_F1 | |||
| fldmiad X!, { d4 } | |||
| vldmia.f64 X!, { d4 } | |||
| vabs.f64 d4, d4 | |||
| vadd.f64 d0 , d0, d4 | |||
| fldmiad X!, { d4 } | |||
| vldmia.f64 X!, { d4 } | |||
| vabs.f64 d4, d4 | |||
| vadd.f64 d0 , d0, d4 | |||
| @@ -226,28 +226,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro KERNEL_S4 | |||
| fldmiad X, { d4 -d5 } | |||
| vldmia.f64 X, { d4 -d5 } | |||
| vabs.f64 d4, d4 | |||
| vadd.f64 d0 , d0, d4 | |||
| vabs.f64 d5, d5 | |||
| vadd.f64 d0 , d0, d5 | |||
| add X, X, INC_X | |||
| fldmiad X, { d4 -d5 } | |||
| vldmia.f64 X, { d4 -d5 } | |||
| vabs.f64 d4, d4 | |||
| vadd.f64 d0 , d0, d4 | |||
| vabs.f64 d5, d5 | |||
| vadd.f64 d0 , d0, d5 | |||
| add X, X, INC_X | |||
| fldmiad X, { d4 -d5 } | |||
| vldmia.f64 X, { d4 -d5 } | |||
| vabs.f64 d4, d4 | |||
| vadd.f64 d0 , d0, d4 | |||
| vabs.f64 d5, d5 | |||
| vadd.f64 d0 , d0, d5 | |||
| add X, X, INC_X | |||
| fldmiad X, { d4 -d5 } | |||
| vldmia.f64 X, { d4 -d5 } | |||
| vabs.f64 d4, d4 | |||
| vadd.f64 d0 , d0, d4 | |||
| vabs.f64 d5, d5 | |||
| @@ -259,7 +259,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro KERNEL_S1 | |||
| fldmiad X, { d4 -d5 } | |||
| vldmia.f64 X, { d4 -d5 } | |||
| vabs.f64 d4, d4 | |||
| vadd.f64 d0 , d0, d4 | |||
| vabs.f64 d5, d5 | |||
| @@ -273,22 +273,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro KERNEL_F4 | |||
| pld [ X, #X_PRE ] | |||
| fldmias X!, { s4 - s5 } | |||
| vldmia.f32 X!, { s4 - s5 } | |||
| vabs.f32 s4, s4 | |||
| vadd.f32 s0 , s0, s4 | |||
| vabs.f32 s5, s5 | |||
| fldmias X!, { s6 - s7 } | |||
| vldmia.f32 X!, { s6 - s7 } | |||
| vabs.f32 s6, s6 | |||
| vadd.f32 s1 , s1, s5 | |||
| vabs.f32 s7, s7 | |||
| vadd.f32 s0 , s0, s6 | |||
| vadd.f32 s1 , s1, s7 | |||
| fldmias X!, { s4 - s5 } | |||
| vldmia.f32 X!, { s4 - s5 } | |||
| vabs.f32 s4, s4 | |||
| vadd.f32 s0 , s0, s4 | |||
| vabs.f32 s5, s5 | |||
| fldmias X!, { s6 - s7 } | |||
| vldmia.f32 X!, { s6 - s7 } | |||
| vabs.f32 s6, s6 | |||
| vadd.f32 s1 , s1, s5 | |||
| vabs.f32 s7, s7 | |||
| @@ -300,11 +300,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro KERNEL_F1 | |||
| fldmias X!, { s4 } | |||
| vldmia.f32 X!, { s4 } | |||
| vabs.f32 s4, s4 | |||
| vadd.f32 s0 , s0, s4 | |||
| fldmias X!, { s4 } | |||
| vldmia.f32 X!, { s4 } | |||
| vabs.f32 s4, s4 | |||
| vadd.f32 s0 , s0, s4 | |||
| @@ -313,28 +313,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro KERNEL_S4 | |||
| fldmias X, { s4 -s5 } | |||
| vldmia.f32 X, { s4 -s5 } | |||
| vabs.f32 s4, s4 | |||
| vadd.f32 s0 , s0, s4 | |||
| vabs.f32 s5, s5 | |||
| vadd.f32 s0 , s0, s5 | |||
| add X, X, INC_X | |||
| fldmias X, { s4 -s5 } | |||
| vldmia.f32 X, { s4 -s5 } | |||
| vabs.f32 s4, s4 | |||
| vadd.f32 s0 , s0, s4 | |||
| vabs.f32 s5, s5 | |||
| vadd.f32 s0 , s0, s5 | |||
| add X, X, INC_X | |||
| fldmias X, { s4 -s5 } | |||
| vldmia.f32 X, { s4 -s5 } | |||
| vabs.f32 s4, s4 | |||
| vadd.f32 s0 , s0, s4 | |||
| vabs.f32 s5, s5 | |||
| vadd.f32 s0 , s0, s5 | |||
| add X, X, INC_X | |||
| fldmias X, { s4 -s5 } | |||
| vldmia.f32 X, { s4 -s5 } | |||
| vabs.f32 s4, s4 | |||
| vadd.f32 s0 , s0, s4 | |||
| vabs.f32 s5, s5 | |||
| @@ -346,7 +346,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro KERNEL_S1 | |||
| fldmias X, { s4 -s5 } | |||
| vldmia.f32 X, { s4 -s5 } | |||
| vabs.f32 s4, s4 | |||
| vadd.f32 s0 , s0, s4 | |||
| vabs.f32 s5, s5 | |||
| @@ -146,17 +146,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro KERNEL_F4 | |||
| pld [ X, #X_PRE ] | |||
| fldmiad X!, { d4 - d7 } | |||
| vldmia.f64 X!, { d4 - d7 } | |||
| pld [ Y, #X_PRE ] | |||
| fldmiad Y , { d8 - d11 } | |||
| vldmia.f64 Y , { d8 - d11 } | |||
| fmacd d8 , d0, d4 | |||
| fstmiad Y!, { d8 } | |||
| vstmia.f64 Y!, { d8 } | |||
| fmacd d9 , d0, d5 | |||
| fstmiad Y!, { d9 } | |||
| vstmia.f64 Y!, { d9 } | |||
| fmacd d10, d0, d6 | |||
| fstmiad Y!, { d10 } | |||
| vstmia.f64 Y!, { d10 } | |||
| fmacd d11, d0, d7 | |||
| fstmiad Y!, { d11 } | |||
| vstmia.f64 Y!, { d11 } | |||
| .endm | |||
| @@ -164,19 +164,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro KERNEL_F1 | |||
| fldmiad X!, { d4 } | |||
| fldmiad Y , { d8 } | |||
| vldmia.f64 X!, { d4 } | |||
| vldmia.f64 Y , { d8 } | |||
| fmacd d8 , d0, d4 | |||
| fstmiad Y!, { d8 } | |||
| vstmia.f64 Y!, { d8 } | |||
| .endm | |||
| .macro KERNEL_S1 | |||
| fldmiad X , { d4 } | |||
| fldmiad Y , { d8 } | |||
| vldmia.f64 X , { d4 } | |||
| vldmia.f64 Y , { d8 } | |||
| fmacd d8 , d0, d4 | |||
| fstmiad Y , { d8 } | |||
| vstmia.f64 Y , { d8 } | |||
| add X, X, INC_X | |||
| add Y, Y, INC_Y | |||
| @@ -186,16 +186,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro KERNEL_F4 | |||
| fldmias X!, { s4 - s7 } | |||
| fldmias Y , { s8 - s11 } | |||
| vldmia.f32 X!, { s4 - s7 } | |||
| vldmia.f32 Y , { s8 - s11 } | |||
| fmacs s8 , s0, s4 | |||
| fstmias Y!, { s8 } | |||
| vstmia.f32 Y!, { s8 } | |||
| fmacs s9 , s0, s5 | |||
| fstmias Y!, { s9 } | |||
| vstmia.f32 Y!, { s9 } | |||
| fmacs s10, s0, s6 | |||
| fstmias Y!, { s10 } | |||
| vstmia.f32 Y!, { s10 } | |||
| fmacs s11, s0, s7 | |||
| fstmias Y!, { s11 } | |||
| vstmia.f32 Y!, { s11 } | |||
| .endm | |||
| @@ -203,19 +203,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro KERNEL_F1 | |||
| fldmias X!, { s4 } | |||
| fldmias Y , { s8 } | |||
| vldmia.f32 X!, { s4 } | |||
| vldmia.f32 Y , { s8 } | |||
| fmacs s8 , s0, s4 | |||
| fstmias Y!, { s8 } | |||
| vstmia.f32 Y!, { s8 } | |||
| .endm | |||
| .macro KERNEL_S1 | |||
| fldmias X , { s4 } | |||
| fldmias Y , { s8 } | |||
| vldmia.f32 X , { s4 } | |||
| vldmia.f32 Y , { s8 } | |||
| fmacs s8 , s0, s4 | |||
| fstmias Y , { s8 } | |||
| vstmia.f32 Y , { s8 } | |||
| add X, X, INC_X | |||
| add Y, Y, INC_Y | |||
| @@ -231,42 +231,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro KERNEL_F4 | |||
| pld [ X, #X_PRE ] | |||
| fldmiad X!, { d4 - d7 } | |||
| vldmia.f64 X!, { d4 - d7 } | |||
| pld [ Y, #X_PRE ] | |||
| fldmiad Y , { d8 - d11 } | |||
| vldmia.f64 Y , { d8 - d11 } | |||
| FMAC_R1 d8 , d0, d4 | |||
| FMAC_R2 d8 , d1, d5 | |||
| FMAC_I1 d9 , d0, d5 | |||
| FMAC_I2 d9 , d1, d4 | |||
| fstmiad Y!, { d8 } | |||
| fstmiad Y!, { d9 } | |||
| vstmia.f64 Y!, { d8 } | |||
| vstmia.f64 Y!, { d9 } | |||
| FMAC_R1 d10, d0, d6 | |||
| FMAC_R2 d10, d1, d7 | |||
| FMAC_I1 d11, d0, d7 | |||
| FMAC_I2 d11, d1, d6 | |||
| fstmiad Y!, { d10 } | |||
| fstmiad Y!, { d11 } | |||
| vstmia.f64 Y!, { d10 } | |||
| vstmia.f64 Y!, { d11 } | |||
| pld [ X, #X_PRE ] | |||
| fldmiad X!, { d4 - d7 } | |||
| vldmia.f64 X!, { d4 - d7 } | |||
| pld [ Y, #X_PRE ] | |||
| fldmiad Y , { d8 - d11 } | |||
| vldmia.f64 Y , { d8 - d11 } | |||
| FMAC_R1 d8 , d0, d4 | |||
| FMAC_R2 d8 , d1, d5 | |||
| FMAC_I1 d9 , d0, d5 | |||
| FMAC_I2 d9 , d1, d4 | |||
| fstmiad Y!, { d8 } | |||
| fstmiad Y!, { d9 } | |||
| vstmia.f64 Y!, { d8 } | |||
| vstmia.f64 Y!, { d9 } | |||
| FMAC_R1 d10, d0, d6 | |||
| FMAC_R2 d10, d1, d7 | |||
| FMAC_I1 d11, d0, d7 | |||
| FMAC_I2 d11, d1, d6 | |||
| fstmiad Y!, { d10 } | |||
| fstmiad Y!, { d11 } | |||
| vstmia.f64 Y!, { d10 } | |||
| vstmia.f64 Y!, { d11 } | |||
| @@ -277,15 +277,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro KERNEL_F1 | |||
| fldmiad X!, { d4 - d5 } | |||
| fldmiad Y , { d8 - d9 } | |||
| vldmia.f64 X!, { d4 - d5 } | |||
| vldmia.f64 Y , { d8 - d9 } | |||
| FMAC_R1 d8 , d0, d4 | |||
| FMAC_R2 d8 , d1, d5 | |||
| FMAC_I1 d9 , d0, d5 | |||
| FMAC_I2 d9 , d1, d4 | |||
| fstmiad Y!, { d8 } | |||
| fstmiad Y!, { d9 } | |||
| vstmia.f64 Y!, { d8 } | |||
| vstmia.f64 Y!, { d9 } | |||
| @@ -293,14 +293,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro KERNEL_S1 | |||
| fldmiad X , { d4 - d5 } | |||
| fldmiad Y , { d8 - d9 } | |||
| vldmia.f64 X , { d4 - d5 } | |||
| vldmia.f64 Y , { d8 - d9 } | |||
| FMAC_R1 d8 , d0, d4 | |||
| FMAC_R2 d8 , d1, d5 | |||
| FMAC_I1 d9 , d0, d5 | |||
| FMAC_I2 d9 , d1, d4 | |||
| fstmiad Y , { d8 - d9 } | |||
| vstmia.f64 Y , { d8 - d9 } | |||
| add X, X, INC_X | |||
| add Y, Y, INC_Y | |||
| @@ -314,40 +314,40 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro KERNEL_F4 | |||
| pld [ X, #X_PRE ] | |||
| fldmias X!, { s4 - s7 } | |||
| vldmia.f32 X!, { s4 - s7 } | |||
| pld [ Y, #X_PRE ] | |||
| fldmias Y , { s8 - s11 } | |||
| vldmia.f32 Y , { s8 - s11 } | |||
| FMAC_R1 s8 , s0, s4 | |||
| FMAC_R2 s8 , s1, s5 | |||
| FMAC_I1 s9 , s0, s5 | |||
| FMAC_I2 s9 , s1, s4 | |||
| fstmias Y!, { s8 } | |||
| fstmias Y!, { s9 } | |||
| vstmia.f32 Y!, { s8 } | |||
| vstmia.f32 Y!, { s9 } | |||
| FMAC_R1 s10, s0, s6 | |||
| FMAC_R2 s10, s1, s7 | |||
| FMAC_I1 s11, s0, s7 | |||
| FMAC_I2 s11, s1, s6 | |||
| fstmias Y!, { s10 } | |||
| fstmias Y!, { s11 } | |||
| vstmia.f32 Y!, { s10 } | |||
| vstmia.f32 Y!, { s11 } | |||
| fldmias X!, { s4 - s7 } | |||
| fldmias Y , { s8 - s11 } | |||
| vldmia.f32 X!, { s4 - s7 } | |||
| vldmia.f32 Y , { s8 - s11 } | |||
| FMAC_R1 s8 , s0, s4 | |||
| FMAC_R2 s8 , s1, s5 | |||
| FMAC_I1 s9 , s0, s5 | |||
| FMAC_I2 s9 , s1, s4 | |||
| fstmias Y!, { s8 } | |||
| fstmias Y!, { s9 } | |||
| vstmia.f32 Y!, { s8 } | |||
| vstmia.f32 Y!, { s9 } | |||
| FMAC_R1 s10, s0, s6 | |||
| FMAC_R2 s10, s1, s7 | |||
| FMAC_I1 s11, s0, s7 | |||
| FMAC_I2 s11, s1, s6 | |||
| fstmias Y!, { s10 } | |||
| fstmias Y!, { s11 } | |||
| vstmia.f32 Y!, { s10 } | |||
| vstmia.f32 Y!, { s11 } | |||
| @@ -358,15 +358,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro KERNEL_F1 | |||
| fldmias X!, { s4 - s5 } | |||
| fldmias Y , { s8 - s9 } | |||
| vldmia.f32 X!, { s4 - s5 } | |||
| vldmia.f32 Y , { s8 - s9 } | |||
| FMAC_R1 s8 , s0, s4 | |||
| FMAC_R2 s8 , s1, s5 | |||
| FMAC_I1 s9 , s0, s5 | |||
| FMAC_I2 s9 , s1, s4 | |||
| fstmias Y!, { s8 } | |||
| fstmias Y!, { s9 } | |||
| vstmia.f32 Y!, { s8 } | |||
| vstmia.f32 Y!, { s9 } | |||
| @@ -374,14 +374,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro KERNEL_S1 | |||
| fldmias X , { s4 - s5 } | |||
| fldmias Y , { s8 - s9 } | |||
| vldmia.f32 X , { s4 - s5 } | |||
| vldmia.f32 Y , { s8 - s9 } | |||
| FMAC_R1 s8 , s0, s4 | |||
| FMAC_R2 s8 , s1, s5 | |||
| FMAC_I1 s9 , s0, s5 | |||
| FMAC_I2 s9 , s1, s4 | |||
| fstmias Y , { s8 - s9 } | |||
| vstmia.f32 Y , { s8 - s9 } | |||
| add X, X, INC_X | |||
| add Y, Y, INC_Y | |||
| @@ -440,13 +440,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| cmp N, #0 | |||
| ble axpy_kernel_L999 | |||
| /* | |||
| cmp INC_X, #0 | |||
| beq axpy_kernel_L999 | |||
| cmp INC_Y, #0 | |||
| beq axpy_kernel_L999 | |||
| */ | |||
| cmp INC_X, #1 | |||
| bne axpy_kernel_S_BEGIN | |||
| @@ -65,15 +65,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro COPY_F4 | |||
| pld [ X, #X_PRE ] | |||
| fldmias X!, { s0 - s7 } | |||
| fstmias Y!, { s0 - s7 } | |||
| vldmia.f32 X!, { s0 - s7 } | |||
| vstmia.f32 Y!, { s0 - s7 } | |||
| .endm | |||
| .macro COPY_F1 | |||
| fldmias X!, { s0 - s1 } | |||
| fstmias Y!, { s0 - s1 } | |||
| vldmia.f32 X!, { s0 - s1 } | |||
| vstmia.f32 Y!, { s0 - s1 } | |||
| .endm | |||
| @@ -83,23 +83,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro COPY_S4 | |||
| nop | |||
| fldmias X, { s0 - s1 } | |||
| fstmias Y, { s0 - s1 } | |||
| vldmia.f32 X, { s0 - s1 } | |||
| vstmia.f32 Y, { s0 - s1 } | |||
| add X, X, INC_X | |||
| add Y, Y, INC_Y | |||
| fldmias X, { s2 - s3 } | |||
| fstmias Y, { s2 - s3 } | |||
| vldmia.f32 X, { s2 - s3 } | |||
| vstmia.f32 Y, { s2 - s3 } | |||
| add X, X, INC_X | |||
| add Y, Y, INC_Y | |||
| fldmias X, { s0 - s1 } | |||
| fstmias Y, { s0 - s1 } | |||
| vldmia.f32 X, { s0 - s1 } | |||
| vstmia.f32 Y, { s0 - s1 } | |||
| add X, X, INC_X | |||
| add Y, Y, INC_Y | |||
| fldmias X, { s2 - s3 } | |||
| fstmias Y, { s2 - s3 } | |||
| vldmia.f32 X, { s2 - s3 } | |||
| vstmia.f32 Y, { s2 - s3 } | |||
| add X, X, INC_X | |||
| add Y, Y, INC_Y | |||
| @@ -108,8 +108,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro COPY_S1 | |||
| fldmias X, { s0 - s1 } | |||
| fstmias Y, { s0 - s1 } | |||
| vldmia.f32 X, { s0 - s1 } | |||
| vstmia.f32 Y, { s0 - s1 } | |||
| add X, X, INC_X | |||
| add Y, Y, INC_Y | |||
| @@ -76,30 +76,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| pld [ X, #X_PRE ] | |||
| pld [ Y, #X_PRE ] | |||
| fldmias X!, { s4 - s5 } | |||
| fldmias Y!, { s8 - s9 } | |||
| vldmia.f32 X!, { s4 - s5 } | |||
| vldmia.f32 Y!, { s8 - s9 } | |||
| fmacs s0 , s4, s8 | |||
| fmacs s1 , s4, s9 | |||
| fldmias X!, { s6 - s7 } | |||
| vldmia.f32 X!, { s6 - s7 } | |||
| fmacs s2 , s5, s9 | |||
| fmacs s3 , s5, s8 | |||
| fldmias Y!, { s10 - s11 } | |||
| vldmia.f32 Y!, { s10 - s11 } | |||
| fmacs s0 , s6, s10 | |||
| fmacs s1 , s6, s11 | |||
| fmacs s2 , s7, s11 | |||
| fmacs s3 , s7, s10 | |||
| fldmias X!, { s4 - s5 } | |||
| fldmias Y!, { s8 - s9 } | |||
| vldmia.f32 X!, { s4 - s5 } | |||
| vldmia.f32 Y!, { s8 - s9 } | |||
| fmacs s0 , s4, s8 | |||
| fmacs s1 , s4, s9 | |||
| fldmias X!, { s6 - s7 } | |||
| vldmia.f32 X!, { s6 - s7 } | |||
| fmacs s2 , s5, s9 | |||
| fmacs s3 , s5, s8 | |||
| fldmias Y!, { s10 - s11 } | |||
| vldmia.f32 Y!, { s10 - s11 } | |||
| fmacs s0 , s6, s10 | |||
| fmacs s1 , s6, s11 | |||
| fmacs s2 , s7, s11 | |||
| @@ -109,8 +109,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro KERNEL_F1 | |||
| fldmias X!, { s4 - s5 } | |||
| fldmias Y!, { s8 - s9 } | |||
| vldmia.f32 X!, { s4 - s5 } | |||
| vldmia.f32 Y!, { s8 - s9 } | |||
| fmacs s0 , s4, s8 | |||
| fmacs s1 , s4, s9 | |||
| fmacs s2 , s5, s9 | |||
| @@ -125,8 +125,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| nop | |||
| fldmias X, { s4 - s5 } | |||
| fldmias Y, { s8 - s9 } | |||
| vldmia.f32 X, { s4 - s5 } | |||
| vldmia.f32 Y, { s8 - s9 } | |||
| fmacs s0 , s4, s8 | |||
| fmacs s1 , s4, s9 | |||
| fmacs s2 , s5, s9 | |||
| @@ -134,8 +134,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| add X, X, INC_X | |||
| add Y, Y, INC_Y | |||
| fldmias X, { s4 - s5 } | |||
| fldmias Y, { s8 - s9 } | |||
| vldmia.f32 X, { s4 - s5 } | |||
| vldmia.f32 Y, { s8 - s9 } | |||
| fmacs s0 , s4, s8 | |||
| fmacs s1 , s4, s9 | |||
| fmacs s2 , s5, s9 | |||
| @@ -143,8 +143,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| add X, X, INC_X | |||
| add Y, Y, INC_Y | |||
| fldmias X, { s4 - s5 } | |||
| fldmias Y, { s8 - s9 } | |||
| vldmia.f32 X, { s4 - s5 } | |||
| vldmia.f32 Y, { s8 - s9 } | |||
| fmacs s0 , s4, s8 | |||
| fmacs s1 , s4, s9 | |||
| fmacs s2 , s5, s9 | |||
| @@ -152,8 +152,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| add X, X, INC_X | |||
| add Y, Y, INC_Y | |||
| fldmias X, { s4 - s5 } | |||
| fldmias Y, { s8 - s9 } | |||
| vldmia.f32 X, { s4 - s5 } | |||
| vldmia.f32 Y, { s8 - s9 } | |||
| fmacs s0 , s4, s8 | |||
| fmacs s1 , s4, s9 | |||
| fmacs s2 , s5, s9 | |||
| @@ -166,8 +166,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro KERNEL_S1 | |||
| fldmias X, { s4 - s5 } | |||
| fldmias Y, { s8 - s9 } | |||
| vldmia.f32 X, { s4 - s5 } | |||
| vldmia.f32 Y, { s8 - s9 } | |||
| fmacs s0 , s4, s8 | |||
| fmacs s1 , s4, s9 | |||
| fmacs s2 , s5, s9 | |||
| @@ -215,11 +215,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| cmp N, #0 | |||
| ble cdot_kernel_L999 | |||
| cmp INC_X, #0 | |||
| beq cdot_kernel_L999 | |||
| # cmp INC_X, #0 | |||
| # beq cdot_kernel_L999 | |||
| cmp INC_Y, #0 | |||
| beq cdot_kernel_L999 | |||
| # cmp INC_Y, #0 | |||
| # beq cdot_kernel_L999 | |||
| cmp INC_X, #1 | |||
| bne cdot_kernel_S_BEGIN | |||
| @@ -165,9 +165,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro KERNEL2x2_I | |||
| pld [ AO, #A_PRE ] | |||
| fldmias AO!, { s0 - s3 } | |||
| vldmia.f32 AO!, { s0 - s3 } | |||
| pld [ BO, #B_PRE ] | |||
| fldmias BO!, { s4 - s7 } | |||
| vldmia.f32 BO!, { s4 - s7 } | |||
| fmuls s8 , s0, s4 | |||
| @@ -197,9 +197,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro KERNEL2x2_M1 | |||
| pld [ AO, #A_PRE ] | |||
| fldmias AO!, { s0 - s3 } | |||
| vldmia.f32 AO!, { s0 - s3 } | |||
| pld [ BO, #B_PRE ] | |||
| fldmias BO!, { s4 - s7 } | |||
| vldmia.f32 BO!, { s4 - s7 } | |||
| fmacs s8 , s0, s4 | |||
| fmacs s9 , s0, s5 | |||
| @@ -225,8 +225,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro KERNEL2x2_M2 | |||
| fldmias AO!, { s0 - s3 } | |||
| fldmias BO!, { s4 - s7 } | |||
| vldmia.f32 AO!, { s0 - s3 } | |||
| vldmia.f32 BO!, { s4 - s7 } | |||
| fmacs s8 , s0, s4 | |||
| fmacs s9 , s0, s5 | |||
| @@ -254,8 +254,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro KERNEL2x2_E | |||
| fldmias AO!, { s0 - s3 } | |||
| fldmias BO!, { s4 - s7 } | |||
| vldmia.f32 AO!, { s0 - s3 } | |||
| vldmia.f32 BO!, { s4 - s7 } | |||
| fmacs s8 , s0, s4 | |||
| fmacs s9 , s0, s5 | |||
| @@ -282,8 +282,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro KERNEL2x2_SUB | |||
| fldmias AO!, { s0 - s3 } | |||
| fldmias BO!, { s4 - s7 } | |||
| vldmia.f32 AO!, { s0 - s3 } | |||
| vldmia.f32 BO!, { s4 - s7 } | |||
| fmacs s8 , s0, s4 | |||
| fmacs s9 , s0, s5 | |||
| @@ -317,7 +317,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| flds s0, ALPHA_R | |||
| flds s1, ALPHA_I | |||
| fldmias CO1, { s4 - s7 } | |||
| vldmia.f32 CO1, { s4 - s7 } | |||
| FMAC_R1 s4 , s0 , s8 | |||
| FMAC_I1 s5 , s0 , s9 | |||
| @@ -329,9 +329,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| FMAC_R2 s6 , s1 , s11 | |||
| FMAC_I2 s7 , s1 , s10 | |||
| fstmias CO1, { s4 - s7 } | |||
| vstmia.f32 CO1, { s4 - s7 } | |||
| fldmias CO2, { s4 - s7 } | |||
| vldmia.f32 CO2, { s4 - s7 } | |||
| FMAC_R1 s4 , s0 , s12 | |||
| FMAC_I1 s5 , s0 , s13 | |||
| @@ -343,7 +343,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| FMAC_R2 s6 , s1 , s15 | |||
| FMAC_I2 s7 , s1 , s14 | |||
| fstmias CO2, { s4 - s7 } | |||
| vstmia.f32 CO2, { s4 - s7 } | |||
| add CO1, CO1, #16 | |||
| @@ -500,23 +500,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| flds s0, ALPHA_R | |||
| flds s1, ALPHA_I | |||
| fldmias CO1, { s4 - s5 } | |||
| vldmia.f32 CO1, { s4 - s5 } | |||
| FMAC_R1 s4 , s0 , s8 | |||
| FMAC_I1 s5 , s0 , s9 | |||
| FMAC_R2 s4 , s1 , s9 | |||
| FMAC_I2 s5 , s1 , s8 | |||
| fstmias CO1, { s4 - s5 } | |||
| vstmia.f32 CO1, { s4 - s5 } | |||
| fldmias CO2, { s4 - s5 } | |||
| vldmia.f32 CO2, { s4 - s5 } | |||
| FMAC_R1 s4 , s0 , s12 | |||
| FMAC_I1 s5 , s0 , s13 | |||
| FMAC_R2 s4 , s1 , s13 | |||
| FMAC_I2 s5 , s1 , s12 | |||
| fstmias CO2, { s4 - s5 } | |||
| vstmia.f32 CO2, { s4 - s5 } | |||
| add CO1, CO1, #8 | |||
| @@ -671,7 +671,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| flds s0, ALPHA_R | |||
| flds s1, ALPHA_I | |||
| fldmias CO1, { s4 - s7 } | |||
| vldmia.f32 CO1, { s4 - s7 } | |||
| FMAC_R1 s4 , s0 , s8 | |||
| FMAC_I1 s5 , s0 , s9 | |||
| @@ -683,7 +683,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| FMAC_R2 s6 , s1 , s11 | |||
| FMAC_I2 s7 , s1 , s10 | |||
| fstmias CO1, { s4 - s7 } | |||
| vstmia.f32 CO1, { s4 - s7 } | |||
| add CO1, CO1, #16 | |||
| @@ -800,14 +800,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| flds s0, ALPHA_R | |||
| flds s1, ALPHA_I | |||
| fldmias CO1, { s4 - s5 } | |||
| vldmia.f32 CO1, { s4 - s5 } | |||
| FMAC_R1 s4 , s0 , s8 | |||
| FMAC_I1 s5 , s0 , s9 | |||
| FMAC_R2 s4 , s1 , s9 | |||
| FMAC_I2 s5 , s1 , s8 | |||
| fstmias CO1, { s4 - s5 } | |||
| vstmia.f32 CO1, { s4 - s5 } | |||
| add CO1, CO1, #8 | |||
| @@ -182,30 +182,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro KERNEL2x2_I | |||
| pld [ AO , #A_PRE ] | |||
| pld [ BO , #B_PRE ] | |||
| fldmias AO!, { s0 - s1 } | |||
| fldmias BO!, { s8 - s9 } | |||
| vldmia.f32 AO!, { s0 - s1 } | |||
| vldmia.f32 BO!, { s8 - s9 } | |||
| fmuls s16 , s0, s8 | |||
| fmuls s24 , s1, s9 | |||
| fldmias AO!, { s2 - s3 } | |||
| vldmia.f32 AO!, { s2 - s3 } | |||
| fmuls s17 , s0, s9 | |||
| fmuls s25 , s1, s8 | |||
| fldmias BO!, { s10 - s11 } | |||
| vldmia.f32 BO!, { s10 - s11 } | |||
| fmuls s18 , s2, s8 | |||
| fmuls s26 , s3, s9 | |||
| fldmias AO!, { s4 - s5 } | |||
| vldmia.f32 AO!, { s4 - s5 } | |||
| fmuls s19 , s2, s9 | |||
| fmuls s27 , s3, s8 | |||
| fldmias BO!, { s12 - s13 } | |||
| vldmia.f32 BO!, { s12 - s13 } | |||
| fmuls s20 , s0, s10 | |||
| fmuls s28 , s1, s11 | |||
| fldmias AO!, { s6 - s7 } | |||
| vldmia.f32 AO!, { s6 - s7 } | |||
| fmuls s21 , s0, s11 | |||
| fmuls s29 , s1, s10 | |||
| fldmias BO!, { s14 - s15 } | |||
| vldmia.f32 BO!, { s14 - s15 } | |||
| fmuls s22 , s2, s10 | |||
| fmuls s30 , s3, s11 | |||
| fmuls s23 , s2, s11 | |||
| @@ -218,17 +218,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro KERNEL2x2_M1 | |||
| fmacs s16 , s0, s8 | |||
| fldmias AO!, { s4 - s5 } | |||
| vldmia.f32 AO!, { s4 - s5 } | |||
| fmacs s24 , s1, s9 | |||
| fmacs s17 , s0, s9 | |||
| fldmias BO!, { s12 - s13 } | |||
| vldmia.f32 BO!, { s12 - s13 } | |||
| fmacs s25 , s1, s8 | |||
| fmacs s18 , s2, s8 | |||
| fldmias AO!, { s6 - s7 } | |||
| vldmia.f32 AO!, { s6 - s7 } | |||
| fmacs s26 , s3, s9 | |||
| fmacs s19 , s2, s9 | |||
| fldmias BO!, { s14 - s15 } | |||
| vldmia.f32 BO!, { s14 - s15 } | |||
| fmacs s27 , s3, s8 | |||
| fmacs s20 , s0, s10 | |||
| @@ -250,19 +250,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| pld [ BO , #B_PRE ] | |||
| fmacs s24 , s5, s13 | |||
| fmacs s17 , s4, s13 | |||
| fldmias AO!, { s0 - s1 } | |||
| vldmia.f32 AO!, { s0 - s1 } | |||
| fmacs s25 , s5, s12 | |||
| fmacs s18 , s6, s12 | |||
| fmacs s26 , s7, s13 | |||
| fldmias BO!, { s8 - s9 } | |||
| vldmia.f32 BO!, { s8 - s9 } | |||
| fmacs s19 , s6, s13 | |||
| fmacs s27 , s7, s12 | |||
| fldmias AO!, { s2 - s3 } | |||
| vldmia.f32 AO!, { s2 - s3 } | |||
| fmacs s20 , s4, s14 | |||
| fmacs s28 , s5, s15 | |||
| fldmias BO!, { s10 - s11 } | |||
| vldmia.f32 BO!, { s10 - s11 } | |||
| fmacs s21 , s4, s15 | |||
| fmacs s29 , s5, s14 | |||
| @@ -300,16 +300,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro KERNEL2x2_SUB | |||
| fldmias AO!, { s0 - s1 } | |||
| fldmias BO!, { s8 - s9 } | |||
| vldmia.f32 AO!, { s0 - s1 } | |||
| vldmia.f32 BO!, { s8 - s9 } | |||
| fmacs s16 , s0, s8 | |||
| fmacs s24 , s1, s9 | |||
| fldmias AO!, { s2 - s3 } | |||
| vldmia.f32 AO!, { s2 - s3 } | |||
| fmacs s17 , s0, s9 | |||
| fmacs s25 , s1, s8 | |||
| fldmias BO!, { s10 - s11 } | |||
| vldmia.f32 BO!, { s10 - s11 } | |||
| fmacs s18 , s2, s8 | |||
| fmacs s26 , s3, s9 | |||
| fmacs s19 , s2, s9 | |||
| @@ -338,8 +338,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| flds s0, ALPHA_R | |||
| flds s1, ALPHA_I | |||
| fldmias CO1, { s4 - s7 } | |||
| fldmias CO2, { s8 - s11 } | |||
| vldmia.f32 CO1, { s4 - s7 } | |||
| vldmia.f32 CO2, { s8 - s11 } | |||
| FADD_R s16, s24 , s16 | |||
| FADD_I s17, s25 , s17 | |||
| @@ -370,8 +370,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| FMAC_R2 s10, s1 , s23 | |||
| FMAC_I2 s11, s1 , s22 | |||
| fstmias CO1, { s4 - s7 } | |||
| fstmias CO2, { s8 - s11 } | |||
| vstmia.f32 CO1, { s4 - s7 } | |||
| vstmia.f32 CO2, { s8 - s11 } | |||
| add CO1, CO1, #16 | |||
| @@ -534,8 +534,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| flds s0, ALPHA_R | |||
| flds s1, ALPHA_I | |||
| fldmias CO1, { s4 - s5 } | |||
| fldmias CO2, { s8 - s9 } | |||
| vldmia.f32 CO1, { s4 - s5 } | |||
| vldmia.f32 CO2, { s8 - s9 } | |||
| FADD_R s16, s24 , s16 | |||
| FADD_I s17, s25 , s17 | |||
| @@ -552,8 +552,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| FMAC_R2 s8 , s1 , s21 | |||
| FMAC_I2 s9 , s1 , s20 | |||
| fstmias CO1, { s4 - s5 } | |||
| fstmias CO2, { s8 - s9 } | |||
| vstmia.f32 CO1, { s4 - s5 } | |||
| vstmia.f32 CO2, { s8 - s9 } | |||
| add CO1, CO1, #8 | |||
| @@ -716,7 +716,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| flds s0, ALPHA_R | |||
| flds s1, ALPHA_I | |||
| fldmias CO1, { s4 - s7 } | |||
| vldmia.f32 CO1, { s4 - s7 } | |||
| FADD_R s16, s24 , s16 | |||
| FADD_I s17, s25 , s17 | |||
| @@ -733,7 +733,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| FMAC_R2 s6 , s1 , s19 | |||
| FMAC_I2 s7 , s1 , s18 | |||
| fstmias CO1, { s4 - s7 } | |||
| vstmia.f32 CO1, { s4 - s7 } | |||
| add CO1, CO1, #16 | |||
| @@ -851,7 +851,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| flds s0, ALPHA_R | |||
| flds s1, ALPHA_I | |||
| fldmias CO1, { s4 - s5 } | |||
| vldmia.f32 CO1, { s4 - s5 } | |||
| FADD_R s16, s24 , s16 | |||
| FADD_I s17, s25 , s17 | |||
| @@ -861,7 +861,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| FMAC_R2 s4 , s1 , s17 | |||
| FMAC_I2 s5 , s1 , s16 | |||
| fstmias CO1, { s4 - s5 } | |||
| vstmia.f32 CO1, { s4 - s5 } | |||
| add CO1, CO1, #8 | |||
| @@ -85,7 +85,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| flds s6 , [ AO2, #8 ] | |||
| flds s7 , [ AO2, #12 ] | |||
| fstmias BO!, { s0 - s7 } | |||
| vstmia.f32 BO!, { s0 - s7 } | |||
| add AO2, AO2, #16 | |||
| .endm | |||
| @@ -99,7 +99,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| flds s3 , [ AO2, #4 ] | |||
| add AO1, AO1, #8 | |||
| fstmias BO!, { s0 - s3 } | |||
| vstmia.f32 BO!, { s0 - s3 } | |||
| add AO2, AO2, #8 | |||
| .endm | |||
| @@ -111,7 +111,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| flds s2 , [ AO1, #8 ] | |||
| flds s3 , [ AO1, #12 ] | |||
| fstmias BO!, { s0 - s3 } | |||
| vstmia.f32 BO!, { s0 - s3 } | |||
| add AO1, AO1, #16 | |||
| .endm | |||
| @@ -122,7 +122,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| flds s0 , [ AO1, #0 ] | |||
| flds s1 , [ AO1, #4 ] | |||
| fstmias BO!, { s0 - s1 } | |||
| vstmia.f32 BO!, { s0 - s1 } | |||
| add AO1, AO1, #8 | |||
| .endm | |||
| @@ -73,12 +73,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| **************************************************************************************/ | |||
| .macro COPY2x2 | |||
| fldmias AO1, { s0 - s3 } | |||
| vldmia.f32 AO1, { s0 - s3 } | |||
| add r3, AO1, LDA | |||
| fldmias r3, { s4 - s7 } | |||
| vldmia.f32 r3, { s4 - s7 } | |||
| fstmias BO1, { s0 - s7 } | |||
| vstmia.f32 BO1, { s0 - s7 } | |||
| add AO1, AO1, #16 | |||
| add BO1, BO1, M4 | |||
| @@ -86,12 +86,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro COPY1x2 | |||
| fldmias AO1, { s0 -s1 } | |||
| vldmia.f32 AO1, { s0 -s1 } | |||
| add r3, AO1, LDA | |||
| fldmias r3, { s2 - s3 } | |||
| vldmia.f32 r3, { s2 - s3 } | |||
| fstmias BO2, { s0 - s3 } | |||
| vstmia.f32 BO2, { s0 - s3 } | |||
| add AO1, AO1, #8 | |||
| add BO2, BO2, #16 | |||
| @@ -100,9 +100,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| /*************************************************************************************************************************/ | |||
| .macro COPY2x1 | |||
| fldmias AO1, { s0 - s3 } | |||
| vldmia.f32 AO1, { s0 - s3 } | |||
| fstmias BO1, { s0 - s3 } | |||
| vstmia.f32 BO1, { s0 - s3 } | |||
| add AO1, AO1, #16 | |||
| add BO1, BO1, M4 | |||
| @@ -110,9 +110,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro COPY1x1 | |||
| fldmias AO1, { s0 - s1 } | |||
| vldmia.f32 AO1, { s0 - s1 } | |||
| fstmias BO2, { s0 - s1 } | |||
| vstmia.f32 BO2, { s0 - s1 } | |||
| add AO1, AO1, #8 | |||
| add BO2, BO2, #8 | |||
| @@ -201,7 +201,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| flds s0, ALPHA_R | |||
| flds s1, ALPHA_I | |||
| fldmias YO, { s4 - s7 } | |||
| vldmia.f32 YO, { s4 - s7 } | |||
| FMAC_R1 s4 , s0 , s8 | |||
| FMAC_I1 s5 , s0 , s9 | |||
| @@ -213,9 +213,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| FMAC_R2 s6 , s1 , s11 | |||
| FMAC_I2 s7 , s1 , s10 | |||
| fstmias YO!, { s4 - s7 } | |||
| vstmia.f32 YO!, { s4 - s7 } | |||
| fldmias YO, { s4 - s7 } | |||
| vldmia.f32 YO, { s4 - s7 } | |||
| FMAC_R1 s4 , s0 , s12 | |||
| FMAC_I1 s5 , s0 , s13 | |||
| @@ -227,7 +227,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| FMAC_R2 s6 , s1 , s15 | |||
| FMAC_I2 s7 , s1 , s14 | |||
| fstmias YO!, { s4 - s7 } | |||
| vstmia.f32 YO!, { s4 - s7 } | |||
| .endm | |||
| @@ -266,14 +266,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| flds s0, ALPHA_R | |||
| flds s1, ALPHA_I | |||
| fldmias YO, { s4 - s5 } | |||
| vldmia.f32 YO, { s4 - s5 } | |||
| FMAC_R1 s4 , s0 , s8 | |||
| FMAC_I1 s5 , s0 , s9 | |||
| FMAC_R2 s4 , s1 , s9 | |||
| FMAC_I2 s5 , s1 , s8 | |||
| fstmias YO, { s4 - s5 } | |||
| vstmia.f32 YO, { s4 - s5 } | |||
| add YO, YO, #8 | |||
| @@ -349,47 +349,47 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| flds s0, ALPHA_R | |||
| flds s1, ALPHA_I | |||
| fldmias YO, { s4 - s5 } | |||
| vldmia.f32 YO, { s4 - s5 } | |||
| FMAC_R1 s4 , s0 , s8 | |||
| FMAC_I1 s5 , s0 , s9 | |||
| FMAC_R2 s4 , s1 , s9 | |||
| FMAC_I2 s5 , s1 , s8 | |||
| fstmias YO, { s4 - s5 } | |||
| vstmia.f32 YO, { s4 - s5 } | |||
| add YO, YO, INC_Y | |||
| fldmias YO, { s6 - s7 } | |||
| vldmia.f32 YO, { s6 - s7 } | |||
| FMAC_R1 s6 , s0 , s10 | |||
| FMAC_I1 s7 , s0 , s11 | |||
| FMAC_R2 s6 , s1 , s11 | |||
| FMAC_I2 s7 , s1 , s10 | |||
| fstmias YO, { s6 - s7 } | |||
| vstmia.f32 YO, { s6 - s7 } | |||
| add YO, YO, INC_Y | |||
| fldmias YO, { s4 - s5 } | |||
| vldmia.f32 YO, { s4 - s5 } | |||
| FMAC_R1 s4 , s0 , s12 | |||
| FMAC_I1 s5 , s0 , s13 | |||
| FMAC_R2 s4 , s1 , s13 | |||
| FMAC_I2 s5 , s1 , s12 | |||
| fstmias YO, { s4 - s5 } | |||
| vstmia.f32 YO, { s4 - s5 } | |||
| add YO, YO, INC_Y | |||
| fldmias YO, { s6 - s7 } | |||
| vldmia.f32 YO, { s6 - s7 } | |||
| FMAC_R1 s6 , s0 , s14 | |||
| FMAC_I1 s7 , s0 , s15 | |||
| FMAC_R2 s6 , s1 , s15 | |||
| FMAC_I2 s7 , s1 , s14 | |||
| fstmias YO, { s6 - s7 } | |||
| vstmia.f32 YO, { s6 - s7 } | |||
| add YO, YO, INC_Y | |||
| @@ -430,14 +430,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| flds s0, ALPHA_R | |||
| flds s1, ALPHA_I | |||
| fldmias YO, { s4 - s5 } | |||
| vldmia.f32 YO, { s4 - s5 } | |||
| FMAC_R1 s4 , s0 , s8 | |||
| FMAC_I1 s5 , s0 , s9 | |||
| FMAC_R2 s4 , s1 , s9 | |||
| FMAC_I2 s5 , s1 , s8 | |||
| fstmias YO, { s4 - s5 } | |||
| vstmia.f32 YO, { s4 - s5 } | |||
| add YO, YO, INC_Y | |||