Update from develop for 0.3.21 releasetags/v0.3.21
| @@ -5,27 +5,20 @@ on: [push, pull_request] | |||
| jobs: | |||
| build: | |||
| runs-on: ${{ matrix.os }} | |||
| strategy: | |||
| fail-fast: false | |||
| matrix: | |||
| os: [ubuntu-latest, macos-latest] | |||
| fortran: [gfortran, flang] | |||
| build: [cmake, make] | |||
| exclude: | |||
| - os: macos-latest | |||
| fortran: flang | |||
| steps: | |||
| - name: Checkout repository | |||
| uses: actions/checkout@v2 | |||
| - name: Compilation cache | |||
| uses: actions/cache@v2 | |||
| with: | |||
| path: ~/.ccache | |||
| # We include the commit sha in the cache key, as new cache entries are | |||
| # only created if there is no existing entry for the key yet. | |||
| key: ${{ runner.os }}-ccache-${{ github.sha }} | |||
| # Restore any ccache cache entry, if none for | |||
| # ${{ runner.os }}-ccache-${{ github.sha }} exists | |||
| restore-keys: | | |||
| ${{ runner.os }}-ccache- | |||
| uses: actions/checkout@v3 | |||
| - name: Print system information | |||
| run: | | |||
| @@ -34,7 +27,7 @@ jobs: | |||
| elif [ "$RUNNER_OS" == "macOS" ]; then | |||
| sysctl -a | grep machdep.cpu | |||
| else | |||
| echo "$RUNNER_OS not supported" | |||
| echo "::error::$RUNNER_OS not supported" | |||
| exit 1 | |||
| fi | |||
| @@ -43,61 +36,224 @@ jobs: | |||
| if [ "$RUNNER_OS" == "Linux" ]; then | |||
| sudo apt-get install -y gfortran cmake ccache | |||
| elif [ "$RUNNER_OS" == "macOS" ]; then | |||
| # It looks like "gfortran" isn't working correctly unless "gcc" is re-installed. | |||
| brew reinstall gcc | |||
| brew install coreutils cmake ccache | |||
| else | |||
| echo "$RUNNER_OS not supported" | |||
| echo "::error::$RUNNER_OS not supported" | |||
| exit 1 | |||
| fi | |||
| ccache -M 300M # Limit the ccache size; Github's overall cache limit is 5GB | |||
| - name: gfortran build | |||
| if: matrix.build == 'make' && matrix.fortran == 'gfortran' | |||
| - name: Compilation cache | |||
| uses: actions/cache@v3 | |||
| with: | |||
| path: ~/.ccache | |||
| # We include the commit sha in the cache key, as new cache entries are | |||
| # only created if there is no existing entry for the key yet. | |||
| # GNU make and cmake call the compilers differently. It looks like | |||
| # that causes the cache to mismatch. Keep the ccache for both build | |||
| # tools separate to avoid polluting each other. | |||
| key: ccache-${{ runner.os }}-${{ matrix.build }}-${{ matrix.fortran }}-${{ github.ref }}-${{ github.sha }} | |||
| # Restore a matching ccache cache entry. Prefer same branch and same Fortran compiler. | |||
| restore-keys: | | |||
| ccache-${{ runner.os }}-${{ matrix.build }}-${{ matrix.fortran }}-${{ github.ref }} | |||
| ccache-${{ runner.os }}-${{ matrix.build }}-${{ matrix.fortran }} | |||
| ccache-${{ runner.os }}-${{ matrix.build }} | |||
| - name: Configure ccache | |||
| run: | | |||
| if [ "$RUNNER_OS" == "Linux" ]; then | |||
| export PATH="/usr/lib/ccache:${PATH}" | |||
| elif [ "$RUNNER_OS" == "macOS" ]; then | |||
| export PATH="$(brew --prefix)/opt/ccache/libexec:${PATH}" | |||
| else | |||
| echo "$RUNNER_OS not supported" | |||
| exit 1 | |||
| if [ "${{ matrix.build }}" = "make" ]; then | |||
| # Add ccache to path | |||
| if [ "$RUNNER_OS" = "Linux" ]; then | |||
| echo "/usr/lib/ccache" >> $GITHUB_PATH | |||
| elif [ "$RUNNER_OS" = "macOS" ]; then | |||
| echo "$(brew --prefix)/opt/ccache/libexec" >> $GITHUB_PATH | |||
| else | |||
| echo "::error::$RUNNER_OS not supported" | |||
| exit 1 | |||
| fi | |||
| fi | |||
| # Limit the maximum size and switch on compression to avoid exceeding the total disk or cache quota (5 GB). | |||
| test -d ~/.ccache || mkdir -p ~/.ccache | |||
| echo "max_size = 300M" > ~/.ccache/ccache.conf | |||
| echo "compression = true" >> ~/.ccache/ccache.conf | |||
| ccache -s | |||
| make -j$(nproc) DYNAMIC_ARCH=1 USE_OPENMP=0 | |||
| - name: flang build | |||
| if: matrix.build == 'make' && matrix.fortran == 'flang' | |||
| - name: Build OpenBLAS | |||
| run: | | |||
| if [ "$RUNNER_OS" == "Linux" ]; then | |||
| export PATH="/usr/lib/ccache:${PATH}" | |||
| elif [ "$RUNNER_OS" == "macOS" ]; then | |||
| exit 0 | |||
| else | |||
| echo "$RUNNER_OS not supported" | |||
| exit 1 | |||
| if [ "${{ matrix.fortran }}" = "flang" ]; then | |||
| # download and install classic flang | |||
| cd /usr/ | |||
| sudo wget -nv https://github.com/flang-compiler/flang/releases/download/flang_20190329/flang-20190329-x86-70.tgz | |||
| sudo tar xf flang-20190329-x86-70.tgz | |||
| sudo rm flang-20190329-x86-70.tgz | |||
| cd - | |||
| fi | |||
| case "${{ matrix.build }}" in | |||
| "make") | |||
| make -j$(nproc) DYNAMIC_ARCH=1 USE_OPENMP=0 FC="ccache ${{ matrix.fortran }}" | |||
| ;; | |||
| "cmake") | |||
| mkdir build && cd build | |||
| cmake -DDYNAMIC_ARCH=1 \ | |||
| -DNOFORTRAN=0 \ | |||
| -DBUILD_WITHOUT_LAPACK=0 \ | |||
| -DCMAKE_VERBOSE_MAKEFILE=ON \ | |||
| -DCMAKE_BUILD_TYPE=Release \ | |||
| -DCMAKE_Fortran_COMPILER=${{ matrix.fortran }} \ | |||
| -DCMAKE_C_COMPILER_LAUNCHER=ccache \ | |||
| -DCMAKE_Fortran_COMPILER_LAUNCHER=ccache \ | |||
| .. | |||
| cmake --build . | |||
| ;; | |||
| *) | |||
| echo "::error::Configuration not supported" | |||
| exit 1 | |||
| ;; | |||
| esac | |||
| - name: Show ccache status | |||
| continue-on-error: true | |||
| run: ccache -s | |||
| - name: Run tests | |||
| timeout-minutes: 60 | |||
| run: | | |||
| case "${{ matrix.build }}" in | |||
| "make") | |||
| MAKE_FLAGS='DYNAMIC_ARCH=1 USE_OPENMP=0' | |||
| echo "::group::Tests in 'test' directory" | |||
| make -C test $MAKE_FLAGS FC="ccache ${{ matrix.fortran }}" | |||
| echo "::endgroup::" | |||
| echo "::group::Tests in 'ctest' directory" | |||
| make -C ctest $MAKE_FLAGS FC="ccache ${{ matrix.fortran }}" | |||
| echo "::endgroup::" | |||
| echo "::group::Tests in 'utest' directory" | |||
| make -C utest $MAKE_FLAGS FC="ccache ${{ matrix.fortran }}" | |||
| echo "::endgroup::" | |||
| ;; | |||
| "cmake") | |||
| cd build && ctest | |||
| ;; | |||
| *) | |||
| echo "::error::Configuration not supported" | |||
| exit 1 | |||
| ;; | |||
| esac | |||
| msys2: | |||
| runs-on: windows-latest | |||
| strategy: | |||
| fail-fast: false | |||
| matrix: | |||
| msystem: [MINGW64, MINGW32, CLANG64] | |||
| idx: [int32, int64] | |||
| include: | |||
| - msystem: MINGW64 | |||
| idx: int32 | |||
| target-prefix: mingw-w64-x86_64 | |||
| fc-pkg: mingw-w64-x86_64-gcc-fortran | |||
| - msystem: MINGW32 | |||
| idx: int32 | |||
| target-prefix: mingw-w64-i686 | |||
| fc-pkg: mingw-w64-i686-gcc-fortran | |||
| - msystem: CLANG64 | |||
| idx: int32 | |||
| target-prefix: mingw-w64-clang-x86_64 | |||
| c-lapack-flags: -DC_LAPACK=ON | |||
| - msystem: MINGW64 | |||
| idx: int64 | |||
| idx64-flags: -DBINARY=64 -DINTERFACE64=1 | |||
| target-prefix: mingw-w64-x86_64 | |||
| fc-pkg: mingw-w64-x86_64-gcc-fortran | |||
| - msystem: CLANG64 | |||
| idx: int64 | |||
| idx64-flags: -DBINARY=64 -DINTERFACE64=1 | |||
| target-prefix: mingw-w64-clang-x86_64 | |||
| c-lapack-flags: -DC_LAPACK=ON | |||
| exclude: | |||
| - msystem: MINGW32 | |||
| idx: int64 | |||
| cd /usr/ | |||
| sudo wget -nv https://github.com/flang-compiler/flang/releases/download/flang_20190329/flang-20190329-x86-70.tgz | |||
| sudo tar xf flang-20190329-x86-70.tgz | |||
| sudo rm flang-20190329-x86-70.tgz | |||
| cd - | |||
| defaults: | |||
| run: | |||
| # Use MSYS2 bash as default shell | |||
| shell: msys2 {0} | |||
| make -j$(nproc) DYNAMIC_ARCH=1 USE_OPENMP=0 FC=flang | |||
| env: | |||
| CHERE_INVOKING: 1 | |||
| steps: | |||
| - name: Get CPU name | |||
| shell: pwsh | |||
| run : | | |||
| Get-CIMInstance -Class Win32_Processor | Select-Object -Property Name | |||
| - name: Install build dependencies | |||
| uses: msys2/setup-msys2@v2 | |||
| with: | |||
| msystem: ${{ matrix.msystem }} | |||
| update: true | |||
| release: false # Use pre-installed version | |||
| install: >- | |||
| base-devel | |||
| ${{ matrix.target-prefix }}-cc | |||
| ${{ matrix.fc-pkg }} | |||
| ${{ matrix.target-prefix }}-cmake | |||
| ${{ matrix.target-prefix }}-ninja | |||
| ${{ matrix.target-prefix }}-ccache | |||
| - name: CMake gfortran build | |||
| if: matrix.build == 'cmake' && matrix.fortran == 'gfortran' | |||
| - name: Checkout repository | |||
| uses: actions/checkout@v3 | |||
| - name: Compilation cache | |||
| uses: actions/cache@v3 | |||
| with: | |||
| # It looks like this path needs to be hard-coded. | |||
| path: C:/msys64/home/runneradmin/.ccache | |||
| # We include the commit sha in the cache key, as new cache entries are | |||
| # only created if there is no existing entry for the key yet. | |||
| key: ccache-msys2-${{ matrix.msystem }}-${{ matrix.idx }}-${{ github.ref }}-${{ github.sha }} | |||
| # Restore a matching ccache cache entry. Prefer same branch. | |||
| restore-keys: | | |||
| ccache-msys2-${{ matrix.msystem }}-${{ matrix.idx }}-${{ github.ref }} | |||
| ccache-msys2-${{ matrix.msystem }}-${{ matrix.idx }} | |||
| - name: Configure ccache | |||
| # Limit the maximum size and switch on compression to avoid exceeding the total disk or cache quota. | |||
| run: | | |||
| if [ "$RUNNER_OS" == "Linux" ]; then | |||
| export PATH="/usr/lib/ccache:${PATH}" | |||
| elif [ "$RUNNER_OS" == "macOS" ]; then | |||
| export PATH="$(brew --prefix)/opt/ccache/libexec:${PATH}" | |||
| else | |||
| echo "$RUNNER_OS not supported" | |||
| exit 1 | |||
| fi | |||
| which ccache | |||
| test -d ~/.ccache || mkdir -p ~/.ccache | |||
| echo "max_size = 250M" > ~/.ccache/ccache.conf | |||
| echo "compression = true" >> ~/.ccache/ccache.conf | |||
| ccache -s | |||
| echo $HOME | |||
| cygpath -w $HOME | |||
| - name: Configure OpenBLAS | |||
| run: | | |||
| mkdir build && cd build | |||
| cmake -DBUILD_SHARED_LIBS=ON \ | |||
| -DBUILD_STATIC_LIBS=ON \ | |||
| -DDYNAMIC_ARCH=ON \ | |||
| -DUSE_THREAD=ON \ | |||
| -DNUM_THREADS=64 \ | |||
| -DTARGET=CORE2 \ | |||
| ${{ matrix.idx64-flags }} \ | |||
| ${{ matrix.c-lapack-flags }} \ | |||
| -DCMAKE_C_COMPILER_LAUNCHER=ccache \ | |||
| -DCMAKE_Fortran_COMPILER_LAUNCHER=ccache \ | |||
| .. | |||
| - name: Build OpenBLAS | |||
| run: cd build && cmake --build . | |||
| - name: Show ccache status | |||
| continue-on-error: true | |||
| run: ccache -s | |||
| mkdir build | |||
| cd build | |||
| cmake -DDYNAMIC_ARCH=1 -DNOFORTRAN=0 -DBUILD_WITHOUT_LAPACK=0 -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_BUILD_TYPE=Release .. | |||
| make -j$(nproc) | |||
| - name: Run tests | |||
| timeout-minutes: 60 | |||
| run: cd build && ctest | |||
| @@ -25,11 +25,12 @@ matrix: | |||
| # - BTYPE="BINARY=64" | |||
| # | |||
| # - <<: *test-ubuntu | |||
| os: linux-ppc64le | |||
| os: linux | |||
| arch: ppc64le | |||
| before_script: &common-before | |||
| - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=POWER8 NUM_THREADS=32" | |||
| script: | |||
| - make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE | |||
| - travis_wait 20 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE | |||
| - make -C test $COMMON_FLAGS $BTYPE | |||
| - make -C ctest $COMMON_FLAGS $BTYPE | |||
| - make -C utest $COMMON_FLAGS $BTYPE | |||
| @@ -43,6 +44,7 @@ matrix: | |||
| arch: s390x | |||
| before_script: | |||
| - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=Z13 NUM_THREADS=32" | |||
| - sudo apt-get install --only-upgrade binutils | |||
| env: | |||
| # for matrix annotation only | |||
| - TARGET_BOX=IBMZ_LINUX | |||
| @@ -55,6 +57,7 @@ matrix: | |||
| compiler: clang | |||
| before_script: | |||
| - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=Z13 NUM_THREADS=32" | |||
| - sudo apt-get install --only-upgrade binutils | |||
| env: | |||
| # for matrix annotation only | |||
| - TARGET_BOX=IBMZ_LINUX | |||
| @@ -101,7 +104,7 @@ matrix: | |||
| - sudo apt-get update | |||
| - sudo apt-get install gcc-9 gfortran-9 -y | |||
| script: | |||
| - make QUIET_MAKE=1 BINARY=64 USE_OPENMP=1 CC=gcc-9 FC=gfortran-9 | |||
| - travis_wait 20 make QUIET_MAKE=1 BINARY=64 USE_OPENMP=1 CC=gcc-9 FC=gfortran-9 | |||
| - make -C test $COMMON_FLAGS $BTYPE | |||
| - make -C ctest $COMMON_FLAGS $BTYPE | |||
| - make -C utest $COMMON_FLAGS $BTYPE | |||
| @@ -118,7 +121,7 @@ matrix: | |||
| - sudo apt-get update | |||
| - sudo apt-get install gcc-9 gfortran-9 -y | |||
| script: | |||
| - make QUIET_MAKE=1 BUILD_BFLOAT16=1 BINARY=64 USE_OPENMP=1 CC=gcc-9 FC=gfortran-9 | |||
| - travis_wait 20 make QUIET_MAKE=1 BUILD_BFLOAT16=1 BINARY=64 USE_OPENMP=1 CC=gcc-9 FC=gfortran-9 | |||
| - make -C test $COMMON_FLAGS $BTYPE | |||
| - make -C ctest $COMMON_FLAGS $BTYPE | |||
| - make -C utest $COMMON_FLAGS $BTYPE | |||
| @@ -269,9 +272,9 @@ matrix: | |||
| # - CFLAGS="-O2 -mno-thumb -Wno-macro-redefined -isysroot /Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS13.5.sdk -arch armv7 -miphoneos-version-min=5.1" | |||
| # - BTYPE="TARGET=ARMV7 HOSTCC=clang NOFORTRAN=1" | |||
| - &test-graviton2 | |||
| - &test-neoversen1 | |||
| os: linux | |||
| arch: arm64-graviton2 | |||
| arch: arm64 | |||
| dist: focal | |||
| group: edge | |||
| virt: lxd | |||
| @@ -1,5 +1,9 @@ | |||
| Thank you for the support. | |||
| ### [2019.12/2021.9] [Chan-Zuckerberg Foundation EOSS Initiative](https://chanzuckerberg.com/eoss/) | |||
| Between December 2019 and September 2021, development and maintaining of OpenBLAS was funded in part by the Chan-Zuckerberg Foundation in the context of two grants awarded to the NumPy Foundation and managed by NumFocus (Cycles 1 and 3 of the Essential Open Source Software for Science (EOSS) Initiative of the Chan-Zuckerberg Foundation) | |||
| ### [2013.8] [Testbed for OpenBLAS project](https://www.bountysource.com/fundraisers/443-testbed-for-openblas-project) | |||
| https://www.bountysource.com/fundraisers/443-testbed-for-openblas-project/pledges | |||
| @@ -17,14 +17,12 @@ include(GNUInstallDirs) | |||
| include(CMakePackageConfigHelpers) | |||
| if(MSVC AND NOT DEFINED NOFORTRAN) | |||
| set(NOFORTRAN ON) | |||
| endif() | |||
| ####### | |||
| if(MSVC) | |||
| option(BUILD_WITHOUT_LAPACK "Do not build LAPACK and LAPACKE (Only BLAS or CBLAS)" ON) | |||
| endif() | |||
| option(BUILD_WITHOUT_LAPACK "Do not build LAPACK and LAPACKE (Only BLAS or CBLAS)" OFF) | |||
| option(BUILD_TESTING "Build LAPACK testsuite when building LAPACK" ON) | |||
| option(C_LAPACK "Build LAPACK from C sources instead of the original Fortran" OFF) | |||
| option(BUILD_WITHOUT_CBLAS "Do not build the C interface (CBLAS) to the BLAS functions" OFF) | |||
| @@ -36,6 +34,8 @@ option(BUILD_RELAPACK "Build with ReLAPACK (recursive implementation of several | |||
| option(USE_LOCKING "Use locks even in single-threaded builds to make them callable from multiple threads" OFF) | |||
| option(USE_PERL "Use the older PERL scripts for build preparation instead of universal shell scripts" OFF) | |||
| if(${CMAKE_SYSTEM_NAME} MATCHES "Linux") | |||
| option(NO_AFFINITY "Disable support for CPU affinity masks to avoid binding processes from e.g. R or numpy/scipy to a single core" ON) | |||
| else() | |||
| @@ -179,7 +179,7 @@ endforeach () | |||
| # Can't just use lapack-netlib's CMake files, since they are set up to search for BLAS, build and install a binary. We just want to build a couple of lib files out of lapack and lapacke. | |||
| # Not using add_subdirectory here because lapack-netlib already has its own CMakeLists.txt. Instead include a cmake script with the sources we want. | |||
| if (NOT NOFORTRAN AND NOT NO_LAPACK) | |||
| if (NOT NO_LAPACK) | |||
| include("${PROJECT_SOURCE_DIR}/cmake/lapack.cmake") | |||
| if (NOT NO_LAPACKE) | |||
| include("${PROJECT_SOURCE_DIR}/cmake/lapacke.cmake") | |||
| @@ -205,8 +205,8 @@ endif () | |||
| # add objects to the openblas lib | |||
| if(NOT NO_LAPACK) | |||
| add_library(LAPACK OBJECT ${LA_SOURCES}) | |||
| list(APPEND TARGET_OBJS "$<TARGET_OBJECTS:LAPACK>") | |||
| add_library(LAPACK_OVERRIDES OBJECT ${LA_SOURCES}) | |||
| list(APPEND TARGET_OBJS "$<TARGET_OBJECTS:LAPACK_OVERRIDES>") | |||
| endif() | |||
| if(NOT NO_LAPACKE) | |||
| add_library(LAPACKE OBJECT ${LAPACKE_SOURCES}) | |||
| @@ -247,7 +247,7 @@ endif() | |||
| if (APPLE AND DYNAMIC_ARCH AND BUILD_SHARED_LIBS) | |||
| set (CMAKE_C_USE_RESPONSE_FILE_FOR_OBJECTS 1) | |||
| if (NOT NOFORTRAN) | |||
| if (NOT NOFORTRAN) | |||
| set (CMAKE_Fortran_USE_RESPONSE_FILE_FOR_OBJECTS 1) | |||
| set (CMAKE_Fortran_CREATE_SHARED_LIBRARY | |||
| "sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ar -ru libopenblas.a && exit 0' " | |||
| @@ -314,14 +314,16 @@ endif() | |||
| if (NOT NOFORTRAN) | |||
| # Build test and ctest | |||
| add_subdirectory(test) | |||
| if (BUILD_TESTING) | |||
| add_subdirectory(lapack-netlib/TESTING) | |||
| endif() | |||
| endif() | |||
| if(NOT NO_CBLAS) | |||
| add_subdirectory(ctest) | |||
| endif() | |||
| add_subdirectory(lapack-netlib/TESTING) | |||
| if (CPP_THREAD_SAFETY_TEST OR CPP_THREAD_SAFETY_GEMV) | |||
| add_subdirectory(cpp_thread_test) | |||
| endif() | |||
| endif() | |||
| set_target_properties(${OpenBLAS_LIBS} PROPERTIES | |||
| VERSION ${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION} | |||
| @@ -394,14 +396,23 @@ if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFFIX} STREQUAL "") | |||
| message(STATUS "adding suffix ${SYMBOLSUFFIX} to names of exported symbols in ${OpenBLAS_LIBNAME}") | |||
| endif() | |||
| if (NOT DEFINED USE_PERL) | |||
| add_custom_command(TARGET ${OpenBLAS_LIBNAME}_shared POST_BUILD | |||
| COMMAND perl ${PROJECT_SOURCE_DIR}/exports/gensymbol "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BUILD_LAPACK_DEPRECATED}" > ${PROJECT_BINARY_DIR}/objcopy.def | |||
| COMMAND ${PROJECT_SOURCE_DIR}/exports/gensymbol "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BUILD_LAPACK_DEPRECATED}" "${BUILD_BFLOAT16}" "${BUILD_SINGLE}" "${BUILD_DOUBLE}" "${BUILD_COMPLEX}" "${BUILD_COMPLEX16}" > ${PROJECT_BINARY_DIR}/objcopy.def | |||
| COMMAND objcopy -v --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def ${PROJECT_BINARY_DIR}/lib/lib${OpenBLAS_LIBNAME}.so | |||
| COMMENT "renaming symbols" | |||
| ) | |||
| else() | |||
| add_custom_command(TARGET ${OpenBLAS_LIBNAME}_shared POST_BUILD | |||
| COMMAND perl ${PROJECT_SOURCE_DIR}/exports/gensymbol.pl "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BUILD_LAPACK_DEPRECATED}" "${BUILD_BFLOAT16}" "${BUILD_SINGLE}" "${BUILD_DOUBLE}" "${BUILD_COMPLEX}" "${BUILD_COMPLEX16}" > ${PROJECT_BINARY_DIR}/objcopy.def | |||
| COMMAND objcopy -v --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def ${PROJECT_BINARY_DIR}/lib/lib${OpenBLAS_LIBNAME}.so | |||
| COMMENT "renaming symbols" | |||
| ) | |||
| endif() | |||
| endif() | |||
| # Install project | |||
| # Install libraries | |||
| @@ -207,3 +207,8 @@ In chronological order: | |||
| * Ilya Kurdyukov <https://github.com/ilyakurdyukov> | |||
| * [2021-02-21] Add basic support for the Elbrus E2000 architecture | |||
| * PLCT Lab, Institute of Software Chinese Academy of Sciences | |||
| * [2022-03] Support RISC-V Vector Intrinisc 1.0 version. | |||
| @@ -1,4 +1,86 @@ | |||
| OpenBLAS ChangeLog | |||
| ==================================================================== | |||
| Version 0.3.21 | |||
| 07-Aug-2022 | |||
| general: | |||
| - Updated the included LAPACK to Reference-LAPACK release 3.10.1 | |||
| - when no Fortran compiler is available, OpenBLAS builds will now automatically | |||
| build LAPACK from an f2c-converted copy of LAPACK 3.9.0 unless the NO_LAPACK option | |||
| is specified | |||
| - similarly added C versions of the BLAS and CBLAS tests | |||
| - enabled building of the ReLAPACK GEMMT kernels when ReLAPACK is built | |||
| - function LAPACKE_lsame is now annotated with the GCC attribute "const" to aid static analyzers | |||
| - added USE_TLS to the list of options reported by the openblas_get_config() function | |||
| - CMAKE builds now support the BUILD_TESTING keyword (to disable the LAPACK testsuite) of Reference-LAPACK | |||
| - fixed CMAKE builds of the laswp_ncopy and neg_tcopy kernels | |||
| - removed the build system requirements for PERL (while keeping the original perl scripts as backup) | |||
| - handle building and running OpenBLAS on systems that report zero available cpu cores | |||
| - added SYMBOLPREFIX/SYMBOLSUFFIX handling for LAPACK 3.10.0 functions added in 0.3.20 | |||
| - fixed linking of the utests on QNX | |||
| - Added support for compilation with the Intel ifx compiler | |||
| - Added support for compilation with the Fujitsu FCC compiler for Fugaku | |||
| - Added support for compilation with the Cray C and Fortran compilers | |||
| - reverted OpenMP threadpool behaviour in the exec_blas call to its state before 0.3.11, that is | |||
| the threadpool will no longer grow or shrink on demand as the overhead for this is too big at least with | |||
| GNU OpenMP. The adaptive behaviour introduced in 0.3.11 can still be requested at runtime by setting | |||
| the environment variable OMP_ADAPTIVE | |||
| - worked around spurious STFSM/CTFSM errors reported by the LAPACK testsuite | |||
| x86_64: | |||
| - fixed determination of compiler support for AVX512 and removed the 0.3.19 | |||
| workaround for building SKYLAKEX kernels on Sandybridge hardware | |||
| - fixed compilation for the SKYLAKEX target with gcc 6 | |||
| - fixed compilation of the CooperLake SBGEMM kernel with LLVM | |||
| - fixed compilation of the SkyLakeX small matrix GEMM kernels with LLVM or ICC | |||
| - fixed compilation of some BFLOAT16 kernels with CMAKE | |||
| - added support for the Zhaoxin/Centaur KH40000 cpu | |||
| - fixed a potential crash in the ZSYMV kernel used for all targets except generic | |||
| - fixed gmake compilation for DYNAMIC_ARCH with a DYNAMIC_LIST including ATOM | |||
| - fixed compilation of LAPACKE with the INTEGER64 option on Windows | |||
| - added support for cross-compiling to individual Intel or AMD targets using CMAKE | |||
| (previously only CORE2 supported, added targets are ATOM, PRESCOTT, NEHALEM, SANDYBRIDGE, | |||
| HASWELL,SKYLAKEX, COOPERLAKE, SAPPHIRERAPIDS, OPTERON, BARCELONA, BULLDOZER, PILEDRIVER, | |||
| STEAMROLLER,EXCAVATOR, ZEN) | |||
| SPARC: | |||
| - worked around an overflow error in the DNRM2 kernel | |||
| POWER: | |||
| - worked around an overflow error in the POWER6 DNRM2 kernel | |||
| - fixed compilation on PPC440 | |||
| - fixed a performance regression in the level1 BLAS on POWER10 | |||
| - fixed the POWER10 ZGEMM kernel | |||
| - fixed singlethreaded builds for POWER10 | |||
| - fixed compilation of the POWER10 DGEMV kernel with older gcc versions | |||
| - enabled compilation of the BFLOAT16 kernels by default | |||
| - enabled the small matrix kernels by default for DYNAMIC_ARCH builds | |||
| - added a workaround for a miscompilation of the CDOT and ZDOT kernels by GCC 12 | |||
| - RISCV: | |||
| - fixed cpu autodetection logic | |||
| ARMV8: | |||
| - added an SBGEMM kernel for Neoverse N2 | |||
| - worked around an overflow error in the DNRM2 kernel used on M1, NeoverseN1, ThunderX2T99 | |||
| - added support for ARM64 systems running MS Windows | |||
| - added support for cross-compiling to the GENERIC ARMV8 target under CMAKE (Windows/MSVC) | |||
| - fixed a performance regression in the generic ARMV8 DGEMM kernel introduced in 0.3.19 | |||
| - added initial support for the Apple M1 cpu under Linux | |||
| - added initial support for the Phytium FT2000 cpu | |||
| - added initial support for the Cortex A510, A710, X1 and X2 cpu | |||
| - fixed an accidental mixup of cpu identifiers in the autodetection code introduced in 0.3.20 | |||
| - fixed linking of Apple M1 builds on macOS 12 and later with recent XCode | |||
| - made Neoverse N2 available in DYNAMIC_ARCH builds | |||
| MIPS,MIPS64: | |||
| - worked around an overflow error in the DNRM2 kernel | |||
| LOONGARCH64: | |||
| - worked around an overflow error in the DNRM2 kernel | |||
| - added preliminary support for the LOONGSON2K1000 cpu | |||
| - added DYNAMIC_ARCH support | |||
| ==================================================================== | |||
| Version 0.3.20 | |||
| 20-Feb-2022 | |||
| @@ -25,11 +25,14 @@ ifeq ($(NO_FORTRAN), 1) | |||
| define NOFORTRAN | |||
| 1 | |||
| endef | |||
| define NO_LAPACK | |||
| ifneq ($(NO_LAPACK), 1) | |||
| define C_LAPACK | |||
| 1 | |||
| endef | |||
| endif | |||
| export NOFORTRAN | |||
| export NO_LAPACK | |||
| export C_LAPACK | |||
| endif | |||
| LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast -O -Og -Os,$(LAPACK_FFLAGS)) | |||
| @@ -146,21 +149,25 @@ ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) | |||
| ifndef NO_FBLAS | |||
| $(MAKE) -C test all | |||
| endif | |||
| endif | |||
| ifneq ($(ONLY_CBLAS), 1) | |||
| $(MAKE) -C utest all | |||
| endif | |||
| ifneq ($(NO_CBLAS), 1) | |||
| ifneq ($(ONLY_CBLAS), 1) | |||
| $(MAKE) -C ctest all | |||
| endif | |||
| ifeq ($(CPP_THREAD_SAFETY_TEST), 1) | |||
| $(MAKE) -C cpp_thread_test all | |||
| endif | |||
| endif | |||
| endif | |||
| libs : | |||
| ifeq ($(CORE), UNKNOWN) | |||
| $(error OpenBLAS: Detecting CPU failed. Please set TARGET explicitly, e.g. make TARGET=your_cpu_target. Please read README for the detail.) | |||
| endif | |||
| ifeq ($(NOFORTRAN), 1) | |||
| $(info OpenBLAS: Detecting fortran compiler failed. Cannot compile LAPACK. Only compile BLAS.) | |||
| $(info OpenBLAS: Detecting fortran compiler failed. Can only compile BLAS and f2c-converted LAPACK.) | |||
| endif | |||
| ifeq ($(NO_STATIC), 1) | |||
| ifeq ($(NO_SHARED), 1) | |||
| @@ -241,19 +248,14 @@ hpl_p : | |||
| fi; \ | |||
| done | |||
| ifeq ($(NO_LAPACK), 1) | |||
| netlib : | |||
| else | |||
| netlib : lapack_prebuild | |||
| ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) | |||
| ifneq ($(NO_LAPACK), 1) | |||
| @$(MAKE) -C $(NETLIB_LAPACK_DIR) lapacklib | |||
| @$(MAKE) -C $(NETLIB_LAPACK_DIR) tmglib | |||
| endif | |||
| ifneq ($(NO_LAPACKE), 1) | |||
| @$(MAKE) -C $(NETLIB_LAPACK_DIR) lapackelib | |||
| endif | |||
| endif | |||
| ifeq ($(NO_LAPACK), 1) | |||
| re_lapack : | |||
| @@ -267,7 +269,7 @@ prof_lapack : lapack_prebuild | |||
| @$(MAKE) -C $(NETLIB_LAPACK_DIR) lapack_prof | |||
| lapack_prebuild : | |||
| ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) | |||
| ifeq ($(NO_LAPACK), $(filter 0,$(NO_LAPACK))) | |||
| -@echo "FC = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc | |||
| -@echo "override FFLAGS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc | |||
| -@echo "FFLAGS_DRV = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc | |||
| @@ -3,6 +3,9 @@ ifneq ($(C_COMPILER), PGI) | |||
| ifeq ($(C_COMPILER), CLANG) | |||
| ISCLANG=1 | |||
| endif | |||
| ifeq ($(C_COMPILER), FUJITSU) | |||
| ISCLANG=1 | |||
| endif | |||
| ifneq (1, $(filter 1,$(GCCVERSIONGT4) $(ISCLANG))) | |||
| CCOMMON_OPT += -march=armv8-a | |||
| ifneq ($(F_COMPILER), NAG) | |||
| @@ -55,6 +58,13 @@ FCOMMON_OPT += -march=armv8-a -mtune=cortex-a73 | |||
| endif | |||
| endif | |||
| ifeq ($(CORE), FT2000) | |||
| CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 | |||
| endif | |||
| endif | |||
| # Use a72 tunings because Neoverse-N1 is only available | |||
| # in GCC>=9 | |||
| ifeq ($(CORE), NEOVERSEN1) | |||
| @@ -114,9 +124,9 @@ ifeq ($(CORE), NEOVERSEN2) | |||
| ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG))) | |||
| ifeq ($(GCCVERSIONGTEQ9), 1) | |||
| ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ10))) | |||
| CCOMMON_OPT += -march=armv8.5-a -mtune=neoverse-n2 | |||
| CCOMMON_OPT += -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2 | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=armv8.5-a -mtune=neoverse-n2 | |||
| FCOMMON_OPT += -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2 | |||
| endif | |||
| else | |||
| CCOMMON_OPT += -march=armv8.5-a -mtune=native | |||
| @@ -229,6 +239,43 @@ endif | |||
| endif | |||
| endif | |||
| ifeq (1, $(filter 1,$(GCCVERSIONGTEQ11) $(ISCLANG))) | |||
| ifeq ($(CORE), CORTEXX1) | |||
| CCOMMON_OPT += -march=armv8.2-a -mtune=cortexa72 | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=armv8.2-a -mtune=cortexa72 | |||
| endif | |||
| endif | |||
| endif | |||
| ifeq (1, $(filter 1,$(GCCVERSIONGTEQ11) $(ISCLANG))) | |||
| ifeq ($(CORE), CORTEXX2) | |||
| CCOMMON_OPT += -march=armv8.4-a+sve | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=armv8.4-a+sve | |||
| endif | |||
| endif | |||
| endif | |||
| #ifeq (1, $(filter 1,$(ISCLANG))) | |||
| ifeq (1, $(filter 1,$(GCCVERSIONGTEQ11) $(ISCLANG))) | |||
| ifeq ($(CORE), CORTEXA510) | |||
| CCOMMON_OPT += -march=armv8.4-a+sve | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=armv8.4-a+sve | |||
| endif | |||
| endif | |||
| endif | |||
| ifeq (1, $(filter 1,$(GCCVERSIONGTEQ11) $(ISCLANG))) | |||
| ifeq ($(CORE), CORTEXA710) | |||
| CCOMMON_OPT += -march=armv8.4-a+sve | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=armv8.4-a+sve | |||
| endif | |||
| endif | |||
| endif | |||
| endif | |||
| endif | |||
| @@ -15,6 +15,12 @@ TARGET_MAKE = Makefile.conf | |||
| TARGET_CONF = config.h | |||
| endif | |||
| ifdef USE_PERL | |||
| SCRIPTSUFFIX = .pl | |||
| else | |||
| SCRIPTSUFFIX = | |||
| endif | |||
| # CPUIDEMU = ../../cpuid/table.o | |||
| ifdef CPUIDEMU | |||
| @@ -46,17 +52,17 @@ TARGET_FLAGS = -mips64r6 | |||
| endif | |||
| ifeq ($(TARGET), C910V) | |||
| TARGET_FLAGS = -march=rv64gcvxthead -mabi=lp64v | |||
| TARGET_FLAGS = -march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d | |||
| endif | |||
| all: getarch_2nd | |||
| ./getarch_2nd 0 >> $(TARGET_MAKE) | |||
| ./getarch_2nd 1 >> $(TARGET_CONF) | |||
| config.h : c_check f_check getarch | |||
| perl ./c_check $(TARGET_MAKE) $(TARGET_CONF) $(CC) $(TARGET_FLAGS) $(CFLAGS) | |||
| $(TARGET_CONF): c_check$(SCRIPTSUFFIX) f_check$(SCRIPTSUFFIX) getarch | |||
| ./c_check$(SCRIPTSUFFIX) $(TARGET_MAKE) $(TARGET_CONF) $(CC) $(TARGET_FLAGS) $(CFLAGS) | |||
| ifneq ($(ONLY_CBLAS), 1) | |||
| perl ./f_check $(TARGET_MAKE) $(TARGET_CONF) $(FC) $(TARGET_FLAGS) | |||
| ./f_check$(SCRIPTSUFFIX) $(TARGET_MAKE) $(TARGET_CONF) $(FC) $(TARGET_FLAGS) | |||
| else | |||
| #When we only build CBLAS, we set NOFORTRAN=2 | |||
| echo "NOFORTRAN=2" >> $(TARGET_MAKE) | |||
| @@ -71,9 +77,11 @@ endif | |||
| getarch : getarch.c cpuid.S dummy $(CPUIDEMU) | |||
| $(HOSTCC) $(HOST_CFLAGS) $(EXFLAGS) -o $(@F) getarch.c cpuid.S $(CPUIDEMU) | |||
| avx512=$$(./c_check$(SCRIPTSUFFIX) - - $(CC) $(TARGET_FLAGS) $(CFLAGS) | grep NO_AVX512); \ | |||
| rv64gv=$$(./c_check$(SCRIPTSUFFIX) - - $(CC) $(TARGET_FLAGS) $(CFLAGS) | grep NO_RV64GV); \ | |||
| $(HOSTCC) $(HOST_CFLAGS) $(EXFLAGS) $${avx512:+-D$${avx512}} $${rv64gv:+-D$${rv64gv}} -o $(@F) getarch.c cpuid.S $(CPUIDEMU) | |||
| getarch_2nd : getarch_2nd.c config.h dummy | |||
| getarch_2nd : getarch_2nd.c $(TARGET_CONF) dummy | |||
| ifndef TARGET_CORE | |||
| $(HOSTCC) -I. $(HOST_CFLAGS) -o $(@F) getarch_2nd.c | |||
| else | |||
| @@ -81,3 +89,5 @@ else | |||
| endif | |||
| dummy: | |||
| .PHONY: dummy | |||
| @@ -1,4 +1,4 @@ | |||
| ifeq ($(CORE), C910V) | |||
| CCOMMON_OPT += -march=rv64gcvxthead -mabi=lp64v | |||
| FCOMMON_OPT += -march=rv64gcvxthead -mabi=lp64v -static | |||
| CCOMMON_OPT += -march=rv64imafdcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c920 | |||
| FCOMMON_OPT += -march=rv64imafdcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c920 -static | |||
| endif | |||
| @@ -261,8 +261,9 @@ endif | |||
| #For small matrix optimization | |||
| ifeq ($(ARCH), x86_64) | |||
| SMALL_MATRIX_OPT = 1 | |||
| else ifeq ($(CORE), POWER10) | |||
| else ifeq ($(ARCH), power) | |||
| SMALL_MATRIX_OPT = 1 | |||
| BUILD_BFLOAT16 = 1 | |||
| endif | |||
| ifeq ($(SMALL_MATRIX_OPT), 1) | |||
| CCOMMON_OPT += -DSMALL_MATRIX_OPT | |||
| @@ -352,7 +353,7 @@ OBJCONV = $(CROSS_SUFFIX)objconv | |||
| # When fortran support was either not detected or actively deselected, only build BLAS. | |||
| ifeq ($(NOFORTRAN), 1) | |||
| NO_LAPACK = 1 | |||
| C_LAPACK = 1 | |||
| override FEXTRALIB = | |||
| endif | |||
| @@ -384,8 +385,12 @@ endif | |||
| ifeq ($(OSNAME), Darwin) | |||
| ifndef MACOSX_DEPLOYMENT_TARGET | |||
| ifeq ($(ARCH), arm64) | |||
| export MACOSX_DEPLOYMENT_TARGET=11.0 | |||
| else | |||
| export MACOSX_DEPLOYMENT_TARGET=10.8 | |||
| endif | |||
| endif | |||
| MD5SUM = md5 -r | |||
| endif | |||
| @@ -675,6 +680,10 @@ ifeq ($(ARCH), mips64) | |||
| DYNAMIC_CORE = LOONGSON3R3 LOONGSON3R4 | |||
| endif | |||
| ifeq ($(ARCH), loongarch64) | |||
| DYNAMIC_CORE = LOONGSON3R5 LOONGSON2K1000 LOONGSONGENERIC | |||
| endif | |||
| ifeq ($(ARCH), zarch) | |||
| DYNAMIC_CORE = ZARCH_GENERIC | |||
| @@ -847,7 +856,7 @@ CCOMMON_OPT += -mabi=32 | |||
| BINARY_DEFINED = 1 | |||
| endif | |||
| ifeq ($(CORE), $(filter $(CORE),LOONGSON3R3 LOONGSON3R4)) | |||
| ifneq (, $(filter $(CORE),LOONGSON3R3 LOONGSON3R4)) | |||
| CCOMMON_OPT += -march=loongson3a | |||
| FCOMMON_OPT += -march=loongson3a | |||
| endif | |||
| @@ -887,11 +896,9 @@ BINARY_DEFINED = 1 | |||
| endif | |||
| ifeq ($(ARCH), loongarch64) | |||
| ifeq ($(CORE), LOONGSON3R5) | |||
| CCOMMON_OPT += -march=loongarch64 -mabi=lp64 | |||
| FCOMMON_OPT += -march=loongarch64 -mabi=lp64 | |||
| endif | |||
| endif | |||
| endif | |||
| @@ -1041,9 +1048,13 @@ FCOMMON_OPT += -frecursive | |||
| # work around ABI problem with passing single-character arguments | |||
| FCOMMON_OPT += -fno-optimize-sibling-calls | |||
| #Don't include -lgfortran, when NO_LAPACK=1 or lsbcc | |||
| ifneq ($(NOFORTRAN), 1) | |||
| ifneq ($(NOFORTRAN), 2) | |||
| ifneq ($(NO_LAPACK), 1) | |||
| EXTRALIB += -lgfortran | |||
| endif | |||
| endif | |||
| endif | |||
| ifdef NO_BINARY_MODE | |||
| ifeq ($(ARCH), $(filter $(ARCH),mips64)) | |||
| ifdef BINARY64 | |||
| @@ -1179,7 +1190,6 @@ FCOMMON_OPT += -i8 | |||
| endif | |||
| endif | |||
| endif | |||
| ifeq ($(ARCH), $(filter $(ARCH),mips64 mips)) | |||
| ifndef BINARY64 | |||
| FCOMMON_OPT += -n32 | |||
| @@ -1189,11 +1199,9 @@ endif | |||
| ifeq ($(CORE), LOONGSON3R3) | |||
| FCOMMON_OPT += -loongson3 -static | |||
| endif | |||
| ifeq ($(CORE), LOONGSON3R4) | |||
| FCOMMON_OPT += -loongson3 -static | |||
| endif | |||
| else | |||
| ifndef BINARY64 | |||
| FCOMMON_OPT += -m32 | |||
| @@ -1201,7 +1209,6 @@ else | |||
| FCOMMON_OPT += -m64 | |||
| endif | |||
| endif | |||
| ifeq ($(USE_OPENMP), 1) | |||
| FEXTRALIB += -lstdc++ | |||
| FCOMMON_OPT += -mp | |||
| @@ -1209,7 +1216,6 @@ endif | |||
| endif | |||
| ifeq ($(C_COMPILER), OPEN64) | |||
| ifeq ($(ARCH), $(filter $(ARCH),mips64 mips)) | |||
| ifndef BINARY64 | |||
| CCOMMON_OPT += -n32 | |||
| @@ -1219,13 +1225,10 @@ endif | |||
| ifeq ($(CORE), LOONGSON3R3) | |||
| CCOMMON_OPT += -loongson3 -static | |||
| endif | |||
| ifeq ($(CORE), LOONGSON3R4) | |||
| CCOMMON_OPT += -loongson3 -static | |||
| endif | |||
| else | |||
| ifndef BINARY64 | |||
| CCOMMON_OPT += -m32 | |||
| else | |||
| @@ -1271,6 +1274,19 @@ FCOMMON_OPT += -openmp | |||
| endif | |||
| endif | |||
| ifeq ($(F_COMPILER), CRAY) | |||
| CCOMMON_OPT += -DF_INTERFACE_INTEL | |||
| FCOMMON_OPT += -hnopattern | |||
| ifdef INTERFACE64 | |||
| ifneq ($(INTERFACE64), 0) | |||
| FCOMMON_OPT += -s integer64 | |||
| endif | |||
| endif | |||
| ifneq ($(USE_OPENMP), 1) | |||
| FCOMMON_OPT += -O noomp | |||
| endif | |||
| endif | |||
| ifdef BINARY64 | |||
| ifdef INTERFACE64 | |||
| ifneq ($(INTERFACE64), 0) | |||
| @@ -1303,6 +1319,10 @@ ifeq ($(DYNAMIC_OLDER), 1) | |||
| CCOMMON_OPT += -DDYNAMIC_OLDER | |||
| endif | |||
| ifeq ($(C_LAPACK), 1) | |||
| CCOMMON_OPT += -DC_LAPACK | |||
| endif | |||
| ifeq ($(NO_LAPACK), 1) | |||
| CCOMMON_OPT += -DNO_LAPACK | |||
| #Disable LAPACK C interface | |||
| @@ -1532,7 +1552,7 @@ override FFLAGS += $(COMMON_OPT) $(FCOMMON_OPT) | |||
| override FPFLAGS += $(FCOMMON_OPT) $(COMMON_PROF) | |||
| #MAKEOVERRIDES = | |||
| ifdef NEED_PIC | |||
| ifeq ($(NEED_PIC), 1) | |||
| ifeq (,$(findstring PIC,$(FFLAGS))) | |||
| override FFLAGS += -fPIC | |||
| endif | |||
| @@ -1550,6 +1570,11 @@ endif | |||
| ifeq ($(F_COMPILER),NAG) | |||
| LAPACK_FFLAGS := $(filter-out -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS)) | |||
| FFLAGS := $(filter-out -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS)) | |||
| endif | |||
| ifeq ($(F_COMPILER),CRAY) | |||
| LAPACK_FFLAGS := $(filter-out -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS)) | |||
| FFLAGS := $(filter-out -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS)) | |||
| endif | |||
| LAPACK_CFLAGS = $(CFLAGS) | |||
| @@ -1562,6 +1587,7 @@ endif | |||
| ifdef OS_WINDOWS | |||
| LAPACK_CFLAGS += -DOPENBLAS_OS_WINDOWS | |||
| LAPACK_CFLAGS += -DLAPACK_COMPLEX_STRUCTURE | |||
| endif | |||
| ifeq ($(C_COMPILER), LSB) | |||
| LAPACK_CFLAGS += -DLAPACK_COMPLEX_STRUCTURE | |||
| @@ -1661,6 +1687,7 @@ export USE_OPENMP | |||
| export CROSS | |||
| export CROSS_SUFFIX | |||
| export NOFORTRAN | |||
| export C_LAPACK | |||
| export NO_FBLAS | |||
| export EXTRALIB | |||
| export CEXTRALIB | |||
| @@ -92,6 +92,10 @@ CORTEXA53 | |||
| CORTEXA57 | |||
| CORTEXA72 | |||
| CORTEXA73 | |||
| CORTEXA510 | |||
| CORTEXA710 | |||
| CORTEXX1 | |||
| CORTEXX2 | |||
| NEOVERSEN1 | |||
| NEOVERSEV1 | |||
| NEOVERSEN2 | |||
| @@ -103,6 +107,9 @@ THUNDERX2T99 | |||
| TSV110 | |||
| THUNDERX3T110 | |||
| VORTEX | |||
| A64FX | |||
| ARMV8SVE | |||
| FT2000 | |||
| 9.System Z: | |||
| ZARCH_GENERIC | |||
| @@ -114,7 +121,9 @@ RISCV64_GENERIC | |||
| C910V | |||
| 11.LOONGARCH64: | |||
| LOONGSONGENERIC | |||
| LOONGSON3R5 | |||
| LOONGSON2K1000 | |||
| 12. Elbrus E2000: | |||
| E2K | |||
| @@ -65,7 +65,7 @@ jobs: | |||
| - task: CMake@1 | |||
| inputs: | |||
| workingDirectory: 'build' # Optional | |||
| cmakeArgs: '-G "Visual Studio 16 2019" ..' | |||
| cmakeArgs: '-G "Visual Studio 17 2022" ..' | |||
| - task: CMake@1 | |||
| inputs: | |||
| cmakeArgs: '--build . --config Release' | |||
| @@ -81,7 +81,7 @@ jobs: | |||
| vmImage: 'windows-latest' | |||
| steps: | |||
| - script: | | |||
| mingw32-make CC=gcc FC=gfortran DYNAMIC_ARCH=1 DYNAMIC_LIST="NEHALEM SANDYBRIDGE HASWELL" | |||
| mingw32-make CC=gcc FC=gfortran DYNAMIC_ARCH=1 DYNAMIC_LIST="SANDYBRIDGE" | |||
| - job: Windows_clang_cmake | |||
| pool: | |||
| @@ -103,7 +103,7 @@ jobs: | |||
| - job: Windows_flang_clang | |||
| pool: | |||
| vmImage: 'windows-latest' | |||
| vmImage: 'windows-2022' | |||
| steps: | |||
| - script: | | |||
| set "PATH=C:\Miniconda\Scripts;C:\Miniconda\Library\bin;C:\Miniconda\Library\usr\bin;C:\Miniconda\condabin;%PATH%" | |||
| @@ -114,11 +114,31 @@ jobs: | |||
| conda install --yes --quiet ninja flang | |||
| mkdir build | |||
| cd build | |||
| call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat" | |||
| cmake -G "Ninja" -DCMAKE_C_COMPILER=clang-cl -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DCMAKE_MT=mt -DCMAKE_BUILD_TYPE=Release -DMSVC_STATIC_CRT=ON .. | |||
| call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvars64.bat" | |||
| cmake -G "Ninja" -DCMAKE_C_COMPILER=clang-cl -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_TESTING=OFF -DCMAKE_MT=mt -DCMAKE_BUILD_TYPE=Release -DMSVC_STATIC_CRT=ON .. | |||
| cmake --build . --config Release | |||
| ctest | |||
| - job: Windows_cl_flang | |||
| pool: | |||
| vmImage: 'windows-2022' | |||
| steps: | |||
| - script: | | |||
| set "PATH=C:\Miniconda\Scripts;C:\Miniconda\Library\bin;C:\Miniconda\Library\usr\bin;C:\Miniconda\condabin;%PATH%" | |||
| set "LIB=C:\Miniconda\Library\lib;%LIB%" | |||
| set "CPATH=C:\Miniconda\Library\include;%CPATH%" | |||
| conda config --add channels conda-forge --force | |||
| conda config --set auto_update_conda false | |||
| conda install --yes --quiet ninja flang | |||
| mkdir build | |||
| cd build | |||
| call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvars64.bat" | |||
| cmake -G "Ninja" -DCMAKE_C_COMPILER=cl -DCMAKE_Fortran_COMPILER=flang -DC_LAPACK=1 -DCMAKE_MT=mt -DCMAKE_BUILD_TYPE=Release -DMSVC_STATIC_CRT=ON .. | |||
| cmake --build . --config Release | |||
| ctest | |||
| - job: OSX_OpenMP | |||
| pool: | |||
| vmImage: 'macOS-10.15' | |||
| @@ -143,11 +163,12 @@ jobs: | |||
| variables: | |||
| LD_LIBRARY_PATH: /usr/local/opt/llvm/lib | |||
| LIBRARY_PATH: /usr/local/opt/llvm/lib | |||
| MACOSX_DEPLOYMENT_TARGET: 11.0 | |||
| steps: | |||
| - script: | | |||
| brew update | |||
| brew install llvm libomp | |||
| make TARGET=CORE2 USE_OPENMP=1 INTERFACE64=1 DYNAMIC_ARCH=1 CC=/usr/local/opt/llvm/bin/clang FC=gfortran-10 | |||
| make TARGET=CORE2 USE_OPENMP=1 DYNAMIC_ARCH=1 CC=/usr/local/opt/llvm/bin/clang NOFORTRAN=1 | |||
| - job: OSX_OpenMP_Clang_cmake | |||
| pool: | |||
| @@ -178,7 +199,7 @@ jobs: | |||
| cmake -DTARGET=CORE2 -DDYNAMIC_ARCH=1 -DCMAKE_C_COMPILER=gcc-10 -DCMAKE_Fortran_COMPILER=gfortran-10 -DBUILD_SHARED_LIBS=ON .. | |||
| cmake --build . | |||
| ctest | |||
| - job: OSX_Ifort_Clang | |||
| pool: | |||
| vmImage: 'macOS-10.15' | |||
| @@ -1,426 +1,415 @@ | |||
| #!/usr/bin/env perl | |||
| #use File::Basename; | |||
| # use File::Temp qw(tempfile); | |||
| #!/bin/sh | |||
| # Checking cross compile | |||
| $hostos = `uname -s | sed -e s/\-.*//`; chop($hostos); | |||
| $hostarch = `uname -m | sed -e s/i.86/x86/`; | |||
| $hostarch = `uname -p` if ($hostos eq "AIX" || $hostos eq "SunOS"); | |||
| chop($hostarch); | |||
| $hostarch = "x86_64" if ($hostarch eq "amd64"); | |||
| $hostarch = "arm" if ($hostarch ne "arm64" && $hostarch =~ /^arm.*/); | |||
| $hostarch = "arm64" if ($hostarch eq "aarch64"); | |||
| $hostarch = "power" if ($hostarch =~ /^(powerpc|ppc).*/); | |||
| $hostarch = "zarch" if ($hostarch eq "s390x"); | |||
| #$tmpf = new File::Temp( UNLINK => 1 ); | |||
| $binary = $ENV{"BINARY"}; | |||
| $makefile = shift(@ARGV); | |||
| $config = shift(@ARGV); | |||
| $compiler_name = shift(@ARGV); | |||
| $flags = join(" ", @ARGV); | |||
| hostos=`uname -s | sed -e 's/\-.*//'` | |||
| hostarch=`uname -m | sed -e 's/i.86/x86/'` | |||
| if [ "$hostos" = "AIX" ] || [ "$hostos" = "SunOS" ]; then | |||
| hostarch=`uname -p` | |||
| fi | |||
| case "$hostarch" in | |||
| amd64) hostarch=x86_64 ;; | |||
| arm*) [ "$hostarch" = "arm64" ] || hostarch='arm' ;; | |||
| aarch64) hostarch=arm64 ;; | |||
| powerpc*|ppc*) hostarch=power ;; | |||
| s390x) hostarch=zarch ;; | |||
| esac | |||
| makefile="$1" | |||
| config="$2" | |||
| compiler_name="$3" | |||
| shift 3 | |||
| flags="$*" | |||
| # First, we need to know the target OS and compiler name | |||
| $data = `$compiler_name $flags -E ctest.c`; | |||
| if ($?) { | |||
| printf STDERR "C Compiler ($compiler_name) is something wrong.\n"; | |||
| die 1; | |||
| } | |||
| $cross_suffix = ""; | |||
| eval "use File::Basename"; | |||
| if ($@){ | |||
| warn "could not load PERL module File::Basename, emulating its functionality"; | |||
| my $dirnam = substr($compiler_name, 0, rindex($compiler_name, "/")-1 ); | |||
| if ($dirnam ne ".") { | |||
| $cross_suffix .= $dirnam . "/"; | |||
| } | |||
| my $basnam = substr($compiler_name, rindex($compiler_name,"/")+1, length($compiler_name)-rindex($compiler_name,"/")-1); | |||
| if ($basnam =~ /([^\s]*-)(.*)/) { | |||
| $cross_suffix .= $1; | |||
| } | |||
| } else { | |||
| if (dirname($compiler_name) ne ".") { | |||
| $cross_suffix .= dirname($compiler_name) . "/"; | |||
| } | |||
| if (basename($compiler_name) =~ /([^\s]*-)(.*)/) { | |||
| $cross_suffix .= $1; | |||
| } | |||
| } | |||
| $compiler = ""; | |||
| $compiler = LSB if ($data =~ /COMPILER_LSB/); | |||
| $compiler = CLANG if ($data =~ /COMPILER_CLANG/); | |||
| $compiler = PGI if ($data =~ /COMPILER_PGI/); | |||
| $compiler = PATHSCALE if ($data =~ /COMPILER_PATHSCALE/); | |||
| $compiler = INTEL if ($data =~ /COMPILER_INTEL/); | |||
| $compiler = OPEN64 if ($data =~ /COMPILER_OPEN64/); | |||
| $compiler = SUN if ($data =~ /COMPILER_SUN/); | |||
| $compiler = IBM if ($data =~ /COMPILER_IBM/); | |||
| $compiler = DEC if ($data =~ /COMPILER_DEC/); | |||
| $compiler = GCC if ($compiler eq ""); | |||
| $os = Linux if ($data =~ /OS_LINUX/); | |||
| $os = FreeBSD if ($data =~ /OS_FREEBSD/); | |||
| $os = NetBSD if ($data =~ /OS_NETBSD/); | |||
| $os = OpenBSD if ($data =~ /OS_OPENBSD/); | |||
| $os = DragonFly if ($data =~ /OS_DRAGONFLY/); | |||
| $os = Darwin if ($data =~ /OS_DARWIN/); | |||
| $os = SunOS if ($data =~ /OS_SUNOS/); | |||
| $os = AIX if ($data =~ /OS_AIX/); | |||
| $os = osf if ($data =~ /OS_OSF/); | |||
| $os = WINNT if ($data =~ /OS_WINNT/); | |||
| $os = CYGWIN_NT if ($data =~ /OS_CYGWIN_NT/); | |||
| $os = Interix if ($data =~ /OS_INTERIX/); | |||
| $os = Android if ($data =~ /OS_ANDROID/); | |||
| $os = Haiku if ($data =~ /OS_HAIKU/); | |||
| $architecture = x86 if ($data =~ /ARCH_X86/); | |||
| $architecture = x86_64 if ($data =~ /ARCH_X86_64/); | |||
| $architecture = e2k if ($data =~ /ARCH_E2K/); | |||
| $architecture = power if ($data =~ /ARCH_POWER/); | |||
| $architecture = mips if ($data =~ /ARCH_MIPS/); | |||
| $architecture = mips64 if ($data =~ /ARCH_MIPS64/); | |||
| $architecture = alpha if ($data =~ /ARCH_ALPHA/); | |||
| $architecture = sparc if ($data =~ /ARCH_SPARC/); | |||
| $architecture = ia64 if ($data =~ /ARCH_IA64/); | |||
| $architecture = arm if ($data =~ /ARCH_ARM/); | |||
| $architecture = arm64 if ($data =~ /ARCH_ARM64/); | |||
| $architecture = zarch if ($data =~ /ARCH_ZARCH/); | |||
| $architecture = riscv64 if ($data =~ /ARCH_RISCV64/); | |||
| $architecture = loongarch64 if ($data =~ /ARCH_LOONGARCH64/); | |||
| $defined = 0; | |||
| if ($os eq "AIX") { | |||
| $compiler_name .= " -maix32" if ($binary eq "32"); | |||
| $compiler_name .= " -maix64" if ($binary eq "64"); | |||
| $defined = 1; | |||
| } | |||
| if ($architecture eq "mips") { | |||
| $compiler_name .= " -mabi=32"; | |||
| $defined = 1; | |||
| } | |||
| if ($architecture eq "mips64") { | |||
| $compiler_name .= " -mabi=n32" if ($binary eq "32"); | |||
| $compiler_name .= " -mabi=64" if ($binary eq "64"); | |||
| $defined = 1; | |||
| } | |||
| if (($architecture eq "arm") || ($architecture eq "arm64")) { | |||
| $defined = 1; | |||
| } | |||
| if ($architecture eq "zarch") { | |||
| $defined = 1; | |||
| $binary = 64; | |||
| } | |||
| if ($architecture eq "e2k") { | |||
| $defined = 1; | |||
| $binary = 64; | |||
| } | |||
| if ($architecture eq "alpha") { | |||
| $defined = 1; | |||
| $binary = 64; | |||
| } | |||
| if ($architecture eq "ia64") { | |||
| $defined = 1; | |||
| $binary = 64; | |||
| } | |||
| if (($architecture eq "x86") && ($os ne Darwin) && ($os ne SunOS)) { | |||
| $defined = 1; | |||
| $binary =32; | |||
| } | |||
| if ($architecture eq "riscv64") { | |||
| $defined = 1; | |||
| $binary = 64; | |||
| } | |||
| if ($architecture eq "loongarch64") { | |||
| $defined = 1; | |||
| $binary = 64; | |||
| } | |||
| if ($compiler eq "PGI") { | |||
| $compiler_name .= " -tp p7" if ($binary eq "32"); | |||
| $compiler_name .= " -tp p7-64" if ($binary eq "64"); | |||
| $openmp = "-mp"; | |||
| $defined = 1; | |||
| } | |||
| if ($compiler eq "IBM") { | |||
| $compiler_name .= " -q32" if ($binary eq "32"); | |||
| $compiler_name .= " -q64" if ($binary eq "64"); | |||
| $openmp = "-qsmp=omp"; | |||
| $defined = 1; | |||
| } | |||
| if ($compiler eq "INTEL") { | |||
| $openmp = "-openmp"; | |||
| } | |||
| if ($compiler eq "PATHSCALE") { | |||
| $openmp = "-mp"; | |||
| } | |||
| if ($compiler eq "OPEN64") { | |||
| $openmp = "-mp"; | |||
| } | |||
| if ($compiler eq "CLANG") { | |||
| $openmp = "-fopenmp"; | |||
| } | |||
| if ($compiler eq "GCC" || $compiler eq "LSB") { | |||
| $openmp = "-fopenmp"; | |||
| { | |||
| data=`$compiler_name $flags -E ctest.c` | |||
| } || { | |||
| printf '%s\n' "C Compiler ($compiler_name) is something wrong." >&2 | |||
| exit 1 | |||
| } | |||
| if ($defined == 0) { | |||
| $compiler_name .= " -m32" if ($binary eq "32"); | |||
| $compiler_name .= " -m64" if ($binary eq "64"); | |||
| } | |||
| cross_suffix="" | |||
| if [ "`dirname $compiler_name`" != '.' ]; then | |||
| cross_suffix="$cross_suffix`dirname $compiler_name`/" | |||
| fi | |||
| bn=`basename $compiler_name` | |||
| case "$bn" in | |||
| *-*) cross_suffix="$cross_suffix${bn%-*}-" | |||
| esac | |||
| compiler="" | |||
| case "$data" in | |||
| *COMPILER_LSB*) compiler=LSB ;; | |||
| *COMPILER_CLANG*) compiler=CLANG ;; | |||
| *COMPILER_PGI*) compiler=PGI ;; | |||
| *COMPILER_PATHSCALE*) compiler=PATHSCALE ;; | |||
| *COMPILER_INTEL*) compiler=INTEL ;; | |||
| *COMPILER_OPEN64*) compiler=OPEN64 ;; | |||
| *COMPILER_SUN*) compiler=SUN ;; | |||
| *COMPILER_IBM*) compiler=IBM ;; | |||
| *COMPILER_DEC*) compiler=DEC ;; | |||
| *COMPILER_FUJITSU*) compiler=FUJITSU ;; | |||
| esac | |||
| if [ -z "$compiler" ]; then | |||
| compiler=GCC | |||
| fi | |||
| case "$data" in *OS_LINUX*) os=Linux ;; esac | |||
| case "$data" in *OS_FREEBSD*) os=FreeBSD ;; esac | |||
| case "$data" in *OS_NETBSD*) os=NetBSD ;; esac | |||
| case "$data" in *OS_OPENBSD*) os=OpenBSD ;; esac | |||
| case "$data" in *OS_DRAGONFLY*) os=DragonFly ;; esac | |||
| case "$data" in *OS_DARWIN*) os=Darwin ;; esac | |||
| case "$data" in *OS_SUNOS*) os=SunOS ;; esac | |||
| case "$data" in *OS_AIX*) os=AIX ;; esac | |||
| case "$data" in *OS_OSF*) os=osf ;; esac | |||
| case "$data" in *OS_WINNT*) os=WINNT ;; esac | |||
| case "$data" in *OS_CYGWIN_NT*) os=CYGWIN_NT ;; esac | |||
| case "$data" in *OS_INTERIX*) os=Interix ;; esac | |||
| case "$data" in *OS_ANDROID*) os=Android ;; esac | |||
| case "$data" in *OS_HAIKU*) os=Haiku ;; esac | |||
| case "$data" in | |||
| *ARCH_X86_64*) architecture=x86_64 ;; | |||
| *ARCH_X86*) architecture=x86 ;; | |||
| *ARCH_E2K*) architecture=e2k ;; | |||
| *ARCH_POWER*) architecture=power ;; | |||
| *ARCH_MIPS64*) architecture=mips64 ;; | |||
| *ARCH_MIPS*) architecture=mips ;; | |||
| *ARCH_ALPHA*) architecture=alpha ;; | |||
| *ARCH_SPARC*) architecture=sparc ;; | |||
| *ARCH_IA64*) architecture=ia64 ;; | |||
| *ARCH_ARM64*) architecture=arm64 ;; | |||
| *ARCH_ARM*) architecture=arm ;; | |||
| *ARCH_ZARCH*) architecture=zarch ;; | |||
| *ARCH_RISCV64*) architecture=riscv64 ;; | |||
| *ARCH_LOONGARCH64*) architecture=loongarch64 ;; | |||
| esac | |||
| defined=0 | |||
| if [ "$os" = "AIX" ]; then | |||
| case "$BINARY" in | |||
| 32) compiler_name="$compiler_name -maix32" ;; | |||
| 64) compiler_name="$compiler_name -maix64" ;; | |||
| esac | |||
| defined=1 | |||
| fi | |||
| case "$architecture" in | |||
| mips) | |||
| compiler_name="$compiler_name -mabi=32" | |||
| defined=1 | |||
| ;; | |||
| mips64) | |||
| case "$BINARY" in | |||
| 32) compiler_name="$compiler_name -mabi=n32" ;; | |||
| 64) compiler_name="$compiler_name -mabi=64" ;; | |||
| esac | |||
| defined=1 | |||
| ;; | |||
| arm|arm64) defined=1 ;; | |||
| zarch|e2k|alpha|ia64|riscv64|loonarch64) | |||
| defined=1 | |||
| BINARY=64 | |||
| ;; | |||
| x86) | |||
| [ "$os" != "Darwin" ] && [ "$os" != "SunOS" ] && { | |||
| defined=1 | |||
| BINARY=32 | |||
| } | |||
| ;; | |||
| esac | |||
| case "$compiler" in | |||
| PGI) | |||
| case "$BINARY" in | |||
| 32) compiler_name="$compiler_name -tp p7" ;; | |||
| 64) compiler_name="$compiler_name -tp p7-64" ;; | |||
| esac | |||
| openmp='-mp' | |||
| defined=1 | |||
| ;; | |||
| IBM) | |||
| case "$BINARY" in | |||
| 32) compiler_name="$compiler_name -q32" ;; | |||
| 64) compiler_name="$compiler_name -q64" ;; | |||
| esac | |||
| openmp='-qsmp=omp' | |||
| defined=1 | |||
| ;; | |||
| INTEL) openmp='-openmp' ;; | |||
| PATHSCALE|OPEN64) openmp='-mp' ;; | |||
| CLANG|GCC|LSB) openmp='-fopenmp' ;; | |||
| FUJITSU) openmp='-Kopenmp' ;; | |||
| esac | |||
| if [ "$defined" -eq 0 ]; then | |||
| case "$BINARY" in | |||
| 32) compiler_name="$compiler_name -m32" ;; | |||
| 64) compiler_name="$compiler_name -m64" ;; | |||
| esac | |||
| fi | |||
| # Do again | |||
| $data = `$compiler_name $flags -E ctest.c`; | |||
| if ($?) { | |||
| printf STDERR "C Compiler ($compiler_name) is something wrong.\n"; | |||
| die 1; | |||
| { | |||
| data="$($compiler_name $flags -E ctest.c)" | |||
| } || { | |||
| printf '%s\n' "C Compiler ($compiler_name) is something wrong." >&2 | |||
| exit 1 | |||
| } | |||
| $have_msa = 0; | |||
| if (($architecture eq "mips") || ($architecture eq "mips64")) { | |||
| eval "use File::Temp qw(tempfile)"; | |||
| if ($@){ | |||
| warn "could not load PERL module File::Temp, so could not check MSA capatibility"; | |||
| } else { | |||
| $tmpf = new File::Temp( SUFFIX => '.c' , UNLINK => 1 ); | |||
| $code = '"addvi.b $w0, $w1, 1"'; | |||
| $msa_flags = "-mmsa -mfp64 -mload-store-pairs"; | |||
| print $tmpf "#include <msa.h>\n\n"; | |||
| print $tmpf "void main(void){ __asm__ volatile($code); }\n"; | |||
| $args = "$msa_flags -o $tmpf.o $tmpf"; | |||
| my @cmd = ("$compiler_name $flags $args >/dev/null 2>/dev/null"); | |||
| system(@cmd) == 0; | |||
| if ($? != 0) { | |||
| $have_msa = 0; | |||
| } else { | |||
| $have_msa = 1; | |||
| } | |||
| unlink("$tmpf.o"); | |||
| have_msa=0 | |||
| if [ "$architecture" = "mips" ] || [ "$architecture" = "mips64" ]; then | |||
| tmpd="$(mktemp -d)" | |||
| tmpf="$tmpd/a.c" | |||
| code='"addvi.b $w0, $w1, 1"' | |||
| msa_flags='-mmsa -mfp64 -mload-store-pairs' | |||
| printf "#include <msa.h>\n\n" >> "$tmpf" | |||
| printf "void main(void){ __asm__ volatile(%s); }\n" "$code" >> "$tmpf" | |||
| args="$msa_flags -o $tmpf.o $tmpf" | |||
| have_msa=1 | |||
| { | |||
| $compiler_name $flags $args >/dev/null 2>&1 | |||
| } || { | |||
| have_msa=0 | |||
| } | |||
| } | |||
| $architecture = x86 if ($data =~ /ARCH_X86/); | |||
| $architecture = x86_64 if ($data =~ /ARCH_X86_64/); | |||
| $architecture = e2k if ($data =~ /ARCH_E2K/); | |||
| $architecture = power if ($data =~ /ARCH_POWER/); | |||
| $architecture = mips if ($data =~ /ARCH_MIPS/); | |||
| $architecture = mips64 if ($data =~ /ARCH_MIPS64/); | |||
| $architecture = alpha if ($data =~ /ARCH_ALPHA/); | |||
| $architecture = sparc if ($data =~ /ARCH_SPARC/); | |||
| $architecture = ia64 if ($data =~ /ARCH_IA64/); | |||
| $architecture = arm if ($data =~ /ARCH_ARM/); | |||
| $architecture = arm64 if ($data =~ /ARCH_ARM64/); | |||
| $architecture = zarch if ($data =~ /ARCH_ZARCH/); | |||
| $architecture = loongarch64 if ($data =~ /ARCH_LOONGARCH64/); | |||
| $binformat = bin32; | |||
| $binformat = bin64 if ($data =~ /BINARY_64/); | |||
| $no_avx512= 0; | |||
| if (($architecture eq "x86") || ($architecture eq "x86_64")) { | |||
| eval "use File::Temp qw(tempfile)"; | |||
| if ($@){ | |||
| warn "could not load PERL module File::Temp, so could not check compiler compatibility with AVX512"; | |||
| $no_avx512 = 0; | |||
| } else { | |||
| # $tmpf = new File::Temp( UNLINK => 1 ); | |||
| ($fh,$tmpf) = tempfile( SUFFIX => '.c' , UNLINK => 1 ); | |||
| $code = '"vbroadcastss -4 * 4(%rsi), %zmm2"'; | |||
| print $tmpf "#include <immintrin.h>\n\nint main(void){ __asm__ volatile($code); }\n"; | |||
| $args = " -march=skylake-avx512 -c -o $tmpf.o $tmpf"; | |||
| if ($compiler eq "PGI") { | |||
| $args = " -tp skylake -c -o $tmpf.o $tmpf"; | |||
| } | |||
| my @cmd = ("$compiler_name $flags $args >/dev/null 2>/dev/null"); | |||
| system(@cmd) == 0; | |||
| if ($? != 0) { | |||
| $no_avx512 = 1; | |||
| } else { | |||
| $no_avx512 = 0; | |||
| } | |||
| unlink("$tmpf.o"); | |||
| rm -rf "$tmpd" | |||
| fi | |||
| case "$data" in | |||
| *ARCH_X86_64*) architecture=x86_64 ;; | |||
| *ARCH_X86*) architecture=x86 ;; | |||
| *ARCH_E2K*) architecture=e2k ;; | |||
| *ARCH_POWER*) architecture=power ;; | |||
| *ARCH_MIPS64*) architecture=mips64 ;; | |||
| *ARCH_MIPS*) architecture=mips ;; | |||
| *ARCH_ALPHA*) architecture=alpha ;; | |||
| *ARCH_SPARC*) architecture=sparc ;; | |||
| *ARCH_IA64*) architecture=ia64 ;; | |||
| *ARCH_ARM64*) architecture=arm64 ;; | |||
| *ARCH_ARM*) architecture=arm ;; | |||
| *ARCH_ZARCH*) architecture=zarch ;; | |||
| *ARCH_LOONGARCH64*) architecture=loongarch64 ;; | |||
| esac | |||
| binformat='bin32' | |||
| case "$data" in | |||
| *BINARY_64*) binformat='bin64' ;; | |||
| esac | |||
| no_avx512=0 | |||
| if [ "$architecture" = "x86" ] || [ "$architecture" = "x86_64" ]; then | |||
| tmpd=`mktemp -d` | |||
| tmpf="$tmpd/a.c" | |||
| code='"vbroadcastss -4 * 4(%rsi), %zmm2"' | |||
| printf "#include <immintrin.h>\n\nint main(void){ __asm__ volatile(%s); }\n" "$code" >> "$tmpf" | |||
| if [ "$compiler" = "PGI" ]; then | |||
| args=" -tp skylake -c -o $tmpf.o $tmpf" | |||
| else | |||
| args=" -march=skylake-avx512 -c -o $tmpf.o $tmpf" | |||
| fi | |||
| no_avx512=0 | |||
| { | |||
| $compiler_name $flags $args >/dev/null 2>&1 | |||
| } || { | |||
| no_avx512=1 | |||
| } | |||
| } | |||
| $c11_atomics = 0; | |||
| if ($data =~ /HAVE_C11/) { | |||
| eval "use File::Temp qw(tempfile)"; | |||
| if ($@){ | |||
| warn "could not load PERL module File::Temp, so could not check compiler compatibility with C11"; | |||
| $c11_atomics = 0; | |||
| } else { | |||
| ($fh,$tmpf) = tempfile( SUFFIX => '.c' , UNLINK => 1 ); | |||
| print $tmpf "#include <stdatomic.h>\nint main(void){}\n"; | |||
| $args = " -c -o $tmpf.o $tmpf"; | |||
| my @cmd = ("$compiler_name $flags $args >/dev/null 2>/dev/null"); | |||
| system(@cmd) == 0; | |||
| if ($? != 0) { | |||
| $c11_atomics = 0; | |||
| } else { | |||
| $c11_atomics = 1; | |||
| } | |||
| unlink("$tmpf.o"); | |||
| rm -rf "$tmpd" | |||
| fi | |||
| no_rv64gv=0 | |||
| if [ "$architecture" = "riscv64" ]; then | |||
| tmpd=`mktemp -d` | |||
| tmpf="$tmpd/a.c" | |||
| code='"vsetvli zero, zero, e8, m1\n"' | |||
| printf "int main(void){ __asm__ volatile(%s); }\n" "$code" >> "$tmpf" | |||
| args=" -march=rv64gv -c -o $tmpf.o $tmpf" | |||
| no_rv64gv=0 | |||
| { | |||
| $compiler_name $flags $args >/dev/null 2>&1 | |||
| } || { | |||
| no_rv64gv=1 | |||
| } | |||
| rm -rf "$tmpd" | |||
| fi | |||
| c11_atomics=0 | |||
| case "$data" in | |||
| *HAVE_C11*) | |||
| tmpd=`mktemp -d` | |||
| tmpf="$tmpd/a.c" | |||
| printf "#include <stdatomic.h>\nint main(void){}\n" >> "$tmpf" | |||
| args=" -c -o $tmpf.o $tmpf" | |||
| c11_atomics=1 | |||
| { | |||
| $compiler_name $flags $args >/dev/null 2>&1 | |||
| } || { | |||
| c11_atomics=0 | |||
| } | |||
| rm -rf "$tmpd" | |||
| ;; | |||
| esac | |||
| oldgcc=0 | |||
| no_avx2=0 | |||
| if [ "$compiler" = "GCC" ]; then | |||
| case "$architecture" in x86|x86_64) | |||
| no_avx2=0 | |||
| oldgcc=0 | |||
| data=`$compiler_name -dumpversion` | |||
| case "$data" in *.*.*) | |||
| data="${data%.*}" | |||
| esac | |||
| if awk -v n1=$data -v n2=4.6 'BEGIN { exit !(n1 <= n2) }'; then | |||
| no_avx2=1 | |||
| oldgcc=1 | |||
| fi | |||
| esac | |||
| fi | |||
| data=`$compiler_name $flags -S ctest1.c && grep globl ctest1.s | head -n 1 && rm -f ctest1.s` | |||
| need_fu='' | |||
| if echo "$data" | grep 'globl[[:space:]][_\.]'; then | |||
| need_fu="${data##*globl[[:space:]]}" | |||
| need_fu="${need_fu%%[!_\.]*}" | |||
| fi | |||
| cross=0 | |||
| if [ "$architecture" != "$hostarch" ]; then | |||
| cross=1 | |||
| [ "$hostarch" = "x86_64" ] && [ "$architecture" = "x86" ] && cross=0 | |||
| [ "$hostarch" = "mips64" ] && [ "$architecture" = "mips" ] && cross=0 | |||
| fi | |||
| [ "$os" != "$hostos" ] && cross=1 | |||
| [ "$os" = "Android" ] && [ "$hostos" = "Linux" ] && [ -n "$TERMUX_APP_PID" ] \ | |||
| && cross=0 | |||
| [ "$USE_OPENMP" != 1 ] && openmp='' | |||
| linker_L="" | |||
| linker_l="" | |||
| linker_a="" | |||
| link=`$compiler_name $flags -c ctest2.c -o ctest2.o 2>&1 && $compiler_name $flags $openmp -v ctest2.o -o ctest2 2>&1 && rm -f ctest2.o ctest2 ctest2.exe` | |||
| link=`echo "$link" | sed 's/\-Y[[:space:]]P\,/\-Y/g'` | |||
| flags=`echo $link | tr "'[[:space:]],\n" " "` | |||
| # Strip trailing quotes | |||
| old_flags="$flags" | |||
| flags='' | |||
| for flag in $old_flags; do | |||
| f=`echo "$flag" | tr '"' ' '` | |||
| flags="$flags $f" | |||
| done | |||
| for flag in $flags; do | |||
| case "$flag" in -L*) | |||
| case "$flag" in | |||
| -LIST:*|-LANG:*) ;; | |||
| *) linker_L="$linker_L $flag" ;; | |||
| esac | |||
| esac | |||
| case "$flag" in -Y*) | |||
| linker_L="$linker_L -Wl,$flag" ;; | |||
| esac | |||
| case "$flag" in --exclude-libs*) | |||
| linker_L="$linker_L -Wl,$flag" | |||
| flags="" | |||
| ;; | |||
| esac | |||
| case "$flag" in -l*) | |||
| case "$flag" in | |||
| *gfortranbegin*|*frtbegin*|*pathfstart*|*numa*|*crt[0-9]*|\ | |||
| *gcc*|*user32*|*kernel32*|*advapi32*|*shell32*|*omp*|\ | |||
| *[0-9]*) ;; | |||
| *) linker_l="$linker_l $flag" ;; | |||
| esac | |||
| esac | |||
| case "$flag" in *.a) linker_a="$linker_a $flag" ;; esac | |||
| done | |||
| [ "$makefile" = "-" ] && { | |||
| [ "$no_rv64gv" -eq 1 ] && printf "NO_RV64GV=1\n" | |||
| [ "$no_avx512" -eq 1 ] && printf "NO_AVX512=1\n" | |||
| [ "$no_avx2" -eq 1 ] && printf "NO_AVX2=1\n" | |||
| [ "$oldgcc" -eq 1 ] && printf "OLDGCC=1\n" | |||
| exit 0 | |||
| } | |||
| if ($compiler eq "GCC" &&( ($architecture eq "x86") || ($architecture eq "x86_64"))) { | |||
| $no_avx2 = 0; | |||
| $oldgcc = 0; | |||
| $data = `$compiler_name -dumpversion`; | |||
| if ($data <= 4.6) { | |||
| $no_avx2 = 1; | |||
| $oldgcc = 1; | |||
| } | |||
| } | |||
| $data = `$compiler_name $flags -S ctest1.c && grep globl ctest1.s | head -n 1 && rm -f ctest1.s`; | |||
| :> "$makefile" || exit 1 | |||
| :> "$config" || exit 1 | |||
| $data =~ /globl\s([_\.]*)(.*)/; | |||
| $need_fu = $1; | |||
| $cross = 0; | |||
| if ($architecture ne $hostarch) { | |||
| $cross = 1; | |||
| $cross = 0 if (($hostarch eq "x86_64") && ($architecture eq "x86")); | |||
| $cross = 0 if (($hostarch eq "mips64") && ($architecture eq "mips")); | |||
| } | |||
| $cross = 1 if ($os ne $hostos); | |||
| $openmp = "" if $ENV{USE_OPENMP} != 1; | |||
| $linker_L = ""; | |||
| $linker_l = ""; | |||
| $linker_a = ""; | |||
| # print $data, "\n"; | |||
| { | |||
| $link = `$compiler_name $flags -c ctest2.c -o ctest2.o 2>&1 && $compiler_name $flags $openmp -v ctest2.o -o ctest2 2>&1 && rm -f ctest2.o ctest2 ctest2.exe`; | |||
| $link =~ s/\-Y\sP\,/\-Y/g; | |||
| @flags = split(/[\s\,\n]/, $link); | |||
| # remove leading and trailing quotes from each flag. | |||
| @flags = map {s/^['"]|['"]$//g; $_} @flags; | |||
| foreach $flags (@flags) { | |||
| if ( | |||
| ($flags =~ /^\-L/) | |||
| && ($flags !~ /^-LIST:/) | |||
| && ($flags !~ /^-LANG:/) | |||
| ) { | |||
| $linker_L .= $flags . " " | |||
| } | |||
| if ($flags =~ /^\-Y/) { | |||
| $linker_L .= "-Wl,". $flags . " " | |||
| } | |||
| if ($flags =~ /^\--exclude-libs/) { | |||
| $linker_L .= "-Wl,". $flags . " "; | |||
| $flags=""; | |||
| } | |||
| if ( | |||
| ($flags =~ /^\-l/) | |||
| && ($flags !~ /gfortranbegin/) | |||
| && ($flags !~ /frtbegin/) | |||
| && ($flags !~ /pathfstart/) | |||
| && ($flags !~ /numa/) | |||
| && ($flags !~ /crt[0-9]/) | |||
| && ($flags !~ /gcc/) | |||
| && ($flags !~ /user32/) | |||
| && ($flags !~ /kernel32/) | |||
| && ($flags !~ /advapi32/) | |||
| && ($flags !~ /shell32/) | |||
| && ($flags !~ /omp/) | |||
| && ($flags !~ /[0-9]+/) | |||
| ) { | |||
| $linker_l .= $flags . " " | |||
| } | |||
| $linker_a .= $flags . " " if $flags =~ /\.a$/; | |||
| printf "OSNAME=%s\n" "$os" | |||
| printf "ARCH=%s\n" "$architecture" | |||
| printf "C_COMPILER=%s\n" "$compiler" | |||
| [ $binformat != 'bin32' ] && printf "BINARY32=\n" | |||
| [ $binformat != 'bin64' ] && printf "BINARY64=\n" | |||
| [ "$binformat" = "bin32" ] && printf "BINARY32=1\n" | |||
| [ "$binformat" = "bin64" ] && printf "BINARY64=1\n" | |||
| [ -n "$need_fu" ] && printf 'FU=%s\n' "$need_fu" | |||
| [ "$cross" -ne 0 ] && [ -n "$cross_suffix" ] && \ | |||
| printf "CROSS_SUFFIX=%s\n" "$cross_suffix" | |||
| [ "$cross" -ne 0 ] && printf "CROSS=1\n" | |||
| printf "CEXTRALIB=%s %s %s\n" "$linker_L" "$linker_l" "$linker_a" | |||
| [ "$have_msa" -eq 1 ] && { | |||
| printf "HAVE_MSA=1\n" | |||
| printf "MSA_FLAGS=%s\n" "$msa_flags" | |||
| } | |||
| [ "$no_rv64gv" -eq 1 ] && printf "NO_RV64GV=1\n" | |||
| [ "$no_avx512" -eq 1 ] && printf "NO_AVX512=1\n" | |||
| [ "$no_avx2" -eq 1 ] && printf "NO_AVX2=1\n" | |||
| [ "$oldgcc" -eq 1 ] && printf "OLDGCC=1\n" | |||
| } >> "$makefile" | |||
| } | |||
| os=`echo "$os" | tr '[[:lower:]]' '[[:upper:]]'/ ` | |||
| architecture=`echo "$architecture" | tr '[[:lower:]]' '[[:upper:]]' ` | |||
| compiler=`echo "$compiler" | tr '[[:lower:]]' '[[:upper:]]' ` | |||
| open(MAKEFILE, "> $makefile") || die "Can't create $makefile"; | |||
| open(CONFFILE, "> $config" ) || die "Can't create $config"; | |||
| { | |||
| printf "#define OS_%s\t1\n" "$os" | |||
| printf "#define ARCH_%s\t1\n" "$architecture" | |||
| printf "#define C_%s\t1\n" "$compiler" | |||
| [ "$binformat" = "bin32" ] && printf "#define __32BIT__\t1\n" | |||
| [ "$binformat" = "bin64" ] && printf "#define __64BIT__\t1\n" | |||
| [ -n "$need_fu" ] && printf "#define FUNDERSCORE\t%s\n" "$need_fu" | |||
| [ "$have_msa" -eq 1 ] && printf "#define HAVE_MSA\t1\n" | |||
| [ "$c11_atomics" -eq 1 ] && printf "#define HAVE_C11\t1\n" | |||
| } >> "$config" | |||
| # print $data, "\n"; | |||
| print MAKEFILE "OSNAME=$os\n"; | |||
| print MAKEFILE "ARCH=$architecture\n"; | |||
| print MAKEFILE "C_COMPILER=$compiler\n"; | |||
| print MAKEFILE "BINARY32=\n" if $binformat ne bin32; | |||
| print MAKEFILE "BINARY64=\n" if $binformat ne bin64; | |||
| print MAKEFILE "BINARY32=1\n" if $binformat eq bin32; | |||
| print MAKEFILE "BINARY64=1\n" if $binformat eq bin64; | |||
| print MAKEFILE "FU=$need_fu\n" if $need_fu ne ""; | |||
| print MAKEFILE "CROSS_SUFFIX=$cross_suffix\n" if $cross != 0 && $cross_suffix ne ""; | |||
| print MAKEFILE "CROSS=1\n" if $cross != 0; | |||
| print MAKEFILE "CEXTRALIB=$linker_L $linker_l $linker_a\n"; | |||
| print MAKEFILE "HAVE_MSA=1\n" if $have_msa eq 1; | |||
| print MAKEFILE "MSA_FLAGS=$msa_flags\n" if $have_msa eq 1; | |||
| print MAKEFILE "NO_AVX512=1\n" if $no_avx512 eq 1; | |||
| print MAKEFILE "NO_AVX2=1\n" if $no_avx2 eq 1; | |||
| print MAKEFILE "OLDGCC=1\n" if $oldgcc eq 1; | |||
| $os =~ tr/[a-z]/[A-Z]/; | |||
| $architecture =~ tr/[a-z]/[A-Z]/; | |||
| $compiler =~ tr/[a-z]/[A-Z]/; | |||
| print CONFFILE "#define OS_$os\t1\n"; | |||
| print CONFFILE "#define ARCH_$architecture\t1\n"; | |||
| print CONFFILE "#define C_$compiler\t1\n"; | |||
| print CONFFILE "#define __32BIT__\t1\n" if $binformat eq bin32; | |||
| print CONFFILE "#define __64BIT__\t1\n" if $binformat eq bin64; | |||
| print CONFFILE "#define FUNDERSCORE\t$need_fu\n" if $need_fu ne ""; | |||
| print CONFFILE "#define HAVE_MSA\t1\n" if $have_msa eq 1; | |||
| print CONFFILE "#define HAVE_C11\t1\n" if $c11_atomics eq 1; | |||
| if ($os eq "LINUX") { | |||
| if [ "$os" = "LINUX" ]; then | |||
| # @pthread = split(/\s+/, `nm /lib/libpthread.so* | grep _pthread_create`); | |||
| # if ($pthread[2] ne "") { | |||
| # print CONFFILE "#define PTHREAD_CREATE_FUNC $pthread[2]\n"; | |||
| # } else { | |||
| print CONFFILE "#define PTHREAD_CREATE_FUNC pthread_create\n"; | |||
| printf "#define PTHREAD_CREATE_FUNC pthread_create\n" >> "$config" | |||
| # } | |||
| } else { | |||
| print CONFFILE "#define PTHREAD_CREATE_FUNC pthread_create\n"; | |||
| } | |||
| close(MAKEFILE); | |||
| close(CONFFILE); | |||
| else | |||
| printf "#define PTHREAD_CREATE_FUNC pthread_create\n" >> "$config" | |||
| fi | |||
| @@ -0,0 +1,456 @@ | |||
| #!/usr/bin/env perl | |||
| #use File::Basename; | |||
| # use File::Temp qw(tempfile); | |||
| # Checking cross compile | |||
| $hostos = `uname -s | sed -e s/\-.*//`; chop($hostos); | |||
| $hostarch = `uname -m | sed -e s/i.86/x86/`; | |||
| $hostarch = `uname -p` if ($hostos eq "AIX" || $hostos eq "SunOS"); | |||
| chop($hostarch); | |||
| $hostarch = "x86_64" if ($hostarch eq "amd64"); | |||
| $hostarch = "arm" if ($hostarch ne "arm64" && $hostarch =~ /^arm.*/); | |||
| $hostarch = "arm64" if ($hostarch eq "aarch64"); | |||
| $hostarch = "power" if ($hostarch =~ /^(powerpc|ppc).*/); | |||
| $hostarch = "zarch" if ($hostarch eq "s390x"); | |||
| #$tmpf = new File::Temp( UNLINK => 1 ); | |||
| $binary = $ENV{"BINARY"}; | |||
| $makefile = shift(@ARGV); | |||
| $config = shift(@ARGV); | |||
| $compiler_name = shift(@ARGV); | |||
| $flags = join(" ", @ARGV); | |||
| # First, we need to know the target OS and compiler name | |||
| $data = `$compiler_name $flags -E ctest.c`; | |||
| if ($?) { | |||
| printf STDERR "C Compiler ($compiler_name) is something wrong.\n"; | |||
| die 1; | |||
| } | |||
| $cross_suffix = ""; | |||
| eval "use File::Basename"; | |||
| if ($@){ | |||
| warn "could not load PERL module File::Basename, emulating its functionality"; | |||
| my $dirnam = substr($compiler_name, 0, rindex($compiler_name, "/")-1 ); | |||
| if ($dirnam ne ".") { | |||
| $cross_suffix .= $dirnam . "/"; | |||
| } | |||
| my $basnam = substr($compiler_name, rindex($compiler_name,"/")+1, length($compiler_name)-rindex($compiler_name,"/")-1); | |||
| if ($basnam =~ /([^\s]*-)(.*)/) { | |||
| $cross_suffix .= $1; | |||
| } | |||
| } else { | |||
| if (dirname($compiler_name) ne ".") { | |||
| $cross_suffix .= dirname($compiler_name) . "/"; | |||
| } | |||
| if (basename($compiler_name) =~ /([^\s]*-)(.*)/) { | |||
| $cross_suffix .= $1; | |||
| } | |||
| } | |||
| $compiler = ""; | |||
| $compiler = LSB if ($data =~ /COMPILER_LSB/); | |||
| $compiler = CLANG if ($data =~ /COMPILER_CLANG/); | |||
| $compiler = PGI if ($data =~ /COMPILER_PGI/); | |||
| $compiler = PATHSCALE if ($data =~ /COMPILER_PATHSCALE/); | |||
| $compiler = INTEL if ($data =~ /COMPILER_INTEL/); | |||
| $compiler = OPEN64 if ($data =~ /COMPILER_OPEN64/); | |||
| $compiler = SUN if ($data =~ /COMPILER_SUN/); | |||
| $compiler = IBM if ($data =~ /COMPILER_IBM/); | |||
| $compiler = DEC if ($data =~ /COMPILER_DEC/); | |||
| $compiler = FUJITSU if ($data =~ /COMPILER_FUJITSU/); | |||
| $compiler = GCC if ($compiler eq ""); | |||
| $os = Linux if ($data =~ /OS_LINUX/); | |||
| $os = FreeBSD if ($data =~ /OS_FREEBSD/); | |||
| $os = NetBSD if ($data =~ /OS_NETBSD/); | |||
| $os = OpenBSD if ($data =~ /OS_OPENBSD/); | |||
| $os = DragonFly if ($data =~ /OS_DRAGONFLY/); | |||
| $os = Darwin if ($data =~ /OS_DARWIN/); | |||
| $os = SunOS if ($data =~ /OS_SUNOS/); | |||
| $os = AIX if ($data =~ /OS_AIX/); | |||
| $os = osf if ($data =~ /OS_OSF/); | |||
| $os = WINNT if ($data =~ /OS_WINNT/); | |||
| $os = CYGWIN_NT if ($data =~ /OS_CYGWIN_NT/); | |||
| $os = Interix if ($data =~ /OS_INTERIX/); | |||
| $os = Android if ($data =~ /OS_ANDROID/); | |||
| $os = Haiku if ($data =~ /OS_HAIKU/); | |||
| $architecture = x86 if ($data =~ /ARCH_X86/); | |||
| $architecture = x86_64 if ($data =~ /ARCH_X86_64/); | |||
| $architecture = e2k if ($data =~ /ARCH_E2K/); | |||
| $architecture = power if ($data =~ /ARCH_POWER/); | |||
| $architecture = mips if ($data =~ /ARCH_MIPS/); | |||
| $architecture = mips64 if ($data =~ /ARCH_MIPS64/); | |||
| $architecture = alpha if ($data =~ /ARCH_ALPHA/); | |||
| $architecture = sparc if ($data =~ /ARCH_SPARC/); | |||
| $architecture = ia64 if ($data =~ /ARCH_IA64/); | |||
| $architecture = arm if ($data =~ /ARCH_ARM/); | |||
| $architecture = arm64 if ($data =~ /ARCH_ARM64/); | |||
| $architecture = zarch if ($data =~ /ARCH_ZARCH/); | |||
| $architecture = riscv64 if ($data =~ /ARCH_RISCV64/); | |||
| $architecture = loongarch64 if ($data =~ /ARCH_LOONGARCH64/); | |||
| $defined = 0; | |||
| if ($os eq "AIX") { | |||
| $compiler_name .= " -maix32" if ($binary eq "32"); | |||
| $compiler_name .= " -maix64" if ($binary eq "64"); | |||
| $defined = 1; | |||
| } | |||
| if ($architecture eq "mips") { | |||
| $compiler_name .= " -mabi=32"; | |||
| $defined = 1; | |||
| } | |||
| if ($architecture eq "mips64") { | |||
| $compiler_name .= " -mabi=n32" if ($binary eq "32"); | |||
| $compiler_name .= " -mabi=64" if ($binary eq "64"); | |||
| $defined = 1; | |||
| } | |||
| if (($architecture eq "arm") || ($architecture eq "arm64")) { | |||
| $defined = 1; | |||
| } | |||
| if ($architecture eq "zarch") { | |||
| $defined = 1; | |||
| $binary = 64; | |||
| } | |||
| if ($architecture eq "e2k") { | |||
| $defined = 1; | |||
| $binary = 64; | |||
| } | |||
| if ($architecture eq "alpha") { | |||
| $defined = 1; | |||
| $binary = 64; | |||
| } | |||
| if ($architecture eq "ia64") { | |||
| $defined = 1; | |||
| $binary = 64; | |||
| } | |||
| if (($architecture eq "x86") && ($os ne Darwin) && ($os ne SunOS)) { | |||
| $defined = 1; | |||
| $binary =32; | |||
| } | |||
| if ($architecture eq "riscv64") { | |||
| $defined = 1; | |||
| $binary = 64; | |||
| } | |||
| if ($architecture eq "loongarch64") { | |||
| $defined = 1; | |||
| $binary = 64; | |||
| } | |||
| if ($compiler eq "PGI") { | |||
| $compiler_name .= " -tp p7" if ($binary eq "32"); | |||
| $compiler_name .= " -tp p7-64" if ($binary eq "64"); | |||
| $openmp = "-mp"; | |||
| $defined = 1; | |||
| } | |||
| if ($compiler eq "IBM") { | |||
| $compiler_name .= " -q32" if ($binary eq "32"); | |||
| $compiler_name .= " -q64" if ($binary eq "64"); | |||
| $openmp = "-qsmp=omp"; | |||
| $defined = 1; | |||
| } | |||
| if ($compiler eq "INTEL") { | |||
| $openmp = "-openmp"; | |||
| } | |||
| if ($compiler eq "PATHSCALE") { | |||
| $openmp = "-mp"; | |||
| } | |||
| if ($compiler eq "OPEN64") { | |||
| $openmp = "-mp"; | |||
| } | |||
| if ($compiler eq "CLANG") { | |||
| $openmp = "-fopenmp"; | |||
| } | |||
| if ($compiler eq "GCC" || $compiler eq "LSB") { | |||
| $openmp = "-fopenmp"; | |||
| } | |||
| if ($compiler eq "FUJITSU") { | |||
| $openmp = "-Kopenmp"; | |||
| } | |||
| if ($defined == 0) { | |||
| $compiler_name .= " -m32" if ($binary eq "32"); | |||
| $compiler_name .= " -m64" if ($binary eq "64"); | |||
| } | |||
| # Do again | |||
| $data = `$compiler_name $flags -E ctest.c`; | |||
| if ($?) { | |||
| printf STDERR "C Compiler ($compiler_name) is something wrong.\n"; | |||
| die 1; | |||
| } | |||
| $have_msa = 0; | |||
| if (($architecture eq "mips") || ($architecture eq "mips64")) { | |||
| eval "use File::Temp qw(tempfile)"; | |||
| if ($@){ | |||
| warn "could not load PERL module File::Temp, so could not check MSA capatibility"; | |||
| } else { | |||
| $tmpf = new File::Temp( SUFFIX => '.c' , UNLINK => 1 ); | |||
| $code = '"addvi.b $w0, $w1, 1"'; | |||
| $msa_flags = "-mmsa -mfp64 -mload-store-pairs"; | |||
| print $tmpf "#include <msa.h>\n\n"; | |||
| print $tmpf "void main(void){ __asm__ volatile($code); }\n"; | |||
| $args = "$msa_flags -o $tmpf.o $tmpf"; | |||
| my @cmd = ("$compiler_name $flags $args >/dev/null 2>/dev/null"); | |||
| system(@cmd) == 0; | |||
| if ($? != 0) { | |||
| $have_msa = 0; | |||
| } else { | |||
| $have_msa = 1; | |||
| } | |||
| unlink("$tmpf.o"); | |||
| } | |||
| } | |||
| $architecture = x86 if ($data =~ /ARCH_X86/); | |||
| $architecture = x86_64 if ($data =~ /ARCH_X86_64/); | |||
| $architecture = e2k if ($data =~ /ARCH_E2K/); | |||
| $architecture = power if ($data =~ /ARCH_POWER/); | |||
| $architecture = mips if ($data =~ /ARCH_MIPS/); | |||
| $architecture = mips64 if ($data =~ /ARCH_MIPS64/); | |||
| $architecture = alpha if ($data =~ /ARCH_ALPHA/); | |||
| $architecture = sparc if ($data =~ /ARCH_SPARC/); | |||
| $architecture = ia64 if ($data =~ /ARCH_IA64/); | |||
| $architecture = arm if ($data =~ /ARCH_ARM/); | |||
| $architecture = arm64 if ($data =~ /ARCH_ARM64/); | |||
| $architecture = zarch if ($data =~ /ARCH_ZARCH/); | |||
| $architecture = loongarch64 if ($data =~ /ARCH_LOONGARCH64/); | |||
| $binformat = bin32; | |||
| $binformat = bin64 if ($data =~ /BINARY_64/); | |||
| $no_avx512= 0; | |||
| if (($architecture eq "x86") || ($architecture eq "x86_64")) { | |||
| eval "use File::Temp qw(tempfile)"; | |||
| if ($@){ | |||
| warn "could not load PERL module File::Temp, so could not check compiler compatibility with AVX512"; | |||
| $no_avx512 = 0; | |||
| } else { | |||
| # $tmpf = new File::Temp( UNLINK => 1 ); | |||
| ($fh,$tmpf) = tempfile( SUFFIX => '.c' , UNLINK => 1 ); | |||
| $code = '"vbroadcastss -4 * 4(%rsi), %zmm2"'; | |||
| print $fh "#include <immintrin.h>\n\nint main(void){ __asm__ volatile($code); }\n"; | |||
| $args = " -march=skylake-avx512 -c -o $tmpf.o $tmpf"; | |||
| if ($compiler eq "PGI") { | |||
| $args = " -tp skylake -c -o $tmpf.o $tmpf"; | |||
| } | |||
| my @cmd = ("$compiler_name $flags $args >/dev/null 2>/dev/null"); | |||
| system(@cmd) == 0; | |||
| if ($? != 0) { | |||
| $no_avx512 = 1; | |||
| } else { | |||
| $no_avx512 = 0; | |||
| } | |||
| unlink("$tmpf.o"); | |||
| } | |||
| } | |||
| $no_rv64gv= 0; | |||
| if (($architecture eq "riscv64")) { | |||
| eval "use File::Temp qw(tempfile)"; | |||
| if ($@){ | |||
| warn "could not load PERL module File::Temp, so could not check compiler compatibility with the RISCV vector extension"; | |||
| $no_rv64gv = 0; | |||
| } else { | |||
| # $tmpf = new File::Temp( UNLINK => 1 ); | |||
| ($fh,$tmpf) = tempfile( SUFFIX => '.c' , UNLINK => 1 ); | |||
| $code = '"vsetvli zero, zero, e8, m1\n"'; | |||
| print $fh "int main(void){ __asm__ volatile($code); }\n"; | |||
| $args = " -march=rv64gv -c -o $tmpf.o $tmpf"; | |||
| my @cmd = ("$compiler_name $flags $args >/dev/null 2>/dev/null"); | |||
| system(@cmd) == 0; | |||
| if ($? != 0) { | |||
| $no_rv64gv = 1; | |||
| } else { | |||
| $no_rv64gv = 0; | |||
| } | |||
| unlink("$tmpf.o"); | |||
| } | |||
| } | |||
| $c11_atomics = 0; | |||
| if ($data =~ /HAVE_C11/) { | |||
| eval "use File::Temp qw(tempfile)"; | |||
| if ($@){ | |||
| warn "could not load PERL module File::Temp, so could not check compiler compatibility with C11"; | |||
| $c11_atomics = 0; | |||
| } else { | |||
| ($fh,$tmpf) = tempfile( SUFFIX => '.c' , UNLINK => 1 ); | |||
| print $fh "#include <stdatomic.h>\nint main(void){}\n"; | |||
| $args = " -c -o $tmpf.o $tmpf"; | |||
| my @cmd = ("$compiler_name $flags $args >/dev/null 2>/dev/null"); | |||
| system(@cmd) == 0; | |||
| if ($? != 0) { | |||
| $c11_atomics = 0; | |||
| } else { | |||
| $c11_atomics = 1; | |||
| } | |||
| unlink("$tmpf.o"); | |||
| } | |||
| } | |||
| if ($compiler eq "GCC" &&( ($architecture eq "x86") || ($architecture eq "x86_64"))) { | |||
| $no_avx2 = 0; | |||
| $oldgcc = 0; | |||
| $data = `$compiler_name -dumpversion`; | |||
| if ($data <= 4.6) { | |||
| $no_avx2 = 1; | |||
| $oldgcc = 1; | |||
| } | |||
| } | |||
| $data = `$compiler_name $flags -S ctest1.c && grep globl ctest1.s | head -n 1 && rm -f ctest1.s`; | |||
| $data =~ /globl\s([_\.]*)(.*)/; | |||
| $need_fu = $1; | |||
| $cross = 0; | |||
| if ($architecture ne $hostarch) { | |||
| $cross = 1; | |||
| $cross = 0 if (($hostarch eq "x86_64") && ($architecture eq "x86")); | |||
| $cross = 0 if (($hostarch eq "mips64") && ($architecture eq "mips")); | |||
| } | |||
| $cross = 1 if ($os ne $hostos); | |||
| $cross = 0 if (($os eq "Android") && ($hostos eq "Linux") && ($ENV{TERMUX_APP_PID} != "")); | |||
| $openmp = "" if $ENV{USE_OPENMP} != 1; | |||
| $linker_L = ""; | |||
| $linker_l = ""; | |||
| $linker_a = ""; | |||
| { | |||
| $link = `$compiler_name $flags -c ctest2.c -o ctest2.o 2>&1 && $compiler_name $flags $openmp -v ctest2.o -o ctest2 2>&1 && rm -f ctest2.o ctest2 ctest2.exe`; | |||
| $link =~ s/\-Y\sP\,/\-Y/g; | |||
| @flags = split(/[\s\,\n]/, $link); | |||
| # remove leading and trailing quotes from each flag. | |||
| @flags = map {s/^['"]|['"]$//g; $_} @flags; | |||
| foreach $flags (@flags) { | |||
| if ( | |||
| ($flags =~ /^\-L/) | |||
| && ($flags !~ /^-LIST:/) | |||
| && ($flags !~ /^-LANG:/) | |||
| ) { | |||
| $linker_L .= $flags . " " | |||
| } | |||
| if ($flags =~ /^\-Y/) { | |||
| $linker_L .= "-Wl,". $flags . " " | |||
| } | |||
| if ($flags =~ /^\--exclude-libs/) { | |||
| $linker_L .= "-Wl,". $flags . " "; | |||
| $flags=""; | |||
| } | |||
| if ( | |||
| ($flags =~ /^\-l/) | |||
| && ($flags !~ /gfortranbegin/) | |||
| && ($flags !~ /frtbegin/) | |||
| && ($flags !~ /pathfstart/) | |||
| && ($flags !~ /numa/) | |||
| && ($flags !~ /crt[0-9]/) | |||
| && ($flags !~ /gcc/) | |||
| && ($flags !~ /user32/) | |||
| && ($flags !~ /kernel32/) | |||
| && ($flags !~ /advapi32/) | |||
| && ($flags !~ /shell32/) | |||
| && ($flags !~ /omp/) | |||
| && ($flags !~ /[0-9]+/) | |||
| ) { | |||
| $linker_l .= $flags . " " | |||
| } | |||
| $linker_a .= $flags . " " if $flags =~ /\.a$/; | |||
| } | |||
| } | |||
| open(MAKEFILE, "> $makefile") || die "Can't create $makefile"; | |||
| open(CONFFILE, "> $config" ) || die "Can't create $config"; | |||
| # print $data, "\n"; | |||
| print MAKEFILE "OSNAME=$os\n"; | |||
| print MAKEFILE "ARCH=$architecture\n"; | |||
| print MAKEFILE "C_COMPILER=$compiler\n"; | |||
| print MAKEFILE "BINARY32=\n" if $binformat ne bin32; | |||
| print MAKEFILE "BINARY64=\n" if $binformat ne bin64; | |||
| print MAKEFILE "BINARY32=1\n" if $binformat eq bin32; | |||
| print MAKEFILE "BINARY64=1\n" if $binformat eq bin64; | |||
| print MAKEFILE "FU=$need_fu\n" if $need_fu ne ""; | |||
| print MAKEFILE "CROSS_SUFFIX=$cross_suffix\n" if $cross != 0 && $cross_suffix ne ""; | |||
| print MAKEFILE "CROSS=1\n" if $cross != 0; | |||
| print MAKEFILE "CEXTRALIB=$linker_L $linker_l $linker_a\n"; | |||
| print MAKEFILE "HAVE_MSA=1\n" if $have_msa eq 1; | |||
| print MAKEFILE "MSA_FLAGS=$msa_flags\n" if $have_msa eq 1; | |||
| print MAKEFILE "NO_RV64GV=1\n" if $no_rv64gv eq 1; | |||
| print MAKEFILE "NO_AVX512=1\n" if $no_avx512 eq 1; | |||
| print MAKEFILE "NO_AVX2=1\n" if $no_avx2 eq 1; | |||
| print MAKEFILE "OLDGCC=1\n" if $oldgcc eq 1; | |||
| $os =~ tr/[a-z]/[A-Z]/; | |||
| $architecture =~ tr/[a-z]/[A-Z]/; | |||
| $compiler =~ tr/[a-z]/[A-Z]/; | |||
| print CONFFILE "#define OS_$os\t1\n"; | |||
| print CONFFILE "#define ARCH_$architecture\t1\n"; | |||
| print CONFFILE "#define C_$compiler\t1\n"; | |||
| print CONFFILE "#define __32BIT__\t1\n" if $binformat eq bin32; | |||
| print CONFFILE "#define __64BIT__\t1\n" if $binformat eq bin64; | |||
| print CONFFILE "#define FUNDERSCORE\t$need_fu\n" if $need_fu ne ""; | |||
| print CONFFILE "#define HAVE_MSA\t1\n" if $have_msa eq 1; | |||
| print CONFFILE "#define HAVE_C11\t1\n" if $c11_atomics eq 1; | |||
| if ($os eq "LINUX") { | |||
| # @pthread = split(/\s+/, `nm /lib/libpthread.so* | grep _pthread_create`); | |||
| # if ($pthread[2] ne "") { | |||
| # print CONFFILE "#define PTHREAD_CREATE_FUNC $pthread[2]\n"; | |||
| # } else { | |||
| print CONFFILE "#define PTHREAD_CREATE_FUNC pthread_create\n"; | |||
| # } | |||
| } else { | |||
| print CONFFILE "#define PTHREAD_CREATE_FUNC pthread_create\n"; | |||
| } | |||
| close(MAKEFILE); | |||
| close(CONFFILE); | |||
| @@ -28,6 +28,8 @@ char* openblas_get_corename(void); | |||
| #ifdef OPENBLAS_OS_LINUX | |||
| /* Sets thread affinity for OpenBLAS threads. `thread_idx` is in [0, openblas_get_num_threads()-1]. */ | |||
| int openblas_setaffinity(int thread_idx, size_t cpusetsize, cpu_set_t* cpu_set); | |||
| /* Queries thread affinity for OpenBLAS threads. `thread_idx` is in [0, openblas_get_num_threads()-1]. */ | |||
| int openblas_getaffinity(int thread_idx, size_t cpusetsize, cpu_set_t* cpu_set); | |||
| #endif | |||
| /* Get the parallelization type which is used by OpenBLAS */ | |||
| @@ -161,6 +161,30 @@ if (${CORE} STREQUAL ARMV8SVE) | |||
| endif () | |||
| endif () | |||
| if (${CORE} STREQUAL CORTEXA510) | |||
| if (NOT DYNAMIC_ARCH) | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve") | |||
| endif () | |||
| endif () | |||
| if (${CORE} STREQUAL CORTEXA710) | |||
| if (NOT DYNAMIC_ARCH) | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve") | |||
| endif () | |||
| endif () | |||
| if (${CORE} STREQUAL CORTEXX1) | |||
| if (NOT DYNAMIC_ARCH) | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a") | |||
| endif () | |||
| endif () | |||
| if (${CORE} STREQUAL CORTEXX2) | |||
| if (NOT DYNAMIC_ARCH) | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve") | |||
| endif () | |||
| endif () | |||
| if (${CORE} STREQUAL POWER10) | |||
| if (NOT DYNAMIC_ARCH) | |||
| execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) | |||
| @@ -50,6 +50,15 @@ else() | |||
| set(ONLY_CBLAS_IN ${ONLY_CBLAS}) | |||
| endif() | |||
| if (NOT DEFINED USE_PERL) | |||
| add_custom_command( | |||
| OUTPUT ${PROJECT_BINARY_DIR}/openblas.def | |||
| #TARGET ${OpenBLAS_LIBNAME} PRE_LINK | |||
| COMMAND "${PROJECT_SOURCE_DIR}/exports/gensymbol" | |||
| ARGS "win2k" "${ARCH_IN}" "dummy" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" "${SYMBOLPREFIX}" "${SYMBOLSUFFIX}" > "${PROJECT_BINARY_DIR}/openblas.def" | |||
| COMMENT "Create openblas.def file" | |||
| VERBATIM) | |||
| else | |||
| add_custom_command( | |||
| OUTPUT ${PROJECT_BINARY_DIR}/openblas.def | |||
| #TARGET ${OpenBLAS_LIBNAME} PRE_LINK | |||
| @@ -57,5 +66,5 @@ add_custom_command( | |||
| ARGS "${PROJECT_SOURCE_DIR}/exports/gensymbol" "win2k" "${ARCH_IN}" "dummy" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" "${SYMBOLPREFIX}" "${SYMBOLSUFFIX}" > "${PROJECT_BINARY_DIR}/openblas.def" | |||
| COMMENT "Create openblas.def file" | |||
| VERBATIM) | |||
| endif() | |||
| endif() | |||
| @@ -25,11 +25,19 @@ check_language(Fortran) | |||
| if(CMAKE_Fortran_COMPILER) | |||
| enable_language(Fortran) | |||
| else() | |||
| set (NOFORTRAN 1) | |||
| if (NOT NO_LAPACK) | |||
| message(STATUS "No Fortran compiler found, can build only BLAS but not LAPACK") | |||
| if (NOT XXXXX) | |||
| message(STATUS "No Fortran compiler found, can build only BLAS and f2c-converted LAPACK") | |||
| set(C_LAPACK 1) | |||
| if (INTERFACE64) | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -DLAPACK_ILP64") | |||
| endif () | |||
| set(TIMER "NONE") | |||
| else () | |||
| message(STATUS "No Fortran compiler found, can build only BLAS") | |||
| endif() | |||
| endif() | |||
| set (NOFORTRAN 1) | |||
| set (NO_LAPACK 1) | |||
| endif() | |||
| if (NOT ONLY_CBLAS) | |||
| @@ -67,7 +67,15 @@ if (${F_COMPILER} STREQUAL "GFORTRAN") | |||
| if (BINARY64) | |||
| set(FCOMMON_OPT "${FCOMMON_OPT} -m64") | |||
| if (INTERFACE64) | |||
| set(FCOMMON_OPT "${FCOMMON_OPT} -fdefault-integer-8") | |||
| if (CMAKE_Fortran_COMPILER_ID STREQUAL "Intel") | |||
| if (WIN32) | |||
| set(FCOMMON_OPT "${FCOMMON_OPT} /integer-size:64") | |||
| else () | |||
| set(FCOMMON_OPT "${FCOMMON_OPT} -integer-size 64") | |||
| endif () | |||
| else () | |||
| set(FCOMMON_OPT "${FCOMMON_OPT} -fdefault-integer-8") | |||
| endif () | |||
| endif () | |||
| else () | |||
| set(FCOMMON_OPT "${FCOMMON_OPT} -m32") | |||
| @@ -214,6 +222,17 @@ if (${F_COMPILER} STREQUAL "COMPAQ") | |||
| endif () | |||
| endif () | |||
| if (${F_COMPILER} STREQUAL "CRAY") | |||
| set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_INTEL") | |||
| set(FCOMMON_OPT "${FCOMMON_OPT} -hnopattern") | |||
| if (INTERFACE64) | |||
| set (FCOMMON_OPT "${FCOMMON_OPT} -s integer64") | |||
| endif () | |||
| if (NOT USE_OPENMP) | |||
| set(FCOMMON_OPT "${FCOMMON_OPT} -O noomp") | |||
| endif () | |||
| endif () | |||
| # from the root Makefile - this is for lapack-netlib to compile the correct secnd file. | |||
| if (${F_COMPILER} STREQUAL "GFORTRAN") | |||
| set(TIMER "INT_ETIME") | |||
| @@ -1,12 +1,14 @@ | |||
| # Sources for compiling lapack-netlib. Can't use CMakeLists.txt because lapack-netlib already has its own cmake files. | |||
| if (NOT C_LAPACK) | |||
| message (STATUS "fortran lapack") | |||
| set(ALLAUX ilaenv.f ilaenv2stage.f ieeeck.f lsamen.f iparmq.f iparam2stage.F | |||
| ilaprec.f ilatrans.f ilauplo.f iladiag.f chla_transtype.f dlaset.f | |||
| ilaprec.f ilatrans.f ilauplo.f iladiag.f chla_transtype.f dlaset.f la_xisnan.F90 | |||
| ../INSTALL/ilaver.f xerbla_array.f | |||
| ../INSTALL/slamch.f) | |||
| set(SCLAUX | |||
| scombssq.f sbdsvdx.f sstevx.f sstein.f | |||
| la_constants.f90 | |||
| sbdsdc.f | |||
| sbdsqr.f sdisna.f slabad.f slacpy.f sladiv.f slae2.f slaebz.f | |||
| slaed0.f slaed1.f slaed2.f slaed3.f slaed4.f slaed5.f slaed6.f | |||
| @@ -15,16 +17,17 @@ set(SCLAUX | |||
| slapy2.f slapy3.f slarnv.f | |||
| slarra.f slarrb.f slarrc.f slarrd.f slarre.f slarrf.f slarrj.f | |||
| slarrk.f slarrr.f slaneg.f | |||
| slartg.f slaruv.f slas2.f slascl.f | |||
| slartg.f90 slaruv.f slas2.f slascl.f | |||
| slasd0.f slasd1.f slasd2.f slasd3.f slasd4.f slasd5.f slasd6.f | |||
| slasd7.f slasd8.f slasda.f slasdq.f slasdt.f | |||
| slaset.f slasq1.f slasq2.f slasq3.f slasq4.f slasq5.f slasq6.f | |||
| slasr.f slasrt.f slassq.f slasv2.f spttrf.f sstebz.f sstedc.f | |||
| slasr.f slasrt.f slassq.f90 slasv2.f spttrf.f sstebz.f sstedc.f | |||
| ssteqr.f ssterf.f slaisnan.f sisnan.f | |||
| slartgp.f slartgs.f | |||
| slartgp.f slartgs.f ../INSTALL/sroundup_lwork.f | |||
| ../INSTALL/second_${TIMER}.f) | |||
| set(DZLAUX | |||
| la_constants.f90 | |||
| dbdsdc.f | |||
| dbdsvdx.f dstevx.f dstein.f | |||
| dbdsqr.f ddisna.f dlabad.f dlacpy.f dladiv.f dlae2.f dlaebz.f | |||
| @@ -34,13 +37,13 @@ set(DZLAUX | |||
| dlapy2.f dlapy3.f dlarnv.f | |||
| dlarra.f dlarrb.f dlarrc.f dlarrd.f dlarre.f dlarrf.f dlarrj.f | |||
| dlarrk.f dlarrr.f dlaneg.f | |||
| dlartg.f dlaruv.f dlas2.f dlascl.f | |||
| dlartg.f90 dlaruv.f dlas2.f dlascl.f | |||
| dlasd0.f dlasd1.f dlasd2.f dlasd3.f dlasd4.f dlasd5.f dlasd6.f | |||
| dlasd7.f dlasd8.f dlasda.f dlasdq.f dlasdt.f | |||
| dlasq1.f dlasq2.f dlasq3.f dlasq4.f dlasq5.f dlasq6.f | |||
| dlasr.f dlasrt.f dlassq.f dlasv2.f dpttrf.f dstebz.f dstedc.f | |||
| dlasr.f dlasrt.f dlassq.f90 dlasv2.f dpttrf.f dstebz.f dstedc.f | |||
| dsteqr.f dsterf.f dlaisnan.f disnan.f | |||
| dlartgp.f dlartgs.f | |||
| dlartgp.f dlartgs.f ../INSTALL/droundup_lwork.f | |||
| ../INSTALL/dlamch.f ../INSTALL/dsecnd_${TIMER}.f) | |||
| set(SLASRC | |||
| @@ -58,6 +61,7 @@ set(SLASRC | |||
| sggrqf.f sggsvd3.f sggsvp3.f sgtcon.f sgtrfs.f sgtsv.f | |||
| sgtsvx.f sgttrf.f sgttrs.f sgtts2.f shgeqz.f | |||
| shsein.f shseqr.f slabrd.f slacon.f slacn2.f | |||
| slaqz0.f slaqz1.f slaqz2.f slaqz3.f slaqz4.f | |||
| slaein.f slaexc.f slag2.f slags2.f slagtm.f slagv2.f slahqr.f | |||
| slahr2.f slaic1.f slaln2.f slals0.f slalsa.f slalsd.f | |||
| slangb.f slange.f slangt.f slanhs.f slansb.f slansp.f | |||
| @@ -170,10 +174,11 @@ set(CLASRC | |||
| clantp.f clantr.f clapll.f clapmt.f clarcm.f claqgb.f claqge.f | |||
| claqhb.f claqhe.f claqhp.f claqp2.f claqps.f claqsb.f | |||
| claqr0.f claqr1.f claqr2.f claqr3.f claqr4.f claqr5.f | |||
| claqz0.f claqz1.f claqz2.f claqz3.f | |||
| claqsp.f claqsy.f clar1v.f clar2v.f ilaclr.f ilaclc.f | |||
| clarf.f clarfb.f clarfb_gett.f clarfg.f clarfgp.f clarft.f | |||
| clarfx.f clarfy.f clargv.f clarnv.f clarrv.f clartg.f clartv.f | |||
| clarz.f clarzb.f clarzt.f clascl.f claset.f clasr.f classq.f | |||
| clarfx.f clarfy.f clargv.f clarnv.f clarrv.f clartg.f90 clartv.f | |||
| clarz.f clarzb.f clarzt.f clascl.f claset.f clasr.f classq.f90 | |||
| clasyf.f clasyf_rook.f clasyf_rk.f clasyf_aa.f | |||
| clatbs.f clatdf.f clatps.f clatrd.f clatrs.f clatrz.f | |||
| cpbcon.f cpbequ.f cpbrfs.f cpbstf.f cpbsv.f | |||
| @@ -244,6 +249,7 @@ set(DLASRC | |||
| dggglm.f dgghrd.f dgghd3.f dgglse.f dggqrf.f | |||
| dggrqf.f dggsvd3.f dggsvp3.f dgtcon.f dgtrfs.f dgtsv.f | |||
| dgtsvx.f dgttrf.f dgttrs.f dgtts2.f dhgeqz.f | |||
| dlaqz0.f dlaqz1.f dlaqz2.f dlaqz3.f dlaqz4.f | |||
| dhsein.f dhseqr.f dlabrd.f dlacon.f dlacn2.f | |||
| dlaein.f dlaexc.f dlag2.f dlags2.f dlagtm.f dlagv2.f dlahqr.f | |||
| dlahr2.f dlaic1.f dlaln2.f dlals0.f dlalsa.f dlalsd.f | |||
| @@ -345,6 +351,7 @@ set(ZLASRC | |||
| zhetrs_3.f zhecon_3.f zhesv_rk.f | |||
| zhesv_aa.f zhesv_aa_2stage.f zhetrf_aa.f zhetrf_aa_2stage.f zhetrs_aa.f zhetrs_aa_2stage.f | |||
| zhgeqz.f zhpcon.f zhpev.f zhpevd.f | |||
| zlaqz0.f zlaqz1.f zlaqz2.f zlaqz3.f | |||
| zhpevx.f zhpgst.f zhpgv.f zhpgvd.f zhpgvx.f zhprfs.f zhpsv.f | |||
| zhpsvx.f | |||
| zhptrd.f zhptrf.f zhptri.f zhptrs.f zhsein.f zhseqr.f zlabrd.f | |||
| @@ -362,9 +369,9 @@ set(ZLASRC | |||
| zlaqsp.f zlaqsy.f zlar1v.f zlar2v.f ilazlr.f ilazlc.f | |||
| zlarcm.f zlarf.f zlarfb.f zlarfb_gett.f | |||
| zlarfg.f zlarfgp.f zlarft.f | |||
| zlarfx.f zlarfy.f zlargv.f zlarnv.f zlarrv.f zlartg.f zlartv.f | |||
| zlarfx.f zlarfy.f zlargv.f zlarnv.f zlarrv.f zlartg.f90 zlartv.f | |||
| zlarz.f zlarzb.f zlarzt.f zlascl.f zlaset.f zlasr.f | |||
| zlassq.f zlasyf.f zlasyf_rook.f zlasyf_rk.f zlasyf_aa.f | |||
| zlassq.f90 zlasyf.f zlasyf_rook.f zlasyf_rk.f zlasyf_aa.f | |||
| zlatbs.f zlatdf.f zlatps.f zlatrd.f zlatrs.f zlatrz.f | |||
| zpbcon.f zpbequ.f zpbrfs.f zpbstf.f zpbsv.f | |||
| zpbsvx.f zpbtf2.f zpbtrf.f zpbtrs.f zpocon.f zpoequ.f zporfs.f | |||
| @@ -488,6 +495,499 @@ if(BUILD_COMPLEX16) | |||
| message(STATUS "Building Double Precision Complex") | |||
| endif() | |||
| else () | |||
| message (STATUS "c lapack") | |||
| set(ALLAUX ilaenv.c ilaenv2stage.c ieeeck.c lsamen.c iparmq.c iparam2stage.c | |||
| ilaprec.c ilatrans.c ilauplo.c iladiag.c chla_transtype.c dlaset.c | |||
| ../INSTALL/ilaver.c xerbla_array.c | |||
| ../INSTALL/slamch.c) | |||
| set(SCLAUX | |||
| scombssq.c sbdsvdx.c sstevx.c sstein.c | |||
| sbdsdc.c | |||
| sbdsqr.c sdisna.c slabad.c slacpy.c sladiv.c slae2.c slaebz.c | |||
| slaed0.c slaed1.c slaed2.c slaed3.c slaed4.c slaed5.c slaed6.c | |||
| slaed7.c slaed8.c slaed9.c slaeda.c slaev2.c slagtf.c | |||
| slagts.c slamrg.c slanst.c | |||
| slapy2.c slapy3.c slarnv.c | |||
| slarra.c slarrb.c slarrc.c slarrd.c slarre.c slarrf.c slarrj.c | |||
| slarrk.c slarrr.c slaneg.c | |||
| slartg.c slaruv.c slas2.c slascl.c | |||
| slasd0.c slasd1.c slasd2.c slasd3.c slasd4.c slasd5.c slasd6.c | |||
| slasd7.c slasd8.c slasda.c slasdq.c slasdt.c | |||
| slaset.c slasq1.c slasq2.c slasq3.c slasq4.c slasq5.c slasq6.c | |||
| slasr.c slasrt.c slassq.c slasv2.c spttrf.c sstebz.c sstedc.c | |||
| ssteqr.c ssterf.c slaisnan.c sisnan.c | |||
| slartgp.c slartgs.c | |||
| ../INSTALL/second_${TIMER}.c) | |||
| set(DZLAUX | |||
| dbdsdc.c | |||
| dbdsvdx.c dstevx.c dstein.c | |||
| dbdsqr.c ddisna.c dlabad.c dlacpy.c dladiv.c dlae2.c dlaebz.c | |||
| dlaed0.c dlaed1.c dlaed2.c dlaed3.c dlaed4.c dlaed5.c dlaed6.c | |||
| dlaed7.c dlaed8.c dlaed9.c dlaeda.c dlaev2.c dlagtf.c | |||
| dlagts.c dlamrg.c dlanst.c | |||
| dlapy2.c dlapy3.c dlarnv.c | |||
| dlarra.c dlarrb.c dlarrc.c dlarrd.c dlarre.c dlarrf.c dlarrj.c | |||
| dlarrk.c dlarrr.c dlaneg.c | |||
| dlartg.c dlaruv.c dlas2.c dlascl.c | |||
| dlasd0.c dlasd1.c dlasd2.c dlasd3.c dlasd4.c dlasd5.c dlasd6.c | |||
| dlasd7.c dlasd8.c dlasda.c dlasdq.c dlasdt.c | |||
| dlasq1.c dlasq2.c dlasq3.c dlasq4.c dlasq5.c dlasq6.c | |||
| dlasr.c dlasrt.c dlassq.c dlasv2.c dpttrf.c dstebz.c dstedc.c | |||
| dsteqr.c dsterf.c dlaisnan.c disnan.c | |||
| dlartgp.c dlartgs.c | |||
| ../INSTALL/dlamch.c ../INSTALL/dsecnd_${TIMER}.c) | |||
| set(SLASRC | |||
| sgbbrd.c sgbcon.c sgbequ.c sgbrfs.c sgbsv.c | |||
| sgbsvx.c sgbtf2.c sgbtrf.c sgbtrs.c sgebak.c sgebal.c sgebd2.c | |||
| sgebrd.c sgecon.c sgeequ.c sgees.c sgeesx.c sgeev.c sgeevx.c | |||
| sgehd2.c sgehrd.c sgelq2.c sgelqf.c | |||
| sgels.c sgelsd.c sgelss.c sgelsy.c sgeql2.c sgeqlf.c | |||
| sgeqp3.c sgeqr2.c sgeqr2p.c sgeqrf.c sgeqrfp.c sgerfs.c sgerq2.c sgerqf.c | |||
| sgesc2.c sgesdd.c sgesvd.c sgesvdx.c sgesvx.c sgetc2.c | |||
| sgetrf2.c sgetri.c | |||
| sggbak.c sggbal.c | |||
| sgges.c sgges3.c sggesx.c sggev.c sggev3.c sggevx.c | |||
| sggglm.c sgghrd.c sgghd3.c sgglse.c sggqrf.c | |||
| sggrqf.c sggsvd3.c sggsvp3.c sgtcon.c sgtrfs.c sgtsv.c | |||
| sgtsvx.c sgttrf.c sgttrs.c sgtts2.c shgeqz.c | |||
| shsein.c shseqr.c slabrd.c slacon.c slacn2.c | |||
| slaein.c slaexc.c slag2.c slags2.c slagtm.c slagv2.c slahqr.c | |||
| slahr2.c slaic1.c slaln2.c slals0.c slalsa.c slalsd.c | |||
| slangb.c slange.c slangt.c slanhs.c slansb.c slansp.c | |||
| slansy.c slantb.c slantp.c slantr.c slanv2.c | |||
| slapll.c slapmt.c | |||
| slaqgb.c slaqge.c slaqp2.c slaqps.c slaqsb.c slaqsp.c slaqsy.c | |||
| slaqr0.c slaqr1.c slaqr2.c slaqr3.c slaqr4.c slaqr5.c | |||
| slaqtr.c slar1v.c slar2v.c ilaslr.c ilaslc.c | |||
| slarf.c slarfb.c slarfb_gett.c slarfg.c slarfgp.c slarft.c slarfx.c slarfy.c slargv.c | |||
| slarrv.c slartv.c | |||
| slarz.c slarzb.c slarzt.c slasy2.c | |||
| slasyf.c slasyf_rook.c slasyf_rk.c slasyf_aa.c | |||
| slatbs.c slatdf.c slatps.c slatrd.c slatrs.c slatrz.c | |||
| sopgtr.c sopmtr.c sorg2l.c sorg2r.c | |||
| sorgbr.c sorghr.c sorgl2.c sorglq.c sorgql.c sorgqr.c sorgr2.c | |||
| sorgrq.c sorgtr.c sorm2l.c sorm2r.c sorm22.c | |||
| sormbr.c sormhr.c sorml2.c sormlq.c sormql.c sormqr.c sormr2.c | |||
| sormr3.c sormrq.c sormrz.c sormtr.c spbcon.c spbequ.c spbrfs.c | |||
| spbstf.c spbsv.c spbsvx.c | |||
| spbtf2.c spbtrf.c spbtrs.c spocon.c spoequ.c sporfs.c sposv.c | |||
| sposvx.c spotrf2.c spotri.c spstrf.c spstf2.c | |||
| sppcon.c sppequ.c | |||
| spprfs.c sppsv.c sppsvx.c spptrf.c spptri.c spptrs.c sptcon.c | |||
| spteqr.c sptrfs.c sptsv.c sptsvx.c spttrs.c sptts2.c srscl.c | |||
| ssbev.c ssbevd.c ssbevx.c ssbgst.c ssbgv.c ssbgvd.c ssbgvx.c | |||
| ssbtrd.c sspcon.c sspev.c sspevd.c sspevx.c sspgst.c | |||
| sspgv.c sspgvd.c sspgvx.c ssprfs.c sspsv.c sspsvx.c ssptrd.c | |||
| ssptrf.c ssptri.c ssptrs.c sstegr.c sstev.c sstevd.c sstevr.c | |||
| ssycon.c ssyev.c ssyevd.c ssyevr.c ssyevx.c ssygs2.c | |||
| ssygst.c ssygv.c ssygvd.c ssygvx.c ssyrfs.c ssysv.c ssysvx.c | |||
| ssytd2.c ssytf2.c ssytrd.c ssytrf.c ssytri.c ssytri2.c ssytri2x.c | |||
| ssyswapr.c ssytrs.c ssytrs2.c | |||
| ssyconv.c ssyconvf.c ssyconvf_rook.c | |||
| ssysv_aa.c ssysv_aa_2stage.c ssytrf_aa.c ssytrf_aa_2stage.c ssytrs_aa.c ssytrs_aa_2stage.c | |||
| ssytf2_rook.c ssytrf_rook.c ssytrs_rook.c | |||
| ssytri_rook.c ssycon_rook.c ssysv_rook.c | |||
| ssytf2_rk.c ssytrf_rk.c ssytrs_3.c | |||
| ssytri_3.c ssytri_3x.c ssycon_3.c ssysv_rk.c | |||
| ssysv_aa.c ssytrf_aa.c ssytrs_aa.c | |||
| stbcon.c | |||
| stbrfs.c stbtrs.c stgevc.c stgex2.c stgexc.c stgsen.c | |||
| stgsja.c stgsna.c stgsy2.c stgsyl.c stpcon.c stprfs.c stptri.c | |||
| stptrs.c | |||
| strcon.c strevc.c strevc3.c strexc.c strrfs.c strsen.c strsna.c strsyl.c | |||
| strtrs.c stzrzf.c sstemr.c | |||
| slansf.c spftrf.c spftri.c spftrs.c ssfrk.c stfsm.c stftri.c stfttp.c | |||
| stfttr.c stpttf.c stpttr.c strttf.c strttp.c | |||
| sgejsv.c sgesvj.c sgsvj0.c sgsvj1.c | |||
| sgeequb.c ssyequb.c spoequb.c sgbequb.c | |||
| sbbcsd.c slapmr.c sorbdb.c sorbdb1.c sorbdb2.c sorbdb3.c sorbdb4.c | |||
| sorbdb5.c sorbdb6.c sorcsd.c sorcsd2by1.c | |||
| sgeqrt.c sgeqrt2.c sgeqrt3.c sgemqrt.c | |||
| stpqrt.c stpqrt2.c stpmqrt.c stprfb.c | |||
| sgelqt.c sgelqt3.c sgemlqt.c | |||
| sgetsls.c sgetsqrhrt.c sgeqr.c slatsqr.c slamtsqr.c sgemqr.c | |||
| sgelq.c slaswlq.c slamswlq.c sgemlq.c | |||
| stplqt.c stplqt2.c stpmlqt.c | |||
| ssytrd_2stage.c ssytrd_sy2sb.c ssytrd_sb2st.c ssb2st_kernels.c | |||
| ssyevd_2stage.c ssyev_2stage.c ssyevx_2stage.c ssyevr_2stage.c | |||
| ssbev_2stage.c ssbevx_2stage.c ssbevd_2stage.c ssygv_2stage.c | |||
| sgesvdq.c slaorhr_col_getrfnp.c | |||
| slaorhr_col_getrfnp2.c sorgtsqr.c sorgtsqr_row.c sorhr_col.c ) | |||
| set(SXLASRC sgesvxx.c sgerfsx.c sla_gerfsx_extended.c sla_geamv.c | |||
| sla_gercond.c sla_gerpvgrw.c ssysvxx.c ssyrfsx.c | |||
| sla_syrfsx_extended.c sla_syamv.c sla_syrcond.c sla_syrpvgrw.c | |||
| sposvxx.c sporfsx.c sla_porfsx_extended.c sla_porcond.c | |||
| sla_porpvgrw.c sgbsvxx.c sgbrfsx.c sla_gbrfsx_extended.c | |||
| sla_gbamv.c sla_gbrcond.c sla_gbrpvgrw.c sla_lin_berr.c slarscl2.c | |||
| slascl2.c sla_wwaddw.c) | |||
| set(CLASRC | |||
| cbdsqr.c cgbbrd.c cgbcon.c cgbequ.c cgbrfs.c cgbsv.c cgbsvx.c | |||
| cgbtf2.c cgbtrf.c cgbtrs.c cgebak.c cgebal.c cgebd2.c cgebrd.c | |||
| cgecon.c cgeequ.c cgees.c cgeesx.c cgeev.c cgeevx.c | |||
| cgehd2.c cgehrd.c cgelq2.c cgelqf.c | |||
| cgels.c cgelsd.c cgelss.c cgelsy.c cgeql2.c cgeqlf.c cgeqp3.c | |||
| cgeqr2.c cgeqr2p.c cgeqrf.c cgeqrfp.c cgerfs.c cgerq2.c cgerqf.c | |||
| cgesc2.c cgesdd.c cgesvd.c cgesvdx.c | |||
| cgesvj.c cgejsv.c cgsvj0.c cgsvj1.c | |||
| cgesvx.c cgetc2.c cgetrf2.c | |||
| cgetri.c | |||
| cggbak.c cggbal.c | |||
| cgges.c cgges3.c cggesx.c cggev.c cggev3.c cggevx.c | |||
| cggglm.c cgghrd.c cgghd3.c cgglse.c cggqrf.c cggrqf.c | |||
| cggsvd3.c cggsvp3.c | |||
| cgtcon.c cgtrfs.c cgtsv.c cgtsvx.c cgttrf.c cgttrs.c cgtts2.c chbev.c | |||
| chbevd.c chbevx.c chbgst.c chbgv.c chbgvd.c chbgvx.c chbtrd.c | |||
| checon.c cheev.c cheevd.c cheevr.c cheevx.c chegs2.c chegst.c | |||
| chegv.c chegvd.c chegvx.c cherfs.c chesv.c chesvx.c chetd2.c | |||
| chetf2.c chetrd.c | |||
| chetrf.c chetri.c chetri2.c chetri2x.c cheswapr.c | |||
| chetrs.c chetrs2.c | |||
| chetf2_rook.c chetrf_rook.c chetri_rook.c | |||
| chetrs_rook.c checon_rook.c chesv_rook.c | |||
| chetf2_rk.c chetrf_rk.c chetri_3.c chetri_3x.c | |||
| chetrs_3.c checon_3.c chesv_rk.c | |||
| chesv_aa.c chesv_aa_2stage.c chetrf_aa.c chetrf_aa_2stage.c chetrs_aa.c chetrs_aa_2stage.c | |||
| chgeqz.c chpcon.c chpev.c chpevd.c | |||
| chpevx.c chpgst.c chpgv.c chpgvd.c chpgvx.c chprfs.c chpsv.c | |||
| chpsvx.c | |||
| chptrd.c chptrf.c chptri.c chptrs.c chsein.c chseqr.c clabrd.c | |||
| clacgv.c clacon.c clacn2.c clacp2.c clacpy.c clacrm.c clacrt.c cladiv.c | |||
| claed0.c claed7.c claed8.c | |||
| claein.c claesy.c claev2.c clags2.c clagtm.c | |||
| clahef.c clahef_rook.c clahef_rk.c clahef_aa.c clahqr.c | |||
| clahr2.c claic1.c clals0.c clalsa.c clalsd.c clangb.c clange.c clangt.c | |||
| clanhb.c clanhe.c | |||
| clanhp.c clanhs.c clanht.c clansb.c clansp.c clansy.c clantb.c | |||
| clantp.c clantr.c clapll.c clapmt.c clarcm.c claqgb.c claqge.c | |||
| claqhb.c claqhe.c claqhp.c claqp2.c claqps.c claqsb.c | |||
| claqr0.c claqr1.c claqr2.c claqr3.c claqr4.c claqr5.c | |||
| claqsp.c claqsy.c clar1v.c clar2v.c ilaclr.c ilaclc.c | |||
| clarf.c clarfb.c clarfb_gett.c clarfg.c clarfgp.c clarft.c | |||
| clarfx.c clarfy.c clargv.c clarnv.c clarrv.c clartg.c clartv.c | |||
| clarz.c clarzb.c clarzt.c clascl.c claset.c clasr.c classq.c | |||
| clasyf.c clasyf_rook.c clasyf_rk.c clasyf_aa.c | |||
| clatbs.c clatdf.c clatps.c clatrd.c clatrs.c clatrz.c | |||
| cpbcon.c cpbequ.c cpbrfs.c cpbstf.c cpbsv.c | |||
| cpbsvx.c cpbtf2.c cpbtrf.c cpbtrs.c cpocon.c cpoequ.c cporfs.c | |||
| cposv.c cposvx.c cpotrf2.c cpotri.c cpstrf.c cpstf2.c | |||
| cppcon.c cppequ.c cpprfs.c cppsv.c cppsvx.c cpptrf.c cpptri.c cpptrs.c | |||
| cptcon.c cpteqr.c cptrfs.c cptsv.c cptsvx.c cpttrf.c cpttrs.c cptts2.c | |||
| crot.c cspcon.c csprfs.c cspsv.c | |||
| cspsvx.c csptrf.c csptri.c csptrs.c csrscl.c cstedc.c | |||
| cstegr.c cstein.c csteqr.c csycon.c | |||
| csyrfs.c csysv.c csysvx.c csytf2.c csytrf.c csytri.c | |||
| csytri2.c csytri2x.c csyswapr.c | |||
| csytrs.c csytrs2.c | |||
| csyconv.c csyconvf.c csyconvf_rook.c | |||
| csytf2_rook.c csytrf_rook.c csytrs_rook.c | |||
| csytri_rook.c csycon_rook.c csysv_rook.c | |||
| csytf2_rk.c csytrf_rk.c csytrf_aa.c csytrf_aa_2stage.c csytrs_3.c csytrs_aa.c csytrs_aa_2stage.c | |||
| csytri_3.c csytri_3x.c csycon_3.c csysv_rk.c csysv_aa.c csysv_aa_2stage.c | |||
| ctbcon.c ctbrfs.c ctbtrs.c ctgevc.c ctgex2.c | |||
| ctgexc.c ctgsen.c ctgsja.c ctgsna.c ctgsy2.c ctgsyl.c ctpcon.c | |||
| ctprfs.c ctptri.c | |||
| ctptrs.c ctrcon.c ctrevc.c ctrevc3.c ctrexc.c ctrrfs.c ctrsen.c ctrsna.c | |||
| ctrsyl.c ctrtrs.c ctzrzf.c cung2l.c cung2r.c | |||
| cungbr.c cunghr.c cungl2.c cunglq.c cungql.c cungqr.c cungr2.c | |||
| cungrq.c cungtr.c cunm2l.c cunm2r.c cunmbr.c cunmhr.c cunml2.c cunm22.c | |||
| cunmlq.c cunmql.c cunmqr.c cunmr2.c cunmr3.c cunmrq.c cunmrz.c | |||
| cunmtr.c cupgtr.c cupmtr.c icmax1.c scsum1.c cstemr.c | |||
| chfrk.c ctfttp.c clanhf.c cpftrf.c cpftri.c cpftrs.c ctfsm.c ctftri.c | |||
| ctfttr.c ctpttf.c ctpttr.c ctrttf.c ctrttp.c | |||
| cgeequb.c cgbequb.c csyequb.c cpoequb.c cheequb.c | |||
| cbbcsd.c clapmr.c cunbdb.c cunbdb1.c cunbdb2.c cunbdb3.c cunbdb4.c | |||
| cunbdb5.c cunbdb6.c cuncsd.c cuncsd2by1.c | |||
| cgeqrt.c cgeqrt2.c cgeqrt3.c cgemqrt.c | |||
| ctpqrt.c ctpqrt2.c ctpmqrt.c ctprfb.c | |||
| cgelqt.c cgelqt3.c cgemlqt.c | |||
| cgetsls.c cgetsqrhrt.c cgeqr.c clatsqr.c clamtsqr.c cgemqr.c | |||
| cgelq.c claswlq.c clamswlq.c cgemlq.c | |||
| ctplqt.c ctplqt2.c ctpmlqt.c | |||
| chetrd_2stage.c chetrd_he2hb.c chetrd_hb2st.c chb2st_kernels.c | |||
| cheevd_2stage.c cheev_2stage.c cheevx_2stage.c cheevr_2stage.c | |||
| chbev_2stage.c chbevx_2stage.c chbevd_2stage.c chegv_2stage.c | |||
| cgesvdq.c claunhr_col_getrfnp.c claunhr_col_getrfnp2.c | |||
| cungtsqr.c cungtsqr_row.c cunhr_col.c ) | |||
| set(CXLASRC cgesvxx.c cgerfsx.c cla_gerfsx_extended.c cla_geamv.c | |||
| cla_gercond_c.c cla_gercond_x.c cla_gerpvgrw.c | |||
| csysvxx.c csyrfsx.c cla_syrfsx_extended.c cla_syamv.c | |||
| cla_syrcond_c.c cla_syrcond_x.c cla_syrpvgrw.c | |||
| cposvxx.c cporfsx.c cla_porfsx_extended.c | |||
| cla_porcond_c.c cla_porcond_x.c cla_porpvgrw.c | |||
| cgbsvxx.c cgbrfsx.c cla_gbrfsx_extended.c cla_gbamv.c | |||
| cla_gbrcond_c.c cla_gbrcond_x.c cla_gbrpvgrw.c | |||
| chesvxx.c cherfsx.c cla_herfsx_extended.c cla_heamv.c | |||
| cla_hercond_c.c cla_hercond_x.c cla_herpvgrw.c | |||
| cla_lin_berr.c clarscl2.c clascl2.c cla_wwaddw.c) | |||
| set(DLASRC | |||
| dgbbrd.c dgbcon.c dgbequ.c dgbrfs.c dgbsv.c | |||
| dgbsvx.c dgbtf2.c dgbtrf.c dgbtrs.c dgebak.c dgebal.c dgebd2.c | |||
| dgebrd.c dgecon.c dgeequ.c dgees.c dgeesx.c dgeev.c dgeevx.c | |||
| dgehd2.c dgehrd.c dgelq2.c dgelqf.c | |||
| dgels.c dgelsd.c dgelss.c dgelsy.c dgeql2.c dgeqlf.c | |||
| dgeqp3.c dgeqr2.c dgeqr2p.c dgeqrf.c dgeqrfp.c dgerfs.c dgerq2.c dgerqf.c | |||
| dgesc2.c dgesdd.c dgesvd.c dgesvdx.c dgesvx.c dgetc2.c | |||
| dgetrf2.c dgetri.c | |||
| dggbak.c dggbal.c | |||
| dgges.c dgges3.c dggesx.c dggev.c dggev3.c dggevx.c | |||
| dggglm.c dgghrd.c dgghd3.c dgglse.c dggqrf.c | |||
| dggrqf.c dggsvd3.c dggsvp3.c dgtcon.c dgtrfs.c dgtsv.c | |||
| dgtsvx.c dgttrf.c dgttrs.c dgtts2.c dhgeqz.c | |||
| dhsein.c dhseqr.c dlabrd.c dlacon.c dlacn2.c | |||
| dlaein.c dlaexc.c dlag2.c dlags2.c dlagtm.c dlagv2.c dlahqr.c | |||
| dlahr2.c dlaic1.c dlaln2.c dlals0.c dlalsa.c dlalsd.c | |||
| dlangb.c dlange.c dlangt.c dlanhs.c dlansb.c dlansp.c | |||
| dlansy.c dlantb.c dlantp.c dlantr.c dlanv2.c | |||
| dlapll.c dlapmt.c | |||
| dlaqgb.c dlaqge.c dlaqp2.c dlaqps.c dlaqsb.c dlaqsp.c dlaqsy.c | |||
| dlaqr0.c dlaqr1.c dlaqr2.c dlaqr3.c dlaqr4.c dlaqr5.c | |||
| dlaqtr.c dlar1v.c dlar2v.c iladlr.c iladlc.c | |||
| dlarf.c dlarfb.c dlarfb_gett.c dlarfg.c dlarfgp.c dlarft.c dlarfx.c dlarfy.c | |||
| dlargv.c dlarrv.c dlartv.c | |||
| dlarz.c dlarzb.c dlarzt.c dlasy2.c | |||
| dlasyf.c dlasyf_rook.c dlasyf_rk.c dlasyf_aa.c | |||
| dlatbs.c dlatdf.c dlatps.c dlatrd.c dlatrs.c dlatrz.c | |||
| dopgtr.c dopmtr.c dorg2l.c dorg2r.c | |||
| dorgbr.c dorghr.c dorgl2.c dorglq.c dorgql.c dorgqr.c dorgr2.c | |||
| dorgrq.c dorgtr.c dorm2l.c dorm2r.c dorm22.c | |||
| dormbr.c dormhr.c dorml2.c dormlq.c dormql.c dormqr.c dormr2.c | |||
| dormr3.c dormrq.c dormrz.c dormtr.c dpbcon.c dpbequ.c dpbrfs.c | |||
| dpbstf.c dpbsv.c dpbsvx.c | |||
| dpbtf2.c dpbtrf.c dpbtrs.c dpocon.c dpoequ.c dporfs.c dposv.c | |||
| dposvx.c dpotrf2.c dpotri.c dpotrs.c dpstrf.c dpstf2.c | |||
| dppcon.c dppequ.c | |||
| dpprfs.c dppsv.c dppsvx.c dpptrf.c dpptri.c dpptrs.c dptcon.c | |||
| dpteqr.c dptrfs.c dptsv.c dptsvx.c dpttrs.c dptts2.c drscl.c | |||
| dsbev.c dsbevd.c dsbevx.c dsbgst.c dsbgv.c dsbgvd.c dsbgvx.c | |||
| dsbtrd.c dspcon.c dspev.c dspevd.c dspevx.c dspgst.c | |||
| dspgv.c dspgvd.c dspgvx.c dsprfs.c dspsv.c dspsvx.c dsptrd.c | |||
| dsptrf.c dsptri.c dsptrs.c dstegr.c dstev.c dstevd.c dstevr.c | |||
| dsycon.c dsyev.c dsyevd.c dsyevr.c | |||
| dsyevx.c dsygs2.c dsygst.c dsygv.c dsygvd.c dsygvx.c dsyrfs.c | |||
| dsysv.c dsysvx.c | |||
| dsytd2.c dsytf2.c dsytrd.c dsytrf.c dsytri.c dsytrs.c dsytrs2.c | |||
| dsytri2.c dsytri2x.c dsyswapr.c | |||
| dsyconv.c dsyconvf.c dsyconvf_rook.c | |||
| dsytf2_rook.c dsytrf_rook.c dsytrs_rook.c | |||
| dsytri_rook.c dsycon_rook.c dsysv_rook.c | |||
| dsytf2_rk.c dsytrf_rk.c dsytrs_3.c | |||
| dsytri_3.c dsytri_3x.c dsycon_3.c dsysv_rk.c | |||
| dsysv_aa.c dsysv_aa_2stage.c dsytrf_aa.c dsytrf_aa_2stage.c dsytrs_aa.c dsytrs_aa_2stage.c | |||
| dtbcon.c | |||
| dtbrfs.c dtbtrs.c dtgevc.c dtgex2.c dtgexc.c dtgsen.c | |||
| dtgsja.c dtgsna.c dtgsy2.c dtgsyl.c dtpcon.c dtprfs.c dtptri.c | |||
| dtptrs.c | |||
| dtrcon.c dtrevc.c dtrevc3.c dtrexc.c dtrrfs.c dtrsen.c dtrsna.c dtrsyl.c | |||
| dtrtrs.c dtzrzf.c dstemr.c | |||
| dsgesv.c dsposv.c dlag2s.c slag2d.c dlat2s.c | |||
| dlansf.c dpftrf.c dpftri.c dpftrs.c dsfrk.c dtfsm.c dtftri.c dtfttp.c | |||
| dtfttr.c dtpttf.c dtpttr.c dtrttf.c dtrttp.c | |||
| dgejsv.c dgesvj.c dgsvj0.c dgsvj1.c | |||
| dgeequb.c dsyequb.c dpoequb.c dgbequb.c | |||
| dbbcsd.c dlapmr.c dorbdb.c dorbdb1.c dorbdb2.c dorbdb3.c dorbdb4.c | |||
| dorbdb5.c dorbdb6.c dorcsd.c dorcsd2by1.c | |||
| dgeqrt.c dgeqrt2.c dgeqrt3.c dgemqrt.c | |||
| dtpqrt.c dtpqrt2.c dtpmqrt.c dtprfb.c | |||
| dgelqt.c dgelqt3.c dgemlqt.c | |||
| dgetsls.c dgetsqrhrt.c dgeqr.c dlatsqr.c dlamtsqr.c dgemqr.c | |||
| dgelq.c dlaswlq.c dlamswlq.c dgemlq.c | |||
| dtplqt.c dtplqt2.c dtpmlqt.c | |||
| dsytrd_2stage.c dsytrd_sy2sb.c dsytrd_sb2st.c dsb2st_kernels.c | |||
| dsyevd_2stage.c dsyev_2stage.c dsyevx_2stage.c dsyevr_2stage.c | |||
| dsbev_2stage.c dsbevx_2stage.c dsbevd_2stage.c dsygv_2stage.c | |||
| dcombssq.c dgesvdq.c dlaorhr_col_getrfnp.c | |||
| dlaorhr_col_getrfnp2.c dorgtsqr.c dorgtsqr_row.c dorhr_col.c ) | |||
| set(DXLASRC dgesvxx.c dgerfsx.c dla_gerfsx_extended.c dla_geamv.c | |||
| dla_gercond.c dla_gerpvgrw.c dsysvxx.c dsyrfsx.c | |||
| dla_syrfsx_extended.c dla_syamv.c dla_syrcond.c dla_syrpvgrw.c | |||
| dposvxx.c dporfsx.c dla_porfsx_extended.c dla_porcond.c | |||
| dla_porpvgrw.c dgbsvxx.c dgbrfsx.c dla_gbrfsx_extended.c | |||
| dla_gbamv.c dla_gbrcond.c dla_gbrpvgrw.c dla_lin_berr.c dlarscl2.c | |||
| dlascl2.c dla_wwaddw.c) | |||
| set(ZLASRC | |||
| zbdsqr.c zgbbrd.c zgbcon.c zgbequ.c zgbrfs.c zgbsv.c zgbsvx.c | |||
| zgbtf2.c zgbtrf.c zgbtrs.c zgebak.c zgebal.c zgebd2.c zgebrd.c | |||
| zgecon.c zgeequ.c zgees.c zgeesx.c zgeev.c zgeevx.c | |||
| zgehd2.c zgehrd.c zgelq2.c zgelqf.c | |||
| zgels.c zgelsd.c zgelss.c zgelsy.c zgeql2.c zgeqlf.c zgeqp3.c | |||
| zgeqr2.c zgeqr2p.c zgeqrf.c zgeqrfp.c zgerfs.c zgerq2.c zgerqf.c | |||
| zgesc2.c zgesdd.c zgesvd.c zgesvdx.c zgesvx.c | |||
| zgesvj.c zgejsv.c zgsvj0.c zgsvj1.c | |||
| zgetc2.c zgetrf2.c | |||
| zgetri.c | |||
| zggbak.c zggbal.c | |||
| zgges.c zgges3.c zggesx.c zggev.c zggev3.c zggevx.c | |||
| zggglm.c zgghrd.c zgghd3.c zgglse.c zggqrf.c zggrqf.c | |||
| zggsvd3.c zggsvp3.c | |||
| zgtcon.c zgtrfs.c zgtsv.c zgtsvx.c zgttrf.c zgttrs.c zgtts2.c zhbev.c | |||
| zhbevd.c zhbevx.c zhbgst.c zhbgv.c zhbgvd.c zhbgvx.c zhbtrd.c | |||
| zhecon.c zheev.c zheevd.c zheevr.c zheevx.c zhegs2.c zhegst.c | |||
| zhegv.c zhegvd.c zhegvx.c zherfs.c zhesv.c zhesvx.c zhetd2.c | |||
| zhetf2.c zhetrd.c | |||
| zhetrf.c zhetri.c zhetri2.c zhetri2x.c zheswapr.c | |||
| zhetrs.c zhetrs2.c | |||
| zhetf2_rook.c zhetrf_rook.c zhetri_rook.c | |||
| zhetrs_rook.c zhecon_rook.c zhesv_rook.c | |||
| zhetf2_rk.c zhetrf_rk.c zhetri_3.c zhetri_3x.c | |||
| zhetrs_3.c zhecon_3.c zhesv_rk.c | |||
| zhesv_aa.c zhesv_aa_2stage.c zhetrf_aa.c zhetrf_aa_2stage.c zhetrs_aa.c zhetrs_aa_2stage.c | |||
| zhgeqz.c zhpcon.c zhpev.c zhpevd.c | |||
| zhpevx.c zhpgst.c zhpgv.c zhpgvd.c zhpgvx.c zhprfs.c zhpsv.c | |||
| zhpsvx.c | |||
| zhptrd.c zhptrf.c zhptri.c zhptrs.c zhsein.c zhseqr.c zlabrd.c | |||
| zlacgv.c zlacon.c zlacn2.c zlacp2.c zlacpy.c zlacrm.c zlacrt.c zladiv.c | |||
| zlaed0.c zlaed7.c zlaed8.c | |||
| zlaein.c zlaesy.c zlaev2.c zlags2.c zlagtm.c | |||
| zlahef.c zlahef_rook.c zlahef_rk.c zlahef_aa.c zlahqr.c | |||
| zlahr2.c zlaic1.c zlals0.c zlalsa.c zlalsd.c zlangb.c zlange.c | |||
| zlangt.c zlanhb.c | |||
| zlanhe.c | |||
| zlanhp.c zlanhs.c zlanht.c zlansb.c zlansp.c zlansy.c zlantb.c | |||
| zlantp.c zlantr.c zlapll.c zlapmt.c zlaqgb.c zlaqge.c | |||
| zlaqhb.c zlaqhe.c zlaqhp.c zlaqp2.c zlaqps.c zlaqsb.c | |||
| zlaqr0.c zlaqr1.c zlaqr2.c zlaqr3.c zlaqr4.c zlaqr5.c | |||
| zlaqsp.c zlaqsy.c zlar1v.c zlar2v.c ilazlr.c ilazlc.c | |||
| zlarcm.c zlarf.c zlarfb.c zlarfb_gett.c | |||
| zlarfg.c zlarfgp.c zlarft.c | |||
| zlarfx.c zlarfy.c zlargv.c zlarnv.c zlarrv.c zlartg.c zlartv.c | |||
| zlarz.c zlarzb.c zlarzt.c zlascl.c zlaset.c zlasr.c | |||
| zlassq.c zlasyf.c zlasyf_rook.c zlasyf_rk.c zlasyf_aa.c | |||
| zlatbs.c zlatdf.c zlatps.c zlatrd.c zlatrs.c zlatrz.c | |||
| zpbcon.c zpbequ.c zpbrfs.c zpbstf.c zpbsv.c | |||
| zpbsvx.c zpbtf2.c zpbtrf.c zpbtrs.c zpocon.c zpoequ.c zporfs.c | |||
| zposv.c zposvx.c zpotrf2.c zpotri.c zpotrs.c zpstrf.c zpstf2.c | |||
| zppcon.c zppequ.c zpprfs.c zppsv.c zppsvx.c zpptrf.c zpptri.c zpptrs.c | |||
| zptcon.c zpteqr.c zptrfs.c zptsv.c zptsvx.c zpttrf.c zpttrs.c zptts2.c | |||
| zrot.c zspcon.c zsprfs.c zspsv.c | |||
| zspsvx.c zsptrf.c zsptri.c zsptrs.c zdrscl.c zstedc.c | |||
| zstegr.c zstein.c zsteqr.c zsycon.c | |||
| zsyrfs.c zsysv.c zsysvx.c zsytf2.c zsytrf.c zsytri.c | |||
| zsytri2.c zsytri2x.c zsyswapr.c | |||
| zsytrs.c zsytrs2.c | |||
| zsyconv.c zsyconvf.c zsyconvf_rook.c | |||
| zsytf2_rook.c zsytrf_rook.c zsytrs_rook.c zsytrs_aa.c zsytrs_aa_2stage.c | |||
| zsytri_rook.c zsycon_rook.c zsysv_rook.c | |||
| zsytf2_rk.c zsytrf_rk.c zsytrf_aa.c zsytrf_aa_2stage.c zsytrs_3.c | |||
| zsytri_3.c zsytri_3x.c zsycon_3.c zsysv_rk.c zsysv_aa.c zsysv_aa_2stage.c | |||
| ztbcon.c ztbrfs.c ztbtrs.c ztgevc.c ztgex2.c | |||
| ztgexc.c ztgsen.c ztgsja.c ztgsna.c ztgsy2.c ztgsyl.c ztpcon.c | |||
| ztprfs.c ztptri.c | |||
| ztptrs.c ztrcon.c ztrevc.c ztrevc3.c ztrexc.c ztrrfs.c ztrsen.c ztrsna.c | |||
| ztrsyl.c ztrtrs.c ztzrzf.c zung2l.c | |||
| zung2r.c zungbr.c zunghr.c zungl2.c zunglq.c zungql.c zungqr.c zungr2.c | |||
| zungrq.c zungtr.c zunm2l.c zunm2r.c zunmbr.c zunmhr.c zunml2.c zunm22.c | |||
| zunmlq.c zunmql.c zunmqr.c zunmr2.c zunmr3.c zunmrq.c zunmrz.c | |||
| zunmtr.c zupgtr.c | |||
| zupmtr.c izmax1.c dzsum1.c zstemr.c | |||
| zcgesv.c zcposv.c zlag2c.c clag2z.c zlat2c.c | |||
| zhfrk.c ztfttp.c zlanhf.c zpftrf.c zpftri.c zpftrs.c ztfsm.c ztftri.c | |||
| ztfttr.c ztpttf.c ztpttr.c ztrttf.c ztrttp.c | |||
| zgeequb.c zgbequb.c zsyequb.c zpoequb.c zheequb.c | |||
| zbbcsd.c zlapmr.c zunbdb.c zunbdb1.c zunbdb2.c zunbdb3.c zunbdb4.c | |||
| zunbdb5.c zunbdb6.c zuncsd.c zuncsd2by1.c | |||
| zgeqrt.c zgeqrt2.c zgeqrt3.c zgemqrt.c | |||
| ztpqrt.c ztpqrt2.c ztpmqrt.c ztprfb.c | |||
| ztplqt.c ztplqt2.c ztpmlqt.c | |||
| zgelqt.c zgelqt3.c zgemlqt.c | |||
| zgetsls.c zgetsqrhrt.c zgeqr.c zlatsqr.c zlamtsqr.c zgemqr.c | |||
| zgelq.c zlaswlq.c zlamswlq.c zgemlq.c | |||
| zhetrd_2stage.c zhetrd_he2hb.c zhetrd_hb2st.c zhb2st_kernels.c | |||
| zheevd_2stage.c zheev_2stage.c zheevx_2stage.c zheevr_2stage.c | |||
| zhbev_2stage.c zhbevx_2stage.c zhbevd_2stage.c zhegv_2stage.c | |||
| zgesvdq.c zlaunhr_col_getrfnp.c zlaunhr_col_getrfnp2.c | |||
| zungtsqr.c zungtsqr_row.c zunhr_col.c) | |||
| set(ZXLASRC zgesvxx.c zgerfsx.c zla_gerfsx_extended.c zla_geamv.c | |||
| zla_gercond_c.c zla_gercond_x.c zla_gerpvgrw.c zsysvxx.c zsyrfsx.c | |||
| zla_syrfsx_extended.c zla_syamv.c zla_syrcond_c.c zla_syrcond_x.c | |||
| zla_syrpvgrw.c zposvxx.c zporfsx.c zla_porfsx_extended.c | |||
| zla_porcond_c.c zla_porcond_x.c zla_porpvgrw.c zgbsvxx.c zgbrfsx.c | |||
| zla_gbrfsx_extended.c zla_gbamv.c zla_gbrcond_c.c zla_gbrcond_x.c | |||
| zla_gbrpvgrw.c zhesvxx.c zherfsx.c zla_herfsx_extended.c | |||
| zla_heamv.c zla_hercond_c.c zla_hercond_x.c zla_herpvgrw.c | |||
| zla_lin_berr.c zlarscl2.c zlascl2.c zla_wwaddw.c) | |||
| if(USE_XBLAS) | |||
| set(ALLXOBJ ${SXLASRC} ${DXLASRC} ${CXLASRC} ${ZXLASRC}) | |||
| endif() | |||
| list(APPEND SLASRC DEPRECATED/sgegs.c DEPRECATED/sgegv.c | |||
| DEPRECATED/sgeqpf.c DEPRECATED/sgelsx.c DEPRECATED/sggsvd.c | |||
| DEPRECATED/sggsvp.c DEPRECATED/slahrd.c DEPRECATED/slatzm.c DEPRECATED/stzrqf.c) | |||
| list(APPEND DLASRC DEPRECATED/dgegs.c DEPRECATED/dgegv.c | |||
| DEPRECATED/dgeqpf.c DEPRECATED/dgelsx.c DEPRECATED/dggsvd.c | |||
| DEPRECATED/dggsvp.c DEPRECATED/dlahrd.c DEPRECATED/dlatzm.c DEPRECATED/dtzrqf.c) | |||
| list(APPEND CLASRC DEPRECATED/cgegs.c DEPRECATED/cgegv.c | |||
| DEPRECATED/cgeqpf.c DEPRECATED/cgelsx.c DEPRECATED/cggsvd.c | |||
| DEPRECATED/cggsvp.c DEPRECATED/clahrd.c DEPRECATED/clatzm.c DEPRECATED/ctzrqf.c) | |||
| list(APPEND ZLASRC DEPRECATED/zgegs.c DEPRECATED/zgegv.c | |||
| DEPRECATED/zgeqpf.c DEPRECATED/zgelsx.c DEPRECATED/zggsvd.c | |||
| DEPRECATED/zggsvp.c DEPRECATED/zlahrd.c DEPRECATED/zlatzm.c DEPRECATED/ztzrqf.c) | |||
| message(STATUS "Building deprecated routines") | |||
| set(DSLASRC spotrs.c) | |||
| set(ZCLASRC cpotrs.c) | |||
| set(SCATGEN slatm1.c slaran.c slarnd.c) | |||
| set(SMATGEN slatms.c slatme.c slatmr.c slatmt.c | |||
| slagge.c slagsy.c slakf2.c slarge.c slaror.c slarot.c slatm2.c | |||
| slatm3.c slatm5.c slatm6.c slatm7.c slahilb.c) | |||
| set(CMATGEN clatms.c clatme.c clatmr.c clatmt.c | |||
| clagge.c claghe.c clagsy.c clakf2.c clarge.c claror.c clarot.c | |||
| clatm1.c clarnd.c clatm2.c clatm3.c clatm5.c clatm6.c clahilb.c slatm7.c) | |||
| set(DZATGEN dlatm1.c dlaran.c dlarnd.c) | |||
| set(DMATGEN dlatms.c dlatme.c dlatmr.c dlatmt.c | |||
| dlagge.c dlagsy.c dlakf2.c dlarge.c dlaror.c dlarot.c dlatm2.c | |||
| dlatm3.c dlatm5.c dlatm6.c dlatm7.c dlahilb.c) | |||
| set(ZMATGEN zlatms.c zlatme.c zlatmr.c zlatmt.c | |||
| zlagge.c zlaghe.c zlagsy.c zlakf2.c zlarge.c zlaror.c zlarot.c | |||
| zlatm1.c zlarnd.c zlatm2.c zlatm3.c zlatm5.c zlatm6.c zlahilb.c dlatm7.c) | |||
| if(BUILD_SINGLE) | |||
| set(LA_REL_SRC ${SLASRC} ${DSLASRC} ${ALLAUX} ${SCLAUX}) | |||
| set(LA_GEN_SRC ${SMATGEN} ${SCATGEN}) | |||
| message(STATUS "Building Single Precision") | |||
| endif() | |||
| if(BUILD_DOUBLE) | |||
| set(LA_REL_SRC ${LA_REL_SRC} ${DLASRC} ${DSLASRC} ${ALLAUX} ${DZLAUX}) | |||
| set(LA_GEN_SRC ${LA_GEN_SRC} ${DMATGEN} ${DZATGEN}) | |||
| message(STATUS "Building Double Precision") | |||
| endif() | |||
| if(BUILD_COMPLEX) | |||
| set(LA_REL_SRC ${LA_REL_SRC} ${CLASRC} ${ZCLASRC} ${ALLAUX} ${SCLAUX}) | |||
| SET(LA_GEN_SRC ${LA_GEN_SRC} ${CMATGEN} ${SCATGEN}) | |||
| message(STATUS "Building Single Precision Complex") | |||
| endif() | |||
| if(BUILD_COMPLEX16) | |||
| set(LA_REL_SRC ${LA_REL_SRC} ${ZLASRC} ${ZCLASRC} ${ALLAUX} ${DZLAUX}) | |||
| SET(LA_GEN_SRC ${LA_GEN_SRC} ${ZMATGEN} ${DZATGEN}) | |||
| # for zlange/zlanhe | |||
| if (NOT BUILD_DOUBLE) | |||
| set (LA_REL_SRC ${LA_REL_SRC} dcombssq.c) | |||
| endif () | |||
| message(STATUS "Building Double Precision Complex") | |||
| endif() | |||
| endif() | |||
| # add lapack-netlib folder to the sources | |||
| set(LA_SOURCES "") | |||
| foreach (LA_FILE ${LA_REL_SRC}) | |||
| @@ -496,4 +996,9 @@ endforeach () | |||
| foreach (LA_FILE ${LA_GEN_SRC}) | |||
| list(APPEND LA_SOURCES "${NETLIB_LAPACK_DIR}/TESTING/MATGEN/${LA_FILE}") | |||
| endforeach () | |||
| set_source_files_properties(${LA_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_FFLAGS}") | |||
| if (NOT C_LAPACK) | |||
| set_source_files_properties(${LA_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_FFLAGS}") | |||
| else () | |||
| set_source_files_properties(${LA_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_CFLAGS}") | |||
| endif () | |||
| @@ -131,6 +131,8 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS | |||
| set(HAVE_SSE2 1) | |||
| set(HAVE_SSE3 1) | |||
| set(HAVE_SSSE3 1) | |||
| set(SBGEMM_UNROLL_M 8) | |||
| set(SBGEMM_UNROLL_N 4) | |||
| set(SGEMM_UNROLL_M 8) | |||
| set(SGEMM_UNROLL_N 4) | |||
| set(DGEMM_UNROLL_M 4) | |||
| @@ -143,6 +145,684 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS | |||
| set(CGEMM3M_UNROLL_N 4) | |||
| set(ZGEMM3M_UNROLL_M 4) | |||
| set(ZGEMM3M_UNROLL_N 4) | |||
| elseif ("${TCORE}" STREQUAL "ATOM") | |||
| file(APPEND ${TARGET_CONF_TEMP} | |||
| "#define L1_DATA_SIZE\t24576\n" | |||
| "#define L1_DATA_LINESIZE\t64\n" | |||
| "#define L2_SIZE\t524288\n" | |||
| "#define L2_LINESIZE\t64\n" | |||
| "#define DTB_DEFAULT_ENTRIES\t64\n" | |||
| "#define DTB_SIZE\t4096\n" | |||
| "#define HAVE_CMOV\n" | |||
| "#define HAVE_MMX\n" | |||
| "#define HAVE_SSE\n" | |||
| "#define HAVE_SSE2\n" | |||
| "#define HAVE_SSE3\n" | |||
| "#define HAVE_SSSE3\n" | |||
| "#define SLOCAL_BUFFER_SIZE\t16384\n" | |||
| "#define DLOCAL_BUFFER_SIZE\t8192\n" | |||
| "#define CLOCAL_BUFFER_SIZE\t16384\n" | |||
| "#define ZLOCAL_BUFFER_SIZE\t8192\n") | |||
| set(HAVE_SSE 1) | |||
| set(HAVE_SSE2 1) | |||
| set(HAVE_SSE3 1) | |||
| set(HAVE_SSSE3 1) | |||
| set(SBGEMM_UNROLL_M 8) | |||
| set(SBGEMM_UNROLL_N 4) | |||
| set(SGEMM_UNROLL_M 8) | |||
| set(SGEMM_UNROLL_N 4) | |||
| set(DGEMM_UNROLL_M 4) | |||
| set(DGEMM_UNROLL_N 2) | |||
| set(CGEMM_UNROLL_M 4) | |||
| set(CGEMM_UNROLL_N 2) | |||
| set(ZGEMM_UNROLL_M 2) | |||
| set(ZGEMM_UNROLL_N 1) | |||
| set(CGEMM3M_UNROLL_M 8) | |||
| set(CGEMM3M_UNROLL_N 4) | |||
| set(ZGEMM3M_UNROLL_M 4) | |||
| set(ZGEMM3M_UNROLL_N 4) | |||
| elseif ("${TCORE}" STREQUAL "PRESCOTT") | |||
| file(APPEND ${TARGET_CONF_TEMP} | |||
| "#define L1_DATA_SIZE\t16384\n" | |||
| "#define L1_DATA_LINESIZE\t64\n" | |||
| "#define L2_SIZE\t1048576\n" | |||
| "#define L2_LINESIZE\t64\n" | |||
| "#define DTB_DEFAULT_ENTRIES\t64\n" | |||
| "#define DTB_SIZE\t4096\n" | |||
| "#define HAVE_CMOV\n" | |||
| "#define HAVE_MMX\n" | |||
| "#define HAVE_SSE\n" | |||
| "#define HAVE_SSE2\n" | |||
| "#define HAVE_SSE3\n" | |||
| "#define SLOCAL_BUFFER_SIZE\t8192\n" | |||
| "#define DLOCAL_BUFFER_SIZE\t8192\n" | |||
| "#define CLOCAL_BUFFER_SIZE\t8192\n" | |||
| "#define ZLOCAL_BUFFER_SIZE\t8192\n") | |||
| set(HAVE_SSE 1) | |||
| set(HAVE_SSE2 1) | |||
| set(HAVE_SSE3 1) | |||
| set(SBGEMM_UNROLL_M 8) | |||
| set(SBGEMM_UNROLL_N 4) | |||
| set(SGEMM_UNROLL_M 8) | |||
| set(SGEMM_UNROLL_N 4) | |||
| set(DGEMM_UNROLL_M 4) | |||
| set(DGEMM_UNROLL_N 4) | |||
| set(CGEMM_UNROLL_M 4) | |||
| set(CGEMM_UNROLL_N 2) | |||
| set(ZGEMM_UNROLL_M 2) | |||
| set(ZGEMM_UNROLL_N 2) | |||
| set(CGEMM3M_UNROLL_M 8) | |||
| set(CGEMM3M_UNROLL_N 4) | |||
| set(ZGEMM3M_UNROLL_M 4) | |||
| set(ZGEMM3M_UNROLL_N 4) | |||
| elseif ("${TCORE}" STREQUAL "NEHALEM") | |||
| file(APPEND ${TARGET_CONF_TEMP} | |||
| "#define L1_DATA_SIZE\t32768\n" | |||
| "#define L1_DATA_LINESIZE\t64\n" | |||
| "#define L2_SIZE\t262144\n" | |||
| "#define L2_LINESIZE\t64\n" | |||
| "#define DTB_DEFAULT_ENTRIES\t64\n" | |||
| "#define DTB_SIZE\t4096\n" | |||
| "#define HAVE_CMOV\n" | |||
| "#define HAVE_MMX\n" | |||
| "#define HAVE_SSE\n" | |||
| "#define HAVE_SSE2\n" | |||
| "#define HAVE_SSE3\n" | |||
| "#define HAVE_SSSE3\n" | |||
| "#define HAVE_SSE4_1\n" | |||
| "#define HAVE_SSE4_2\n" | |||
| "#define SLOCAL_BUFFER_SIZE\t65535\n" | |||
| "#define DLOCAL_BUFFER_SIZE\t32768\n" | |||
| "#define CLOCAL_BUFFER_SIZE\t65536\n" | |||
| "#define ZLOCAL_BUFFER_SIZE\t32768\n") | |||
| set(HAVE_SSE 1) | |||
| set(HAVE_SSE2 1) | |||
| set(HAVE_SSE3 1) | |||
| set(HAVE_SSSE3 1) | |||
| set(HAVE_SSE4_1 1) | |||
| set(HAVE_SSE4_2 1) | |||
| set(SBGEMM_UNROLL_M 8) | |||
| set(SBGEMM_UNROLL_N 4) | |||
| set(SGEMM_UNROLL_M 4) | |||
| set(SGEMM_UNROLL_N 8) | |||
| set(DGEMM_UNROLL_M 2) | |||
| set(DGEMM_UNROLL_N 8) | |||
| set(CGEMM_UNROLL_M 2) | |||
| set(CGEMM_UNROLL_N 4) | |||
| set(ZGEMM_UNROLL_M 1) | |||
| set(ZGEMM_UNROLL_N 4) | |||
| set(CGEMM3M_UNROLL_M 4) | |||
| set(CGEMM3M_UNROLL_N 8) | |||
| set(ZGEMM3M_UNROLL_M 2) | |||
| set(ZGEMM3M_UNROLL_N 8) | |||
| elseif ("${TCORE}" STREQUAL "SANDYBRIDGE") | |||
| file(APPEND ${TARGET_CONF_TEMP} | |||
| "#define L1_DATA_SIZE\t32768\n" | |||
| "#define L1_DATA_LINESIZE\t64\n" | |||
| "#define L2_SIZE\t262144\n" | |||
| "#define L2_LINESIZE\t64\n" | |||
| "#define DTB_DEFAULT_ENTRIES\t64\n" | |||
| "#define DTB_SIZE\t4096\n" | |||
| "#define HAVE_CMOV\n" | |||
| "#define HAVE_MMX\n" | |||
| "#define HAVE_SSE\n" | |||
| "#define HAVE_SSE2\n" | |||
| "#define HAVE_SSE3\n" | |||
| "#define HAVE_SSSE3\n" | |||
| "#define HAVE_SSE4_1\n" | |||
| "#define HAVE_SSE4_2\n" | |||
| "#define HAVE_AVX\n" | |||
| "#define SLOCAL_BUFFER_SIZE\t24576\n" | |||
| "#define DLOCAL_BUFFER_SIZE\t16384\n" | |||
| "#define CLOCAL_BUFFER_SIZE\t32768\n" | |||
| "#define ZLOCAL_BUFFER_SIZE\t24576\n") | |||
| set(HAVE_SSE 1) | |||
| set(HAVE_SSE2 1) | |||
| set(HAVE_SSE3 1) | |||
| set(HAVE_SSSE3 1) | |||
| set(HAVE_SSE4_1 1) | |||
| set(HAVE_SSE4_2 1) | |||
| set(HAVE_AVX 1) | |||
| set(SBGEMM_UNROLL_M 8) | |||
| set(SBGEMM_UNROLL_N 4) | |||
| set(SGEMM_UNROLL_M 16) | |||
| set(SGEMM_UNROLL_N 4) | |||
| set(DGEMM_UNROLL_M 8) | |||
| set(DGEMM_UNROLL_N 4) | |||
| set(CGEMM_UNROLL_M 8) | |||
| set(CGEMM_UNROLL_N 2) | |||
| set(ZGEMM_UNROLL_M 1) | |||
| set(ZGEMM_UNROLL_N 4) | |||
| set(CGEMM3M_UNROLL_M 4) | |||
| set(CGEMM3M_UNROLL_N 8) | |||
| set(ZGEMM3M_UNROLL_M 2) | |||
| set(ZGEMM3M_UNROLL_N 8) | |||
| elseif ("${TCORE}" STREQUAL "HASWELL") | |||
| file(APPEND ${TARGET_CONF_TEMP} | |||
| "#define L1_DATA_SIZE\t32768\n" | |||
| "#define L1_DATA_LINESIZE\t64\n" | |||
| "#define L2_SIZE\t262144\n" | |||
| "#define L2_LINESIZE\t64\n" | |||
| "#define DTB_DEFAULT_ENTRIES\t64\n" | |||
| "#define DTB_SIZE\t4096\n" | |||
| "#define HAVE_CMOV\n" | |||
| "#define HAVE_MMX\n" | |||
| "#define HAVE_SSE\n" | |||
| "#define HAVE_SSE2\n" | |||
| "#define HAVE_SSE3\n" | |||
| "#define HAVE_SSSE3\n" | |||
| "#define HAVE_SSE4_1\n" | |||
| "#define HAVE_SSE4_2\n" | |||
| "#define HAVE_AVX\n" | |||
| "#define HAVE_AVX2\n" | |||
| "#define HAVE_FMA3\n" | |||
| "#define SLOCAL_BUFFER_SIZE\t20480\n" | |||
| "#define DLOCAL_BUFFER_SIZE\t32768\n" | |||
| "#define CLOCAL_BUFFER_SIZE\t16384\n" | |||
| "#define ZLOCAL_BUFFER_SIZE\t12288\n") | |||
| set(HAVE_SSE 1) | |||
| set(HAVE_SSE2 1) | |||
| set(HAVE_SSE3 1) | |||
| set(HAVE_SSSE3 1) | |||
| set(HAVE_SSE4_1 1) | |||
| set(HAVE_SSE4_2 1) | |||
| set(HAVE_AVX 1) | |||
| set(HAVE_AVX2 1) | |||
| set(HAVE_FMA3 1) | |||
| set(SBGEMM_UNROLL_M 8) | |||
| set(SBGEMM_UNROLL_N 4) | |||
| set(SGEMM_UNROLL_M 8) | |||
| set(SGEMM_UNROLL_N 4) | |||
| set(DGEMM_UNROLL_M 4) | |||
| set(DGEMM_UNROLL_N 8) | |||
| set(CGEMM_UNROLL_M 8) | |||
| set(CGEMM_UNROLL_N 2) | |||
| set(ZGEMM_UNROLL_M 4) | |||
| set(ZGEMM_UNROLL_N 2) | |||
| set(CGEMM3M_UNROLL_M 8) | |||
| set(CGEMM3M_UNROLL_N 4) | |||
| set(ZGEMM3M_UNROLL_M 4) | |||
| set(ZGEMM3M_UNROLL_N 4) | |||
| elseif ("${TCORE}" STREQUAL "SKYLAKEX") | |||
| file(APPEND ${TARGET_CONF_TEMP} | |||
| "#define L1_DATA_SIZE\t32768\n" | |||
| "#define L1_DATA_LINESIZE\t64\n" | |||
| "#define L2_SIZE\t262144\n" | |||
| "#define L2_LINESIZE\t64\n" | |||
| "#define DTB_DEFAULT_ENTRIES\t64\n" | |||
| "#define DTB_SIZE\t4096\n" | |||
| "#define HAVE_CMOV\n" | |||
| "#define HAVE_MMX\n" | |||
| "#define HAVE_SSE\n" | |||
| "#define HAVE_SSE2\n" | |||
| "#define HAVE_SSE3\n" | |||
| "#define HAVE_SSSE3\n" | |||
| "#define HAVE_SSE4_1\n" | |||
| "#define HAVE_SSE4_2\n" | |||
| "#define HAVE_AVX\n" | |||
| "#define HAVE_AVX2\n" | |||
| "#define HAVE_FMA3\n" | |||
| "#define HAVE_AVX512VL\n" | |||
| "#define SLOCAL_BUFFER_SIZE\t28672\n" | |||
| "#define DLOCAL_BUFFER_SIZE\t12288\n" | |||
| "#define CLOCAL_BUFFER_SIZE\t12288\n" | |||
| "#define ZLOCAL_BUFFER_SIZE\t8192\n") | |||
| set(HAVE_CMOV 1) | |||
| set(HAVE_MMX 1) | |||
| set(HAVE_SSE 1) | |||
| set(HAVE_SSE2 1) | |||
| set(HAVE_SSE3 1) | |||
| set(HAVE_SSSE3 1) | |||
| set(HAVE_SSE4_1 1) | |||
| set(HAVE_SSE4_2 1) | |||
| set(HAVE_AVX 1) | |||
| set(HAVE_AVX2 1) | |||
| set(HAVE_FMA3 1) | |||
| set(HAVE_AVX512VL 1) | |||
| set(SBGEMM_UNROLL_M 8) | |||
| set(SBGEMM_UNROLL_N 4) | |||
| set(SGEMM_UNROLL_M 16) | |||
| set(SGEMM_UNROLL_N 4) | |||
| set(DGEMM_UNROLL_M 16) | |||
| set(DGEMM_UNROLL_N 2) | |||
| set(CGEMM_UNROLL_M 8) | |||
| set(CGEMM_UNROLL_N 2) | |||
| set(ZGEMM_UNROLL_M 4) | |||
| set(ZGEMM_UNROLL_N 2) | |||
| set(CGEMM3M_UNROLL_M 8) | |||
| set(CGEMM3M_UNROLL_N 4) | |||
| set(ZGEMM3M_UNROLL_M 4) | |||
| set(ZGEMM3M_UNROLL_N 4) | |||
| elseif ("${TCORE}" STREQUAL "COOPERLAKE") | |||
| file(APPEND ${TARGET_CONF_TEMP} | |||
| "#define L1_DATA_SIZE\t32768\n" | |||
| "#define L1_DATA_LINESIZE\t64\n" | |||
| "#define L2_SIZE\t262144\n" | |||
| "#define L2_LINESIZE\t64\n" | |||
| "#define DTB_DEFAULT_ENTRIES\t64\n" | |||
| "#define DTB_SIZE\t4096\n" | |||
| "#define HAVE_CMOV\n" | |||
| "#define HAVE_MMX\n" | |||
| "#define HAVE_SSE\n" | |||
| "#define HAVE_SSE2\n" | |||
| "#define HAVE_SSE3\n" | |||
| "#define HAVE_SSSE3\n" | |||
| "#define HAVE_SSE4_1\n" | |||
| "#define HAVE_SSE4_2\n" | |||
| "#define HAVE_AVX\n" | |||
| "#define HAVE_AVX2\n" | |||
| "#define HAVE_FMA3\n" | |||
| "#define HAVE_AVX512VL\n" | |||
| "#define HAVE_AVX512BF16\n" | |||
| "#define SLOCAL_BUFFER_SIZE\t20480\n" | |||
| "#define DLOCAL_BUFFER_SIZE\t12288\n" | |||
| "#define CLOCAL_BUFFER_SIZE\t12288\n" | |||
| "#define ZLOCAL_BUFFER_SIZE\t8192\n") | |||
| set(HAVE_CMOV 1) | |||
| set(HAVE_MMX 1) | |||
| set(HAVE_SSE 1) | |||
| set(HAVE_SSE2 1) | |||
| set(HAVE_SSE3 1) | |||
| set(HAVE_SSSE3 1) | |||
| set(HAVE_SSE4_1 1) | |||
| set(HAVE_SSE4_2 1) | |||
| set(HAVE_AVX 1) | |||
| set(HAVE_AVX2 1) | |||
| set(HAVE_FMA3 1) | |||
| set(HAVE_AVX512VL 1) | |||
| set(HAVE_AVX512BF16 1) | |||
| set(SBGEMM_UNROLL_M 16) | |||
| set(SBGEMM_UNROLL_N 4) | |||
| set(SGEMM_UNROLL_M 16) | |||
| set(SGEMM_UNROLL_N 4) | |||
| set(DGEMM_UNROLL_M 16) | |||
| set(DGEMM_UNROLL_N 2) | |||
| set(CGEMM_UNROLL_M 8) | |||
| set(CGEMM_UNROLL_N 2) | |||
| set(ZGEMM_UNROLL_M 4) | |||
| set(ZGEMM_UNROLL_N 2) | |||
| set(CGEMM3M_UNROLL_M 8) | |||
| set(CGEMM3M_UNROLL_N 4) | |||
| set(ZGEMM3M_UNROLL_M 4) | |||
| set(ZGEMM3M_UNROLL_N 4) | |||
| elseif ("${TCORE}" STREQUAL "SAPPHIRERAPIDS") | |||
| file(APPEND ${TARGET_CONF_TEMP} | |||
| "#define L1_DATA_SIZE\t32768\n" | |||
| "#define L1_DATA_LINESIZE\t64\n" | |||
| "#define L2_SIZE\t262144\n" | |||
| "#define L2_LINESIZE\t64\n" | |||
| "#define DTB_DEFAULT_ENTRIES\t64\n" | |||
| "#define DTB_SIZE\t4096\n" | |||
| "#define HAVE_CMOV\n" | |||
| "#define HAVE_MMX\n" | |||
| "#define HAVE_SSE\n" | |||
| "#define HAVE_SSE2\n" | |||
| "#define HAVE_SSE3\n" | |||
| "#define HAVE_SSSE3\n" | |||
| "#define HAVE_SSE4_1\n" | |||
| "#define HAVE_SSE4_2\n" | |||
| "#define HAVE_AVX\n" | |||
| "#define HAVE_AVX2\n" | |||
| "#define HAVE_FMA3\n" | |||
| "#define HAVE_AVX512VL\n" | |||
| "#define HAVE_AVX512BF16\n" | |||
| "#define SLOCAL_BUFFER_SIZE\t20480\n" | |||
| "#define DLOCAL_BUFFER_SIZE\t12288\n" | |||
| "#define CLOCAL_BUFFER_SIZE\t12288\n" | |||
| "#define ZLOCAL_BUFFER_SIZE\t8192\n") | |||
| set(HAVE_CMOV 1) | |||
| set(HAVE_MMX 1) | |||
| set(HAVE_SSE 1) | |||
| set(HAVE_SSE2 1) | |||
| set(HAVE_SSE3 1) | |||
| set(HAVE_SSSE3 1) | |||
| set(HAVE_SSE4_1 1) | |||
| set(HAVE_SSE4_2 1) | |||
| set(HAVE_AVX 1) | |||
| set(HAVE_AVX2 1) | |||
| set(HAVE_FMA3 1) | |||
| set(HAVE_AVX512VL 1) | |||
| set(HAVE_AVX512BF16 1) | |||
| set(SBGEMM_UNROLL_M 32) | |||
| set(SBGEMM_UNROLL_N 16) | |||
| set(SGEMM_UNROLL_M 16) | |||
| set(SGEMM_UNROLL_N 4) | |||
| set(DGEMM_UNROLL_M 16) | |||
| set(DGEMM_UNROLL_N 2) | |||
| set(CGEMM_UNROLL_M 8) | |||
| set(CGEMM_UNROLL_N 2) | |||
| set(ZGEMM_UNROLL_M 4) | |||
| set(ZGEMM_UNROLL_N 2) | |||
| set(CGEMM3M_UNROLL_M 8) | |||
| set(CGEMM3M_UNROLL_N 4) | |||
| set(ZGEMM3M_UNROLL_M 4) | |||
| set(ZGEMM3M_UNROLL_N 4) | |||
| elseif ("${TCORE}" STREQUAL "OPTERON") | |||
| file(APPEND ${TARGET_CONF_TEMP} | |||
| "#define L1_DATA_SIZE\t65536\n" | |||
| "#define L1_DATA_LINESIZE\t64\n" | |||
| "#define L2_SIZE\t1048576\n" | |||
| "#define L2_LINESIZE\t64\n" | |||
| "#define DTB_DEFAULT_ENTRIES\t32\n" | |||
| "#define DTB_SIZE\t4096\n" | |||
| "#define HAVE_3DNOW\n" | |||
| "#define HAVE_3DNOWEX\n" | |||
| "#define HAVE_MMX\n" | |||
| "#define HAVE_SSE\n" | |||
| "#define HAVE_SSE2\n" | |||
| "#define SLOCAL_BUFFER_SIZE\t15360\n" | |||
| "#define DLOCAL_BUFFER_SIZE\t15360\n" | |||
| "#define CLOCAL_BUFFER_SIZE\t15360\n" | |||
| "#define ZLOCAL_BUFFER_SIZE\t15360\n") | |||
| set(HAVE_3DNOW 1) | |||
| set(HAVE_3DNOWEX 1) | |||
| set(HAVE_MMX 1) | |||
| set(HAVE_SSE 1) | |||
| set(HAVE_SSE2 1) | |||
| set(SBGEMM_UNROLL_M 8) | |||
| set(SBGEMM_UNROLL_N 4) | |||
| set(SGEMM_UNROLL_M 8) | |||
| set(SGEMM_UNROLL_N 4) | |||
| set(DGEMM_UNROLL_M 4) | |||
| set(DGEMM_UNROLL_N 4) | |||
| set(CGEMM_UNROLL_M 4) | |||
| set(CGEMM_UNROLL_N 2) | |||
| set(ZGEMM_UNROLL_M 2) | |||
| set(ZGEMM_UNROLL_N 2) | |||
| set(CGEMM3M_UNROLL_M 8) | |||
| set(CGEMM3M_UNROLL_N 4) | |||
| set(ZGEMM3M_UNROLL_M 4) | |||
| set(ZGEMM3M_UNROLL_N 4) | |||
| elseif ("${TCORE}" STREQUAL "BARCELONA") | |||
| file(APPEND ${TARGET_CONF_TEMP} | |||
| "#define L1_DATA_SIZE\t32768\n" | |||
| "#define L1_DATA_LINESIZE\t64\n" | |||
| "#define L2_SIZE\t524288\n" | |||
| "#define L2_LINESIZE\t64\n" | |||
| "#define DTB_DEFAULT_ENTRIES\t64\n" | |||
| "#define DTB_SIZE\t4096\n" | |||
| "#define HAVE_MMX\n" | |||
| "#define HAVE_SSE\n" | |||
| "#define HAVE_SSE2\n" | |||
| "#define HAVE_SSE3\n" | |||
| "#define HAVE_SSE4A\n" | |||
| "#define HAVE_MISALIGNSSE\n" | |||
| "#define HAVE_128BITFPU\n" | |||
| "#define HAVE_FASTMOVU\n" | |||
| "#define SLOCAL_BUFFER_SIZE\t14336\n" | |||
| "#define DLOCAL_BUFFER_SIZE\t14336\n" | |||
| "#define CLOCAL_BUFFER_SIZE\t14336\n" | |||
| "#define ZLOCAL_BUFFER_SIZE\t14336\n") | |||
| set(HAVE_SSE 1) | |||
| set(HAVE_SSE2 1) | |||
| set(HAVE_SSE3 1) | |||
| set(HAVE_SSE4A 1) | |||
| set(HAVE_MISALIGNSSE 1) | |||
| set(HAVE_128BITFPU 1) | |||
| set(HAVE_FASTMOVU 1) | |||
| set(SBGEMM_UNROLL_M 8) | |||
| set(SBGEMM_UNROLL_N 4) | |||
| set(SGEMM_UNROLL_M 8) | |||
| set(SGEMM_UNROLL_N 4) | |||
| set(DGEMM_UNROLL_M 4) | |||
| set(DGEMM_UNROLL_N 4) | |||
| set(CGEMM_UNROLL_M 4) | |||
| set(CGEMM_UNROLL_N 2) | |||
| set(ZGEMM_UNROLL_M 2) | |||
| set(ZGEMM_UNROLL_N 2) | |||
| set(CGEMM3M_UNROLL_M 8) | |||
| set(CGEMM3M_UNROLL_N 4) | |||
| set(ZGEMM3M_UNROLL_M 4) | |||
| set(ZGEMM3M_UNROLL_N 4) | |||
| elseif ("${TCORE}" STREQUAL "BULLDOZER") | |||
| file(APPEND ${TARGET_CONF_TEMP} | |||
| "#define L1_DATA_SIZE\t49152\n" | |||
| "#define L1_DATA_LINESIZE\t64\n" | |||
| "#define L2_SIZE\t1024000\n" | |||
| "#define L2_LINESIZE\t64\n" | |||
| "#define DTB_DEFAULT_ENTRIES\t32\n" | |||
| "#define DTB_SIZE\t4096\n" | |||
| "#define HAVE_MMX\n" | |||
| "#define HAVE_SSE\n" | |||
| "#define HAVE_SSE2\n" | |||
| "#define HAVE_SSE3\n" | |||
| "#define HAVE_SSE4A\n" | |||
| "#define HAVE_AVX\n" | |||
| "#define HAVE_MISALIGNSSE\n" | |||
| "#define HAVE_128BITFPU\n" | |||
| "#define HAVE_FASTMOVU\n" | |||
| "#define SLOCAL_BUFFER_SIZE\t5376\n" | |||
| "#define DLOCAL_BUFFER_SIZE\t5376\n" | |||
| "#define CLOCAL_BUFFER_SIZE\t14336\n" | |||
| "#define ZLOCAL_BUFFER_SIZE\t14336\n") | |||
| set(HAVE_SSE 1) | |||
| set(HAVE_SSE2 1) | |||
| set(HAVE_SSE3 1) | |||
| set(HAVE_SSE4A 1) | |||
| set(HAVE_AVX 1) | |||
| set(HAVE_MISALIGNSSE 1) | |||
| set(HAVE_128BITFPU 1) | |||
| set(HAVE_FASTMOVU 1) | |||
| set(SBGEMM_UNROLL_M 8) | |||
| set(SBGEMM_UNROLL_N 4) | |||
| set(SGEMM_UNROLL_M 16) | |||
| set(SGEMM_UNROLL_N 2) | |||
| set(DGEMM_UNROLL_M 8) | |||
| set(DGEMM_UNROLL_N 2) | |||
| set(CGEMM_UNROLL_M 2) | |||
| set(CGEMM_UNROLL_N 2) | |||
| set(ZGEMM_UNROLL_M 2) | |||
| set(ZGEMM_UNROLL_N 2) | |||
| set(CGEMM3M_UNROLL_M 8) | |||
| set(CGEMM3M_UNROLL_N 4) | |||
| set(ZGEMM3M_UNROLL_M 4) | |||
| set(ZGEMM3M_UNROLL_N 4) | |||
| elseif ("${TCORE}" STREQUAL "PILEDRIVER") | |||
| file(APPEND ${TARGET_CONF_TEMP} | |||
| "#define L1_DATA_SIZE\t16384\n" | |||
| "#define L1_DATA_LINESIZE\t64\n" | |||
| "#define L2_SIZE\t2097152\n" | |||
| "#define L2_LINESIZE\t64\n" | |||
| "#define DTB_DEFAULT_ENTRIES\t64\n" | |||
| "#define DTB_SIZE\t4096\n" | |||
| "#define HAVE_MMX\n" | |||
| "#define HAVE_SSE\n" | |||
| "#define HAVE_SSE2\n" | |||
| "#define HAVE_SSE3\n" | |||
| "#define HAVE_SSE4_1\n" | |||
| "#define HAVE_SSE4_2\n" | |||
| "#define HAVE_SSE4A\n" | |||
| "#define HAVE_AVX\n" | |||
| "#define HAVE_MISALIGNSSE\n" | |||
| "#define HAVE_128BITFPU\n" | |||
| "#define HAVE_FASTMOVU\n" | |||
| "#define HAVE_CFLUSH\n" | |||
| "#define HAVE_FMA3\n" | |||
| "#define SLOCAL_BUFFER_SIZE\t6144\n" | |||
| "#define DLOCAL_BUFFER_SIZE\t5376\n" | |||
| "#define CLOCAL_BUFFER_SIZE\t10752\n" | |||
| "#define ZLOCAL_BUFFER_SIZE\t10752\n") | |||
| set(HAVE_SSE 1) | |||
| set(HAVE_SSE2 1) | |||
| set(HAVE_SSE3 1) | |||
| set(HAVE_SSE4_1 1) | |||
| set(HAVE_SSE4_2 1) | |||
| set(HAVE_SSE4A 1) | |||
| set(HAVE_AVX 1) | |||
| set(HAVE_FMA3 1) | |||
| set(HAVE_MISALIGNSSE 1) | |||
| set(HAVE_128BITFPU 1) | |||
| set(HAVE_FASTMOVU 1) | |||
| set(HAVE_CFLUSH 1) | |||
| set(SBGEMM_UNROLL_M 8) | |||
| set(SBGEMM_UNROLL_N 4) | |||
| set(SGEMM_UNROLL_M 16) | |||
| set(SGEMM_UNROLL_N 2) | |||
| set(DGEMM_UNROLL_M 8) | |||
| set(DGEMM_UNROLL_N 2) | |||
| set(CGEMM_UNROLL_M 4) | |||
| set(CGEMM_UNROLL_N 2) | |||
| set(ZGEMM_UNROLL_M 2) | |||
| set(ZGEMM_UNROLL_N 2) | |||
| set(CGEMM3M_UNROLL_M 8) | |||
| set(CGEMM3M_UNROLL_N 4) | |||
| set(ZGEMM3M_UNROLL_M 4) | |||
| set(ZGEMM3M_UNROLL_N 4) | |||
| elseif ("${TCORE}" STREQUAL "STEAMROLLER") | |||
| file(APPEND ${TARGET_CONF_TEMP} | |||
| "#define L1_DATA_SIZE\t16384\n" | |||
| "#define L1_DATA_LINESIZE\t64\n" | |||
| "#define L2_SIZE\t2097152\n" | |||
| "#define L2_LINESIZE\t64\n" | |||
| "#define DTB_DEFAULT_ENTRIES\t64\n" | |||
| "#define DTB_SIZE\t4096\n" | |||
| "#define HAVE_MMX\n" | |||
| "#define HAVE_SSE\n" | |||
| "#define HAVE_SSE2\n" | |||
| "#define HAVE_SSE3\n" | |||
| "#define HAVE_SSE4_1\n" | |||
| "#define HAVE_SSE4_2\n" | |||
| "#define HAVE_SSE4A\n" | |||
| "#define HAVE_AVX\n" | |||
| "#define HAVE_MISALIGNSSE\n" | |||
| "#define HAVE_128BITFPU\n" | |||
| "#define HAVE_FASTMOVU\n" | |||
| "#define HAVE_CFLUSH\n" | |||
| "#define HAVE_FMA3\n" | |||
| "#define SLOCAL_BUFFER_SIZE\t6144\n" | |||
| "#define DLOCAL_BUFFER_SIZE\t5120\n" | |||
| "#define CLOCAL_BUFFER_SIZE\t10240\n" | |||
| "#define ZLOCAL_BUFFER_SIZE\t10240\n") | |||
| set(HAVE_SSE 1) | |||
| set(HAVE_SSE2 1) | |||
| set(HAVE_SSE3 1) | |||
| set(HAVE_SSE4_1 1) | |||
| set(HAVE_SSE4_2 1) | |||
| set(HAVE_SSE4A 1) | |||
| set(HAVE_AVX 1) | |||
| set(HAVE_FMA3 1) | |||
| set(HAVE_MISALIGNSSE 1) | |||
| set(HAVE_128BITFPU 1) | |||
| set(HAVE_FASTMOVU 1) | |||
| set(HAVE_CFLUSH 1) | |||
| set(SBGEMM_UNROLL_M 8) | |||
| set(SBGEMM_UNROLL_N 4) | |||
| set(SGEMM_UNROLL_M 16) | |||
| set(SGEMM_UNROLL_N 2) | |||
| set(DGEMM_UNROLL_M 8) | |||
| set(DGEMM_UNROLL_N 2) | |||
| set(CGEMM_UNROLL_M 4) | |||
| set(CGEMM_UNROLL_N 2) | |||
| set(ZGEMM_UNROLL_M 2) | |||
| set(ZGEMM_UNROLL_N 2) | |||
| set(CGEMM3M_UNROLL_M 8) | |||
| set(CGEMM3M_UNROLL_N 4) | |||
| set(ZGEMM3M_UNROLL_M 4) | |||
| set(ZGEMM3M_UNROLL_N 4) | |||
| elseif ("${TCORE}" STREQUAL "EXCAVATOR") | |||
| file(APPEND ${TARGET_CONF_TEMP} | |||
| "#define L1_DATA_SIZE\t16384\n" | |||
| "#define L1_DATA_LINESIZE\t64\n" | |||
| "#define L2_SIZE\t2097152\n" | |||
| "#define L2_LINESIZE\t64\n" | |||
| "#define DTB_DEFAULT_ENTRIES\t64\n" | |||
| "#define DTB_SIZE\t4096\n" | |||
| "#define HAVE_MMX\n" | |||
| "#define HAVE_SSE\n" | |||
| "#define HAVE_SSE2\n" | |||
| "#define HAVE_SSE3\n" | |||
| "#define HAVE_SSE4_1\n" | |||
| "#define HAVE_SSE4_2\n" | |||
| "#define HAVE_SSE4A\n" | |||
| "#define HAVE_AVX\n" | |||
| "#define HAVE_MISALIGNSSE\n" | |||
| "#define HAVE_128BITFPU\n" | |||
| "#define HAVE_FASTMOVU\n" | |||
| "#define HAVE_CFLUSH\n" | |||
| "#define HAVE_FMA3\n" | |||
| "#define SLOCAL_BUFFER_SIZE\t6144\n" | |||
| "#define DLOCAL_BUFFER_SIZE\t5120\n" | |||
| "#define CLOCAL_BUFFER_SIZE\t10240\n" | |||
| "#define ZLOCAL_BUFFER_SIZE\t10240\n") | |||
| set(HAVE_SSE 1) | |||
| set(HAVE_SSE2 1) | |||
| set(HAVE_SSE3 1) | |||
| set(HAVE_SSE4_1 1) | |||
| set(HAVE_SSE4_2 1) | |||
| set(HAVE_SSE4A 1) | |||
| set(HAVE_AVX 1) | |||
| set(HAVE_FMA3 1) | |||
| set(HAVE_MISALIGNSSE 1) | |||
| set(HAVE_128BITFPU 1) | |||
| set(HAVE_FASTMOVU 1) | |||
| set(HAVE_CFLUSH 1) | |||
| set(SBGEMM_UNROLL_M 8) | |||
| set(SBGEMM_UNROLL_N 4) | |||
| set(SGEMM_UNROLL_M 16) | |||
| set(SGEMM_UNROLL_N 2) | |||
| set(DGEMM_UNROLL_M 8) | |||
| set(DGEMM_UNROLL_N 2) | |||
| set(CGEMM_UNROLL_M 4) | |||
| set(CGEMM_UNROLL_N 2) | |||
| set(ZGEMM_UNROLL_M 2) | |||
| set(ZGEMM_UNROLL_N 2) | |||
| set(CGEMM3M_UNROLL_M 8) | |||
| set(CGEMM3M_UNROLL_N 4) | |||
| set(ZGEMM3M_UNROLL_M 4) | |||
| set(ZGEMM3M_UNROLL_N 4) | |||
| elseif ("${TCORE}" STREQUAL "ZEN") | |||
| file(APPEND ${TARGET_CONF_TEMP} | |||
| "#define L1_DATA_SIZE\t32768\n" | |||
| "#define L1_DATA_LINESIZE\t64\n" | |||
| "#define L2_SIZE\t524288\n" | |||
| "#define L2_LINESIZE\t64\n" | |||
| "#define DTB_DEFAULT_ENTRIES\t64\n" | |||
| "#define DTB_SIZE\t4096\n" | |||
| "#define HAVE_MMX\n" | |||
| "#define HAVE_SSE\n" | |||
| "#define HAVE_SSE2\n" | |||
| "#define HAVE_SSE3\n" | |||
| "#define HAVE_SSE4_1\n" | |||
| "#define HAVE_SSE4_2\n" | |||
| "#define HAVE_SSE4A\n" | |||
| "#define HAVE_MISALIGNSSE\n" | |||
| "#define HAVE_128BITFPU\n" | |||
| "#define HAVE_FASTMOVU\n" | |||
| "#define HAVE_CFLUSH\n" | |||
| "#define HAVE_AVX\n" | |||
| "#define HAVE_AVX2\n" | |||
| "#define HAVE_FMA3\n" | |||
| "#define SLOCAL_BUFFER_SIZE\t20480\n" | |||
| "#define DLOCAL_BUFFER_SIZE\t32768\n" | |||
| "#define CLOCAL_BUFFER_SIZE\t16384\n" | |||
| "#define ZLOCAL_BUFFER_SIZE\t12288\n") | |||
| set(HAVE_SSE 1) | |||
| set(HAVE_SSE2 1) | |||
| set(HAVE_SSE3 1) | |||
| set(HAVE_SSE4_1 1) | |||
| set(HAVE_SSE4_2 1) | |||
| set(HAVE_AVX 1) | |||
| set(HAVE_AVX2 1) | |||
| set(HAVE_FMA3 1) | |||
| set(HAVE_SSE4A 1) | |||
| set(HAVE_MISALIGNSSE 1) | |||
| set(HAVE_128BITFPU 1) | |||
| set(HAVE_FASTMOVU 1) | |||
| set(HAVE_CFLUSH 1) | |||
| set(SBGEMM_UNROLL_M 8) | |||
| set(SBGEMM_UNROLL_N 4) | |||
| set(SGEMM_UNROLL_M 8) | |||
| set(SGEMM_UNROLL_N 4) | |||
| set(DGEMM_UNROLL_M 4) | |||
| set(DGEMM_UNROLL_N 8) | |||
| set(CGEMM_UNROLL_M 8) | |||
| set(CGEMM_UNROLL_N 2) | |||
| set(ZGEMM_UNROLL_M 4) | |||
| set(ZGEMM_UNROLL_N 2) | |||
| set(CGEMM3M_UNROLL_M 8) | |||
| set(CGEMM3M_UNROLL_N 4) | |||
| set(ZGEMM3M_UNROLL_M 4) | |||
| set(ZGEMM3M_UNROLL_N 4) | |||
| elseif ("${TCORE}" STREQUAL "ARMV7") | |||
| file(APPEND ${TARGET_CONF_TEMP} | |||
| "#define L1_DATA_SIZE\t65536\n" | |||
| @@ -199,12 +879,12 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS | |||
| "#define HAVE_VFP\n" | |||
| "#define HAVE_NEON\n" | |||
| "#define ARMV8\n") | |||
| if ("${TCORE}" STREQUAL "CORTEXA57") | |||
| if ("${TCORE}" STREQUAL "CORTEXA57") | |||
| set(SGEMM_UNROLL_M 16) | |||
| set(SGEMM_UNROLL_N 4) | |||
| else () | |||
| set(SGEMM_UNROLL_M 8) | |||
| set(SGEMM_UNROLL_N 8) | |||
| set(SGEMM_UNROLL_N 8) | |||
| endif () | |||
| set(DGEMM_UNROLL_M 8) | |||
| set(DGEMM_UNROLL_N 4) | |||
| @@ -581,6 +1261,15 @@ endif () | |||
| set(ZGEMM_UNROLL_M 8) | |||
| set(ZGEMM_UNROLL_N 2) | |||
| set(SYMV_P 8) | |||
| elseif ("${TCORE}" STREQUAL "GENERIC") | |||
| file(APPEND ${TARGET_CONF_TEMP} | |||
| "#define L1_DATA_SIZE 32768\n" | |||
| "#define L1_DATA_LINESIZE 128\n" | |||
| "#define L2_SIZE 524288\n" | |||
| "#define L2_LINESIZE 128 \n" | |||
| "#define DTB_DEFAULT_ENTRIES 128\n" | |||
| "#define DTB_SIZE 4096\n" | |||
| "#define L2_ASSOCIATIVE 8\n") | |||
| endif() | |||
| set(SBGEMM_UNROLL_M 8) | |||
| set(SBGEMM_UNROLL_N 4) | |||
| @@ -603,7 +1292,7 @@ endif () | |||
| "#define GEMM_MULTITHREAD_THRESHOLD\t${GEMM_MULTITHREAD_THRESHOLD}\n") | |||
| # Move to where gen_config_h would place it | |||
| file(MAKE_DIRECTORY ${TARGET_CONF_DIR}) | |||
| file(RENAME ${TARGET_CONF_TEMP} "${TARGET_CONF_DIR}/${TARGET_CONF}") | |||
| file(RENAME ${TARGET_CONF_TEMP} "${TARGET_CONF_DIR}/${TARGET_CONF}") | |||
| else(NOT CMAKE_CROSSCOMPILING) | |||
| # compile getarch | |||
| @@ -639,7 +1328,7 @@ else(NOT CMAKE_CROSSCOMPILING) | |||
| OUTPUT_VARIABLE GETARCH_LOG | |||
| COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH_BIN} | |||
| ) | |||
| if (NOT ${GETARCH_RESULT}) | |||
| MESSAGE(FATAL_ERROR "Compiling getarch failed ${GETARCH_LOG}") | |||
| endif () | |||
| @@ -284,8 +284,15 @@ if (NOT NOFORTRAN) | |||
| # Fortran Compiler dependent settings | |||
| include("${PROJECT_SOURCE_DIR}/cmake/fc.cmake") | |||
| else () | |||
| set(NO_LAPACK 1) | |||
| set(NO_LAPACKE 1) | |||
| if (NOT XXXX) | |||
| set(C_LAPACK 1) | |||
| if (INTERFACE64) | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -DLAPACK_ILP64") | |||
| endif () | |||
| set(TIMER "NONE") | |||
| else () | |||
| set (NO_LAPACK 1) | |||
| endif () | |||
| endif () | |||
| if (BINARY64) | |||
| @@ -552,6 +559,14 @@ if (${CMAKE_SYSTEM_NAME} STREQUAL "Windows") | |||
| endforeach () | |||
| endif () | |||
| if ("${F_COMPILER}" STREQUAL "NAG" OR "${F_COMPILER}" STREQUAL "CRAY") | |||
| set(FILTER_FLAGS "-msse3;-mssse3;-msse4.1;-mavx;-mavx2,-mskylake-avx512") | |||
| foreach (FILTER_FLAG ${FILTER_FLAGS}) | |||
| string(REPLACE ${FILTER_FLAG} "" LAPACK_FFLAGS ${LAPACK_FFLAGS}) | |||
| string(REPLACE ${FILTER_FLAG} "" LAPACK_FPFLAGS ${LAPACK_FPFLAGS}) | |||
| endforeach () | |||
| endif () | |||
| if ("${F_COMPILER}" STREQUAL "GFORTRAN") | |||
| # lapack-netlib is rife with uninitialized warnings -hpa | |||
| set(LAPACK_FFLAGS "${LAPACK_FFLAGS} -Wno-maybe-uninitialized") | |||
| @@ -31,7 +31,11 @@ endif() | |||
| # Pretty thorough determination of arch. Add more if needed | |||
| if(CMAKE_CL_64 OR MINGW64) | |||
| set(X86_64 1) | |||
| if (CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*|arm64.*|ARM64.*)") | |||
| set(ARM64 1) | |||
| else() | |||
| set(X86_64 1) | |||
| endif() | |||
| elseif(MINGW OR (MSVC AND NOT CMAKE_CROSSCOMPILING)) | |||
| set(X86 1) | |||
| elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc.*|power.*|Power.*") | |||
| @@ -33,9 +33,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #ifndef COMMON_ARM64 | |||
| #define COMMON_ARM64 | |||
| #ifdef C_MSVC | |||
| #include <intrin.h> | |||
| #define MB __dmb(_ARM64_BARRIER_ISH) | |||
| #define WMB __dmb(_ARM64_BARRIER_ISHST) | |||
| #define RMB __dmb(_ARM64_BARRIER_ISHLD) | |||
| #else | |||
| #define MB __asm__ __volatile__ ("dmb ish" : : : "memory") | |||
| #define WMB __asm__ __volatile__ ("dmb ishst" : : : "memory") | |||
| #define RMB __asm__ __volatile__ ("dmb ishld" : : : "memory") | |||
| #endif | |||
| #define INLINE inline | |||
| @@ -53,6 +60,7 @@ static void __inline blas_lock(volatile BLASULONG *address){ | |||
| BLASULONG ret; | |||
| do { | |||
| #ifndef C_MSVC | |||
| __asm__ __volatile__( | |||
| "mov x4, #1 \n\t" | |||
| "sevl \n\t" | |||
| @@ -70,7 +78,10 @@ static void __inline blas_lock(volatile BLASULONG *address){ | |||
| ); | |||
| #else | |||
| while (*address) {YIELDING;} | |||
| ret=InterlockedExchange64((volatile LONG64 *)(address), 1); | |||
| #endif | |||
| } while (ret); | |||
| @@ -80,6 +91,14 @@ static void __inline blas_lock(volatile BLASULONG *address){ | |||
| #if !defined(OS_DARWIN) && !defined (OS_ANDROID) | |||
| static __inline BLASULONG rpcc(void){ | |||
| #ifdef C_MSVC | |||
| const int64_t pmccntr_el0 = (((3 & 1) << 14) | // op0 | |||
| ((3 & 7) << 11) | // op1 | |||
| ((9 & 15) << 7) | // crn | |||
| ((13 & 15) << 3) | // crm | |||
| ((0 & 7) << 0)); // op2 | |||
| return _ReadStatusReg(pmccntr_el0); | |||
| #else | |||
| BLASULONG ret = 0; | |||
| blasint shift; | |||
| @@ -87,6 +106,7 @@ static __inline BLASULONG rpcc(void){ | |||
| __asm__ __volatile__ ("mrs %0,cntfrq_el0; clz %w0, %w0":"=&r"(shift)); | |||
| return ret << shift; | |||
| #endif | |||
| } | |||
| #define RPCC_DEFINED | |||
| @@ -2610,8 +2610,9 @@ | |||
| #endif | |||
| #ifndef ASSEMBLER | |||
| #if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64)\ | |||
| || defined(ARCH_LOONGARCH64) || defined(ARCH_E2K) | |||
| #if !defined(DYNAMIC_ARCH) \ | |||
| && (defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64) \ | |||
| || defined(ARCH_LOONGARCH64) || defined(ARCH_E2K)) | |||
| extern BLASLONG gemm_offset_a; | |||
| extern BLASLONG gemm_offset_b; | |||
| extern BLASLONG sbgemm_p; | |||
| @@ -92,7 +92,7 @@ static inline int blas_quickdivide(blasint x, blasint y){ | |||
| #define SEEK_ADDRESS | |||
| #if defined(C910V) | |||
| #include <riscv-vector.h> | |||
| #include <riscv_vector.h> | |||
| #endif | |||
| #endif | |||
| @@ -6,12 +6,14 @@ | |||
| #include "../cblas.h" | |||
| #include "cpp_thread_safety_common.h" | |||
| void launch_cblas_dgemv(double* A, double* x, double* y, const blasint randomMatSize){ | |||
| void launch_cblas_dgemv(double* A, double* x, double* y, const blasint randomMatSize) | |||
| { | |||
| const blasint inc = 1; | |||
| cblas_dgemv(CblasColMajor, CblasNoTrans, randomMatSize, randomMatSize, 1.0, A, randomMatSize, x, inc, 0.1, y, inc); | |||
| } | |||
| } | |||
| int main(int argc, char* argv[]){ | |||
| int main(int argc, char* argv[]) | |||
| { | |||
| blasint randomMatSize = 1024; //dimension of the random square matrices and vectors being used | |||
| uint32_t numConcurrentThreads = 52; //number of concurrent calls of the functions being tested | |||
| uint32_t numTestRounds = 16; //number of testing rounds before success exit | |||
| @@ -20,20 +22,23 @@ int main(int argc, char* argv[]){ | |||
| if (maxHwThreads < 52) | |||
| numConcurrentThreads = maxHwThreads; | |||
| if (argc > 4){ | |||
| if (argc > 4) | |||
| { | |||
| std::cout<<"ERROR: too many arguments for thread safety tester"<<std::endl; | |||
| abort(); | |||
| } | |||
| if(argc == 4){ | |||
| } | |||
| if(argc == 4) | |||
| { | |||
| std::vector<std::string> cliArgs; | |||
| for (int i = 1; i < argc; i++){ | |||
| for (int i = 1; i < argc; i++) | |||
| { | |||
| cliArgs.push_back(argv[i]); | |||
| std::cout<<argv[i]<<std::endl; | |||
| } | |||
| } | |||
| randomMatSize = std::stoul(cliArgs.at(0)); | |||
| numConcurrentThreads = std::stoul(cliArgs.at(1)); | |||
| numTestRounds = std::stoul(cliArgs.at(2)); | |||
| } | |||
| } | |||
| std::uniform_real_distribution<double> rngdist{-1.0, 1.0}; | |||
| std::vector<std::vector<double>> matBlock(numConcurrentThreads); | |||
| @@ -56,15 +61,18 @@ int main(int argc, char* argv[]){ | |||
| std::cout<<"Preparing to test CBLAS DGEMV thread safety\n"; | |||
| std::cout<<"Allocating matrices..."<<std::flush; | |||
| for(uint32_t i=0; i<numConcurrentThreads; i++){ | |||
| for(uint32_t i=0; i<numConcurrentThreads; i++) | |||
| { | |||
| matBlock.at(i).resize(randomMatSize*randomMatSize); | |||
| } | |||
| } | |||
| std::cout<<"done\n"; | |||
| std::cout<<"Allocating vectors..."<<std::flush; | |||
| for(uint32_t i=0; i<(numConcurrentThreads*2); i++){ | |||
| for(uint32_t i=0; i<(numConcurrentThreads*2); i++) | |||
| { | |||
| vecBlock.at(i).resize(randomMatSize); | |||
| } | |||
| } | |||
| std::cout<<"done\n"; | |||
| //pauser(); | |||
| std::cout<<"Filling matrices with random numbers..."<<std::flush; | |||
| @@ -77,31 +85,35 @@ int main(int argc, char* argv[]){ | |||
| std::cout<<"Testing CBLAS DGEMV thread safety"<<std::endl; | |||
| omp_set_num_threads(numConcurrentThreads); | |||
| for(uint32_t R=0; R<numTestRounds; R++){ | |||
| for(uint32_t R=0; R<numTestRounds; R++) | |||
| { | |||
| std::cout<<"DGEMV round #"<<R<<std::endl; | |||
| std::cout<<"Launching "<<numConcurrentThreads<<" threads simultaneously using OpenMP..."<<std::flush; | |||
| #pragma omp parallel for default(none) shared(futureBlock, matBlock, vecBlock, randomMatSize, numConcurrentThreads) | |||
| for(uint32_t i=0; i<numConcurrentThreads; i++){ | |||
| for(uint32_t i=0; i<numConcurrentThreads; i++) | |||
| { | |||
| futureBlock[i] = std::async(std::launch::async, launch_cblas_dgemv, &matBlock[i][0], &vecBlock[i*2][0], &vecBlock[i*2+1][0], randomMatSize); | |||
| } | |||
| } | |||
| std::cout<<"done\n"; | |||
| std::cout<<"Waiting for threads to finish..."<<std::flush; | |||
| for(uint32_t i=0; i<numConcurrentThreads; i++){ | |||
| for(uint32_t i=0; i<numConcurrentThreads; i++) | |||
| { | |||
| futureBlock[i].get(); | |||
| } | |||
| } | |||
| std::cout<<"done\n"; | |||
| std::cout<<"Comparing results from different threads..."<<std::flush; | |||
| for(uint32_t i=2; i<(numConcurrentThreads*2); i+=2){ //i is the index of vector x, for a given thread | |||
| for(uint32_t j = 0; j < static_cast<uint32_t>(randomMatSize); j++){ | |||
| for(uint32_t j = 0; j < static_cast<uint32_t>(randomMatSize); j++) | |||
| { | |||
| if (std::abs(vecBlock[i+1][j] - vecBlock[1][j]) > 1.0E-13){ //i+1 is the index of vector y, for a given thread | |||
| std::cout<<"ERROR: one of the threads returned a different result! Index : "<<i+1<<std::endl; | |||
| std::cout<<"CBLAS DGEMV thread safety test FAILED!"<<std::endl; | |||
| return -1; | |||
| } | |||
| } | |||
| } | |||
| std::cout<<"OK!\n"<<std::endl; | |||
| } | |||
| std::cout<<"OK!\n"<<std::endl; | |||
| } | |||
| std::cout<<"CBLAS DGEMV thread safety test PASSED!\n"<<std::endl; | |||
| return 0; | |||
| } | |||
| } | |||
| @@ -45,6 +45,10 @@ size_t length64=sizeof(value64); | |||
| #define CPU_NEOVERSEN1 11 | |||
| #define CPU_NEOVERSEV1 16 | |||
| #define CPU_NEOVERSEN2 17 | |||
| #define CPU_CORTEXX1 18 | |||
| #define CPU_CORTEXX2 19 | |||
| #define CPU_CORTEXA510 20 | |||
| #define CPU_CORTEXA710 21 | |||
| // Qualcomm | |||
| #define CPU_FALKOR 6 | |||
| // Cavium | |||
| @@ -59,6 +63,8 @@ size_t length64=sizeof(value64); | |||
| #define CPU_VORTEX 13 | |||
| // Fujitsu | |||
| #define CPU_A64FX 15 | |||
| // Phytium | |||
| #define CPU_FT2000 22 | |||
| static char *cpuname[] = { | |||
| "UNKNOWN", | |||
| @@ -73,12 +79,17 @@ static char *cpuname[] = { | |||
| "TSV110", | |||
| "EMAG8180", | |||
| "NEOVERSEN1", | |||
| "NEOVERSEV1" | |||
| "NEOVERSEN2" | |||
| "THUNDERX3T110", | |||
| "VORTEX", | |||
| "CORTEXA55", | |||
| "A64FX" | |||
| "A64FX", | |||
| "NEOVERSEV1", | |||
| "NEOVERSEN2", | |||
| "CORTEXX1", | |||
| "CORTEXX2", | |||
| "CORTEXA510", | |||
| "CORTEXA710", | |||
| "FT2000" | |||
| }; | |||
| static char *cpuname_lower[] = { | |||
| @@ -94,12 +105,17 @@ static char *cpuname_lower[] = { | |||
| "tsv110", | |||
| "emag8180", | |||
| "neoversen1", | |||
| "neoversev1", | |||
| "neoversen2", | |||
| "thunderx3t110", | |||
| "vortex", | |||
| "cortexa55", | |||
| "a64fx" | |||
| "a64fx", | |||
| "neoversev1", | |||
| "neoversen2", | |||
| "cortexx1", | |||
| "cortexx2", | |||
| "cortexa510", | |||
| "cortexa710", | |||
| "ft2000" | |||
| }; | |||
| int get_feature(char *search) | |||
| @@ -182,6 +198,14 @@ int detect(void) | |||
| return CPU_NEOVERSEN2; | |||
| else if (strstr(cpu_part, "0xd05")) | |||
| return CPU_CORTEXA55; | |||
| else if (strstr(cpu_part, "0xd46")) | |||
| return CPU_CORTEXA510; | |||
| else if (strstr(cpu_part, "0xd47")) | |||
| return CPU_CORTEXA710; | |||
| else if (strstr(cpu_part, "0xd44")) | |||
| return CPU_CORTEXX1; | |||
| else if (strstr(cpu_part, "0xd4c")) | |||
| return CPU_CORTEXX2; | |||
| } | |||
| // Qualcomm | |||
| else if (strstr(cpu_implementer, "0x51") && strstr(cpu_part, "0xc00")) | |||
| @@ -202,6 +226,13 @@ int detect(void) | |||
| // Fujitsu | |||
| else if (strstr(cpu_implementer, "0x46") && strstr(cpu_part, "0x001")) | |||
| return CPU_A64FX; | |||
| // Apple | |||
| else if (strstr(cpu_implementer, "0x61") && strstr(cpu_part, "0x022")) | |||
| return CPU_VORTEX; | |||
| // Phytium | |||
| else if (strstr(cpu_implementer, "0x70") && (strstr(cpu_part, "0x660") || strstr(cpu_part, "0x661") | |||
| || strstr(cpu_part, "0x662") || strstr(cpu_part, "0x663"))) | |||
| return CPU_FT2000; | |||
| } | |||
| p = (char *) NULL ; | |||
| @@ -382,7 +413,24 @@ void get_cpuconfig(void) | |||
| printf("#define DTB_DEFAULT_ENTRIES 48\n"); | |||
| printf("#define DTB_SIZE 4096\n"); | |||
| break; | |||
| case CPU_CORTEXA510: | |||
| case CPU_CORTEXA710: | |||
| case CPU_CORTEXX1: | |||
| case CPU_CORTEXX2: | |||
| printf("#define ARMV9\n"); | |||
| printf("#define %s\n", cpuname[d]); | |||
| printf("#define L1_CODE_SIZE 65536\n"); | |||
| printf("#define L1_CODE_LINESIZE 64\n"); | |||
| printf("#define L1_CODE_ASSOCIATIVE 4\n"); | |||
| printf("#define L1_DATA_SIZE 65536\n"); | |||
| printf("#define L1_DATA_LINESIZE 64\n"); | |||
| printf("#define L1_DATA_ASSOCIATIVE 4\n"); | |||
| printf("#define L2_SIZE 1048576\n"); | |||
| printf("#define L2_LINESIZE 64\n"); | |||
| printf("#define L2_ASSOCIATIVE 8\n"); | |||
| printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||
| printf("#define DTB_SIZE 4096\n"); | |||
| break; | |||
| case CPU_FALKOR: | |||
| printf("#define FALKOR\n"); | |||
| printf("#define L1_CODE_SIZE 65536\n"); | |||
| @@ -469,9 +517,9 @@ void get_cpuconfig(void) | |||
| printf("#define DTB_DEFAULT_ENTRIES 64 \n"); | |||
| printf("#define DTB_SIZE 4096 \n"); | |||
| break; | |||
| #ifdef __APPLE__ | |||
| case CPU_VORTEX: | |||
| printf("#define VORTEX \n"); | |||
| #ifdef __APPLE__ | |||
| sysctlbyname("hw.l1icachesize",&value64,&length64,NULL,0); | |||
| printf("#define L1_CODE_SIZE %lld \n",value64); | |||
| sysctlbyname("hw.cachelinesize",&value64,&length64,NULL,0); | |||
| @@ -480,10 +528,10 @@ void get_cpuconfig(void) | |||
| printf("#define L1_DATA_SIZE %lld \n",value64); | |||
| sysctlbyname("hw.l2cachesize",&value64,&length64,NULL,0); | |||
| printf("#define L2_SIZE %lld \n",value64); | |||
| #endif | |||
| printf("#define DTB_DEFAULT_ENTRIES 64 \n"); | |||
| printf("#define DTB_SIZE 4096 \n"); | |||
| break; | |||
| #endif | |||
| case CPU_A64FX: | |||
| printf("#define A64FX\n"); | |||
| printf("#define L1_CODE_SIZE 65535\n"); | |||
| @@ -494,6 +542,16 @@ void get_cpuconfig(void) | |||
| printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||
| printf("#define DTB_SIZE 4096\n"); | |||
| break; | |||
| case CPU_FT2000: | |||
| printf("#define FT2000\n"); | |||
| printf("#define L1_CODE_SIZE 32768\n"); | |||
| printf("#define L1_DATA_SIZE 32768\n"); | |||
| printf("#define L1_DATA_LINESIZE 64\n"); | |||
| printf("#define L2_SIZE 33554432\n"); | |||
| printf("#define L2_LINESIZE 64\n"); | |||
| printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||
| printf("#define DTB_SIZE 4096\n"); | |||
| break; | |||
| } | |||
| get_cpucount(); | |||
| } | |||
| @@ -33,30 +33,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include <stdint.h> | |||
| #define CPU_UNKNOWN 0 | |||
| #define CPU_LOONGSON3R5 1 | |||
| /* If LASX extension instructions supported, | |||
| * using core LOONGSON3R5 | |||
| * If only LSX extension instructions supported, | |||
| * using core LOONGSON2K1000 | |||
| * If neither LASX nor LSX extension instructions supported, | |||
| * using core LOONGSONGENERIC (As far as I know, there is no such | |||
| * CPU yet) | |||
| */ | |||
| #define CPU_GENERIC 0 | |||
| #define CPU_LOONGSON3R5 1 | |||
| #define CPU_LOONGSON2K1000 2 | |||
| #define LOONGARCH_CFG2 0x02 | |||
| #define LOONGARCH_LASX 1<<7 | |||
| #define LOONGARCH_LSX 1<<6 | |||
| static char *cpuname[] = { | |||
| "UNKNOWN", | |||
| "LOONGSON3R5" | |||
| "LOONGSONGENERIC", | |||
| "LOONGSON3R5", | |||
| "LOONGSON2K1000" | |||
| }; | |||
| static char *cpuname_lower[] = { | |||
| "loongsongeneric", | |||
| "loongson3r5", | |||
| "loongson2k1000" | |||
| }; | |||
| int detect(void) { | |||
| uint32_t reg = 0; | |||
| __asm__ volatile ( | |||
| "cpucfg %0, %1 \n\t" | |||
| : "+&r"(reg) | |||
| : "r"(LOONGARCH_CFG2) | |||
| ); | |||
| if (reg & LOONGARCH_LASX) | |||
| return CPU_LOONGSON3R5; | |||
| else | |||
| return CPU_UNKNOWN; | |||
| #ifdef __linux | |||
| uint32_t reg = 0; | |||
| __asm__ volatile ( | |||
| "cpucfg %0, %1 \n\t" | |||
| : "+&r"(reg) | |||
| : "r"(LOONGARCH_CFG2) | |||
| ); | |||
| if (reg & LOONGARCH_LASX) | |||
| return CPU_LOONGSON3R5; | |||
| else if (reg & LOONGARCH_LSX) | |||
| return CPU_LOONGSON2K1000; | |||
| else | |||
| return CPU_GENERIC; | |||
| #endif | |||
| return CPU_GENERIC; | |||
| } | |||
| char *get_corename(void) { | |||
| @@ -68,11 +91,8 @@ void get_architecture(void) { | |||
| } | |||
| void get_subarchitecture(void) { | |||
| if (detect() == CPU_LOONGSON3R5) { | |||
| printf("LOONGSON3R5"); | |||
| } else { | |||
| printf("UNKNOWN"); | |||
| } | |||
| int d = detect(); | |||
| printf("%s", cpuname[d]); | |||
| } | |||
| void get_subdirname(void) { | |||
| @@ -80,31 +100,44 @@ void get_subdirname(void) { | |||
| } | |||
| void get_cpuconfig(void) { | |||
| if (detect() == CPU_LOONGSON3R5) { | |||
| printf("#define LOONGSON3R5\n"); | |||
| printf("#define L1_DATA_SIZE 65536\n"); | |||
| printf("#define L1_DATA_LINESIZE 64\n"); | |||
| printf("#define L2_SIZE 1048576\n"); | |||
| printf("#define L2_LINESIZE 64\n"); | |||
| printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||
| printf("#define DTB_SIZE 4096\n"); | |||
| printf("#define L2_ASSOCIATIVE 16\n"); | |||
| } else { | |||
| printf("#define LOONGSON3R5\n"); | |||
| printf("#define L1_DATA_SIZE 65536\n"); | |||
| printf("#define L1_DATA_LINESIZE 64\n"); | |||
| printf("#define L2_SIZE 1048576\n"); | |||
| printf("#define L2_LINESIZE 64\n"); | |||
| printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||
| printf("#define DTB_SIZE 4096\n"); | |||
| printf("#define L2_ASSOCIATIVE 16\n"); | |||
| int d = detect(); | |||
| switch (d) { | |||
| case CPU_LOONGSON3R5: | |||
| printf("#define LOONGSON3R5\n"); | |||
| printf("#define L1_DATA_SIZE 65536\n"); | |||
| printf("#define L1_DATA_LINESIZE 64\n"); | |||
| printf("#define L2_SIZE 1048576\n"); | |||
| printf("#define L2_LINESIZE 64\n"); | |||
| printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||
| printf("#define DTB_SIZE 4096\n"); | |||
| printf("#define L2_ASSOCIATIVE 16\n"); | |||
| break; | |||
| case CPU_LOONGSON2K1000: | |||
| printf("#define LOONGSON2K1000\n"); | |||
| printf("#define L1_DATA_SIZE 65536\n"); | |||
| printf("#define L1_DATA_LINESIZE 64\n"); | |||
| printf("#define L2_SIZE 262144\n"); | |||
| printf("#define L2_LINESIZE 64\n"); | |||
| printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||
| printf("#define DTB_SIZE 4096\n"); | |||
| printf("#define L2_ASSOCIATIVE 16\n"); | |||
| break; | |||
| default: | |||
| printf("#define LOONGSONGENERIC\n"); | |||
| printf("#define L1_DATA_SIZE 65536\n"); | |||
| printf("#define L1_DATA_LINESIZE 64\n"); | |||
| printf("#define L2_SIZE 262144\n"); | |||
| printf("#define L2_LINESIZE 64\n"); | |||
| printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||
| printf("#define DTB_SIZE 4096\n"); | |||
| printf("#define L2_ASSOCIATIVE 16\n"); | |||
| break; | |||
| } | |||
| } | |||
| void get_libname(void){ | |||
| if (detect() == CPU_LOONGSON3R5) { | |||
| printf("loongson3r5\n"); | |||
| } else { | |||
| printf("loongarch64\n"); | |||
| } | |||
| int d = detect(); | |||
| printf("%s", cpuname_lower[d]); | |||
| } | |||
| @@ -1,5 +1,5 @@ | |||
| /***************************************************************************** | |||
| Copyright (c) 2011-2014, The OpenBLAS Project | |||
| Copyright (c) 2011-2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| @@ -13,9 +13,9 @@ met: | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written | |||
| permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| @@ -70,16 +70,46 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #define CPU_UNKNOWN 0 | |||
| #define CPU_C910V 1 | |||
| #define CPU_GENERIC 0 | |||
| #define CPU_C910V 1 | |||
| static char *cpuname[] = { | |||
| "UNKOWN", | |||
| "RISCV64_GENERIC", | |||
| "C910V" | |||
| }; | |||
| int detect(void){ | |||
| return CPU_UNKNOWN; | |||
| #ifdef __linux | |||
| FILE *infile; | |||
| char buffer[512],isa_buffer[512],model_buffer[512]; | |||
| const char* check_c910_str = "T-HEAD C910"; | |||
| char *pmodel = NULL, *pisa = NULL; | |||
| infile = fopen("/proc/cpuinfo", "r"); | |||
| while (fgets(buffer, sizeof(buffer), infile)){ | |||
| if(!strncmp(buffer, "model name", 10)){ | |||
| strcpy(model_buffer, buffer); | |||
| pmodel = strchr(isa_buffer, ':') + 1; | |||
| } | |||
| if(!strncmp(buffer, "isa", 3)){ | |||
| strcpy(isa_buffer, buffer); | |||
| pisa = strchr(isa_buffer, '4') + 1; | |||
| } | |||
| } | |||
| fclose(infile); | |||
| if (!pmodel) | |||
| return(CPU_GENERIC); | |||
| if (strstr(pmodel, check_c910_str) && strchr(pisa, 'v')) | |||
| return CPU_C910V; | |||
| return CPU_GENERIC; | |||
| #endif | |||
| return CPU_GENERIC; | |||
| } | |||
| char *get_corename(void){ | |||
| @@ -91,6 +121,7 @@ void get_architecture(void){ | |||
| } | |||
| void get_subarchitecture(void){ | |||
| printf("%s",cpuname[detect()]); | |||
| } | |||
| void get_subdirname(void){ | |||
| @@ -98,7 +129,7 @@ void get_subdirname(void){ | |||
| } | |||
| void get_cpuconfig(void){ | |||
| printf("#define UNKNOWN\n"); | |||
| printf("#define %s\n", cpuname[detect()]); | |||
| printf("#define L1_DATA_SIZE 65536\n"); | |||
| printf("#define L1_DATA_LINESIZE 32\n"); | |||
| printf("#define L2_SIZE 512488\n"); | |||
| @@ -1707,8 +1707,18 @@ int get_cpuname(void){ | |||
| if (model == 0xf && stepping < 0xe) | |||
| return CPUTYPE_NANO; | |||
| return CPUTYPE_NEHALEM; | |||
| case 0x7: | |||
| switch (exmodel) { | |||
| case 5: | |||
| if (support_avx2()) | |||
| return CPUTYPE_ZEN; | |||
| else | |||
| return CPUTYPE_DUNNINGTON; | |||
| default: | |||
| return CPUTYPE_NEHALEM; | |||
| } | |||
| default: | |||
| if (family >= 0x7) | |||
| if (family >= 0x8) | |||
| return CPUTYPE_NEHALEM; | |||
| else | |||
| return CPUTYPE_VIAC3; | |||
| @@ -1716,7 +1726,20 @@ int get_cpuname(void){ | |||
| } | |||
| if (vendor == VENDOR_ZHAOXIN){ | |||
| return CPUTYPE_NEHALEM; | |||
| switch (family) { | |||
| case 0x7: | |||
| switch (exmodel) { | |||
| case 5: | |||
| if (support_avx2()) | |||
| return CPUTYPE_ZEN; | |||
| else | |||
| return CPUTYPE_DUNNINGTON; | |||
| default: | |||
| return CPUTYPE_NEHALEM; | |||
| } | |||
| default: | |||
| return CPUTYPE_NEHALEM; | |||
| } | |||
| } | |||
| if (vendor == VENDOR_RISE){ | |||
| @@ -2416,8 +2439,18 @@ int get_coretype(void){ | |||
| if (model == 0xf && stepping < 0xe) | |||
| return CORE_NANO; | |||
| return CORE_NEHALEM; | |||
| case 0x7: | |||
| switch (exmodel) { | |||
| case 5: | |||
| if (support_avx2()) | |||
| return CORE_ZEN; | |||
| else | |||
| return CORE_DUNNINGTON; | |||
| default: | |||
| return CORE_NEHALEM; | |||
| } | |||
| default: | |||
| if (family >= 0x7) | |||
| if (family >= 0x8) | |||
| return CORE_NEHALEM; | |||
| else | |||
| return CORE_VIAC3; | |||
| @@ -2425,7 +2458,20 @@ int get_coretype(void){ | |||
| } | |||
| if (vendor == VENDOR_ZHAOXIN) { | |||
| return CORE_NEHALEM; | |||
| switch (family) { | |||
| case 0x7: | |||
| switch (exmodel) { | |||
| case 5: | |||
| if (support_avx2()) | |||
| return CORE_ZEN; | |||
| else | |||
| return CORE_DUNNINGTON; | |||
| default: | |||
| return CORE_NEHALEM; | |||
| } | |||
| default: | |||
| return CORE_NEHALEM; | |||
| } | |||
| } | |||
| return CORE_UNKNOWN; | |||
| @@ -44,6 +44,10 @@ COMPILER_DEC | |||
| COMPILER_GNU | |||
| #endif | |||
| #if defined(__fcc_version__) || defined(__FCC_version__) | |||
| COMPILER_FUJITSU | |||
| #endif | |||
| #if defined(__ANDROID__) | |||
| OS_ANDROID | |||
| #endif | |||
| @@ -1,7 +1,9 @@ | |||
| include_directories(${PROJECT_SOURCE_DIR}) | |||
| include_directories(${PROJECT_BINARY_DIR}) | |||
| if (NOT NOFORTRAN) | |||
| enable_language(Fortran) | |||
| endif() | |||
| set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DADD${BU} -DCBLAS") | |||
| if (CMAKE_Fortran_COMPILER_ID STREQUAL GNU) | |||
| @@ -28,14 +30,24 @@ foreach(float_type ${FLOAT_TYPES}) | |||
| continue() | |||
| endif() | |||
| #level1 | |||
| if (NOT NOFORTRAN) | |||
| add_executable(x${float_char}cblat1 | |||
| c_${float_char}blat1.f | |||
| c_${float_char}blas1.c) | |||
| else() | |||
| add_executable(x${float_char}cblat1 | |||
| c_${float_char}blat1c.c | |||
| c_${float_char}blas1.c) | |||
| endif() | |||
| target_link_libraries(x${float_char}cblat1 ${OpenBLAS_LIBNAME}) | |||
| if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD") | |||
| target_link_libraries(x${float_char}cblat1 m) | |||
| endif() | |||
| add_test(NAME "x${float_char}cblat1" | |||
| COMMAND $<TARGET_FILE:x${float_char}cblat1>) | |||
| #level2 | |||
| if (NOT NOFORTRAN) | |||
| add_executable(x${float_char}cblat2 | |||
| c_${float_char}blat2.f | |||
| c_${float_char}blas2.c | |||
| @@ -43,11 +55,24 @@ foreach(float_type ${FLOAT_TYPES}) | |||
| auxiliary.c | |||
| c_xerbla.c | |||
| constant.c) | |||
| else() | |||
| add_executable(x${float_char}cblat2 | |||
| c_${float_char}blat2c.c | |||
| c_${float_char}blas2.c | |||
| c_${float_char}2chke.c | |||
| auxiliary.c | |||
| c_xerbla.c | |||
| constant.c) | |||
| endif() | |||
| target_link_libraries(x${float_char}cblat2 ${OpenBLAS_LIBNAME}) | |||
| if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD") | |||
| target_link_libraries(x${float_char}cblat2 m) | |||
| endif() | |||
| add_test(NAME "x${float_char}cblat2" | |||
| COMMAND ${test_helper} $<TARGET_FILE:x${float_char}cblat2> "${PROJECT_SOURCE_DIR}/ctest/${float_char}in2") | |||
| #level3 | |||
| if (NOT NOFORTRAN) | |||
| add_executable(x${float_char}cblat3 | |||
| c_${float_char}blat3.f | |||
| c_${float_char}blas3.c | |||
| @@ -55,7 +80,19 @@ foreach(float_type ${FLOAT_TYPES}) | |||
| auxiliary.c | |||
| c_xerbla.c | |||
| constant.c) | |||
| else() | |||
| add_executable(x${float_char}cblat3 | |||
| c_${float_char}blat3c.c | |||
| c_${float_char}blas3.c | |||
| c_${float_char}3chke.c | |||
| auxiliary.c | |||
| c_xerbla.c | |||
| constant.c) | |||
| endif() | |||
| target_link_libraries(x${float_char}cblat3 ${OpenBLAS_LIBNAME}) | |||
| if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD") | |||
| target_link_libraries(x${float_char}cblat3 m) | |||
| endif() | |||
| add_test(NAME "x${float_char}cblat3" | |||
| COMMAND ${test_helper} $<TARGET_FILE:x${float_char}cblat3> "${PROJECT_SOURCE_DIR}/ctest/${float_char}in3") | |||
| @@ -43,11 +43,7 @@ ztestl3o = c_zblas3.o c_z3chke.o auxiliary.o c_xerbla.o constant.o | |||
| ztestl3o_3m = c_zblas3_3m.o c_z3chke_3m.o auxiliary.o c_xerbla.o constant.o | |||
| ifeq ($(NOFORTRAN),1) | |||
| all :: | |||
| else | |||
| all :: all1 all2 all3 | |||
| endif | |||
| ifeq ($(BUILD_SINGLE),1) | |||
| all1targets += xscblat1 | |||
| @@ -222,53 +218,83 @@ endif | |||
| ifeq ($(BUILD_SINGLE),1) | |||
| # Single real | |||
| ifeq ($(NOFORTRAN), $(filter 0 2,$(NOFORTRAN))) | |||
| xscblat1: $(stestl1o) c_sblat1.o $(TOPDIR)/$(LIBNAME) | |||
| $(FC) $(FLDFLAGS) -o xscblat1 c_sblat1.o $(stestl1o) $(LIB) $(EXTRALIB) $(CEXTRALIB) | |||
| xscblat2: $(stestl2o) c_sblat2.o $(TOPDIR)/$(LIBNAME) | |||
| $(FC) $(FLDFLAGS) -o xscblat2 c_sblat2.o $(stestl2o) $(LIB) $(EXTRALIB) $(CEXTRALIB) | |||
| xscblat3: $(stestl3o) c_sblat3.o $(TOPDIR)/$(LIBNAME) | |||
| $(FC) $(FLDFLAGS) -o xscblat3 c_sblat3.o $(stestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB) | |||
| else | |||
| xscblat1: $(stestl1o) c_sblat1c.o $(TOPDIR)/$(LIBNAME) | |||
| $(CC) $(CFLAGS) -o xscblat1 c_sblat1c.o $(stestl1o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB)) | |||
| xscblat2: $(stestl2o) c_sblat2c.o $(TOPDIR)/$(LIBNAME) | |||
| $(CC) $(CFLAGS) -o xscblat2 c_sblat2c.o $(stestl2o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB)) | |||
| xscblat3: $(stestl3o) c_sblat3c.o $(TOPDIR)/$(LIBNAME) | |||
| $(CC) $(CFLAGS) -o xscblat3 c_sblat3c.o $(stestl3o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB)) | |||
| endif | |||
| endif | |||
| ifeq ($(BUILD_DOUBLE),1) | |||
| # Double real | |||
| ifeq ($(NOFORTRAN),0) | |||
| xdcblat1: $(dtestl1o) c_dblat1.o $(TOPDIR)/$(LIBNAME) | |||
| $(FC) $(FLDFLAGS) -o xdcblat1 c_dblat1.o $(dtestl1o) $(LIB) $(EXTRALIB) $(CEXTRALIB) | |||
| xdcblat2: $(dtestl2o) c_dblat2.o $(TOPDIR)/$(LIBNAME) | |||
| $(FC) $(FLDFLAGS) -o xdcblat2 c_dblat2.o $(dtestl2o) $(LIB) $(EXTRALIB) $(CEXTRALIB) | |||
| xdcblat3: $(dtestl3o) c_dblat3.o $(TOPDIR)/$(LIBNAME) | |||
| $(FC) $(FLDFLAGS) -o xdcblat3 c_dblat3.o $(dtestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB) | |||
| else | |||
| xdcblat1: $(dtestl1o) c_dblat1c.o $(TOPDIR)/$(LIBNAME) | |||
| $(CC) $(CFLAGS) -o xdcblat1 c_dblat1c.o $(dtestl1o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB)) | |||
| xdcblat2: $(dtestl2o) c_dblat2c.o $(TOPDIR)/$(LIBNAME) | |||
| $(CC) $(CFLAGS) -o xdcblat2 c_dblat2c.o $(dtestl2o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB)) | |||
| xdcblat3: $(dtestl3o) c_dblat3c.o $(TOPDIR)/$(LIBNAME) | |||
| $(CC) $(CFLAGS) -o xdcblat3 c_dblat3c.o $(dtestl3o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB)) | |||
| endif | |||
| endif | |||
| ifeq ($(BUILD_COMPLEX),1) | |||
| # Single complex | |||
| ifeq ($(NOFORTRAN),0) | |||
| xccblat1: $(ctestl1o) c_cblat1.o $(TOPDIR)/$(LIBNAME) | |||
| $(FC) $(FLDFLAGS) -o xccblat1 c_cblat1.o $(ctestl1o) $(LIB) $(EXTRALIB) $(CEXTRALIB) | |||
| xccblat2: $(ctestl2o) c_cblat2.o $(TOPDIR)/$(LIBNAME) | |||
| $(FC) $(FLDFLAGS) -o xccblat2 c_cblat2.o $(ctestl2o) $(LIB) $(EXTRALIB) $(CEXTRALIB) | |||
| xccblat3: $(ctestl3o) c_cblat3.o $(TOPDIR)/$(LIBNAME) | |||
| $(FC) $(FLDFLAGS) -o xccblat3 c_cblat3.o $(ctestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB) | |||
| xccblat3_3m: $(ctestl3o_3m) c_cblat3_3m.o $(TOPDIR)/$(LIBNAME) | |||
| $(FC) $(FLDFLAGS) -o xccblat3_3m c_cblat3_3m.o $(ctestl3o_3m) $(LIB) $(EXTRALIB) $(CEXTRALIB) | |||
| else | |||
| xccblat1: $(ctestl1o) c_cblat1c.o $(TOPDIR)/$(LIBNAME) | |||
| $(CC) $(CFLAGS) -o xccblat1 c_cblat1c.o $(ctestl1o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB)) | |||
| xccblat2: $(ctestl2o) c_cblat2c.o $(TOPDIR)/$(LIBNAME) | |||
| $(CC) $(CFLAGS) -o xccblat2 c_cblat2c.o $(ctestl2o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB)) | |||
| xccblat3: $(ctestl3o) c_cblat3c.o $(TOPDIR)/$(LIBNAME) | |||
| $(CC) $(CFLAGS) -o xccblat3 c_cblat3c.o $(ctestl3o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB)) | |||
| endif | |||
| endif | |||
| ifeq ($(BUILD_COMPLEX16),1) | |||
| # Double complex | |||
| ifeq ($(NOFORTRAN),0) | |||
| xzcblat1: $(ztestl1o) c_zblat1.o $(TOPDIR)/$(LIBNAME) | |||
| $(FC) $(FLDFLAGS) -o xzcblat1 c_zblat1.o $(ztestl1o) $(LIB) $(EXTRALIB) $(CEXTRALIB) | |||
| xzcblat2: $(ztestl2o) c_zblat2.o $(TOPDIR)/$(LIBNAME) | |||
| $(FC) $(FLDFLAGS) -o xzcblat2 c_zblat2.o $(ztestl2o) $(LIB) $(EXTRALIB) $(CEXTRALIB) | |||
| xzcblat3: $(ztestl3o) c_zblat3.o $(TOPDIR)/$(LIBNAME) | |||
| $(FC) $(FLDFLAGS) -o xzcblat3 c_zblat3.o $(ztestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB) | |||
| xzcblat3_3m: $(ztestl3o_3m) c_zblat3_3m.o $(TOPDIR)/$(LIBNAME) | |||
| $(FC) $(FLDFLAGS) -o xzcblat3_3m c_zblat3_3m.o $(ztestl3o_3m) $(LIB) $(EXTRALIB) $(CEXTRALIB) | |||
| else | |||
| xzcblat1: $(ztestl1o) c_zblat1c.o $(TOPDIR)/$(LIBNAME) | |||
| $(CC) $(CFLAGS) -o xzcblat1 c_zblat1c.o $(ztestl1o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB)) | |||
| xzcblat2: $(ztestl2o) c_zblat2c.o $(TOPDIR)/$(LIBNAME) | |||
| $(CC) $(CFLAGS) -o xzcblat2 c_zblat2c.o $(ztestl2o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB)) | |||
| xzcblat3: $(ztestl3o) c_zblat3c.o $(TOPDIR)/$(LIBNAME) | |||
| $(CC) $(CFLAGS) -o xzcblat3 c_zblat3c.o $(ztestl3o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB)) | |||
| endif | |||
| endif | |||
| include $(TOPDIR)/Makefile.tail | |||
| @@ -27,11 +27,15 @@ else | |||
| ifeq ($(ARCH),mips64) | |||
| COMMONOBJS += dynamic_mips64.$(SUFFIX) | |||
| else | |||
| ifeq ($(ARCH),loongarch64) | |||
| COMMONOBJS += dynamic_loongarch64.$(SUFFIX) | |||
| else | |||
| COMMONOBJS += dynamic.$(SUFFIX) | |||
| endif | |||
| endif | |||
| endif | |||
| endif | |||
| endif | |||
| else | |||
| COMMONOBJS += parameter.$(SUFFIX) | |||
| endif | |||
| @@ -99,11 +103,15 @@ else | |||
| ifeq ($(ARCH),mips64) | |||
| HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_mips64.$(SUFFIX) | |||
| else | |||
| ifeq ($(ARCH),loongarch64) | |||
| HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_loongarch64.$(SUFFIX) | |||
| else | |||
| HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic.$(SUFFIX) | |||
| endif | |||
| endif | |||
| endif | |||
| endif | |||
| endif | |||
| else | |||
| HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) parameter.$(SUFFIX) | |||
| endif | |||
| @@ -352,6 +352,20 @@ int openblas_setaffinity(int thread_idx, size_t cpusetsize, cpu_set_t* cpu_set) | |||
| return pthread_setaffinity_np(thread, cpusetsize, cpu_set); | |||
| } | |||
| int openblas_getaffinity(int thread_idx, size_t cpusetsize, cpu_set_t* cpu_set) { | |||
| const int active_threads = openblas_get_num_threads(); | |||
| if (thread_idx < 0 || thread_idx >= active_threads) { | |||
| errno = EINVAL; | |||
| return -1; | |||
| } | |||
| pthread_t thread = (thread_idx == active_threads - 1) | |||
| ? pthread_self() | |||
| : blas_threads[thread_idx]; | |||
| return pthread_getaffinity_np(thread, cpusetsize, cpu_set); | |||
| } | |||
| #endif | |||
| static void* blas_thread_server(void *arg){ | |||
| @@ -403,6 +403,7 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){ | |||
| break; | |||
| } | |||
| if (openblas_omp_adaptive_env() != 0) { | |||
| #pragma omp parallel for num_threads(num) schedule(OMP_SCHED) | |||
| for (i = 0; i < num; i ++) { | |||
| @@ -412,6 +413,17 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){ | |||
| exec_threads(&queue[i], buf_index); | |||
| } | |||
| } else { | |||
| #pragma omp parallel for schedule(OMP_SCHED) | |||
| for (i = 0; i < num; i ++) { | |||
| #ifndef USE_SIMPLE_THREADED_LEVEL3 | |||
| queue[i].position = i; | |||
| #endif | |||
| exec_threads(&queue[i], buf_index); | |||
| } | |||
| } | |||
| #ifdef HAVE_C11 | |||
| atomic_store(&blas_buffer_inuse[buf_index], false); | |||
| @@ -96,7 +96,7 @@ extern gotoblas_t gotoblas_BARCELONA; | |||
| #endif | |||
| #ifdef DYN_ATOM | |||
| extern gotoblas_t gotoblas_ATOM; | |||
| elif defined(DYN_NEHALEM) | |||
| #elif defined(DYN_NEHALEM) | |||
| #define gotoblas_ATOM gotoblas_NEHALEM | |||
| #else | |||
| #define gotoblas_ATOM gotoblas_PRESCOTT | |||
| @@ -855,7 +855,11 @@ static gotoblas_t *get_coretype(void){ | |||
| openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK); | |||
| return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. | |||
| } | |||
| } else if (exfamily == 10) { | |||
| } else if (exfamily == 10) { | |||
| if(support_avx512_bf16()) | |||
| return &gotoblas_COOPERLAKE; | |||
| if(support_avx512()) | |||
| return &gotoblas_SKYLAKEX; | |||
| if(support_avx()) | |||
| return &gotoblas_ZEN; | |||
| else{ | |||
| @@ -863,7 +867,7 @@ static gotoblas_t *get_coretype(void){ | |||
| return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. | |||
| } | |||
| }else { | |||
| return &gotoblas_BARCELONA; | |||
| return NULL; | |||
| } | |||
| } | |||
| @@ -875,14 +879,37 @@ static gotoblas_t *get_coretype(void){ | |||
| if (model == 0xf && stepping < 0xe) | |||
| return &gotoblas_NANO; | |||
| return &gotoblas_NEHALEM; | |||
| case 0x7: | |||
| switch (exmodel) { | |||
| case 5: | |||
| if (support_avx2()) | |||
| return &gotoblas_ZEN; | |||
| else | |||
| return &gotoblas_DUNNINGTON; | |||
| default: | |||
| return &gotoblas_NEHALEM; | |||
| } | |||
| default: | |||
| if (family >= 0x7) | |||
| if (family >= 0x8) | |||
| return &gotoblas_NEHALEM; | |||
| } | |||
| } | |||
| if (vendor == VENDOR_ZHAOXIN) { | |||
| return &gotoblas_NEHALEM; | |||
| switch (family) { | |||
| case 0x7: | |||
| switch (exmodel) { | |||
| case 5: | |||
| if (support_avx2()) | |||
| return &gotoblas_ZEN; | |||
| else | |||
| return &gotoblas_DUNNINGTON; | |||
| default: | |||
| return &gotoblas_NEHALEM; | |||
| } | |||
| default: | |||
| return &gotoblas_NEHALEM; | |||
| } | |||
| } | |||
| return NULL; | |||
| @@ -99,6 +99,16 @@ extern gotoblas_t gotoblas_NEOVERSEN1; | |||
| #else | |||
| #define gotoblas_NEOVERSEN1 gotoblas_ARMV8 | |||
| #endif | |||
| #ifdef DYN_NEOVERSEV1 | |||
| extern gotoblas_t gotoblas_NEOVERSEV1; | |||
| #else | |||
| #define gotoblas_NEOVERSEV1 gotoblas_ARMV8 | |||
| #endif | |||
| #ifdef DYN_NEOVERSEN2 | |||
| extern gotoblas_t gotoblas_NEOVERSEN2; | |||
| #else | |||
| #define gotoblas_NEOVERSEN2 gotoblas_ARMV8 | |||
| #endif | |||
| #ifdef DYN_CORTEX_A55 | |||
| extern gotoblas_t gotoblas_CORTEXA55; | |||
| #else | |||
| @@ -115,6 +125,8 @@ extern gotoblas_t gotoblas_THUNDERX2T99; | |||
| extern gotoblas_t gotoblas_TSV110; | |||
| extern gotoblas_t gotoblas_EMAG8180; | |||
| extern gotoblas_t gotoblas_NEOVERSEN1; | |||
| extern gotoblas_t gotoblas_NEOVERSEV1; | |||
| extern gotoblas_t gotoblas_NEOVERSEN2; | |||
| extern gotoblas_t gotoblas_THUNDERX3T110; | |||
| extern gotoblas_t gotoblas_CORTEXA55; | |||
| #endif | |||
| @@ -166,8 +178,10 @@ char *gotoblas_corename(void) { | |||
| if (gotoblas == &gotoblas_TSV110) return corename[ 8]; | |||
| if (gotoblas == &gotoblas_EMAG8180) return corename[ 9]; | |||
| if (gotoblas == &gotoblas_NEOVERSEN1) return corename[10]; | |||
| if (gotoblas == &gotoblas_THUNDERX3T110) return corename[11]; | |||
| if (gotoblas == &gotoblas_CORTEXA55) return corename[12]; | |||
| if (gotoblas == &gotoblas_NEOVERSEV1) return corename[11]; | |||
| if (gotoblas == &gotoblas_NEOVERSEN2) return corename[12]; | |||
| if (gotoblas == &gotoblas_THUNDERX3T110) return corename[13]; | |||
| if (gotoblas == &gotoblas_CORTEXA55) return corename[14]; | |||
| return corename[NUM_CORETYPES]; | |||
| } | |||
| @@ -198,8 +212,10 @@ static gotoblas_t *force_coretype(char *coretype) { | |||
| case 8: return (&gotoblas_TSV110); | |||
| case 9: return (&gotoblas_EMAG8180); | |||
| case 10: return (&gotoblas_NEOVERSEN1); | |||
| case 11: return (&gotoblas_THUNDERX3T110); | |||
| case 12: return (&gotoblas_CORTEXA55); | |||
| case 11: return (&gotoblas_NEOVERSEV1); | |||
| case 12: return (&gotoblas_NEOVERSEN2); | |||
| case 13: return (&gotoblas_THUNDERX3T110); | |||
| case 14: return (&gotoblas_CORTEXA55); | |||
| } | |||
| snprintf(message, 128, "Core not found: %s\n", coretype); | |||
| openblas_warning(1, message); | |||
| @@ -258,6 +274,10 @@ static gotoblas_t *get_coretype(void) { | |||
| return &gotoblas_CORTEXA73; | |||
| case 0xd0c: // Neoverse N1 | |||
| return &gotoblas_NEOVERSEN1; | |||
| case 0xd49: | |||
| return &gotoblas_NEOVERSEN2; | |||
| case 0xd40: | |||
| return &gotoblas_NEOVERSEV1; | |||
| case 0xd05: // Cortex A55 | |||
| return &gotoblas_CORTEXA55; | |||
| } | |||
| @@ -0,0 +1,128 @@ | |||
| /******************************************************************************* | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *******************************************************************************/ | |||
| #include "common.h" | |||
| extern gotoblas_t gotoblas_LOONGSON3R5; | |||
| extern gotoblas_t gotoblas_LOONGSON2K1000; | |||
| extern gotoblas_t gotoblas_LOONGSONGENERIC; | |||
| extern void openblas_warning(int verbose, const char * msg); | |||
| #define NUM_CORETYPES 3 | |||
| static char *corename[] = { | |||
| "loongson3r5", | |||
| "loongson2k1000", | |||
| "loongsongeneric", | |||
| "unknown" | |||
| }; | |||
| char *gotoblas_corename(void) { | |||
| if (gotoblas == &gotoblas_LOONGSON3R5) return corename[0]; | |||
| if (gotoblas == &gotoblas_LOONGSON2K1000) return corename[1]; | |||
| if (gotoblas == &gotoblas_LOONGSONGENERIC) return corename[2]; | |||
| return corename[NUM_CORETYPES]; | |||
| } | |||
| static gotoblas_t *force_coretype(char *coretype) { | |||
| int i; | |||
| int found = -1; | |||
| char message[128]; | |||
| for ( i=0 ; i < NUM_CORETYPES; i++) | |||
| { | |||
| if (!strncasecmp(coretype, corename[i], 20)) | |||
| { | |||
| found = i; | |||
| break; | |||
| } | |||
| } | |||
| switch (found) | |||
| { | |||
| case 0: return (&gotoblas_LOONGSON3R5); | |||
| case 1: return (&gotoblas_LOONGSON2K1000); | |||
| case 2: return (&gotoblas_LOONGSONGENERIC); | |||
| } | |||
| snprintf(message, 128, "Core not found: %s\n", coretype); | |||
| openblas_warning(1, message); | |||
| return NULL; | |||
| } | |||
| #define LASX_MASK 1<<7 | |||
| #define LSX_MASK 1<<6 | |||
| #define LOONGARCH_CFG2 0x02 | |||
| static gotoblas_t *get_coretype(void) { | |||
| int ret = 0; | |||
| __asm__ volatile ( | |||
| "cpucfg %0, %1 \n\t" | |||
| : "+&r"(ret) | |||
| : "r"(LOONGARCH_CFG2) | |||
| ); | |||
| if (ret & LASX_MASK) | |||
| return &gotoblas_LOONGSON3R5; | |||
| else if (ret & LSX_MASK) | |||
| return &gotoblas_LOONGSON2K1000; | |||
| else | |||
| return &gotoblas_LOONGSONGENERIC; | |||
| } | |||
| void gotoblas_dynamic_init(void) { | |||
| char coremsg[128]; | |||
| char coren[22]; | |||
| char *p; | |||
| if (gotoblas) return; | |||
| p = getenv("OPENBLAS_CORETYPE"); | |||
| if ( p ) | |||
| { | |||
| gotoblas = force_coretype(p); | |||
| } | |||
| else | |||
| { | |||
| gotoblas = get_coretype(); | |||
| } | |||
| if (gotoblas && gotoblas->init) { | |||
| strncpy(coren, gotoblas_corename(), 20); | |||
| sprintf(coremsg, "Core: %s\n", coren); | |||
| openblas_warning(2, coremsg); | |||
| gotoblas -> init(); | |||
| } else { | |||
| openblas_warning(0, "OpenBLAS : Architecture Initialization failed. No initialization function found.\n"); | |||
| exit(1); | |||
| } | |||
| } | |||
| void gotoblas_dynamic_quit(void) { | |||
| gotoblas = NULL; | |||
| } | |||
| @@ -877,21 +877,21 @@ void gotoblas_affinity_init(void) { | |||
| nums = sysconf(_SC_NPROCESSORS_CONF); | |||
| #if !defined(__GLIBC_PREREQ) | |||
| common->num_procs = nums; | |||
| common->num_procs = nums >0 ? nums : 2; | |||
| #else | |||
| #if !__GLIBC_PREREQ(2, 3) | |||
| common->num_procs = nums; | |||
| common->num_procs = nums >0 ? nums : 2; | |||
| #elif __GLIBC_PREREQ(2, 7) | |||
| cpusetp = CPU_ALLOC(nums); | |||
| cpusetp = CPU_ALLOC(nums>0? nums:1024); | |||
| if (cpusetp == NULL) { | |||
| common->num_procs = nums; | |||
| common->num_procs = nums>0 ? nums: 2; | |||
| } else { | |||
| size_t size; | |||
| size = CPU_ALLOC_SIZE(nums); | |||
| size = CPU_ALLOC_SIZE(nums>0? nums: 1024); | |||
| ret = sched_getaffinity(0,size,cpusetp); | |||
| if (ret!=0) | |||
| common->num_procs = nums; | |||
| common->num_procs = nums >0 ? nums : 1; | |||
| else | |||
| common->num_procs = CPU_COUNT_S(size,cpusetp); | |||
| } | |||
| @@ -899,12 +899,12 @@ void gotoblas_affinity_init(void) { | |||
| #else | |||
| ret = sched_getaffinity(0,sizeof(cpu_set_t), &cpuset); | |||
| if (ret!=0) { | |||
| common->num_procs = nums; | |||
| common->num_procs = nums >0 ? nums : 2; | |||
| } else { | |||
| #if !__GLIBC_PREREQ(2, 6) | |||
| int i; | |||
| int n = 0; | |||
| for (i=0;i<nums;i++) | |||
| for (i=0;i<(nums >0 ?nums:1024) ;i++) | |||
| if (CPU_ISSET(i,&cpuset)) n++; | |||
| common->num_procs = n; | |||
| } | |||
| @@ -1022,7 +1022,7 @@ void gotoblas_set_affinity2(int threads) {}; | |||
| void gotoblas_affinity_reschedule(void) {}; | |||
| int get_num_procs(void) { return sysconf(_SC_NPROCESSORS_CONF); } | |||
| int get_num_procs(void) { int num = sysconf(_SC_NPROCESSORS_CONF); return (nums >0 ? nums : 2); } | |||
| int get_num_nodes(void) { return 1; } | |||
| @@ -252,23 +252,23 @@ int get_num_procs(void) { | |||
| ret = omp_get_num_places(); | |||
| if (ret >0 ) nums = ret; | |||
| #endif | |||
| return nums; | |||
| return (nums > 0 ? nums : 2); | |||
| #endif | |||
| #if !defined(OS_LINUX) | |||
| return nums; | |||
| return (nums > 0 ? nums : 2); | |||
| #endif | |||
| #if !defined(__GLIBC_PREREQ) | |||
| return nums; | |||
| return (nums > 0 ? nums :2); | |||
| #else | |||
| #if !__GLIBC_PREREQ(2, 3) | |||
| return nums; | |||
| return (nums > 0 ? nums :2); | |||
| #endif | |||
| #if !__GLIBC_PREREQ(2, 7) | |||
| ret = sched_getaffinity(0,sizeof(cpuset), &cpuset); | |||
| if (ret!=0) return nums; | |||
| if (ret!=0) return (nums > 0 ? nums :2); | |||
| n=0; | |||
| #if !__GLIBC_PREREQ(2, 6) | |||
| for (i=0;i<nums;i++) | |||
| @@ -277,31 +277,31 @@ int get_num_procs(void) { | |||
| #else | |||
| nums = CPU_COUNT(sizeof(cpuset),&cpuset); | |||
| #endif | |||
| return nums; | |||
| return (nums > 0 ? nums :2); | |||
| #else | |||
| if (nums >= CPU_SETSIZE) { | |||
| cpusetp = CPU_ALLOC(nums); | |||
| if (cpusetp == NULL) { | |||
| return nums; | |||
| return (nums > 0 ? nums :2); | |||
| } | |||
| size = CPU_ALLOC_SIZE(nums); | |||
| ret = sched_getaffinity(0,size,cpusetp); | |||
| if (ret!=0) { | |||
| CPU_FREE(cpusetp); | |||
| return nums; | |||
| return (nums > 0 ? nums :2); | |||
| } | |||
| ret = CPU_COUNT_S(size,cpusetp); | |||
| if (ret > 0 && ret < nums) nums = ret; | |||
| CPU_FREE(cpusetp); | |||
| return nums; | |||
| return (nums > 0 ? nums :2); | |||
| } else { | |||
| ret = sched_getaffinity(0,sizeof(cpuset),&cpuset); | |||
| if (ret!=0) { | |||
| return nums; | |||
| return (nums > 0 ? nums :2); | |||
| } | |||
| ret = CPU_COUNT(&cpuset); | |||
| if (ret > 0 && ret < nums) nums = ret; | |||
| return nums; | |||
| return (nums > 0 ? nums :2); | |||
| } | |||
| #endif | |||
| #endif | |||
| @@ -1823,56 +1823,56 @@ int get_num_procs(void) { | |||
| ret = omp_get_num_places(); | |||
| if (ret >0 ) nums = ret; | |||
| #endif | |||
| return nums; | |||
| return (nums > 0 ? nums :2); | |||
| #endif | |||
| #if !defined(OS_LINUX) | |||
| return nums; | |||
| return (nums > 0 ? nums :2); | |||
| #endif | |||
| #if !defined(__GLIBC_PREREQ) | |||
| return nums; | |||
| return (nums > 0 ? nums :2); | |||
| #else | |||
| #if !__GLIBC_PREREQ(2, 3) | |||
| return nums; | |||
| return (nums > 0 ? nums :2); | |||
| #endif | |||
| #if !__GLIBC_PREREQ(2, 7) | |||
| ret = sched_getaffinity(0,sizeof(cpuset), &cpuset); | |||
| if (ret!=0) return nums; | |||
| if (ret!=0) return (nums > 0 ? nums :2); | |||
| n=0; | |||
| #if !__GLIBC_PREREQ(2, 6) | |||
| for (i=0;i<nums;i++) | |||
| for (i=0;i<(nums > 0 ? nums :2);i++) | |||
| if (CPU_ISSET(i,&cpuset)) n++; | |||
| nums=n; | |||
| #else | |||
| nums = CPU_COUNT(sizeof(cpuset),&cpuset); | |||
| #endif | |||
| return nums; | |||
| return (nums > 0 ? nums :2); | |||
| #else | |||
| if (nums >= CPU_SETSIZE) { | |||
| cpusetp = CPU_ALLOC(nums); | |||
| if (cpusetp == NULL) { | |||
| return nums; | |||
| return (nums > 0 ? nums :2); | |||
| } | |||
| size = CPU_ALLOC_SIZE(nums); | |||
| ret = sched_getaffinity(0,size,cpusetp); | |||
| if (ret!=0) { | |||
| CPU_FREE(cpusetp); | |||
| return nums; | |||
| return (nums > 0 ? nums :2); | |||
| } | |||
| ret = CPU_COUNT_S(size,cpusetp); | |||
| if (ret > 0 && ret < nums) nums = ret; | |||
| CPU_FREE(cpusetp); | |||
| return nums; | |||
| return (nums > 0 ? nums :2); | |||
| } else { | |||
| ret = sched_getaffinity(0,sizeof(cpuset),&cpuset); | |||
| if (ret!=0) { | |||
| return nums; | |||
| return (nums > 0 ? nums :2); | |||
| } | |||
| ret = CPU_COUNT(&cpuset); | |||
| if (ret > 0 && ret < nums) nums = ret; | |||
| return nums; | |||
| return (nums > 0 ? nums :2); | |||
| } | |||
| #endif | |||
| #endif | |||
| @@ -39,6 +39,7 @@ static int openblas_env_block_factor=0; | |||
| static int openblas_env_openblas_num_threads=0; | |||
| static int openblas_env_goto_num_threads=0; | |||
| static int openblas_env_omp_num_threads=0; | |||
| static int openblas_env_omp_adaptive=0; | |||
| int openblas_verbose() { return openblas_env_verbose;} | |||
| unsigned int openblas_thread_timeout() { return openblas_env_thread_timeout;} | |||
| @@ -46,6 +47,7 @@ int openblas_block_factor() { return openblas_env_block_factor;} | |||
| int openblas_num_threads_env() { return openblas_env_openblas_num_threads;} | |||
| int openblas_goto_num_threads_env() { return openblas_env_goto_num_threads;} | |||
| int openblas_omp_num_threads_env() { return openblas_env_omp_num_threads;} | |||
| int openblas_omp_adaptive_env() { return openblas_env_omp_adaptive;} | |||
| void openblas_read_env() { | |||
| int ret=0; | |||
| @@ -79,6 +81,11 @@ void openblas_read_env() { | |||
| if(ret<0) ret=0; | |||
| openblas_env_omp_num_threads=ret; | |||
| ret=0; | |||
| if (readenv(p,"OMP_ADAPTIVE")) ret = atoi(p); | |||
| if(ret<0) ret=0; | |||
| openblas_env_omp_adaptive=ret; | |||
| } | |||
| @@ -60,6 +60,9 @@ static char* openblas_config_str="" | |||
| #ifdef USE_OPENMP | |||
| "USE_OPENMP " | |||
| #endif | |||
| #ifdef USE_TLS | |||
| "USE_TLS " | |||
| #endif | |||
| #ifndef DYNAMIC_ARCH | |||
| CHAR_CORENAME | |||
| #endif | |||
| @@ -2,6 +2,12 @@ TOPDIR = .. | |||
| include ../Makefile.system | |||
| ifdef USE_PERL | |||
| GENSYM = gensymbol.pl | |||
| else | |||
| GENSYM = gensymbol | |||
| endif | |||
| ifndef EXPRECISION | |||
| EXPRECISION = 0 | |||
| endif | |||
| @@ -119,11 +125,11 @@ dll : ../$(LIBDLLNAME) | |||
| -shared -o ../$(LIBDLLNAME) -Wl,--out-implib,../$(IMPLIBNAME) \ | |||
| -Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive $(FEXTRALIB) $(EXTRALIB) | |||
| $(LIBPREFIX).def : gensymbol | |||
| perl ./gensymbol win2k $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F) | |||
| $(LIBPREFIX).def : $(GENSYM) | |||
| ./$(GENSYM) win2k $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F) | |||
| libgoto_hpl.def : gensymbol | |||
| perl ./gensymbol win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F) | |||
| libgoto_hpl.def : $(GENSYM) | |||
| ./$(GENSYM) win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F) | |||
| ifeq ($(OSNAME), Darwin) | |||
| INTERNALNAME = $(LIBPREFIX).$(MAJOR_VERSION).dylib | |||
| @@ -265,24 +271,24 @@ static : ../$(LIBNAME) | |||
| $(AR) -cq ../$(LIBNAME) goto.$(SUFFIX) | |||
| rm -f goto.$(SUFFIX) | |||
| osx.def : gensymbol ../Makefile.system ../getarch.c | |||
| perl ./gensymbol osx $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F) | |||
| osx.def : $(GENSYM) ../Makefile.system ../getarch.c | |||
| ./$(GENSYM) osx $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F) | |||
| aix.def : gensymbol ../Makefile.system ../getarch.c | |||
| perl ./gensymbol aix $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F) | |||
| aix.def : $(GENSYM) ../Makefile.system ../getarch.c | |||
| ./$(GENSYM) aix $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F) | |||
| objcopy.def : gensymbol ../Makefile.system ../getarch.c | |||
| perl ./gensymbol objcopy $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F) | |||
| objcopy.def : $(GENSYM) ../Makefile.system ../getarch.c | |||
| ./$(GENSYM) objcopy $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F) | |||
| objconv.def : gensymbol ../Makefile.system ../getarch.c | |||
| perl ./gensymbol objconv $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F) | |||
| objconv.def : $(GENSYM) ../Makefile.system ../getarch.c | |||
| ./$(GENSYM) objconv $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F) | |||
| test : linktest.c | |||
| $(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) -lm && echo OK. | |||
| rm -f linktest | |||
| linktest.c : gensymbol ../Makefile.system ../getarch.c | |||
| perl ./gensymbol linktest $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > linktest.c | |||
| linktest.c : $(GENSYM) ../Makefile.system ../getarch.c | |||
| ./$(GENSYM) linktest $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > linktest.c | |||
| clean :: | |||
| @rm -f *.def *.dylib __.SYMDEF* *.renamed | |||
| @@ -1,6 +1,16 @@ | |||
| #!/usr/bin/env perl | |||
| #!/bin/sh | |||
| split() { | |||
| set -f | |||
| old_ifs=$IFS | |||
| IFS=$2 | |||
| set -- $1 | |||
| printf '%s ' "$@" | |||
| IFS=$old_ifs | |||
| set +f | |||
| } | |||
| $hostos = `uname -s | sed -e s/\-.*//`; chop($hostos); | |||
| hostos="$(uname -s | sed 's/\-.*//')" | |||
| # | |||
| # 1. Not specified | |||
| @@ -12,407 +22,397 @@ $hostos = `uname -s | sed -e s/\-.*//`; chop($hostos); | |||
| # 2.2.2 Path is not correct, invalid compiler name, then gfortran is default with NOFORTRAN definition | |||
| # | |||
| $makefile = shift(@ARGV); | |||
| $config = shift(@ARGV); | |||
| makefile="$1" | |||
| config="$2" | |||
| nofortran=0 | |||
| $nofortran = 0; | |||
| shift 2 | |||
| compiler="$*" | |||
| compiler_bin="$1" | |||
| $compiler = join(" ", @ARGV); | |||
| $compiler_bin = shift(@ARGV); | |||
| # f77 is too ambiguous | |||
| $compiler = "" if $compiler eq "f77"; | |||
| @path = split(/:/, $ENV{"PATH"}); | |||
| if ($compiler eq "") { | |||
| @lists = ("gfortran", "g95", "frt", "fort", "openf90", "openf95", | |||
| "sunf77", "sunf90", "sunf95", | |||
| "xlf95", "xlf90", "xlf", | |||
| "ppuf77", "ppuf95", "ppuf90", "ppuxlf", | |||
| "pathf90", "pathf95", | |||
| "pgf95", "pgf90", "pgf77", "pgfortran", "nvfortran", | |||
| "flang", "egfortran", | |||
| "ifort", "nagfor"); | |||
| OUTER: | |||
| foreach $lists (@lists) { | |||
| foreach $path (@path) { | |||
| if (-x $path . "/" . $lists) { | |||
| $compiler = $lists; | |||
| $compiler_bin = $lists; | |||
| last OUTER; | |||
| [ "$compiler" = "f77" ] && compiler='' | |||
| path=`split "$PATH" ':'` | |||
| if [ -z "$compiler" ]; then | |||
| lists="gfortran g95 frt fort openf90 openf95 | |||
| sunf77 sunf90 sunf95 | |||
| xlf95 xlf90 xlf | |||
| ppuf77 ppuf95 ppuf90 ppuxlf | |||
| pathf90 pathf95 | |||
| pgf95 pgf90 pgf77 pgfortran nvfortran | |||
| flang egfortran | |||
| ifort nagfor ifx ftn crayftn" | |||
| for list in $lists; do | |||
| for p in $path; do | |||
| if [ -x "$p/$list" ]; then | |||
| compiler=$list | |||
| compiler_bin=$list | |||
| break 2 | |||
| fi | |||
| done | |||
| done | |||
| fi | |||
| if [ -z "$compiler" ]; then | |||
| nofortran=1 | |||
| compiler=gfortran | |||
| vendor=GFORTRAN | |||
| bu="_" | |||
| else | |||
| { | |||
| data="$(command -v "$compiler_bin" >/dev/null 2>&1)" | |||
| vendor="" | |||
| } && { | |||
| data=`$compiler -O2 -S ftest.f > /dev/null 2>&1 && cat ftest.s && rm -f ftest.s` | |||
| if [ -z "$data" ]; then | |||
| data=`$compiler -O2 -S ftest.f > /dev/null 2>&1 && cat ftest.c && rm -f ftest.c` | |||
| fi | |||
| case "$data" in *zhoge_*) bu=_ ;; esac | |||
| case "$data" in | |||
| *Fujitsu*) | |||
| vendor=FUJITSU | |||
| openmp='-Kopenmp' | |||
| ;; | |||
| *Cray*) | |||
| vendor=CRAY | |||
| openmp='-fopenmp' | |||
| ;; | |||
| *GNU*|*GCC*) | |||
| v="${data#*GCC: *\) }" | |||
| v="${v%%\"*}" | |||
| major="${v%%.*}" | |||
| if [ "$major" -ge 4 ]; then | |||
| vendor=GFORTRAN | |||
| openmp='-fopenmp' | |||
| else | |||
| case "$compiler" in | |||
| *flang*) | |||
| vendor=FLANG | |||
| openmp='-fopenmp' | |||
| ;; | |||
| *ifx*) | |||
| vendor=INTEL | |||
| openmp='-fopenmp' | |||
| ;; | |||
| *pgf*|*nvf*) | |||
| vendor=PGI | |||
| openmp='-mp' | |||
| ;; | |||
| *) | |||
| vendor=G77 | |||
| openmp='' | |||
| ;; | |||
| esac | |||
| fi | |||
| ;; | |||
| *g95*) | |||
| vendor=G95 | |||
| openmp='' | |||
| ;; | |||
| *Intel*) | |||
| vendor=INTEL | |||
| openmp='-fopenmp' | |||
| ;; | |||
| *'Sun Fortran'*) | |||
| vendor=SUN | |||
| openmp='-xopenmp=parallel' | |||
| ;; | |||
| *PathScale*) | |||
| vendor=PATHSCALE | |||
| openmp='-openmp' | |||
| ;; | |||
| *Open64*) | |||
| vendor=OPEN64 | |||
| openmp='-mp' | |||
| ;; | |||
| *PGF*|*NVF*) | |||
| vendor=PGI | |||
| openmp='-mp' | |||
| ;; | |||
| *'IBM XL'*) | |||
| vendor=IBM | |||
| openmp='-openmp' | |||
| ;; | |||
| *NAG*) | |||
| vendor=NAG | |||
| openmp='-openmp' | |||
| ;; | |||
| esac | |||
| # for embedded underscore name, e.g. zho_ge, it may append 2 underscores. | |||
| data=`$compiler -O2 -S ftest3.f >/dev/null 2>&1 && cat ftest3.s && rm -f ftest3.s` | |||
| [ -z "$data" ] && { | |||
| data=`$compiler -O2 -S ftest3.f >/dev/null 2>&1 && cat ftest3.c && rm -f ftest3.c` | |||
| } | |||
| case "$data" in *' zho_ge__'*) need2bu=1 ;; esac | |||
| case "$vendor" in *G95*) [ "$NO_LAPACKE" != 1 ] && need2bu='' ;; esac | |||
| } | |||
| if [ -z "$vendor" ]; then | |||
| case "$compiler" in | |||
| *g77*) | |||
| vendor=G77 | |||
| bu=_ | |||
| openmp='' | |||
| ;; | |||
| *g95*) | |||
| vendor=G95 | |||
| bu=_ | |||
| openmp='' | |||
| ;; | |||
| *gfortran*) | |||
| vendor=GFORTRAN | |||
| bu=_ | |||
| openmp='-fopenmp' | |||
| ;; | |||
| *ifort*|*ifx*) | |||
| vendor=INTEL | |||
| bu=_ | |||
| openmp='-fopenmp' | |||
| ;; | |||
| *pathf*) | |||
| vendor=PATHSCALE | |||
| bu=_ | |||
| openmp='-mp' | |||
| ;; | |||
| *pgf*|*nvf*) | |||
| vendor=PGI | |||
| bu=_ | |||
| openmp='-mp' | |||
| ;; | |||
| *ftn*) | |||
| vendor=PGI | |||
| bu=_ | |||
| openmp=-openmp | |||
| ;; | |||
| *frt*) | |||
| vendor=FUJITSU | |||
| bu=_ | |||
| openmp='-openmp' | |||
| ;; | |||
| *sunf77*|*sunf90*|*sunf95*) | |||
| vendor=SUN | |||
| bu=_ | |||
| openmp='-xopenmp=parallel' | |||
| ;; | |||
| *ppuf*|*xlf*) | |||
| vendor=IBM | |||
| openmp='-openmp' | |||
| ;; | |||
| *open64*) | |||
| vendor=OPEN64 | |||
| openmp='-mp' | |||
| ;; | |||
| *flang*) | |||
| vendor=FLANG | |||
| bu=_ | |||
| openmp='-fopenmp' | |||
| ;; | |||
| *nagfor*) | |||
| vendor=NAG | |||
| bu=_ | |||
| openmp='-openmp' | |||
| ;; | |||
| esac | |||
| if [ -z "$vendor" ]; then | |||
| nofortran=1 | |||
| compiler="gfortran" | |||
| vendor=GFORTRAN | |||
| bu=_ | |||
| openmp='' | |||
| fi | |||
| fi | |||
| fi | |||
| { | |||
| data=`command -v $compiler_bin >/dev/null 2>&1` | |||
| } && { | |||
| binary=$BINARY | |||
| [ "$USE_OPENMP" != 1 ] && openmp='' | |||
| case "$binary" in | |||
| 32) | |||
| { | |||
| link=`$compiler $openmp -m32 -v ftest2.f 2>&1 && rm -f a.out a.exe` | |||
| } || { | |||
| link=`$compiler $openmp -q32 -v ftest2.f 2>&1 && rm -f a.out a.exe` | |||
| } || { | |||
| # for AIX | |||
| link=`$compiler $openmp -maix32 -v ftest2.f 2>&1 && rm -f a.out a.exe` | |||
| } || { | |||
| # for gfortran MIPS | |||
| mips_data=`$compiler_bin -E -dM - < /dev/null` | |||
| case "$mips_data" in | |||
| *_MIPS_ISA_MIPS64*) | |||
| link=`$compiler $openmp -mabi=n32 -v ftest2.f 2>&1 && rm -f a.out a.exe` | |||
| ;; | |||
| *) | |||
| link=`$compiler $openmp -mabi=32 -v ftest2.f 2>&1 && rm -f a.out a.exe` | |||
| ;; | |||
| esac | |||
| } || { | |||
| binary='' | |||
| } | |||
| } | |||
| } | |||
| } | |||
| if ($compiler eq "") { | |||
| $nofortran = 1; | |||
| $compiler = "gfortran"; | |||
| $vendor = GFORTRAN; | |||
| $bu = "_"; | |||
| } else { | |||
| $data = `which $compiler_bin > /dev/null 2> /dev/null`; | |||
| $vendor = ""; | |||
| if (!$?) { | |||
| $data = `$compiler -O2 -S ftest.f > /dev/null 2>&1 && cat ftest.s && rm -f ftest.s`; | |||
| if ($data eq "") { | |||
| $data = `$compiler -O2 -S ftest.f > /dev/null 2>&1 && cat ftest.c && rm -f ftest.c`; | |||
| } | |||
| if ($data =~ /zhoge_/) { | |||
| $bu = "_"; | |||
| } | |||
| if ($data =~ /Fujitsu/) { | |||
| $vendor = FUJITSU; | |||
| $openmp = "-Kopenmp"; | |||
| } elsif ($data =~ /GNU/ || $data =~ /GCC/ ) { | |||
| $data =~ s/\(+.*?\)+//g; | |||
| $data =~ /(\d+)\.(\d+).(\d+)/; | |||
| $major = $1; | |||
| $minor = $2; | |||
| if ($major >= 4) { | |||
| $vendor = GFORTRAN; | |||
| $openmp = "-fopenmp"; | |||
| } else { | |||
| if ($compiler =~ /flang/) { | |||
| $vendor = FLANG; | |||
| $openmp = "-fopenmp"; | |||
| } elsif ($compiler =~ /pgf/ || $compiler =~ /nvf/) { | |||
| $vendor = PGI; | |||
| $openmp = "-mp"; | |||
| } else { | |||
| $vendor = G77; | |||
| $openmp = ""; | |||
| } | |||
| } | |||
| } | |||
| if ($data =~ /g95/) { | |||
| $vendor = G95; | |||
| $openmp = ""; | |||
| } | |||
| if ($data =~ /Intel/) { | |||
| $vendor = INTEL; | |||
| $openmp = "-fopenmp"; | |||
| } | |||
| if ($data =~ /Sun Fortran/) { | |||
| $vendor = SUN; | |||
| $openmp = "-xopenmp=parallel"; | |||
| } | |||
| if ($data =~ /PathScale/) { | |||
| $vendor = PATHSCALE; | |||
| $openmp = "-openmp"; | |||
| } | |||
| if ($data =~ /Open64/) { | |||
| $vendor = OPEN64; | |||
| $openmp = "-mp"; | |||
| } | |||
| if ($data =~ /PGF/ || $data =~ /NVF/) { | |||
| $vendor = PGI; | |||
| $openmp = "-mp"; | |||
| } | |||
| if ($data =~ /IBM XL/) { | |||
| $vendor = IBM; | |||
| $openmp = "-openmp"; | |||
| } | |||
| if ($data =~ /NAG/) { | |||
| $vendor = NAG; | |||
| $openmp = "-openmp"; | |||
| } | |||
| # for embedded underscore name, e.g. zho_ge, it may append 2 underscores. | |||
| $data = `$compiler -O2 -S ftest3.f > /dev/null 2>&1 && cat ftest3.s && rm -f ftest3.s`; | |||
| if ($data eq "") { | |||
| $data = `$compiler -O2 -S ftest3.f > /dev/null 2>&1 && cat ftest3.c && rm -f ftest3.c`; | |||
| } | |||
| if ($data =~ / zho_ge__/) { | |||
| $need2bu = 1; | |||
| } | |||
| if ($vendor =~ /G95/) { | |||
| if ($ENV{NO_LAPACKE} != 1) { | |||
| $need2bu = ""; | |||
| } | |||
| } | |||
| } | |||
| if ($vendor eq "") { | |||
| if ($compiler =~ /g77/) { | |||
| $vendor = G77; | |||
| $bu = "_"; | |||
| $openmp = ""; | |||
| } | |||
| if ($compiler =~ /g95/) { | |||
| $vendor = G95; | |||
| $bu = "_"; | |||
| $openmp = ""; | |||
| } | |||
| if ($compiler =~ /gfortran/) { | |||
| $vendor = GFORTRAN; | |||
| $bu = "_"; | |||
| $openmp = "-fopenmp"; | |||
| } | |||
| if ($compiler =~ /ifort/) { | |||
| $vendor = INTEL; | |||
| $bu = "_"; | |||
| $openmp = "-fopenmp"; | |||
| } | |||
| if ($compiler =~ /pathf/) { | |||
| $vendor = PATHSCALE; | |||
| $bu = "_"; | |||
| $openmp = "-mp"; | |||
| } | |||
| if ($compiler =~ /pgf/ || $compiler =~ /nvf/) { | |||
| $vendor = PGI; | |||
| $bu = "_"; | |||
| $openmp = "-mp"; | |||
| } | |||
| if ($compiler =~ /ftn/) { | |||
| $vendor = PGI; | |||
| $bu = "_"; | |||
| $openmp = "-openmp"; | |||
| } | |||
| if ($compiler =~ /frt/) { | |||
| $vendor = FUJITSU; | |||
| $bu = "_"; | |||
| $openmp = "-openmp"; | |||
| } | |||
| if ($compiler =~ /sunf77|sunf90|sunf95/) { | |||
| $vendor = SUN; | |||
| $bu = "_"; | |||
| $openmp = "-xopenmp=parallel"; | |||
| } | |||
| if ($compiler =~ /ppuf/) { | |||
| $vendor = IBM; | |||
| $openmp = "-openmp"; | |||
| } | |||
| if ($compiler =~ /xlf/) { | |||
| $vendor = IBM; | |||
| $openmp = "-openmp"; | |||
| } | |||
| if ($compiler =~ /open64/) { | |||
| $vendor = OPEN64; | |||
| $openmp = "-mp"; | |||
| } | |||
| if ($compiler =~ /flang/) { | |||
| $vendor = FLANG; | |||
| $bu = "_"; | |||
| $openmp = "-fopenmp"; | |||
| } | |||
| if ($compiler =~ /nagfor/) { | |||
| $vendor = NAG; | |||
| $bu = "_"; | |||
| $openmp = "-openmp"; | |||
| } | |||
| if ($vendor eq "") { | |||
| $nofortran = 1; | |||
| $compiler = "gfortran"; | |||
| $vendor = GFORTRAN; | |||
| $bu = "_"; | |||
| $openmp = ""; | |||
| } | |||
| ;; | |||
| 64) | |||
| { | |||
| link=`$compiler $openmp -m64 -v ftest2.f 2>&1 && rm -f a.out a.exe` | |||
| } || { | |||
| link=`$compiler $openmp -q64 -v ftest2.f 2>&1 && rm -f a.out a.exe` | |||
| } || { | |||
| # for AIX | |||
| link=`$compiler $openmp -maix64 -v ftest2.f 2>&1 && rm -f a.out a.exe` | |||
| } || { | |||
| # for gfortran MIPS | |||
| link=`$compiler $openmp -mabi=64 -v ftest2.f 2>&1 && rm -f a.out a.exe` | |||
| } || { | |||
| # for nagfor | |||
| link=`$compiler $openmp -dryrun ftest2.f 2>&1 && rm -f a.out a.exe` | |||
| } || { | |||
| binary='' | |||
| } | |||
| ;; | |||
| esac | |||
| } | |||
| if [ -z "$binary" ]; then | |||
| link=`$compiler $openmp -v ftest2.f 2>&1 && rm -f a.out a.exe` | |||
| fi | |||
| } | |||
| $data = `which $compiler_bin > /dev/null 2> /dev/null`; | |||
| if [ "$vendor" = "NAG" ]; then | |||
| link=`$compiler $openmp -dryrun ftest2.f 2>&1 && rm -f a.out a.exe` | |||
| fi | |||
| if [ "$vendor" = "CRAY" ]; then | |||
| link=`$compiler $openmp -hnopattern ftest2.f 2>&1 && rm -f a.out a.exe` | |||
| fi | |||
| linker_L="" | |||
| linker_l="" | |||
| linker_a="" | |||
| if (!$?) { | |||
| if [ -n "$link" ]; then | |||
| $binary = $ENV{"BINARY"}; | |||
| link=`echo "$link" | sed 's/\-Y[[:space:]]P\,/\-Y/g'` | |||
| $openmp = "" if $ENV{USE_OPENMP} != 1; | |||
| link=`echo "$link" | sed 's/\-R[[:space:]]*/\-rpath\%/g'` | |||
| if ($binary == 32) { | |||
| $link = `$compiler $openmp -m32 -v ftest2.f 2>&1 && rm -f a.out a.exe`; | |||
| if ($?) { | |||
| $link = `$compiler $openmp -q32 -v ftest2.f 2>&1 && rm -f a.out a.exe`; | |||
| } | |||
| # for AIX | |||
| if ($?) { | |||
| $link = `$compiler $openmp -maix32 -v ftest2.f 2>&1 && rm -f a.out a.exe`; | |||
| } | |||
| #For gfortran MIPS | |||
| if ($?) { | |||
| $mips_data = `$compiler_bin -E -dM - < /dev/null`; | |||
| if ($mips_data =~ /_MIPS_ISA_MIPS64/) { | |||
| $link = `$compiler $openmp -mabi=n32 -v ftest2.f 2>&1 && rm -f a.out a.exe`; | |||
| } else { | |||
| $link = `$compiler $openmp -mabi=32 -v ftest2.f 2>&1 && rm -f a.out a.exe`; | |||
| } | |||
| } | |||
| $binary = "" if ($?); | |||
| } | |||
| link=`echo "$link" | sed 's/\-rpath[[:space:]]+/\-rpath\%/g'` | |||
| if ($binary == 64) { | |||
| $link = `$compiler $openmp -m64 -v ftest2.f 2>&1 && rm -f a.out a.exe`; | |||
| if ($?) { | |||
| $link = `$compiler $openmp -q64 -v ftest2.f 2>&1 && rm -f a.out a.exe`; | |||
| } | |||
| # for AIX | |||
| if ($?) { | |||
| $link = `$compiler $openmp -maix64 -v ftest2.f 2>&1 && rm -f a.out a.exe`; | |||
| } | |||
| #For gfortran MIPS | |||
| if ($?) { | |||
| $link = `$compiler $openmp -mabi=64 -v ftest2.f 2>&1 && rm -f a.out a.exe`; | |||
| } | |||
| #For nagfor | |||
| if ($?) { | |||
| $link = `$compiler $openmp -dryrun ftest2.f 2>&1 && rm -f a.out a.exe`; | |||
| } | |||
| $binary = "" if ($?); | |||
| } | |||
| if ($binary eq "") { | |||
| $link = `$compiler $openmp -v ftest2.f 2>&1 && rm -f a.out a.exe`; | |||
| } | |||
| } | |||
| link=`echo "$link" | sed 's/\-rpath-link[[:space:]]+/\-rpath-link\%/g'` | |||
| if ( $vendor eq "NAG") { | |||
| $link = `$compiler $openmp -dryrun ftest2.f 2>&1 && rm -f a.out a.exe`; | |||
| } | |||
| $linker_L = ""; | |||
| $linker_l = ""; | |||
| $linker_a = ""; | |||
| if ($link ne "") { | |||
| $link =~ s/\-Y\sP\,/\-Y/g; | |||
| $link =~ s/\-R\s*/\-rpath\%/g; | |||
| $link =~ s/\-rpath\s+/\-rpath\%/g; | |||
| $link =~ s/\-rpath-link\s+/\-rpath-link\%/g; | |||
| @flags = split(/[\s\,\n]/, $link); | |||
| flags=`echo "$link" | tr "',\n" " "` | |||
| # remove leading and trailing quotes from each flag. | |||
| @flags = map {s/^['"]|['"]$//g; $_} @flags; | |||
| foreach $flags (@flags) { | |||
| if ( | |||
| ($flags =~ /^\-L/) | |||
| && ($flags !~ /^-LIST:/) | |||
| && ($flags !~ /^-LANG:/) | |||
| ) { | |||
| $linker_L .= $flags . " "; | |||
| } | |||
| if ($flags =~ /^\-Y/) { | |||
| next if ($hostos eq 'SunOS'); | |||
| $linker_L .= "-Wl,". $flags . " "; | |||
| } | |||
| if ($flags =~ /^\--exclude-libs/) { | |||
| $linker_L .= "-Wl,". $flags . " "; | |||
| $flags=""; | |||
| } | |||
| if ($flags =~ /^\-rpath\%/) { | |||
| $flags =~ s/\%/\,/g; | |||
| $linker_L .= "-Wl,". $flags . " " ; | |||
| } | |||
| if ($flags =~ /^\-rpath-link\%/) { | |||
| $flags =~ s/\%/\,/g; | |||
| $linker_L .= "-Wl,". $flags . " " ; | |||
| } | |||
| if ($flags =~ /-lgomp/ && $ENV{"CC"} =~ /clang/) { | |||
| $flags = "-lomp"; | |||
| } | |||
| if ( | |||
| ($flags =~ /^\-l/) | |||
| && ($flags !~ /ibrary/) | |||
| && ($flags !~ /gfortranbegin/) | |||
| && ($flags !~ /flangmain/) | |||
| && ($flags !~ /frtbegin/) | |||
| && ($flags !~ /pathfstart/) | |||
| && ($flags !~ /crt[0-9]/) | |||
| && ($flags !~ /gcc/) | |||
| && ($flags !~ /user32/) | |||
| && ($flags !~ /kernel32/) | |||
| && ($flags !~ /advapi32/) | |||
| && ($flags !~ /shell32/) | |||
| && ($flags !~ /omp/ || ($vendor !~ /PGI/ && $vendor !~ /FUJITSU/ && $flags =~ /omp/)) | |||
| && ($flags !~ /[0-9]+/ || ($vendor == FUJITSU && $flags =~ /^-lfj90/)) | |||
| && ($flags !~ /^\-l$/) | |||
| ) { | |||
| $linker_l .= $flags . " "; | |||
| } | |||
| if ( $flags =~ /quickfit.o/ && $vendor == NAG) { | |||
| $linker_l .= $flags . " "; | |||
| } | |||
| if ( $flags =~ /safefit.o/ && $vendor == NAG) { | |||
| $linker_l .= $flags . " "; | |||
| } | |||
| if ( $flags =~ /thsafe.o/ && $vendor == NAG) { | |||
| $linker_l .= $flags . " "; | |||
| } | |||
| $linker_a .= $flags . " " if $flags =~ /\.a$/; | |||
| } | |||
| } | |||
| if ($vendor eq "FLANG"){ | |||
| $linker_a .= "-lflang" | |||
| } | |||
| open(MAKEFILE, ">> $makefile") || die "Can't append $makefile"; | |||
| open(CONFFILE, ">> $config" ) || die "Can't append $config"; | |||
| print MAKEFILE "F_COMPILER=$vendor\n"; | |||
| print MAKEFILE "FC=$compiler\n"; | |||
| print MAKEFILE "BU=$bu\n" if $bu ne ""; | |||
| print MAKEFILE "NOFORTRAN=1\n" if $nofortran == 1; | |||
| print CONFFILE "#define BUNDERSCORE\t$bu\n" if $bu ne ""; | |||
| print CONFFILE "#define NEEDBUNDERSCORE\t1\n" if $bu ne ""; | |||
| print CONFFILE "#define NEED2UNDERSCORES\t1\n" if $need2bu ne ""; | |||
| print MAKEFILE "NEED2UNDERSCORES=1\n" if $need2bu ne ""; | |||
| if (($linker_l ne "") || ($linker_a ne "")) { | |||
| print MAKEFILE "FEXTRALIB=$linker_L $linker_l $linker_a\n"; | |||
| } | |||
| #@flags = map {s/^['"]|['"]$//g; $_} @flags; | |||
| for flag in $flags; do | |||
| case "$flag" in -L*) | |||
| case "$flag" in | |||
| -LIST:*|-LANG:*) ;; | |||
| *) linker_L="$linker_L $flag" ;; | |||
| esac | |||
| esac | |||
| case "$flag" in -Y*) | |||
| [ "$hostos" = "SunOS" ] && continue | |||
| linker_L="$linker_L -Wl,$flag" | |||
| ;; | |||
| esac | |||
| case "$flag" in --exclude-libs*) | |||
| linker_L="$linker_L -Wl,$flag" | |||
| flag="" | |||
| ;; | |||
| esac | |||
| case "$flag" in -rpath%*) | |||
| flag=`echo "$flag" | sed 's/\%/\,/g'` | |||
| linker_L="$linker_L -Wl,$flag" | |||
| esac | |||
| case "$flag" in -rpath-link%*) | |||
| flag=`echo "$flag" | sed 's/\%/\,/g'` | |||
| linker_L="$linker_L -Wl,$flag" | |||
| ;; | |||
| esac | |||
| case "$flag" in *-lgomp*) | |||
| case "$CC" in *clang*) | |||
| flag="-lomp" | |||
| ;; | |||
| esac | |||
| esac | |||
| case "$flag" in -l*) | |||
| case "$flag" in | |||
| *ibrary*|*gfortranbegin*|*flangmain*|*frtbegin*|*pathfstart*|\ | |||
| *crt[0-9]*|*gcc*|*user32*|*kernel32*|*advapi32*|*shell32*|\ | |||
| -l) ;; | |||
| *omp*) | |||
| case "$vendor" in | |||
| *PGI*|*FUJITSU*) ;; | |||
| *) linker_l="$linker_l $flag" ;; | |||
| esac | |||
| ;; | |||
| *[0-9]*) | |||
| if [ "$vendor" = "FUJITSU" ]; then | |||
| case "$flag" in | |||
| -lfj90*) linker_l="$linker_l $flag" ;; | |||
| *) ;; | |||
| esac | |||
| fi | |||
| ;; | |||
| *) linker_l="$linker_l $flag" ;; | |||
| esac | |||
| esac | |||
| case "$flag" in *quickfit.o*) | |||
| [ "$vendor" = "NAG" ] && linker_l="$linker_l $flag" ;; | |||
| esac | |||
| case "$flag" in *safefit.o*) | |||
| [ "$vendor" = "NAG" ] && linker_l="$linker_l $flag" ;; | |||
| esac | |||
| case "$flag" in *thsafe.o*) | |||
| [ "$vendor" = "NAG" ] && linker_l="$linker_l $flag" ;; | |||
| esac | |||
| case "$flag" in *.a) linker_a="$linker_a $flag" ;; esac | |||
| done | |||
| fi | |||
| if [ "$vendor" = "FLANG" ]; then | |||
| linker_a="$linker_a -lflang" | |||
| fi | |||
| printf "F_COMPILER=%s\n" "$vendor" >> "$makefile" | |||
| printf "FC=%s\n" "$compiler" >> "$makefile" | |||
| [ -n "$bu" ] && printf 'BU=%s\n' "$bu" >> "$makefile" | |||
| [ "$nofortran" -eq 1 ] && printf 'NOFORTRAN=1\n' >> "$makefile" | |||
| [ -n "$bu" ] && printf '#define BUNDERSCORE\t%s\n' "$bu" >> "$config" | |||
| [ -n "$bu" ] && printf '#define NEEDBUNDERSCORE\t1\n' >> "$config" | |||
| [ -n "$need2bu" ] && printf "#define NEED2UNDERSCORES\t1\n" >> "$config" | |||
| [ -n "$need2bu" ] && printf "#define NEED2UNDERSCORES=1\n" >> "$config" | |||
| if [ -n "$linker_l" ] || [ -n "$linker_a" ]; then | |||
| printf "FEXTRALIB=%s %s %s\n" "$linker_L" "$linker_l" "$linker_a" >> "$makefile" | |||
| fi | |||
| close(MAKEFILE); | |||
| close(CONFFILE); | |||
| @@ -0,0 +1,429 @@ | |||
| #!/usr/bin/env perl | |||
| $hostos = `uname -s | sed -e s/\-.*//`; chop($hostos); | |||
| # | |||
| # 1. Not specified | |||
| # 1.1 Automatically detect, then check compiler | |||
| # 1.2 If no fortran compiler is detected, gfortran is default with NOFORTRAN definition | |||
| # 2. Specified | |||
| # 2.1 If path is correct, check compiler | |||
| # 2.2 If path is not correct, but still valid compiler name, force setting | |||
| # 2.2.2 Path is not correct, invalid compiler name, then gfortran is default with NOFORTRAN definition | |||
| # | |||
| $makefile = shift(@ARGV); | |||
| $config = shift(@ARGV); | |||
| $nofortran = 0; | |||
| $compiler = join(" ", @ARGV); | |||
| $compiler_bin = shift(@ARGV); | |||
| # f77 is too ambiguous | |||
| $compiler = "" if $compiler eq "f77"; | |||
| @path = split(/:/, $ENV{"PATH"}); | |||
| if ($compiler eq "") { | |||
| @lists = ("gfortran", "g95", "frt", "fort", "openf90", "openf95", | |||
| "sunf77", "sunf90", "sunf95", | |||
| "xlf95", "xlf90", "xlf", | |||
| "ppuf77", "ppuf95", "ppuf90", "ppuxlf", | |||
| "pathf90", "pathf95", | |||
| "pgf95", "pgf90", "pgf77", "pgfortran", "nvfortran", | |||
| "flang", "egfortran", | |||
| "ifort", "nagfor", "ifx", "ftn", "crayftn"); | |||
| OUTER: | |||
| foreach $lists (@lists) { | |||
| foreach $path (@path) { | |||
| if (-x $path . "/" . $lists) { | |||
| $compiler = $lists; | |||
| $compiler_bin = $lists; | |||
| last OUTER; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| if ($compiler eq "") { | |||
| $nofortran = 1; | |||
| $compiler = "gfortran"; | |||
| $vendor = GFORTRAN; | |||
| $bu = "_"; | |||
| } else { | |||
| $data = `which $compiler_bin > /dev/null 2> /dev/null`; | |||
| $vendor = ""; | |||
| if (!$?) { | |||
| $data = `$compiler -O2 -S ftest.f > /dev/null 2>&1 && cat ftest.s && rm -f ftest.s`; | |||
| if ($data eq "") { | |||
| $data = `$compiler -O2 -S ftest.f > /dev/null 2>&1 && cat ftest.c && rm -f ftest.c`; | |||
| } | |||
| if ($data =~ /zhoge_/) { | |||
| $bu = "_"; | |||
| } | |||
| if ($data =~ /Fujitsu/) { | |||
| $vendor = FUJITSU; | |||
| $openmp = "-Kopenmp"; | |||
| } elsif ($data =~ /Cray/) { | |||
| $vendor = CRAY; | |||
| $openmp = "-fopenmp"; | |||
| } elsif ($data =~ /GNU/ || $data =~ /GCC/ ) { | |||
| $data =~ s/\(+.*?\)+//g; | |||
| $data =~ /(\d+)\.(\d+).(\d+)/; | |||
| $major = $1; | |||
| $minor = $2; | |||
| if ($major >= 4) { | |||
| $vendor = GFORTRAN; | |||
| $openmp = "-fopenmp"; | |||
| } else { | |||
| if ($compiler =~ /flang/) { | |||
| $vendor = FLANG; | |||
| $openmp = "-fopenmp"; | |||
| } elsif ($compiler =~ /ifx/) { | |||
| $vendor = INTEL; | |||
| $openmp = "-fopenmp"; | |||
| } elsif ($compiler =~ /pgf/ || $compiler =~ /nvf/) { | |||
| $vendor = PGI; | |||
| $openmp = "-mp"; | |||
| } else { | |||
| $vendor = G77; | |||
| $openmp = ""; | |||
| } | |||
| } | |||
| } | |||
| if ($data =~ /g95/) { | |||
| $vendor = G95; | |||
| $openmp = ""; | |||
| } | |||
| if ($data =~ /Intel/) { | |||
| $vendor = INTEL; | |||
| $openmp = "-fopenmp"; | |||
| } | |||
| if ($data =~ /Sun Fortran/) { | |||
| $vendor = SUN; | |||
| $openmp = "-xopenmp=parallel"; | |||
| } | |||
| if ($data =~ /PathScale/) { | |||
| $vendor = PATHSCALE; | |||
| $openmp = "-openmp"; | |||
| } | |||
| if ($data =~ /Open64/) { | |||
| $vendor = OPEN64; | |||
| $openmp = "-mp"; | |||
| } | |||
| if ($data =~ /PGF/ || $data =~ /NVF/) { | |||
| $vendor = PGI; | |||
| $openmp = "-mp"; | |||
| } | |||
| if ($data =~ /IBM XL/) { | |||
| $vendor = IBM; | |||
| $openmp = "-openmp"; | |||
| } | |||
| if ($data =~ /NAG/) { | |||
| $vendor = NAG; | |||
| $openmp = "-openmp"; | |||
| } | |||
| # for embedded underscore name, e.g. zho_ge, it may append 2 underscores. | |||
| $data = `$compiler -O2 -S ftest3.f > /dev/null 2>&1 && cat ftest3.s && rm -f ftest3.s`; | |||
| if ($data eq "") { | |||
| $data = `$compiler -O2 -S ftest3.f > /dev/null 2>&1 && cat ftest3.c && rm -f ftest3.c`; | |||
| } | |||
| if ($data =~ / zho_ge__/) { | |||
| $need2bu = 1; | |||
| } | |||
| if ($vendor =~ /G95/) { | |||
| if ($ENV{NO_LAPACKE} != 1) { | |||
| $need2bu = ""; | |||
| } | |||
| } | |||
| } | |||
| if ($vendor eq "") { | |||
| if ($compiler =~ /g77/) { | |||
| $vendor = G77; | |||
| $bu = "_"; | |||
| $openmp = ""; | |||
| } | |||
| if ($compiler =~ /g95/) { | |||
| $vendor = G95; | |||
| $bu = "_"; | |||
| $openmp = ""; | |||
| } | |||
| if ($compiler =~ /gfortran/) { | |||
| $vendor = GFORTRAN; | |||
| $bu = "_"; | |||
| $openmp = "-fopenmp"; | |||
| } | |||
| if ($compiler =~ /ifort/ || $compiler =~ /ifx/) { | |||
| $vendor = INTEL; | |||
| $bu = "_"; | |||
| $openmp = "-fopenmp"; | |||
| } | |||
| if ($compiler =~ /pathf/) { | |||
| $vendor = PATHSCALE; | |||
| $bu = "_"; | |||
| $openmp = "-mp"; | |||
| } | |||
| if ($compiler =~ /pgf/ || $compiler =~ /nvf/) { | |||
| $vendor = PGI; | |||
| $bu = "_"; | |||
| $openmp = "-mp"; | |||
| } | |||
| if ($compiler =~ /ftn/) { | |||
| $vendor = PGI; | |||
| $bu = "_"; | |||
| $openmp = "-openmp"; | |||
| } | |||
| if ($compiler =~ /frt/) { | |||
| $vendor = FUJITSU; | |||
| $bu = "_"; | |||
| $openmp = "-openmp"; | |||
| } | |||
| if ($compiler =~ /sunf77|sunf90|sunf95/) { | |||
| $vendor = SUN; | |||
| $bu = "_"; | |||
| $openmp = "-xopenmp=parallel"; | |||
| } | |||
| if ($compiler =~ /ppuf/) { | |||
| $vendor = IBM; | |||
| $openmp = "-openmp"; | |||
| } | |||
| if ($compiler =~ /xlf/) { | |||
| $vendor = IBM; | |||
| $openmp = "-openmp"; | |||
| } | |||
| if ($compiler =~ /open64/) { | |||
| $vendor = OPEN64; | |||
| $openmp = "-mp"; | |||
| } | |||
| if ($compiler =~ /flang/) { | |||
| $vendor = FLANG; | |||
| $bu = "_"; | |||
| $openmp = "-fopenmp"; | |||
| } | |||
| if ($compiler =~ /nagfor/) { | |||
| $vendor = NAG; | |||
| $bu = "_"; | |||
| $openmp = "-openmp"; | |||
| } | |||
| if ($vendor eq "") { | |||
| $nofortran = 1; | |||
| $compiler = "gfortran"; | |||
| $vendor = GFORTRAN; | |||
| $bu = "_"; | |||
| $openmp = ""; | |||
| } | |||
| } | |||
| } | |||
| $data = `which $compiler_bin > /dev/null 2> /dev/null`; | |||
| if (!$?) { | |||
| $binary = $ENV{"BINARY"}; | |||
| $openmp = "" if $ENV{USE_OPENMP} != 1; | |||
| if ($binary == 32) { | |||
| $link = `$compiler $openmp -m32 -v ftest2.f 2>&1 && rm -f a.out a.exe`; | |||
| if ($?) { | |||
| $link = `$compiler $openmp -q32 -v ftest2.f 2>&1 && rm -f a.out a.exe`; | |||
| } | |||
| # for AIX | |||
| if ($?) { | |||
| $link = `$compiler $openmp -maix32 -v ftest2.f 2>&1 && rm -f a.out a.exe`; | |||
| } | |||
| #For gfortran MIPS | |||
| if ($?) { | |||
| $mips_data = `$compiler_bin -E -dM - < /dev/null`; | |||
| if ($mips_data =~ /_MIPS_ISA_MIPS64/) { | |||
| $link = `$compiler $openmp -mabi=n32 -v ftest2.f 2>&1 && rm -f a.out a.exe`; | |||
| } else { | |||
| $link = `$compiler $openmp -mabi=32 -v ftest2.f 2>&1 && rm -f a.out a.exe`; | |||
| } | |||
| } | |||
| $binary = "" if ($?); | |||
| } | |||
| if ($binary == 64) { | |||
| $link = `$compiler $openmp -m64 -v ftest2.f 2>&1 && rm -f a.out a.exe`; | |||
| if ($?) { | |||
| $link = `$compiler $openmp -q64 -v ftest2.f 2>&1 && rm -f a.out a.exe`; | |||
| } | |||
| # for AIX | |||
| if ($?) { | |||
| $link = `$compiler $openmp -maix64 -v ftest2.f 2>&1 && rm -f a.out a.exe`; | |||
| } | |||
| #For gfortran MIPS | |||
| if ($?) { | |||
| $link = `$compiler $openmp -mabi=64 -v ftest2.f 2>&1 && rm -f a.out a.exe`; | |||
| } | |||
| #For nagfor | |||
| if ($?) { | |||
| $link = `$compiler $openmp -dryrun ftest2.f 2>&1 && rm -f a.out a.exe`; | |||
| } | |||
| $binary = "" if ($?); | |||
| } | |||
| if ($binary eq "") { | |||
| $link = `$compiler $openmp -v ftest2.f 2>&1 && rm -f a.out a.exe`; | |||
| } | |||
| } | |||
| if ( $vendor eq "NAG") { | |||
| $link = `$compiler $openmp -dryrun ftest2.f 2>&1 && rm -f a.out a.exe`; | |||
| } | |||
| if ( $vendor eq "CRAY") { | |||
| $link = `$compiler $openmp -hnopattern ftest2.f 2>&1 && rm -f a.out a.exe`; | |||
| } | |||
| $linker_L = ""; | |||
| $linker_l = ""; | |||
| $linker_a = ""; | |||
| if ($link ne "") { | |||
| $link =~ s/\-Y\sP\,/\-Y/g; | |||
| $link =~ s/\-R\s*/\-rpath\%/g; | |||
| $link =~ s/\-rpath\s+/\-rpath\%/g; | |||
| $link =~ s/\-rpath-link\s+/\-rpath-link\%/g; | |||
| @flags = split(/[\s\,\n]/, $link); | |||
| # remove leading and trailing quotes from each flag. | |||
| @flags = map {s/^['"]|['"]$//g; $_} @flags; | |||
| foreach $flags (@flags) { | |||
| if ( | |||
| ($flags =~ /^\-L/) | |||
| && ($flags !~ /^-LIST:/) | |||
| && ($flags !~ /^-LANG:/) | |||
| ) { | |||
| $linker_L .= $flags . " "; | |||
| } | |||
| if ($flags =~ /^\-Y/) { | |||
| next if ($hostos eq 'SunOS'); | |||
| $linker_L .= "-Wl,". $flags . " "; | |||
| } | |||
| if ($flags =~ /^\--exclude-libs/) { | |||
| $linker_L .= "-Wl,". $flags . " "; | |||
| $flags=""; | |||
| } | |||
| if ($flags =~ /^\-rpath\%/) { | |||
| $flags =~ s/\%/\,/g; | |||
| $linker_L .= "-Wl,". $flags . " " ; | |||
| } | |||
| if ($flags =~ /^\-rpath-link\%/) { | |||
| $flags =~ s/\%/\,/g; | |||
| $linker_L .= "-Wl,". $flags . " " ; | |||
| } | |||
| if ($flags =~ /-lgomp/ && $ENV{"CC"} =~ /clang/) { | |||
| $flags = "-lomp"; | |||
| } | |||
| if ( | |||
| ($flags =~ /^\-l/) | |||
| && ($flags !~ /ibrary/) | |||
| && ($flags !~ /gfortranbegin/) | |||
| && ($flags !~ /flangmain/) | |||
| && ($flags !~ /frtbegin/) | |||
| && ($flags !~ /pathfstart/) | |||
| && ($flags !~ /crt[0-9]/) | |||
| && ($flags !~ /gcc/) | |||
| && ($flags !~ /user32/) | |||
| && ($flags !~ /kernel32/) | |||
| && ($flags !~ /advapi32/) | |||
| && ($flags !~ /shell32/) | |||
| && ($flags !~ /omp/ || ($vendor !~ /PGI/ && $vendor !~ /FUJITSU/ && $flags =~ /omp/)) | |||
| && ($flags !~ /[0-9]+/ || ($vendor == FUJITSU && $flags =~ /^-lfj90/)) | |||
| && ($flags !~ /^\-l$/) | |||
| ) { | |||
| $linker_l .= $flags . " "; | |||
| } | |||
| if ( $flags =~ /quickfit.o/ && $vendor == NAG) { | |||
| $linker_l .= $flags . " "; | |||
| } | |||
| if ( $flags =~ /safefit.o/ && $vendor == NAG) { | |||
| $linker_l .= $flags . " "; | |||
| } | |||
| if ( $flags =~ /thsafe.o/ && $vendor == NAG) { | |||
| $linker_l .= $flags . " "; | |||
| } | |||
| $linker_a .= $flags . " " if $flags =~ /\.a$/; | |||
| } | |||
| } | |||
| if ($vendor eq "FLANG"){ | |||
| $linker_a .= "-lflang" | |||
| } | |||
| open(MAKEFILE, ">> $makefile") || die "Can't append $makefile"; | |||
| open(CONFFILE, ">> $config" ) || die "Can't append $config"; | |||
| print MAKEFILE "F_COMPILER=$vendor\n"; | |||
| print MAKEFILE "FC=$compiler\n"; | |||
| print MAKEFILE "BU=$bu\n" if $bu ne ""; | |||
| print MAKEFILE "NOFORTRAN=1\n" if $nofortran == 1; | |||
| print CONFFILE "#define BUNDERSCORE\t$bu\n" if $bu ne ""; | |||
| print CONFFILE "#define NEEDBUNDERSCORE\t1\n" if $bu ne ""; | |||
| print CONFFILE "#define NEED2UNDERSCORES\t1\n" if $need2bu ne ""; | |||
| print MAKEFILE "NEED2UNDERSCORES=1\n" if $need2bu ne ""; | |||
| if (($linker_l ne "") || ($linker_a ne "")) { | |||
| print MAKEFILE "FEXTRALIB=$linker_L $linker_l $linker_a\n"; | |||
| } | |||
| close(MAKEFILE); | |||
| close(CONFFILE); | |||
| @@ -94,14 +94,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include <sys/sysinfo.h> | |||
| #endif | |||
| #if defined(__x86_64__) || defined(_M_X64) | |||
| #if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) | |||
| #else | |||
| #ifndef NO_AVX512 | |||
| #define NO_AVX512 | |||
| #endif | |||
| #endif | |||
| #endif | |||
| /* #define FORCE_P2 */ | |||
| /* #define FORCE_KATMAI */ | |||
| /* #define FORCE_COPPERMINE */ | |||
| @@ -140,9 +132,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| /* #define FORCE_PPC440FP2 */ | |||
| /* #define FORCE_CELL */ | |||
| /* #define FORCE_SICORTEX */ | |||
| /* #define FORCE_LOONGSON3R3 */ | |||
| /* #define FORCE_LOONGSON3R4 */ | |||
| /* #define FORCE_LOONGSON3R5 */ | |||
| /* #define FORCE_LOONGSON3R3 */ | |||
| /* #define FORCE_LOONGSON3R4 */ | |||
| /* #define FORCE_LOONGSON3R5 */ | |||
| /* #define FORCE_LOONGSON2K1000 */ | |||
| /* #define FORCE_LOONGSONGENERIC */ | |||
| /* #define FORCE_I6400 */ | |||
| /* #define FORCE_P6600 */ | |||
| /* #define FORCE_P5600 */ | |||
| @@ -977,6 +971,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #else | |||
| #endif | |||
| #ifdef FORCE_LOONGSON2K1000 | |||
| #define FORCE | |||
| #define ARCHITECTURE "LOONGARCH" | |||
| #define SUBARCHITECTURE "LOONGSON2K1000" | |||
| #define SUBDIRNAME "loongarch64" | |||
| #define ARCHCONFIG "-DLOONGSON2K1000 " \ | |||
| "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \ | |||
| "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=16 " | |||
| #define LIBNAME "loongson2k1000" | |||
| #define CORENAME "LOONGSON2K1000" | |||
| #else | |||
| #endif | |||
| #ifdef FORCE_LOONGSONGENERIC | |||
| #define FORCE | |||
| #define ARCHITECTURE "LOONGARCH" | |||
| #define SUBARCHITECTURE "LOONGSONGENERIC" | |||
| #define SUBDIRNAME "loongarch64" | |||
| #define ARCHCONFIG "-DLOONGSONGENERIC " \ | |||
| "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \ | |||
| "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=16 " | |||
| #define LIBNAME "loongsongeneric" | |||
| #define CORENAME "LOONGSONGENERIC" | |||
| #else | |||
| #endif | |||
| #ifdef FORCE_I6400 | |||
| #define FORCE | |||
| #define ARCHITECTURE "MIPS" | |||
| @@ -1240,7 +1262,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" | |||
| #define LIBNAME "cortexa53" | |||
| #define CORENAME "CORTEXA53" | |||
| #else | |||
| #endif | |||
| #ifdef FORCE_CORTEXA57 | |||
| @@ -1256,7 +1277,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" | |||
| #define LIBNAME "cortexa57" | |||
| #define CORENAME "CORTEXA57" | |||
| #else | |||
| #endif | |||
| #ifdef FORCE_CORTEXA72 | |||
| @@ -1272,7 +1292,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" | |||
| #define LIBNAME "cortexa72" | |||
| #define CORENAME "CORTEXA72" | |||
| #else | |||
| #endif | |||
| #ifdef FORCE_CORTEXA73 | |||
| @@ -1288,7 +1307,62 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" | |||
| #define LIBNAME "cortexa73" | |||
| #define CORENAME "CORTEXA73" | |||
| #else | |||
| #endif | |||
| #ifdef FORCE_CORTEXX1 | |||
| #define FORCE | |||
| #define ARCHITECTURE "ARM64" | |||
| #define SUBARCHITECTURE "CORTEXX1" | |||
| #define SUBDIRNAME "arm64" | |||
| #define ARCHCONFIG "-DCORTEXX1 " \ | |||
| "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ | |||
| "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 " \ | |||
| "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" | |||
| #define LIBNAME "cortexx1" | |||
| #define CORENAME "CORTEXX1" | |||
| #endif | |||
| #ifdef FORCE_CORTEXX2 | |||
| #define FORCE | |||
| #define ARCHITECTURE "ARM64" | |||
| #define SUBARCHITECTURE "CORTEXX2" | |||
| #define SUBDIRNAME "arm64" | |||
| #define ARCHCONFIG "-DCORTEXX2 " \ | |||
| "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ | |||
| "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 " \ | |||
| "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SVE -DARMV8 -DARMV9" | |||
| #define LIBNAME "cortexx2" | |||
| #define CORENAME "CORTEXX2" | |||
| #endif | |||
| #ifdef FORCE_CORTEXA510 | |||
| #define FORCE | |||
| #define ARCHITECTURE "ARM64" | |||
| #define SUBARCHITECTURE "CORTEXA510" | |||
| #define SUBDIRNAME "arm64" | |||
| #define ARCHCONFIG "-DCORTEXA510 " \ | |||
| "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ | |||
| "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 " \ | |||
| "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SVE -DARMV8 -DARMV9" | |||
| #define LIBNAME "cortexa510" | |||
| #define CORENAME "CORTEXA510" | |||
| #endif | |||
| #ifdef FORCE_CORTEXA710 | |||
| #define FORCE | |||
| #define ARCHITECTURE "ARM64" | |||
| #define SUBARCHITECTURE "CORTEXA710" | |||
| #define SUBDIRNAME "arm64" | |||
| #define ARCHCONFIG "-DCORTEXA710 " \ | |||
| "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ | |||
| "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 " \ | |||
| "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SVE -DARMV8 -DARMV9" | |||
| #define LIBNAME "cortexa710" | |||
| #define CORENAME "CORTEXA710" | |||
| #endif | |||
| #ifdef FORCE_NEOVERSEN1 | |||
| @@ -1305,7 +1379,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| "-march=armv8.2-a -mtune=neoverse-n1" | |||
| #define LIBNAME "neoversen1" | |||
| #define CORENAME "NEOVERSEN1" | |||
| #else | |||
| #endif | |||
| #ifdef FORCE_NEOVERSEV1 | |||
| @@ -1322,7 +1395,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| "-march=armv8.4-a -mtune=neoverse-v1" | |||
| #define LIBNAME "neoversev1" | |||
| #define CORENAME "NEOVERSEV1" | |||
| #else | |||
| #endif | |||
| @@ -1340,7 +1412,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| "-march=armv8.5-a -mtune=neoverse-n2" | |||
| #define LIBNAME "neoversen2" | |||
| #define CORENAME "NEOVERSEN2" | |||
| #else | |||
| #endif | |||
| #ifdef FORCE_CORTEXA55 | |||
| @@ -1356,7 +1427,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" | |||
| #define LIBNAME "cortexa55" | |||
| #define CORENAME "CORTEXA55" | |||
| #else | |||
| #endif | |||
| #ifdef FORCE_FALKOR | |||
| @@ -1372,7 +1442,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" | |||
| #define LIBNAME "falkor" | |||
| #define CORENAME "FALKOR" | |||
| #else | |||
| #endif | |||
| #ifdef FORCE_THUNDERX | |||
| @@ -1387,7 +1456,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" | |||
| #define LIBNAME "thunderx" | |||
| #define CORENAME "THUNDERX" | |||
| #else | |||
| #endif | |||
| #ifdef FORCE_THUNDERX2T99 | |||
| @@ -1405,7 +1473,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" | |||
| #define LIBNAME "thunderx2t99" | |||
| #define CORENAME "THUNDERX2T99" | |||
| #else | |||
| #endif | |||
| #ifdef FORCE_TSV110 | |||
| @@ -1421,7 +1488,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" | |||
| #define LIBNAME "tsv110" | |||
| #define CORENAME "TSV110" | |||
| #else | |||
| #endif | |||
| #ifdef FORCE_EMAG8180 | |||
| @@ -1456,7 +1522,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" | |||
| #define LIBNAME "thunderx3t110" | |||
| #define CORENAME "THUNDERX3T110" | |||
| #else | |||
| #endif | |||
| #ifdef FORCE_VORTEX | |||
| @@ -1488,7 +1553,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SVE -DARMV8" | |||
| #define LIBNAME "a64fx" | |||
| #define CORENAME "A64FX" | |||
| #else | |||
| #endif | |||
| #ifdef FORCE_FT2000 | |||
| #define ARMV8 | |||
| #define FORCE | |||
| #define ARCHITECTURE "ARM64" | |||
| #define SUBARCHITECTURE "FT2000" | |||
| #define SUBDIRNAME "arm64" | |||
| #define ARCHCONFIG "-DFT2000 " \ | |||
| "-DL1_CODE_SIZE=32768 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=8 " \ | |||
| "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=8 " \ | |||
| "-DL2_SIZE=33554426-DL2_LINESIZE=64 -DL2_ASSOCIATIVE=8 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | |||
| "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" | |||
| #define LIBNAME "ft2000" | |||
| #define CORENAME "FT2000" | |||
| #endif | |||
| #ifdef FORCE_ZARCH_GENERIC | |||
| @@ -1524,6 +1604,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #ifdef FORCE_C910V | |||
| #define FORCE | |||
| #define ARCHITECTURE "RISCV64" | |||
| #ifdef NO_RV64GV | |||
| #define SUBARCHITECTURE "RISCV64_GENERIC" | |||
| #define SUBDIRNAME "riscv64" | |||
| #define ARCHCONFIG "-DRISCV64_GENERIC " \ | |||
| "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \ | |||
| "-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " | |||
| #define LIBNAME "riscv64_generic" | |||
| #define CORENAME "RISCV64_GENERIC" | |||
| #else | |||
| #define SUBARCHITECTURE "C910V" | |||
| #define SUBDIRNAME "riscv64" | |||
| #define ARCHCONFIG "-DC910V " \ | |||
| @@ -1532,6 +1622,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " | |||
| #define LIBNAME "c910v" | |||
| #define CORENAME "C910V" | |||
| #endif | |||
| #else | |||
| #endif | |||
| @@ -1632,17 +1723,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| static int get_num_cores(void) { | |||
| int count; | |||
| #ifdef OS_WINDOWS | |||
| SYSTEM_INFO sysinfo; | |||
| #elif defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__) || defined(__DragonFly__) || defined(__APPLE__) | |||
| int m[2], count; | |||
| int m[2]; | |||
| size_t len; | |||
| #endif | |||
| #if defined(linux) || defined(__sun__) | |||
| //returns the number of processors which are currently online | |||
| return sysconf(_SC_NPROCESSORS_CONF); | |||
| count = sysconf(_SC_NPROCESSORS_CONF); | |||
| if (count <= 0) count = 2; | |||
| return count; | |||
| #elif defined(OS_WINDOWS) | |||
| GetSystemInfo(&sysinfo); | |||
| @@ -1653,13 +1747,15 @@ static int get_num_cores(void) { | |||
| m[1] = HW_NCPU; | |||
| len = sizeof(int); | |||
| sysctl(m, 2, &count, &len, NULL, 0); | |||
| if (count <= 0) count = 2; | |||
| return count; | |||
| #elif defined(AIX) | |||
| //returns the number of processors which are currently online | |||
| return sysconf(_SC_NPROCESSORS_ONLN); | |||
| count = sysconf(_SC_NPROCESSORS_ONLN); | |||
| if (count <= 0) count = 2; | |||
| #else | |||
| return 2; | |||
| #endif | |||
| @@ -1681,7 +1777,7 @@ int main(int argc, char *argv[]){ | |||
| #ifdef FORCE | |||
| printf("CORE=%s\n", CORENAME); | |||
| #else | |||
| #if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) | |||
| #if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) || defined(__riscv) | |||
| printf("CORE=%s\n", get_corename()); | |||
| #endif | |||
| #endif | |||
| @@ -1829,7 +1925,7 @@ printf("ELF_VERSION=2\n"); | |||
| #ifdef FORCE | |||
| printf("#define CHAR_CORENAME \"%s\"\n", CORENAME); | |||
| #else | |||
| #if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) | |||
| #if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) || defined(__riscv) | |||
| printf("#define CHAR_CORENAME \"%s\"\n", get_corename()); | |||
| #endif | |||
| #endif | |||
| @@ -531,8 +531,11 @@ $(BLASOBJS) $(BLASOBJS_P) : functable.h | |||
| $(BLASOBJS) $(BLASOBJS_P) : override CFLAGS += -DPROFILE_FUNC_NAME=interface_$(*F) | |||
| functable.h : Makefile | |||
| ifndef USE_PERL | |||
| ./create $(FUNCALLFILES) > functable.h | |||
| else | |||
| ./create.pl $(FUNCALLFILES) > functable.h | |||
| endif | |||
| endif | |||
| clean :: | |||
| @@ -1,22 +1,22 @@ | |||
| #!/usr/bin/env perl | |||
| #!/bin/sh | |||
| $count = 0; | |||
| count=0 | |||
| foreach (@ARGV) { | |||
| print "#define\tinterface_", $_, "\t\t", $count, "\n"; | |||
| $count ++; | |||
| } | |||
| for arg in "$@"; do | |||
| printf "#define\tinterface_%s\t\t%d\n" "$arg" "$count" | |||
| count=`expr $count + 1` | |||
| done | |||
| print "#ifdef USE_FUNCTABLE\n"; | |||
| printf "#ifdef USE_FUNCTABLE\n" | |||
| print "#define MAX_PROF_TABLE ", $count, "\n"; | |||
| printf "#define MAX_PROF_TABLE %d\n" "$count" | |||
| print "static char *func_table[] = {\n"; | |||
| printf "static char *func_table[] = {\n" | |||
| foreach (@ARGV) { | |||
| print "\"", $_, "\",\n"; | |||
| } | |||
| for arg in "$@"; do | |||
| printf "\"%s\",\n" "$arg" | |||
| done | |||
| print "};\n"; | |||
| print "#endif\n"; | |||
| printf "};\n" | |||
| printf "#endif\n" | |||
| @@ -0,0 +1,22 @@ | |||
| #!/usr/bin/env perl | |||
| $count = 0; | |||
| foreach (@ARGV) { | |||
| print "#define\tinterface_", $_, "\t\t", $count, "\n"; | |||
| $count ++; | |||
| } | |||
| print "#ifdef USE_FUNCTABLE\n"; | |||
| print "#define MAX_PROF_TABLE ", $count, "\n"; | |||
| print "static char *func_table[] = {\n"; | |||
| foreach (@ARGV) { | |||
| print "\"", $_, "\",\n"; | |||
| } | |||
| print "};\n"; | |||
| print "#endif\n"; | |||
| @@ -678,7 +678,7 @@ endif () | |||
| set(SBGEMM_SMALL_K_B0_TN ../generic/gemm_small_matrix_kernel_tn.c) | |||
| endif () | |||
| if (NOT DEFINED SBGEMM_SMALL_K_B0_TT) | |||
| set($SBGEMM_SMALL_K_B0_TT ../generic/gemm_small_matrix_kernel_tt.c) | |||
| set(SBGEMM_SMALL_K_B0_TT ../generic/gemm_small_matrix_kernel_tt.c) | |||
| endif () | |||
| GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_M_PERMIT}" "" "gemm_small_matrix_permit" false "" "" false "BFLOAT16") | |||
| GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_NN}" "" "gemm_small_kernel_nn" false "" "" false "BFLOAT16") | |||
| @@ -854,49 +854,49 @@ endif () | |||
| # Makefile.LA | |||
| if(NOT NO_LAPACK) | |||
| foreach (float_type ${FLOAT_TYPES}) | |||
| string(SUBSTRING ${float_type} 0 1 float_char) | |||
| if (${float_type} STREQUAL "BFLOAT16") | |||
| set (float_char "SB") | |||
| endif () | |||
| if (NOT DEFINED ${float_char}NEG_TCOPY) | |||
| if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C" OR ${float_char} STREQUAL "X") | |||
| set(${float_char}NEG_TCOPY ../generic/zneg_tcopy.c) | |||
| set(${float_char}NEG_TCOPY ../generic/zneg_tcopy_${${float_char}GEMM_UNROLL_M}.c) | |||
| else () | |||
| set(${float_char}NEG_TCOPY ../generic/neg_tcopy.c) | |||
| set(${float_char}NEG_TCOPY ../generic/neg_tcopy_${${float_char}GEMM_UNROLL_M}.c) | |||
| endif () | |||
| endif () | |||
| if (NOT DEFINED ${float_char}LASWP_NCOPY) | |||
| if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C" OR ${float_char} STREQUAL "X") | |||
| set(${float_char}LASWP_NCOPY ../generic/zlaswp_ncopy.c) | |||
| set(${float_char}LASWP_NCOPY ../generic/zlaswp_ncopy_${${float_char}GEMM_UNROLL_N}.c) | |||
| else () | |||
| set(${float_char}LASWP_NCOPY ../generic/laswp_ncopy.c) | |||
| set(${float_char}LASWP_NCOPY ../generic/laswp_ncopy_${${float_char}GEMM_UNROLL_N}.c) | |||
| endif () | |||
| endif () | |||
| string(SUBSTRING ${float_type} 0 1 float_char) | |||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}NEG_TCOPY}_${${float_char}GEMM_UNROLL_M}" "" "neg_tcopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}LASWP_NCOPY}_${${float_char}GEMM_UNROLL_N}" "" "laswp_ncopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}NEG_TCOPY}" "" "neg_tcopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}LASWP_NCOPY}" "" "laswp_ncopy" false "" "" false ${float_type}) | |||
| endforeach() | |||
| if (BUILD_COMPLEX AND NOT BUILD_SINGLE) | |||
| if (NOT DEFINED SNEG_TCOPY) | |||
| set(SNEG_TCOPY ../generic/neg_tcopy.c) | |||
| set(SNEG_TCOPY ../generic/neg_tcopy_${${float_char}GEMM_UNROLL_M}.c) | |||
| endif () | |||
| if (NOT DEFINED SLASWP_NCOPY) | |||
| set(SLASWP_NCOPY ../generic/laswp_ncopy.c) | |||
| set(SLASWP_NCOPY ../generic/laswp_ncopy_${${float_char}GEMM_UNROLL_N}.c) | |||
| endif () | |||
| GenerateNamedObjects("${KERNELDIR}/${SNEG_TCOPY}_${SGEMM_UNROLL_M}" "" "neg_tcopy" false "" "" false "SINGLE") | |||
| GenerateNamedObjects("${KERNELDIR}/${SLASWP_NCOPY}_${SGEMM_UNROLL_N}" "" "laswp_ncopy" false "" "" false "SINGLE") | |||
| GenerateNamedObjects("${KERNELDIR}/${SNEG_TCOPY}" "" "neg_tcopy" false "" "" false "SINGLE") | |||
| GenerateNamedObjects("${KERNELDIR}/${SLASWP_NCOPY}" "" "laswp_ncopy" false "" "" false "SINGLE") | |||
| endif() | |||
| if (BUILD_COMPLEX16 AND NOT BUILD_DOUBLE) | |||
| if (NOT DEFINED DNEG_TCOPY) | |||
| set(DNEG_TCOPY ../generic/neg_tcopy.c) | |||
| set(DNEG_TCOPY ../generic/neg_tcopy_${${float_char}GEMM_UNROLL_M}.c) | |||
| endif () | |||
| if (NOT DEFINED DLASWP_NCOPY) | |||
| set(DLASWP_NCOPY ../generic/laswp_ncopy.c) | |||
| set(DLASWP_NCOPY ../generic/laswp_ncopy_${${float_char}GEMM_UNROLL_N}.c) | |||
| endif () | |||
| GenerateNamedObjects("${KERNELDIR}/${DNEG_TCOPY}_${DGEMM_UNROLL_M}" "" "neg_tcopy" false "" "" false "DOUBLE") | |||
| GenerateNamedObjects("${KERNELDIR}/${DLASWP_NCOPY}_${DGEMM_UNROLL_N}" "" "laswp_ncopy" false "" "" false "DOUBLE") | |||
| GenerateNamedObjects("${KERNELDIR}/${DNEG_TCOPY}_${DGEMM_UNROLL_M}.c" "" "neg_tcopy" false "" "" false "DOUBLE") | |||
| GenerateNamedObjects("${KERNELDIR}/${DLASWP_NCOPY}_${DGEMM_UNROLL_N}.c" "" "laswp_ncopy" false "" "" false "DOUBLE") | |||
| endif() | |||
| endif() | |||
| @@ -0,0 +1,216 @@ | |||
| SAMINKERNEL = ../arm/amin.c | |||
| DAMINKERNEL = ../arm/amin.c | |||
| CAMINKERNEL = ../arm/zamin.c | |||
| ZAMINKERNEL = ../arm/zamin.c | |||
| SMAXKERNEL = ../arm/max.c | |||
| DMAXKERNEL = ../arm/max.c | |||
| SMINKERNEL = ../arm/min.c | |||
| DMINKERNEL = ../arm/min.c | |||
| ISAMINKERNEL = ../arm/iamin.c | |||
| IDAMINKERNEL = ../arm/iamin.c | |||
| ICAMINKERNEL = ../arm/izamin.c | |||
| IZAMINKERNEL = ../arm/izamin.c | |||
| ISMAXKERNEL = ../arm/imax.c | |||
| IDMAXKERNEL = ../arm/imax.c | |||
| ISMINKERNEL = ../arm/imin.c | |||
| IDMINKERNEL = ../arm/imin.c | |||
| STRSMKERNEL_LN = trsm_kernel_LN_sve.c | |||
| STRSMKERNEL_LT = trsm_kernel_LT_sve.c | |||
| STRSMKERNEL_RN = trsm_kernel_RN_sve.c | |||
| STRSMKERNEL_RT = trsm_kernel_RT_sve.c | |||
| DTRSMKERNEL_LN = trsm_kernel_LN_sve.c | |||
| DTRSMKERNEL_LT = trsm_kernel_LT_sve.c | |||
| DTRSMKERNEL_RN = trsm_kernel_RN_sve.c | |||
| DTRSMKERNEL_RT = trsm_kernel_RT_sve.c | |||
| TRSMCOPYLN_M = trsm_lncopy_sve.c | |||
| TRSMCOPYLT_M = trsm_ltcopy_sve.c | |||
| TRSMCOPYUN_M = trsm_uncopy_sve.c | |||
| TRSMCOPYUT_M = trsm_utcopy_sve.c | |||
| CTRSMKERNEL_LN = trsm_kernel_LN_sve.c | |||
| CTRSMKERNEL_LT = trsm_kernel_LT_sve.c | |||
| CTRSMKERNEL_RN = trsm_kernel_RN_sve.c | |||
| CTRSMKERNEL_RT = trsm_kernel_RT_sve.c | |||
| ZTRSMKERNEL_LN = trsm_kernel_LN_sve.c | |||
| ZTRSMKERNEL_LT = trsm_kernel_LT_sve.c | |||
| ZTRSMKERNEL_RN = trsm_kernel_RN_sve.c | |||
| ZTRSMKERNEL_RT = trsm_kernel_RT_sve.c | |||
| ZTRSMCOPYLN_M = ztrsm_lncopy_sve.c | |||
| ZTRSMCOPYLT_M = ztrsm_ltcopy_sve.c | |||
| ZTRSMCOPYUN_M = ztrsm_uncopy_sve.c | |||
| ZTRSMCOPYUT_M = ztrsm_utcopy_sve.c | |||
| SAMAXKERNEL = amax.S | |||
| DAMAXKERNEL = amax.S | |||
| CAMAXKERNEL = zamax.S | |||
| ZAMAXKERNEL = zamax.S | |||
| SAXPYKERNEL = axpy.S | |||
| DAXPYKERNEL = axpy.S | |||
| CAXPYKERNEL = zaxpy.S | |||
| ZAXPYKERNEL = zaxpy.S | |||
| SROTKERNEL = rot.S | |||
| DROTKERNEL = rot.S | |||
| CROTKERNEL = zrot.S | |||
| ZROTKERNEL = zrot.S | |||
| SSCALKERNEL = scal.S | |||
| DSCALKERNEL = scal.S | |||
| CSCALKERNEL = zscal.S | |||
| ZSCALKERNEL = zscal.S | |||
| SGEMVNKERNEL = gemv_n.S | |||
| DGEMVNKERNEL = gemv_n.S | |||
| CGEMVNKERNEL = zgemv_n.S | |||
| ZGEMVNKERNEL = zgemv_n.S | |||
| SGEMVTKERNEL = gemv_t.S | |||
| DGEMVTKERNEL = gemv_t.S | |||
| CGEMVTKERNEL = zgemv_t.S | |||
| ZGEMVTKERNEL = zgemv_t.S | |||
| SASUMKERNEL = asum.S | |||
| DASUMKERNEL = asum.S | |||
| CASUMKERNEL = casum.S | |||
| ZASUMKERNEL = zasum.S | |||
| SCOPYKERNEL = copy.S | |||
| DCOPYKERNEL = copy.S | |||
| CCOPYKERNEL = copy.S | |||
| ZCOPYKERNEL = copy.S | |||
| SSWAPKERNEL = swap.S | |||
| DSWAPKERNEL = swap.S | |||
| CSWAPKERNEL = swap.S | |||
| ZSWAPKERNEL = swap.S | |||
| ISAMAXKERNEL = iamax.S | |||
| IDAMAXKERNEL = iamax.S | |||
| ICAMAXKERNEL = izamax.S | |||
| IZAMAXKERNEL = izamax.S | |||
| SNRM2KERNEL = nrm2.S | |||
| DNRM2KERNEL = nrm2.S | |||
| CNRM2KERNEL = znrm2.S | |||
| ZNRM2KERNEL = znrm2.S | |||
| DDOTKERNEL = dot.S | |||
| ifneq ($(C_COMPILER), PGI) | |||
| SDOTKERNEL = ../generic/dot.c | |||
| else | |||
| SDOTKERNEL = dot.S | |||
| endif | |||
| ifneq ($(C_COMPILER), PGI) | |||
| CDOTKERNEL = zdot.S | |||
| ZDOTKERNEL = zdot.S | |||
| else | |||
| CDOTKERNEL = ../arm/zdot.c | |||
| ZDOTKERNEL = ../arm/zdot.c | |||
| endif | |||
| DSDOTKERNEL = dot.S | |||
| DGEMM_BETA = dgemm_beta.S | |||
| SGEMM_BETA = sgemm_beta.S | |||
| SGEMMKERNEL = sgemm_kernel_sve_v2x$(SGEMM_UNROLL_N).S | |||
| STRMMKERNEL = strmm_kernel_sve_v1x$(SGEMM_UNROLL_N).S | |||
| SGEMMINCOPY = sgemm_ncopy_sve_v1.c | |||
| SGEMMITCOPY = sgemm_tcopy_sve_v1.c | |||
| SGEMMONCOPY = sgemm_ncopy_$(DGEMM_UNROLL_N).S | |||
| SGEMMOTCOPY = sgemm_tcopy_$(DGEMM_UNROLL_N).S | |||
| SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| STRMMUNCOPY_M = trmm_uncopy_sve_v1.c | |||
| STRMMLNCOPY_M = trmm_lncopy_sve_v1.c | |||
| STRMMUTCOPY_M = trmm_utcopy_sve_v1.c | |||
| STRMMLTCOPY_M = trmm_ltcopy_sve_v1.c | |||
| SSYMMUCOPY_M = symm_ucopy_sve.c | |||
| SSYMMLCOPY_M = symm_lcopy_sve.c | |||
| DGEMMKERNEL = dgemm_kernel_sve_v2x$(DGEMM_UNROLL_N).S | |||
| DTRMMKERNEL = dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S | |||
| DGEMMINCOPY = dgemm_ncopy_sve_v1.c | |||
| DGEMMITCOPY = dgemm_tcopy_sve_v1.c | |||
| DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S | |||
| DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S | |||
| DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| DTRMMUNCOPY_M = trmm_uncopy_sve_v1.c | |||
| DTRMMLNCOPY_M = trmm_lncopy_sve_v1.c | |||
| DTRMMUTCOPY_M = trmm_utcopy_sve_v1.c | |||
| DTRMMLTCOPY_M = trmm_ltcopy_sve_v1.c | |||
| DSYMMUCOPY_M = symm_ucopy_sve.c | |||
| DSYMMLCOPY_M = symm_lcopy_sve.c | |||
| CGEMMKERNEL = cgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S | |||
| CTRMMKERNEL = ctrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S | |||
| CGEMMINCOPY = cgemm_ncopy_sve_v1.c | |||
| CGEMMITCOPY = cgemm_tcopy_sve_v1.c | |||
| CGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c | |||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c | |||
| CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| CTRMMUNCOPY_M = ztrmm_uncopy_sve_v1.c | |||
| CTRMMLNCOPY_M = ztrmm_lncopy_sve_v1.c | |||
| CTRMMUTCOPY_M = ztrmm_utcopy_sve_v1.c | |||
| CTRMMLTCOPY_M = ztrmm_ltcopy_sve_v1.c | |||
| CHEMMLTCOPY_M = zhemm_ltcopy_sve.c | |||
| CHEMMUTCOPY_M = zhemm_utcopy_sve.c | |||
| CSYMMUCOPY_M = zsymm_ucopy_sve.c | |||
| CSYMMLCOPY_M = zsymm_lcopy_sve.c | |||
| ZGEMMKERNEL = zgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S | |||
| ZTRMMKERNEL = ztrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S | |||
| ZGEMMINCOPY = zgemm_ncopy_sve_v1.c | |||
| ZGEMMITCOPY = zgemm_tcopy_sve_v1.c | |||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c | |||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c | |||
| ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| ZTRMMUNCOPY_M = ztrmm_uncopy_sve_v1.c | |||
| ZTRMMLNCOPY_M = ztrmm_lncopy_sve_v1.c | |||
| ZTRMMUTCOPY_M = ztrmm_utcopy_sve_v1.c | |||
| ZTRMMLTCOPY_M = ztrmm_ltcopy_sve_v1.c | |||
| ZHEMMLTCOPY_M = zhemm_ltcopy_sve.c | |||
| ZHEMMUTCOPY_M = zhemm_utcopy_sve.c | |||
| ZSYMMUCOPY_M = zsymm_ucopy_sve.c | |||
| ZSYMMLCOPY_M = zsymm_lcopy_sve.c | |||
| @@ -0,0 +1,216 @@ | |||
| SAMINKERNEL = ../arm/amin.c | |||
| DAMINKERNEL = ../arm/amin.c | |||
| CAMINKERNEL = ../arm/zamin.c | |||
| ZAMINKERNEL = ../arm/zamin.c | |||
| SMAXKERNEL = ../arm/max.c | |||
| DMAXKERNEL = ../arm/max.c | |||
| SMINKERNEL = ../arm/min.c | |||
| DMINKERNEL = ../arm/min.c | |||
| ISAMINKERNEL = ../arm/iamin.c | |||
| IDAMINKERNEL = ../arm/iamin.c | |||
| ICAMINKERNEL = ../arm/izamin.c | |||
| IZAMINKERNEL = ../arm/izamin.c | |||
| ISMAXKERNEL = ../arm/imax.c | |||
| IDMAXKERNEL = ../arm/imax.c | |||
| ISMINKERNEL = ../arm/imin.c | |||
| IDMINKERNEL = ../arm/imin.c | |||
| STRSMKERNEL_LN = trsm_kernel_LN_sve.c | |||
| STRSMKERNEL_LT = trsm_kernel_LT_sve.c | |||
| STRSMKERNEL_RN = trsm_kernel_RN_sve.c | |||
| STRSMKERNEL_RT = trsm_kernel_RT_sve.c | |||
| DTRSMKERNEL_LN = trsm_kernel_LN_sve.c | |||
| DTRSMKERNEL_LT = trsm_kernel_LT_sve.c | |||
| DTRSMKERNEL_RN = trsm_kernel_RN_sve.c | |||
| DTRSMKERNEL_RT = trsm_kernel_RT_sve.c | |||
| TRSMCOPYLN_M = trsm_lncopy_sve.c | |||
| TRSMCOPYLT_M = trsm_ltcopy_sve.c | |||
| TRSMCOPYUN_M = trsm_uncopy_sve.c | |||
| TRSMCOPYUT_M = trsm_utcopy_sve.c | |||
| CTRSMKERNEL_LN = trsm_kernel_LN_sve.c | |||
| CTRSMKERNEL_LT = trsm_kernel_LT_sve.c | |||
| CTRSMKERNEL_RN = trsm_kernel_RN_sve.c | |||
| CTRSMKERNEL_RT = trsm_kernel_RT_sve.c | |||
| ZTRSMKERNEL_LN = trsm_kernel_LN_sve.c | |||
| ZTRSMKERNEL_LT = trsm_kernel_LT_sve.c | |||
| ZTRSMKERNEL_RN = trsm_kernel_RN_sve.c | |||
| ZTRSMKERNEL_RT = trsm_kernel_RT_sve.c | |||
| ZTRSMCOPYLN_M = ztrsm_lncopy_sve.c | |||
| ZTRSMCOPYLT_M = ztrsm_ltcopy_sve.c | |||
| ZTRSMCOPYUN_M = ztrsm_uncopy_sve.c | |||
| ZTRSMCOPYUT_M = ztrsm_utcopy_sve.c | |||
| SAMAXKERNEL = amax.S | |||
| DAMAXKERNEL = amax.S | |||
| CAMAXKERNEL = zamax.S | |||
| ZAMAXKERNEL = zamax.S | |||
| SAXPYKERNEL = axpy.S | |||
| DAXPYKERNEL = axpy.S | |||
| CAXPYKERNEL = zaxpy.S | |||
| ZAXPYKERNEL = zaxpy.S | |||
| SROTKERNEL = rot.S | |||
| DROTKERNEL = rot.S | |||
| CROTKERNEL = zrot.S | |||
| ZROTKERNEL = zrot.S | |||
| SSCALKERNEL = scal.S | |||
| DSCALKERNEL = scal.S | |||
| CSCALKERNEL = zscal.S | |||
| ZSCALKERNEL = zscal.S | |||
| SGEMVNKERNEL = gemv_n.S | |||
| DGEMVNKERNEL = gemv_n.S | |||
| CGEMVNKERNEL = zgemv_n.S | |||
| ZGEMVNKERNEL = zgemv_n.S | |||
| SGEMVTKERNEL = gemv_t.S | |||
| DGEMVTKERNEL = gemv_t.S | |||
| CGEMVTKERNEL = zgemv_t.S | |||
| ZGEMVTKERNEL = zgemv_t.S | |||
| SASUMKERNEL = asum.S | |||
| DASUMKERNEL = asum.S | |||
| CASUMKERNEL = casum.S | |||
| ZASUMKERNEL = zasum.S | |||
| SCOPYKERNEL = copy.S | |||
| DCOPYKERNEL = copy.S | |||
| CCOPYKERNEL = copy.S | |||
| ZCOPYKERNEL = copy.S | |||
| SSWAPKERNEL = swap.S | |||
| DSWAPKERNEL = swap.S | |||
| CSWAPKERNEL = swap.S | |||
| ZSWAPKERNEL = swap.S | |||
| ISAMAXKERNEL = iamax.S | |||
| IDAMAXKERNEL = iamax.S | |||
| ICAMAXKERNEL = izamax.S | |||
| IZAMAXKERNEL = izamax.S | |||
| SNRM2KERNEL = nrm2.S | |||
| DNRM2KERNEL = nrm2.S | |||
| CNRM2KERNEL = znrm2.S | |||
| ZNRM2KERNEL = znrm2.S | |||
| DDOTKERNEL = dot.S | |||
| ifneq ($(C_COMPILER), PGI) | |||
| SDOTKERNEL = ../generic/dot.c | |||
| else | |||
| SDOTKERNEL = dot.S | |||
| endif | |||
| ifneq ($(C_COMPILER), PGI) | |||
| CDOTKERNEL = zdot.S | |||
| ZDOTKERNEL = zdot.S | |||
| else | |||
| CDOTKERNEL = ../arm/zdot.c | |||
| ZDOTKERNEL = ../arm/zdot.c | |||
| endif | |||
| DSDOTKERNEL = dot.S | |||
| DGEMM_BETA = dgemm_beta.S | |||
| SGEMM_BETA = sgemm_beta.S | |||
| SGEMMKERNEL = sgemm_kernel_sve_v2x$(SGEMM_UNROLL_N).S | |||
| STRMMKERNEL = strmm_kernel_sve_v1x$(SGEMM_UNROLL_N).S | |||
| SGEMMINCOPY = sgemm_ncopy_sve_v1.c | |||
| SGEMMITCOPY = sgemm_tcopy_sve_v1.c | |||
| SGEMMONCOPY = sgemm_ncopy_$(DGEMM_UNROLL_N).S | |||
| SGEMMOTCOPY = sgemm_tcopy_$(DGEMM_UNROLL_N).S | |||
| SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| STRMMUNCOPY_M = trmm_uncopy_sve_v1.c | |||
| STRMMLNCOPY_M = trmm_lncopy_sve_v1.c | |||
| STRMMUTCOPY_M = trmm_utcopy_sve_v1.c | |||
| STRMMLTCOPY_M = trmm_ltcopy_sve_v1.c | |||
| SSYMMUCOPY_M = symm_ucopy_sve.c | |||
| SSYMMLCOPY_M = symm_lcopy_sve.c | |||
| DGEMMKERNEL = dgemm_kernel_sve_v2x$(DGEMM_UNROLL_N).S | |||
| DTRMMKERNEL = dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S | |||
| DGEMMINCOPY = dgemm_ncopy_sve_v1.c | |||
| DGEMMITCOPY = dgemm_tcopy_sve_v1.c | |||
| DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S | |||
| DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S | |||
| DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| DTRMMUNCOPY_M = trmm_uncopy_sve_v1.c | |||
| DTRMMLNCOPY_M = trmm_lncopy_sve_v1.c | |||
| DTRMMUTCOPY_M = trmm_utcopy_sve_v1.c | |||
| DTRMMLTCOPY_M = trmm_ltcopy_sve_v1.c | |||
| DSYMMUCOPY_M = symm_ucopy_sve.c | |||
| DSYMMLCOPY_M = symm_lcopy_sve.c | |||
| CGEMMKERNEL = cgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S | |||
| CTRMMKERNEL = ctrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S | |||
| CGEMMINCOPY = cgemm_ncopy_sve_v1.c | |||
| CGEMMITCOPY = cgemm_tcopy_sve_v1.c | |||
| CGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c | |||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c | |||
| CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| CTRMMUNCOPY_M = ztrmm_uncopy_sve_v1.c | |||
| CTRMMLNCOPY_M = ztrmm_lncopy_sve_v1.c | |||
| CTRMMUTCOPY_M = ztrmm_utcopy_sve_v1.c | |||
| CTRMMLTCOPY_M = ztrmm_ltcopy_sve_v1.c | |||
| CHEMMLTCOPY_M = zhemm_ltcopy_sve.c | |||
| CHEMMUTCOPY_M = zhemm_utcopy_sve.c | |||
| CSYMMUCOPY_M = zsymm_ucopy_sve.c | |||
| CSYMMLCOPY_M = zsymm_lcopy_sve.c | |||
| ZGEMMKERNEL = zgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S | |||
| ZTRMMKERNEL = ztrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S | |||
| ZGEMMINCOPY = zgemm_ncopy_sve_v1.c | |||
| ZGEMMITCOPY = zgemm_tcopy_sve_v1.c | |||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c | |||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c | |||
| ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| ZTRMMUNCOPY_M = ztrmm_uncopy_sve_v1.c | |||
| ZTRMMLNCOPY_M = ztrmm_lncopy_sve_v1.c | |||
| ZTRMMUTCOPY_M = ztrmm_utcopy_sve_v1.c | |||
| ZTRMMLTCOPY_M = ztrmm_ltcopy_sve_v1.c | |||
| ZHEMMLTCOPY_M = zhemm_ltcopy_sve.c | |||
| ZHEMMUTCOPY_M = zhemm_utcopy_sve.c | |||
| ZSYMMUCOPY_M = zsymm_ucopy_sve.c | |||
| ZSYMMLCOPY_M = zsymm_lcopy_sve.c | |||
| @@ -0,0 +1 @@ | |||
| include $(KERNELDIR)/KERNEL.CORTEXA57 | |||
| @@ -0,0 +1,216 @@ | |||
| SAMINKERNEL = ../arm/amin.c | |||
| DAMINKERNEL = ../arm/amin.c | |||
| CAMINKERNEL = ../arm/zamin.c | |||
| ZAMINKERNEL = ../arm/zamin.c | |||
| SMAXKERNEL = ../arm/max.c | |||
| DMAXKERNEL = ../arm/max.c | |||
| SMINKERNEL = ../arm/min.c | |||
| DMINKERNEL = ../arm/min.c | |||
| ISAMINKERNEL = ../arm/iamin.c | |||
| IDAMINKERNEL = ../arm/iamin.c | |||
| ICAMINKERNEL = ../arm/izamin.c | |||
| IZAMINKERNEL = ../arm/izamin.c | |||
| ISMAXKERNEL = ../arm/imax.c | |||
| IDMAXKERNEL = ../arm/imax.c | |||
| ISMINKERNEL = ../arm/imin.c | |||
| IDMINKERNEL = ../arm/imin.c | |||
| STRSMKERNEL_LN = trsm_kernel_LN_sve.c | |||
| STRSMKERNEL_LT = trsm_kernel_LT_sve.c | |||
| STRSMKERNEL_RN = trsm_kernel_RN_sve.c | |||
| STRSMKERNEL_RT = trsm_kernel_RT_sve.c | |||
| DTRSMKERNEL_LN = trsm_kernel_LN_sve.c | |||
| DTRSMKERNEL_LT = trsm_kernel_LT_sve.c | |||
| DTRSMKERNEL_RN = trsm_kernel_RN_sve.c | |||
| DTRSMKERNEL_RT = trsm_kernel_RT_sve.c | |||
| TRSMCOPYLN_M = trsm_lncopy_sve.c | |||
| TRSMCOPYLT_M = trsm_ltcopy_sve.c | |||
| TRSMCOPYUN_M = trsm_uncopy_sve.c | |||
| TRSMCOPYUT_M = trsm_utcopy_sve.c | |||
| CTRSMKERNEL_LN = trsm_kernel_LN_sve.c | |||
| CTRSMKERNEL_LT = trsm_kernel_LT_sve.c | |||
| CTRSMKERNEL_RN = trsm_kernel_RN_sve.c | |||
| CTRSMKERNEL_RT = trsm_kernel_RT_sve.c | |||
| ZTRSMKERNEL_LN = trsm_kernel_LN_sve.c | |||
| ZTRSMKERNEL_LT = trsm_kernel_LT_sve.c | |||
| ZTRSMKERNEL_RN = trsm_kernel_RN_sve.c | |||
| ZTRSMKERNEL_RT = trsm_kernel_RT_sve.c | |||
| ZTRSMCOPYLN_M = ztrsm_lncopy_sve.c | |||
| ZTRSMCOPYLT_M = ztrsm_ltcopy_sve.c | |||
| ZTRSMCOPYUN_M = ztrsm_uncopy_sve.c | |||
| ZTRSMCOPYUT_M = ztrsm_utcopy_sve.c | |||
| SAMAXKERNEL = amax.S | |||
| DAMAXKERNEL = amax.S | |||
| CAMAXKERNEL = zamax.S | |||
| ZAMAXKERNEL = zamax.S | |||
| SAXPYKERNEL = axpy.S | |||
| DAXPYKERNEL = axpy.S | |||
| CAXPYKERNEL = zaxpy.S | |||
| ZAXPYKERNEL = zaxpy.S | |||
| SROTKERNEL = rot.S | |||
| DROTKERNEL = rot.S | |||
| CROTKERNEL = zrot.S | |||
| ZROTKERNEL = zrot.S | |||
| SSCALKERNEL = scal.S | |||
| DSCALKERNEL = scal.S | |||
| CSCALKERNEL = zscal.S | |||
| ZSCALKERNEL = zscal.S | |||
| SGEMVNKERNEL = gemv_n.S | |||
| DGEMVNKERNEL = gemv_n.S | |||
| CGEMVNKERNEL = zgemv_n.S | |||
| ZGEMVNKERNEL = zgemv_n.S | |||
| SGEMVTKERNEL = gemv_t.S | |||
| DGEMVTKERNEL = gemv_t.S | |||
| CGEMVTKERNEL = zgemv_t.S | |||
| ZGEMVTKERNEL = zgemv_t.S | |||
| SASUMKERNEL = asum.S | |||
| DASUMKERNEL = asum.S | |||
| CASUMKERNEL = casum.S | |||
| ZASUMKERNEL = zasum.S | |||
| SCOPYKERNEL = copy.S | |||
| DCOPYKERNEL = copy.S | |||
| CCOPYKERNEL = copy.S | |||
| ZCOPYKERNEL = copy.S | |||
| SSWAPKERNEL = swap.S | |||
| DSWAPKERNEL = swap.S | |||
| CSWAPKERNEL = swap.S | |||
| ZSWAPKERNEL = swap.S | |||
| ISAMAXKERNEL = iamax.S | |||
| IDAMAXKERNEL = iamax.S | |||
| ICAMAXKERNEL = izamax.S | |||
| IZAMAXKERNEL = izamax.S | |||
| SNRM2KERNEL = nrm2.S | |||
| DNRM2KERNEL = nrm2.S | |||
| CNRM2KERNEL = znrm2.S | |||
| ZNRM2KERNEL = znrm2.S | |||
| DDOTKERNEL = dot.S | |||
| ifneq ($(C_COMPILER), PGI) | |||
| SDOTKERNEL = ../generic/dot.c | |||
| else | |||
| SDOTKERNEL = dot.S | |||
| endif | |||
| ifneq ($(C_COMPILER), PGI) | |||
| CDOTKERNEL = zdot.S | |||
| ZDOTKERNEL = zdot.S | |||
| else | |||
| CDOTKERNEL = ../arm/zdot.c | |||
| ZDOTKERNEL = ../arm/zdot.c | |||
| endif | |||
| DSDOTKERNEL = dot.S | |||
| DGEMM_BETA = dgemm_beta.S | |||
| SGEMM_BETA = sgemm_beta.S | |||
| SGEMMKERNEL = sgemm_kernel_sve_v2x$(SGEMM_UNROLL_N).S | |||
| STRMMKERNEL = strmm_kernel_sve_v1x$(SGEMM_UNROLL_N).S | |||
| SGEMMINCOPY = sgemm_ncopy_sve_v1.c | |||
| SGEMMITCOPY = sgemm_tcopy_sve_v1.c | |||
| SGEMMONCOPY = sgemm_ncopy_$(DGEMM_UNROLL_N).S | |||
| SGEMMOTCOPY = sgemm_tcopy_$(DGEMM_UNROLL_N).S | |||
| SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| STRMMUNCOPY_M = trmm_uncopy_sve_v1.c | |||
| STRMMLNCOPY_M = trmm_lncopy_sve_v1.c | |||
| STRMMUTCOPY_M = trmm_utcopy_sve_v1.c | |||
| STRMMLTCOPY_M = trmm_ltcopy_sve_v1.c | |||
| SSYMMUCOPY_M = symm_ucopy_sve.c | |||
| SSYMMLCOPY_M = symm_lcopy_sve.c | |||
| DGEMMKERNEL = dgemm_kernel_sve_v2x$(DGEMM_UNROLL_N).S | |||
| DTRMMKERNEL = dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S | |||
| DGEMMINCOPY = dgemm_ncopy_sve_v1.c | |||
| DGEMMITCOPY = dgemm_tcopy_sve_v1.c | |||
| DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S | |||
| DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S | |||
| DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| DTRMMUNCOPY_M = trmm_uncopy_sve_v1.c | |||
| DTRMMLNCOPY_M = trmm_lncopy_sve_v1.c | |||
| DTRMMUTCOPY_M = trmm_utcopy_sve_v1.c | |||
| DTRMMLTCOPY_M = trmm_ltcopy_sve_v1.c | |||
| DSYMMUCOPY_M = symm_ucopy_sve.c | |||
| DSYMMLCOPY_M = symm_lcopy_sve.c | |||
| CGEMMKERNEL = cgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S | |||
| CTRMMKERNEL = ctrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S | |||
| CGEMMINCOPY = cgemm_ncopy_sve_v1.c | |||
| CGEMMITCOPY = cgemm_tcopy_sve_v1.c | |||
| CGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c | |||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c | |||
| CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| CTRMMUNCOPY_M = ztrmm_uncopy_sve_v1.c | |||
| CTRMMLNCOPY_M = ztrmm_lncopy_sve_v1.c | |||
| CTRMMUTCOPY_M = ztrmm_utcopy_sve_v1.c | |||
| CTRMMLTCOPY_M = ztrmm_ltcopy_sve_v1.c | |||
| CHEMMLTCOPY_M = zhemm_ltcopy_sve.c | |||
| CHEMMUTCOPY_M = zhemm_utcopy_sve.c | |||
| CSYMMUCOPY_M = zsymm_ucopy_sve.c | |||
| CSYMMLCOPY_M = zsymm_lcopy_sve.c | |||
| ZGEMMKERNEL = zgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S | |||
| ZTRMMKERNEL = ztrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S | |||
| ZGEMMINCOPY = zgemm_ncopy_sve_v1.c | |||
| ZGEMMITCOPY = zgemm_tcopy_sve_v1.c | |||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c | |||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c | |||
| ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| ZTRMMUNCOPY_M = ztrmm_uncopy_sve_v1.c | |||
| ZTRMMLNCOPY_M = ztrmm_lncopy_sve_v1.c | |||
| ZTRMMUTCOPY_M = ztrmm_utcopy_sve_v1.c | |||
| ZTRMMLTCOPY_M = ztrmm_ltcopy_sve_v1.c | |||
| ZHEMMLTCOPY_M = zhemm_ltcopy_sve.c | |||
| ZHEMMUTCOPY_M = zhemm_utcopy_sve.c | |||
| ZSYMMUCOPY_M = zsymm_ucopy_sve.c | |||
| ZSYMMLCOPY_M = zsymm_lcopy_sve.c | |||
| @@ -0,0 +1,3 @@ | |||
| include $(KERNELDIR)/KERNEL.CORTEXA57 | |||
| @@ -187,3 +187,14 @@ ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c | |||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c | |||
| ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| SBGEMM_BETA = sbgemm_beta_neoversen2.c | |||
| SBGEMMKERNEL = sbgemm_kernel_$(SBGEMM_UNROLL_M)x$(SBGEMM_UNROLL_N)_neoversen2.c | |||
| SBGEMMINCOPY = sbgemm_ncopy_neoversen2.c | |||
| SBGEMMITCOPY = sbgemm_tcopy_neoversen2.c | |||
| SBGEMMONCOPY = sbgemm_ncopy_neoversen2.c | |||
| SBGEMMOTCOPY = sbgemm_tcopy_neoversen2.c | |||
| SBGEMMINCOPYOBJ = sbgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| SBGEMMITCOPYOBJ = sbgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| SBGEMMONCOPYOBJ = sbgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| SBGEMMOTCOPYOBJ = sbgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| @@ -0,0 +1,173 @@ | |||
| ifndef DSDOTKERNEL | |||
| DSDOTKERNEL = ../generic/dot.c | |||
| endif | |||
| SGEMM_BETA = ../generic/gemm_beta.c | |||
| DGEMM_BETA = ../generic/gemm_beta.c | |||
| CGEMM_BETA = ../generic/zgemm_beta.c | |||
| ZGEMM_BETA = ../generic/zgemm_beta.c | |||
| STRMMKERNEL = ../generic/trmmkernel_2x2.c | |||
| DTRMMKERNEL = ../generic/trmmkernel_2x2.c | |||
| CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c | |||
| ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c | |||
| SGEMMKERNEL = ../generic/gemmkernel_2x2.c | |||
| SGEMMONCOPY = ../generic/gemm_ncopy_2.c | |||
| SGEMMOTCOPY = ../generic/gemm_tcopy_2.c | |||
| SGEMMONCOPYOBJ = sgemm_oncopy.o | |||
| SGEMMOTCOPYOBJ = sgemm_otcopy.o | |||
| DGEMMKERNEL = ../generic/gemmkernel_2x2.c | |||
| DGEMMONCOPY = ../generic/gemm_ncopy_2.c | |||
| DGEMMOTCOPY = ../generic/gemm_tcopy_2.c | |||
| DGEMMONCOPYOBJ = dgemm_oncopy.o | |||
| DGEMMOTCOPYOBJ = dgemm_otcopy.o | |||
| CGEMMKERNEL = ../generic/zgemmkernel_2x2.c | |||
| CGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||
| CGEMMONCOPYOBJ = cgemm_oncopy.o | |||
| CGEMMOTCOPYOBJ = cgemm_otcopy.o | |||
| ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c | |||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||
| ZGEMMONCOPYOBJ = zgemm_oncopy.o | |||
| ZGEMMOTCOPYOBJ = zgemm_otcopy.o | |||
| STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| #Todo: CGEMM3MKERNEL should be 4x4 blocksizes. | |||
| CGEMM3MKERNEL = zgemm3m_kernel_8x4_sse3.S | |||
| ZGEMM3MKERNEL = zgemm3m_kernel_4x4_sse3.S | |||
| #Pure C for other kernels | |||
| SAMAXKERNEL = ../arm/amax.c | |||
| DAMAXKERNEL = ../arm/amax.c | |||
| CAMAXKERNEL = ../arm/zamax.c | |||
| ZAMAXKERNEL = ../arm/zamax.c | |||
| SAMINKERNEL = ../arm/amin.c | |||
| DAMINKERNEL = ../arm/amin.c | |||
| CAMINKERNEL = ../arm/zamin.c | |||
| ZAMINKERNEL = ../arm/zamin.c | |||
| SMAXKERNEL = ../arm/max.c | |||
| DMAXKERNEL = ../arm/max.c | |||
| SMINKERNEL = ../arm/min.c | |||
| DMINKERNEL = ../arm/min.c | |||
| ISAMAXKERNEL = ../arm/iamax.c | |||
| IDAMAXKERNEL = ../arm/iamax.c | |||
| ICAMAXKERNEL = ../arm/izamax.c | |||
| IZAMAXKERNEL = ../arm/izamax.c | |||
| ISAMINKERNEL = ../arm/iamin.c | |||
| IDAMINKERNEL = ../arm/iamin.c | |||
| ICAMINKERNEL = ../arm/izamin.c | |||
| IZAMINKERNEL = ../arm/izamin.c | |||
| ISMAXKERNEL = ../arm/imax.c | |||
| IDMAXKERNEL = ../arm/imax.c | |||
| ISMINKERNEL = ../arm/imin.c | |||
| IDMINKERNEL = ../arm/imin.c | |||
| SASUMKERNEL = ../arm/asum.c | |||
| DASUMKERNEL = ../arm/asum.c | |||
| CASUMKERNEL = ../arm/zasum.c | |||
| ZASUMKERNEL = ../arm/zasum.c | |||
| SSUMKERNEL = ../arm/sum.c | |||
| DSUMKERNEL = ../arm/sum.c | |||
| CSUMKERNEL = ../arm/zsum.c | |||
| ZSUMKERNEL = ../arm/zsum.c | |||
| SAXPYKERNEL = ../arm/axpy.c | |||
| DAXPYKERNEL = ../arm/axpy.c | |||
| CAXPYKERNEL = ../arm/zaxpy.c | |||
| ZAXPYKERNEL = ../arm/zaxpy.c | |||
| SCOPYKERNEL = ../arm/copy.c | |||
| DCOPYKERNEL = ../arm/copy.c | |||
| CCOPYKERNEL = ../arm/zcopy.c | |||
| ZCOPYKERNEL = ../arm/zcopy.c | |||
| SDOTKERNEL = ../arm/dot.c | |||
| DDOTKERNEL = ../arm/dot.c | |||
| CDOTKERNEL = ../arm/zdot.c | |||
| ZDOTKERNEL = ../arm/zdot.c | |||
| SNRM2KERNEL = ../arm/nrm2.c | |||
| DNRM2KERNEL = ../arm/nrm2.c | |||
| CNRM2KERNEL = ../arm/znrm2.c | |||
| ZNRM2KERNEL = ../arm/znrm2.c | |||
| SROTKERNEL = ../arm/rot.c | |||
| DROTKERNEL = ../arm/rot.c | |||
| CROTKERNEL = ../arm/zrot.c | |||
| ZROTKERNEL = ../arm/zrot.c | |||
| SSCALKERNEL = ../arm/scal.c | |||
| DSCALKERNEL = ../arm/scal.c | |||
| CSCALKERNEL = ../arm/zscal.c | |||
| ZSCALKERNEL = ../arm/zscal.c | |||
| SSWAPKERNEL = ../arm/swap.c | |||
| DSWAPKERNEL = ../arm/swap.c | |||
| CSWAPKERNEL = ../arm/zswap.c | |||
| ZSWAPKERNEL = ../arm/zswap.c | |||
| SGEMVNKERNEL = ../arm/gemv_n.c | |||
| DGEMVNKERNEL = ../arm/gemv_n.c | |||
| CGEMVNKERNEL = ../arm/zgemv_n.c | |||
| ZGEMVNKERNEL = ../arm/zgemv_n.c | |||
| SGEMVTKERNEL = ../arm/gemv_t.c | |||
| DGEMVTKERNEL = ../arm/gemv_t.c | |||
| CGEMVTKERNEL = ../arm/zgemv_t.c | |||
| ZGEMVTKERNEL = ../arm/zgemv_t.c | |||
| SSYMV_U_KERNEL = ../generic/symv_k.c | |||
| SSYMV_L_KERNEL = ../generic/symv_k.c | |||
| DSYMV_U_KERNEL = ../generic/symv_k.c | |||
| DSYMV_L_KERNEL = ../generic/symv_k.c | |||
| QSYMV_U_KERNEL = ../generic/symv_k.c | |||
| QSYMV_L_KERNEL = ../generic/symv_k.c | |||
| CSYMV_U_KERNEL = ../generic/zsymv_k.c | |||
| CSYMV_L_KERNEL = ../generic/zsymv_k.c | |||
| ZSYMV_U_KERNEL = ../generic/zsymv_k.c | |||
| ZSYMV_L_KERNEL = ../generic/zsymv_k.c | |||
| XSYMV_U_KERNEL = ../generic/zsymv_k.c | |||
| XSYMV_L_KERNEL = ../generic/zsymv_k.c | |||
| ZHEMV_U_KERNEL = ../generic/zhemv_k.c | |||
| ZHEMV_L_KERNEL = ../generic/zhemv_k.c | |||
| LSAME_KERNEL = ../generic/lsame.c | |||
| SCABS_KERNEL = ../generic/cabs.c | |||
| DCABS_KERNEL = ../generic/cabs.c | |||
| QCABS_KERNEL = ../generic/cabs.c | |||
| #Dump kernel | |||
| CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c | |||
| ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c | |||
| @@ -404,6 +404,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| #else | |||
| nrm2_compute(n, x, inc_x, &ssq, &scale); | |||
| #endif | |||
| if (fabs(scale) <1.e-300) return 0.; | |||
| ssq = sqrt(ssq) * scale; | |||
| return ssq; | |||
| @@ -0,0 +1,83 @@ | |||
| /*************************************************************************** | |||
| * Copyright (c) 2022, The OpenBLAS Project | |||
| * All rights reserved. | |||
| * Redistribution and use in source and binary forms, with or without | |||
| * modification, are permitted provided that the following conditions are | |||
| * met: | |||
| * 1. Redistributions of source code must retain the above copyright | |||
| * notice, this list of conditions and the following disclaimer. | |||
| * 2. Redistributions in binary form must reproduce the above copyright | |||
| * notice, this list of conditions and the following disclaimer in | |||
| * the documentation and/or other materials provided with the | |||
| * distribution. | |||
| * 3. Neither the name of the OpenBLAS project nor the names of | |||
| * its contributors may be used to endorse or promote products | |||
| * derived from this software without specific prior written permission. | |||
| * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |||
| * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | |||
| * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | |||
| * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | |||
| * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |||
| * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | |||
| * POSSIBILITY OF SUCH DAMAGE. | |||
| * *****************************************************************************/ | |||
| #include "common.h" | |||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, IFLOAT *dummy2, | |||
| BLASLONG dummy3, IFLOAT *dummy4, BLASLONG dummy5, FLOAT *c, | |||
| BLASLONG ldc) { | |||
| BLASLONG i, j; | |||
| BLASLONG chunk, remain; | |||
| FLOAT *c_offset1, *c_offset; | |||
| c_offset = c; | |||
| chunk = m >> 3; | |||
| remain = m & 7; | |||
| if (beta == ZERO) { | |||
| for (j = n; j > 0; j--) { | |||
| c_offset1 = c_offset; | |||
| c_offset += ldc; | |||
| for (i = chunk; i > 0; i--) { | |||
| *(c_offset1 + 0) = ZERO; | |||
| *(c_offset1 + 1) = ZERO; | |||
| *(c_offset1 + 2) = ZERO; | |||
| *(c_offset1 + 3) = ZERO; | |||
| *(c_offset1 + 4) = ZERO; | |||
| *(c_offset1 + 5) = ZERO; | |||
| *(c_offset1 + 6) = ZERO; | |||
| *(c_offset1 + 7) = ZERO; | |||
| c_offset1 += 8; | |||
| } | |||
| for (i = remain; i > 0; i--) { | |||
| *c_offset1 = ZERO; | |||
| c_offset1++; | |||
| } | |||
| } | |||
| } else { | |||
| for (j = n; j > 0; j--) { | |||
| c_offset1 = c_offset; | |||
| c_offset += ldc; | |||
| for (i = chunk; i > 0; i--) { | |||
| *(c_offset1 + 0) *= beta; | |||
| *(c_offset1 + 1) *= beta; | |||
| *(c_offset1 + 2) *= beta; | |||
| *(c_offset1 + 3) *= beta; | |||
| *(c_offset1 + 4) *= beta; | |||
| *(c_offset1 + 5) *= beta; | |||
| *(c_offset1 + 6) *= beta; | |||
| *(c_offset1 + 7) *= beta; | |||
| c_offset1 += 8; | |||
| } | |||
| for (i = remain; i > 0; i--) { | |||
| *c_offset1 *= beta; | |||
| c_offset1++; | |||
| } | |||
| } | |||
| } | |||
| return 0; | |||
| }; | |||
| @@ -0,0 +1,45 @@ | |||
| /*************************************************************************** | |||
| * Copyright (c) 2022, The OpenBLAS Project | |||
| * All rights reserved. | |||
| * Redistribution and use in source and binary forms, with or without | |||
| * modification, are permitted provided that the following conditions are | |||
| * met: | |||
| * 1. Redistributions of source code must retain the above copyright | |||
| * notice, this list of conditions and the following disclaimer. | |||
| * 2. Redistributions in binary form must reproduce the above copyright | |||
| * notice, this list of conditions and the following disclaimer in | |||
| * the documentation and/or other materials provided with the | |||
| * distribution. | |||
| * 3. Neither the name of the OpenBLAS project nor the names of | |||
| * its contributors may be used to endorse or promote products | |||
| * derived from this software without specific prior written permission. | |||
| * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |||
| * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | |||
| * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | |||
| * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | |||
| * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |||
| * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | |||
| * POSSIBILITY OF SUCH DAMAGE. | |||
| * *****************************************************************************/ | |||
| #include <arm_sve.h> | |||
| #include "common.h" | |||
| #define ALPHA_ONE | |||
| #include "sbgemm_kernel_8x4_neoversen2_impl.c" | |||
| #undef ALPHA_ONE | |||
| #include "sbgemm_kernel_8x4_neoversen2_impl.c" | |||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT *A, IFLOAT *B, | |||
| FLOAT *C, BLASLONG ldc) { | |||
| if (alpha == 1.0f) | |||
| return sbgemm_kernel_neoversen2_alpha_one(m, n, k, alpha, A, B, C, ldc); | |||
| else | |||
| return sbgemm_kernel_neoversen2_alpha(m, n, k, alpha, A, B, C, ldc); | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,665 @@ | |||
| /*************************************************************************** | |||
| * Copyright (c) 2022, The OpenBLAS Project | |||
| * All rights reserved. | |||
| * Redistribution and use in source and binary forms, with or without | |||
| * modification, are permitted provided that the following conditions are | |||
| * met: | |||
| * 1. Redistributions of source code must retain the above copyright | |||
| * notice, this list of conditions and the following disclaimer. | |||
| * 2. Redistributions in binary form must reproduce the above copyright | |||
| * notice, this list of conditions and the following disclaimer in | |||
| * the documentation and/or other materials provided with the | |||
| * distribution. | |||
| * 3. Neither the name of the OpenBLAS project nor the names of | |||
| * its contributors may be used to endorse or promote products | |||
| * derived from this software without specific prior written permission. | |||
| * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |||
| * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | |||
| * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | |||
| * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | |||
| * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |||
| * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | |||
| * POSSIBILITY OF SUCH DAMAGE. | |||
| * *****************************************************************************/ | |||
| #include <arm_sve.h> | |||
| #include "common.h" | |||
| #ifdef ALPHA_ONE | |||
| #define LOAD_C(M, N) \ | |||
| mc##M##N = svld1_gather_index(pg32, ptr_c0##N + 2 * M , off_vc); | |||
| #define LOAD_C_LOW(M, N) \ | |||
| mc##M##N = svld1_gather_index(pg32_low, ptr_c0##N + 2 * M, off_vc); | |||
| #define LOAD_C_EVEN(M, N) \ | |||
| mc##M##N = svld1_gather_index(pg32_even, ptr_c0##N + 2 * M, off_vc); | |||
| #define LOAD_C_FIRST(M, N) \ | |||
| mc##M##N = svld1_gather_index(pg32_first, ptr_c0##N + 2 * M, off_vc); | |||
| #define STORE_C(M, N) \ | |||
| svst1_scatter_index(pg32, ptr_c0##N + 2 * M, off_vc, mc##M##N); | |||
| #define STORE_C_LOW(M, N) \ | |||
| svst1_scatter_index(pg32_low, ptr_c0##N + 2 * M, off_vc, mc##M##N); | |||
| #define STORE_C_EVEN(M, N) \ | |||
| svst1_scatter_index(pg32_even, ptr_c0##N + 2 * M, off_vc, mc##M##N); | |||
| #define STORE_C_FIRST(M, N) \ | |||
| svst1_scatter_index(pg32_first, ptr_c0##N + 2 * M, off_vc, mc##M##N); | |||
| #else | |||
| #define LOAD_C(M, N) \ | |||
| mc##M##N = svdup_f32(0); \ | |||
| oc##M##N = svld1_gather_index(pg32, ptr_c0##N + 2 * M , off_vc); | |||
| #define LOAD_C_LOW(M, N) \ | |||
| mc##M##N = svdup_f32(0); \ | |||
| oc##M##N = svld1_gather_index(pg32_low, ptr_c0##N + 2 * M , off_vc); | |||
| #define LOAD_C_EVEN(M, N) \ | |||
| mc##M##N = svdup_f32(0); \ | |||
| oc##M##N = svld1_gather_index(pg32_even, ptr_c0##N + 2 * M , off_vc); | |||
| #define LOAD_C_FIRST(M, N) \ | |||
| mc##M##N = svdup_f32(0); \ | |||
| oc##M##N = svld1_gather_index(pg32_first, ptr_c0##N + 2 * M , off_vc); | |||
| #define STORE_C(M, N) \ | |||
| mc##M##N = svmad_z(pg32, svalpha, mc##M##N, oc##M##N); \ | |||
| svst1_scatter_index(pg32, ptr_c0##N + 2 * M, off_vc, mc##M##N); | |||
| #define STORE_C_LOW(M, N) \ | |||
| mc##M##N = svmad_z(pg32_low, svalpha, mc##M##N, oc##M##N); \ | |||
| svst1_scatter_index(pg32_low, ptr_c0##N + 2 * M, off_vc, mc##M##N); | |||
| #define STORE_C_EVEN(M, N) \ | |||
| mc##M##N = svmad_z(pg32_even, svalpha, mc##M##N, oc##M##N); \ | |||
| svst1_scatter_index(pg32_even, ptr_c0##N + 2 * M, off_vc, mc##M##N); | |||
| #define STORE_C_FIRST(M, N) \ | |||
| mc##M##N = svmad_z(pg32_first, svalpha, mc##M##N, oc##M##N); \ | |||
| svst1_scatter_index(pg32_first, ptr_c0##N + 2 * M, off_vc, mc##M##N); | |||
| #endif | |||
| #define LOAD_A(M) ma##M = svld1_bf16(pg16, ptr_a##M); | |||
| #define LOAD_B(N) mb##N = svld1_bf16(pg16, ptr_b##N); | |||
| #define MATMUL(M, N) mc##M##N = svbfmmla(mc##M##N, ma##M, mb##N); | |||
| #define LOAD_KREST_1(NAME, M) \ | |||
| m##NAME##M = svdupq_bf16(*(ptr_##NAME##M), zero, zero, zero, \ | |||
| *(ptr_##NAME##M + 1), zero, zero, zero); | |||
| #define LOAD_KREST_1_LOW(NAME, M) \ | |||
| m##NAME##M = svdupq_bf16(*(ptr_##NAME##M), zero, zero, zero, zero, zero, \ | |||
| zero, zero); | |||
| #define LOAD_KREST_2(NAME, M) \ | |||
| m##NAME##M = \ | |||
| svdupq_bf16(*(ptr_##NAME##M), *(ptr_##NAME##M + 1), zero, zero, \ | |||
| *(ptr_##NAME##M + 2), *(ptr_##NAME##M + 3), zero, zero); | |||
| #define LOAD_KREST_2_LOW(NAME, M) \ | |||
| m##NAME##M = svdupq_bf16(*(ptr_##NAME##M), *(ptr_##NAME##M + 1), zero, \ | |||
| zero, zero, zero, zero, zero); | |||
| #define LOAD_KREST_3(NAME, M) \ | |||
| m##NAME##M = \ | |||
| svdupq_bf16(*(ptr_##NAME##M), *(ptr_##NAME##M + 1), \ | |||
| *(ptr_##NAME##M + 2), zero, *(ptr_##NAME##M + 3), \ | |||
| *(ptr_##NAME##M + 4), *(ptr_##NAME##M + 5), zero); | |||
| #define LOAD_KREST_3_LOW(NAME, M) \ | |||
| m##NAME##M = \ | |||
| svdupq_bf16(*(ptr_##NAME##M), *(ptr_##NAME##M + 1), \ | |||
| *(ptr_##NAME##M + 2), zero, zero, zero, zero, zero); | |||
| #ifdef ALPHA_ONE | |||
| int sbgemm_kernel_neoversen2_alpha_one(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, IFLOAT * B, FLOAT * C, BLASLONG ldc) | |||
| #else | |||
| int sbgemm_kernel_neoversen2_alpha(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, IFLOAT * B, FLOAT * C, BLASLONG ldc) | |||
| #endif | |||
| { | |||
| bfloat16_t *ptr_a = (bfloat16_t *)A; | |||
| bfloat16_t *ptr_b = (bfloat16_t *)B; | |||
| FLOAT *ptr_c = C; | |||
| bfloat16_t *ptr_a0, *ptr_a1, *ptr_a2, *ptr_a3; | |||
| bfloat16_t *ptr_b0, *ptr_b1; | |||
| FLOAT *ptr_c00, *ptr_c01; | |||
| svbfloat16_t ma0, ma1, ma2, ma3, mb0, mb1; | |||
| svfloat32_t mc00, mc01, mc10, mc11, mc20, mc21, mc30, mc31; | |||
| #ifndef ALPHA_ONE | |||
| svfloat32_t oc00, oc01, oc10, oc11, oc20, oc21, oc30, oc31; | |||
| #endif | |||
| svbool_t pg16 = svptrue_b16(); | |||
| svbool_t pg16_low = svdupq_b16(1, 1, 1, 1, 0, 0, 0, 0); | |||
| svbool_t pg32 = svptrue_b32(); | |||
| svbool_t pg32_low = svdupq_b32(1, 1, 0, 0); | |||
| svbool_t pg32_even = svdupq_b32(1, 0, 1, 0); | |||
| svbool_t pg32_first = svdupq_b32(1, 0, 0, 0); | |||
| svfloat32_t svalpha = svdup_f32(alpha); | |||
| bfloat16 tmp = 0; | |||
| bfloat16_t zero = *((bfloat16_t *)&tmp); | |||
| BLASLONG krest = k & 3; | |||
| // 00 01 10 11 | |||
| svuint32_t off_vc = svdupq_u32(0, (uint32_t)ldc, 1, (uint32_t)ldc + 1); | |||
| for (BLASLONG j = 0; j < n / 4; j++) { | |||
| ptr_c00 = ptr_c; | |||
| ptr_c01 = ptr_c + 2 * ldc; | |||
| ptr_c += 4 * ldc; | |||
| ptr_a = (bfloat16_t *)A; | |||
| for (BLASLONG i = 0; i < m / 8; i++) { | |||
| ptr_a0 = ptr_a; | |||
| ptr_a1 = ptr_a0 + 2 * k; | |||
| ptr_a2 = ptr_a1 + 2 * k; | |||
| ptr_a3 = ptr_a2 + 2 * k; | |||
| ptr_a += 8 * k; | |||
| ptr_b0 = ptr_b; | |||
| ptr_b1 = ptr_b0 + 2 * k; | |||
| LOAD_C(0, 0); LOAD_C(0, 1); | |||
| LOAD_C(1, 0); LOAD_C(1, 1); | |||
| LOAD_C(2, 0); LOAD_C(2, 1); | |||
| LOAD_C(3, 0); LOAD_C(3, 1); | |||
| for (BLASLONG p = 0; p < k / 4; p++) { | |||
| LOAD_A(0); LOAD_A(1); LOAD_A(2); LOAD_A(3); | |||
| LOAD_B(0); LOAD_B(1); | |||
| MATMUL(0, 0); MATMUL(0, 1); | |||
| MATMUL(1, 0); MATMUL(1, 1); | |||
| MATMUL(2, 0); MATMUL(2, 1); | |||
| MATMUL(3, 0); MATMUL(3, 1); | |||
| ptr_a0 += 8; ptr_a1 += 8; ptr_a2 += 8; ptr_a3 += 8; | |||
| ptr_b0 += 8; ptr_b1 += 8; | |||
| } | |||
| if (krest) { | |||
| if (krest == 1) { | |||
| LOAD_KREST_1(a, 0); LOAD_KREST_1(a, 1); | |||
| LOAD_KREST_1(a, 2); LOAD_KREST_1(a, 3); | |||
| LOAD_KREST_1(b, 0); LOAD_KREST_1(b, 1); | |||
| } else if (krest == 2) { | |||
| LOAD_KREST_2(a, 0); LOAD_KREST_2(a, 1); | |||
| LOAD_KREST_2(a, 2); LOAD_KREST_2(a, 3); | |||
| LOAD_KREST_2(b, 0); LOAD_KREST_2(b, 1); | |||
| } else if (krest == 3) { | |||
| LOAD_KREST_3(a, 0); LOAD_KREST_3(a, 1); | |||
| LOAD_KREST_3(a, 2); LOAD_KREST_3(a, 3); | |||
| LOAD_KREST_3(b, 0); LOAD_KREST_3(b, 1); | |||
| } | |||
| MATMUL(0, 0); MATMUL(0, 1); | |||
| MATMUL(1, 0); MATMUL(1, 1); | |||
| MATMUL(2, 0); MATMUL(2, 1); | |||
| MATMUL(3, 0); MATMUL(3, 1); | |||
| } | |||
| STORE_C(0, 0); STORE_C(0, 1); | |||
| STORE_C(1, 0); STORE_C(1, 1); | |||
| STORE_C(2, 0); STORE_C(2, 1); | |||
| STORE_C(3, 0); STORE_C(3, 1); | |||
| ptr_c00 += 8; ptr_c01 += 8; | |||
| } | |||
| if (m & 4) { | |||
| ptr_a0 = ptr_a; | |||
| ptr_a1 = ptr_a0 + 2 * k; | |||
| ptr_a += 4 * k; | |||
| ptr_b0 = ptr_b; | |||
| ptr_b1 = ptr_b0 + 2 * k; | |||
| LOAD_C(0, 0); LOAD_C(0, 1); | |||
| LOAD_C(1, 0); LOAD_C(1, 1); | |||
| for (BLASLONG p = 0; p < k / 4; p++) { | |||
| LOAD_A(0); LOAD_A(1); | |||
| LOAD_B(0); LOAD_B(1); | |||
| MATMUL(0, 0); MATMUL(0, 1); | |||
| MATMUL(1, 0); MATMUL(1, 1); | |||
| ptr_a0 += 8; ptr_a1 += 8; | |||
| ptr_b0 += 8; ptr_b1 += 8; | |||
| } | |||
| if (krest) { | |||
| if (krest == 1) { | |||
| LOAD_KREST_1(a, 0); LOAD_KREST_1(a, 1); | |||
| LOAD_KREST_1(b, 0); LOAD_KREST_1(b, 1); | |||
| } else if (krest == 2) { | |||
| LOAD_KREST_2(a, 0); LOAD_KREST_2(a, 1); | |||
| LOAD_KREST_2(b, 0); LOAD_KREST_2(b, 1); | |||
| } else if (krest == 3) { | |||
| LOAD_KREST_3(a, 0); LOAD_KREST_3(a, 1); | |||
| LOAD_KREST_3(b, 0); LOAD_KREST_3(b, 1); | |||
| } | |||
| MATMUL(0, 0); MATMUL(0, 1); | |||
| MATMUL(1, 0); MATMUL(1, 1); | |||
| } | |||
| STORE_C(0, 0); STORE_C(0, 1); | |||
| STORE_C(1, 0); STORE_C(1, 1); | |||
| ptr_c00 += 4; ptr_c01 += 4; | |||
| } | |||
| if (m & 2) { | |||
| ptr_a0 = ptr_a; | |||
| ptr_a += 2 * k; | |||
| ptr_b0 = ptr_b; | |||
| ptr_b1 = ptr_b0 + 2 * k; | |||
| LOAD_C(0, 0); LOAD_C(0, 1); | |||
| for (BLASLONG p = 0; p < k / 4; p++) { | |||
| LOAD_A(0); | |||
| LOAD_B(0); LOAD_B(1); | |||
| MATMUL(0, 0); MATMUL(0, 1); | |||
| ptr_a0 += 8; | |||
| ptr_b0 += 8; ptr_b1 += 8; | |||
| } | |||
| if (krest) { | |||
| if (krest == 1) { | |||
| LOAD_KREST_1(a, 0); | |||
| LOAD_KREST_1(b, 0); LOAD_KREST_1(b, 1); | |||
| } else if (krest == 2) { | |||
| LOAD_KREST_2(a, 0); | |||
| LOAD_KREST_2(b, 0); LOAD_KREST_2(b, 1); | |||
| } else if (krest == 3) { | |||
| LOAD_KREST_3(a, 0); | |||
| LOAD_KREST_3(b, 0); LOAD_KREST_3(b, 1); | |||
| } | |||
| MATMUL(0, 0); MATMUL(0, 1); | |||
| } | |||
| STORE_C(0, 0); STORE_C(0, 1); | |||
| ptr_c00 += 2; ptr_c01 += 2; | |||
| } | |||
| if (m & 1) { | |||
| ptr_a0 = ptr_a; | |||
| ptr_b0 = ptr_b; | |||
| ptr_b1 = ptr_b0 + 2 * k; | |||
| LOAD_C_LOW(0, 0); LOAD_C_LOW(0, 1); | |||
| for (BLASLONG p = 0; p < k / 4; p++) { | |||
| ma0 = svld1_bf16(pg16_low, ptr_a0); | |||
| LOAD_B(0); LOAD_B(1); | |||
| MATMUL(0, 0); MATMUL(0, 1); | |||
| ptr_a0 += 4; | |||
| ptr_b0 += 8; | |||
| ptr_b1 += 8; | |||
| } | |||
| if (krest) { | |||
| if (krest == 1) { | |||
| LOAD_KREST_1_LOW(a, 0); | |||
| LOAD_KREST_1(b, 0); LOAD_KREST_1(b, 1); | |||
| } else if (krest == 2) { | |||
| LOAD_KREST_2_LOW(a, 0); | |||
| LOAD_KREST_2(b, 0); LOAD_KREST_2(b, 1); | |||
| } else if (krest == 3) { | |||
| LOAD_KREST_3_LOW(a, 0); | |||
| LOAD_KREST_3(b, 0); LOAD_KREST_3(b, 1); | |||
| } | |||
| MATMUL(0, 0); MATMUL(0, 1); | |||
| } | |||
| STORE_C_LOW(0, 0); STORE_C_LOW(0, 1); | |||
| } | |||
| ptr_b += 4 * k; | |||
| } | |||
| if (n & 2) { | |||
| ptr_c00 = ptr_c; | |||
| ptr_c += 2 * ldc; | |||
| ptr_a = (bfloat16_t *)A; | |||
| for (BLASLONG i = 0; i < m / 8; i++) { | |||
| ptr_a0 = ptr_a; | |||
| ptr_a1 = ptr_a0 + 2 * k; | |||
| ptr_a2 = ptr_a1 + 2 * k; | |||
| ptr_a3 = ptr_a2 + 2 * k; | |||
| ptr_a += 8 * k; | |||
| ptr_b0 = ptr_b; | |||
| LOAD_C(0, 0); | |||
| LOAD_C(1, 0); | |||
| LOAD_C(2, 0); | |||
| LOAD_C(3, 0); | |||
| for (BLASLONG p = 0; p < k / 4; p++) { | |||
| LOAD_A(0); LOAD_A(1); LOAD_A(2); LOAD_A(3); | |||
| LOAD_B(0); | |||
| MATMUL(0, 0); | |||
| MATMUL(1, 0); | |||
| MATMUL(2, 0); | |||
| MATMUL(3, 0); | |||
| ptr_a0 += 8; ptr_a1 += 8; ptr_a2 += 8; ptr_a3 += 8; | |||
| ptr_b0 += 8; | |||
| } | |||
| if (krest) { | |||
| if (krest == 1) { | |||
| LOAD_KREST_1(a, 0); LOAD_KREST_1(a, 1); | |||
| LOAD_KREST_1(a, 2); LOAD_KREST_1(a, 3); | |||
| LOAD_KREST_1(b, 0); | |||
| } else if (krest == 2) { | |||
| LOAD_KREST_2(a, 0); LOAD_KREST_2(a, 1); | |||
| LOAD_KREST_2(a, 2); LOAD_KREST_2(a, 3); | |||
| LOAD_KREST_2(b, 0); | |||
| } else if (krest == 3) { | |||
| LOAD_KREST_3(a, 0); LOAD_KREST_3(a, 1); | |||
| LOAD_KREST_3(a, 2); LOAD_KREST_3(a, 3); | |||
| LOAD_KREST_3(b, 0); | |||
| } | |||
| MATMUL(0, 0); | |||
| MATMUL(1, 0); | |||
| MATMUL(2, 0); | |||
| MATMUL(3, 0); | |||
| } | |||
| STORE_C(0, 0); | |||
| STORE_C(1, 0); | |||
| STORE_C(2, 0); | |||
| STORE_C(3, 0); | |||
| ptr_c00 += 8; | |||
| } | |||
| if (m & 4) { | |||
| ptr_a0 = ptr_a; | |||
| ptr_a1 = ptr_a0 + 2 * k; | |||
| ptr_a += 4 * k; | |||
| ptr_b0 = ptr_b; | |||
| LOAD_C(0, 0); | |||
| LOAD_C(1, 0); | |||
| for (BLASLONG p = 0; p < k / 4; p++) { | |||
| LOAD_A(0); LOAD_A(1); | |||
| LOAD_B(0); | |||
| MATMUL(0, 0); | |||
| MATMUL(1, 0); | |||
| ptr_a0 += 8; ptr_a1 += 8; | |||
| ptr_b0 += 8; | |||
| } | |||
| if (krest) { | |||
| if (krest == 1) { | |||
| LOAD_KREST_1(a, 0); LOAD_KREST_1(a, 1); | |||
| LOAD_KREST_1(b, 0); | |||
| } else if (krest == 2) { | |||
| LOAD_KREST_2(a, 0); LOAD_KREST_2(a, 1); | |||
| LOAD_KREST_2(b, 0); | |||
| } else if (krest == 3) { | |||
| LOAD_KREST_3(a, 0); LOAD_KREST_3(a, 1); | |||
| LOAD_KREST_3(b, 0); | |||
| } | |||
| MATMUL(0, 0); | |||
| MATMUL(1, 0); | |||
| } | |||
| STORE_C(0, 0) | |||
| STORE_C(1, 0) | |||
| ptr_c00 += 4; | |||
| } | |||
| if (m & 2) { | |||
| ptr_a0 = ptr_a; | |||
| ptr_a += 2 * k; | |||
| ptr_b0 = ptr_b; | |||
| LOAD_C(0, 0); | |||
| for (BLASLONG p = 0; p < k / 4; p++) { | |||
| LOAD_A(0); | |||
| LOAD_B(0); | |||
| MATMUL(0, 0); | |||
| ptr_a0 += 8; | |||
| ptr_b0 += 8; | |||
| } | |||
| if (krest) { | |||
| if (krest == 1) { | |||
| LOAD_KREST_1(a, 0); | |||
| LOAD_KREST_1(b, 0); | |||
| } else if (krest == 2) { | |||
| LOAD_KREST_2(a, 0); | |||
| LOAD_KREST_2(b, 0); | |||
| } else if (krest == 3) { | |||
| LOAD_KREST_3(a, 0); | |||
| LOAD_KREST_3(b, 0); | |||
| } | |||
| MATMUL(0, 0); | |||
| } | |||
| STORE_C(0, 0); | |||
| ptr_c00 += 2; | |||
| } | |||
| if (m & 1) { | |||
| ptr_a0 = ptr_a; | |||
| ptr_b0 = ptr_b; | |||
| LOAD_C(0, 0); | |||
| for (BLASLONG p = 0; p < k / 4; p++) { | |||
| ma0 = svld1_bf16(pg16_low, ptr_a0); | |||
| LOAD_B(0); | |||
| MATMUL(0, 0); | |||
| ptr_a0 += 4; | |||
| ptr_b0 += 8; | |||
| } | |||
| if (krest) { | |||
| if (krest == 1) { | |||
| LOAD_KREST_1_LOW(a, 0); | |||
| LOAD_KREST_1(b, 0); | |||
| } else if (krest == 2) { | |||
| LOAD_KREST_2_LOW(a, 0); | |||
| LOAD_KREST_2(b, 0); | |||
| } else if (krest == 3) { | |||
| LOAD_KREST_3_LOW(a, 0); | |||
| LOAD_KREST_3(b, 0); | |||
| } | |||
| MATMUL(0, 0); | |||
| } | |||
| STORE_C_LOW(0, 0); | |||
| } | |||
| ptr_b += 2 * k; | |||
| } | |||
| if (n & 1) { | |||
| ptr_c00 = ptr_c; | |||
| ptr_a = (bfloat16_t *) A; | |||
| for (BLASLONG i = 0; i < m / 8; i++) { | |||
| ptr_a0 = ptr_a; | |||
| ptr_a1 = ptr_a0 + 2 * k; | |||
| ptr_a2 = ptr_a1 + 2 * k; | |||
| ptr_a3 = ptr_a2 + 2 * k; | |||
| ptr_a += 8 * k; | |||
| ptr_b0 = ptr_b; | |||
| LOAD_C_EVEN(0, 0); | |||
| LOAD_C_EVEN(1, 0); | |||
| LOAD_C_EVEN(2, 0); | |||
| LOAD_C_EVEN(3, 0); | |||
| for (BLASLONG p = 0; p < k / 4; p++) { | |||
| LOAD_A(0); LOAD_A(1); LOAD_A(2); LOAD_A(3); | |||
| mb0 = svld1_bf16(pg16_low, ptr_b0); | |||
| MATMUL(0, 0); | |||
| MATMUL(1, 0); | |||
| MATMUL(2, 0); | |||
| MATMUL(3, 0); | |||
| ptr_a0 += 8; ptr_a1 += 8; ptr_a2 += 8; ptr_a3 += 8; | |||
| ptr_b0 += 4; | |||
| } | |||
| if (krest) { | |||
| if (krest == 1) { | |||
| LOAD_KREST_1(a, 0); LOAD_KREST_1(a, 1); | |||
| LOAD_KREST_1(a, 2); LOAD_KREST_1(a, 3); | |||
| LOAD_KREST_1_LOW(b, 0); | |||
| } else if (krest == 2) { | |||
| LOAD_KREST_2(a, 0); LOAD_KREST_2(a, 1); | |||
| LOAD_KREST_2(a, 2); LOAD_KREST_2(a, 3); | |||
| LOAD_KREST_2_LOW(b, 0); | |||
| } else if (krest == 3) { | |||
| LOAD_KREST_3(a, 0); LOAD_KREST_3(a, 1); | |||
| LOAD_KREST_3(a, 2); LOAD_KREST_3(a, 3); | |||
| LOAD_KREST_3_LOW(b, 0); | |||
| } | |||
| MATMUL(0, 0); | |||
| MATMUL(1, 0); | |||
| MATMUL(2, 0); | |||
| MATMUL(3, 0); | |||
| } | |||
| STORE_C_EVEN(0, 0) | |||
| STORE_C_EVEN(1, 0); | |||
| STORE_C_EVEN(2, 0); | |||
| STORE_C_EVEN(3, 0); | |||
| ptr_c00 += 8; | |||
| } | |||
| if (m & 4) { | |||
| ptr_a0 = ptr_a; | |||
| ptr_a1 = ptr_a0 + 2 * k; | |||
| ptr_a += 4 * k; | |||
| ptr_b0 = ptr_b; | |||
| LOAD_C_EVEN(0, 0); | |||
| LOAD_C_EVEN(1, 0); | |||
| for (BLASLONG p = 0; p < k / 4; p++) { | |||
| LOAD_A(0); LOAD_A(1); | |||
| mb0 = svld1_bf16(pg16_low, ptr_b0); | |||
| MATMUL(0, 0); | |||
| MATMUL(1, 0); | |||
| ptr_a0 += 8; ptr_a1 += 8; | |||
| ptr_b0 += 4; | |||
| } | |||
| if (krest) { | |||
| if (krest == 1) { | |||
| LOAD_KREST_1(a, 0); LOAD_KREST_1(a, 1); | |||
| LOAD_KREST_1_LOW(b, 0); | |||
| } else if (krest == 2) { | |||
| LOAD_KREST_2(a, 0); LOAD_KREST_2(a, 1); | |||
| LOAD_KREST_2_LOW(b, 0); | |||
| } else if (krest == 3) { | |||
| LOAD_KREST_3(a, 0); LOAD_KREST_3(a, 1); | |||
| LOAD_KREST_3_LOW(b, 0); | |||
| } | |||
| MATMUL(0, 0); | |||
| MATMUL(1, 0); | |||
| } | |||
| STORE_C_EVEN(0, 0) | |||
| STORE_C_EVEN(1, 0) | |||
| ptr_c00 += 4; | |||
| } | |||
| if (m & 2) { | |||
| ptr_a0 = ptr_a; | |||
| ptr_a += 2 * k; | |||
| ptr_b0 = ptr_b; | |||
| LOAD_C_EVEN(0, 0); | |||
| for (BLASLONG p = 0; p < k / 4; p++) { | |||
| LOAD_A(0); | |||
| mb0 = svld1_bf16(pg16_low, ptr_b0); | |||
| MATMUL(0, 0); | |||
| ptr_a0 += 8; | |||
| ptr_b0 += 4; | |||
| } | |||
| if (krest) { | |||
| if (krest == 1) { | |||
| LOAD_KREST_1(a, 0); | |||
| LOAD_KREST_1_LOW(b, 0); | |||
| } else if (krest == 2) { | |||
| LOAD_KREST_2(a, 0); | |||
| LOAD_KREST_2_LOW(b, 0); | |||
| } else if (krest == 3) { | |||
| LOAD_KREST_3(a, 0); | |||
| LOAD_KREST_3_LOW(b, 0); | |||
| } | |||
| MATMUL(0, 0); | |||
| } | |||
| STORE_C_EVEN(0, 0); | |||
| ptr_c00 += 2; | |||
| } | |||
| if (m & 1) { | |||
| ptr_a0 = ptr_a; | |||
| ptr_b0 = ptr_b; | |||
| LOAD_C_FIRST(0, 0); | |||
| for (BLASLONG p = 0; p < k / 4; p++) { | |||
| ma0 = svld1_bf16(pg16_low, ptr_a0); | |||
| mb0 = svld1_bf16(pg16_low, ptr_b0); | |||
| MATMUL(0, 0); | |||
| ptr_a0 += 4; | |||
| ptr_b0 += 4; | |||
| } | |||
| if (krest) { | |||
| if (krest == 1) { | |||
| LOAD_KREST_1_LOW(a, 0); | |||
| LOAD_KREST_1_LOW(b, 0); | |||
| } else if (krest == 2) { | |||
| LOAD_KREST_2_LOW(a, 0); | |||
| LOAD_KREST_2_LOW(b, 0); | |||
| } else if (krest == 3) { | |||
| LOAD_KREST_3_LOW(a, 0); | |||
| LOAD_KREST_3_LOW(b, 0); | |||
| } | |||
| MATMUL(0, 0); | |||
| } | |||
| STORE_C_FIRST(0, 0); | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,101 @@ | |||
| /*************************************************************************** | |||
| * Copyright (c) 2022, The OpenBLAS Project | |||
| * All rights reserved. | |||
| * Redistribution and use in source and binary forms, with or without | |||
| * modification, are permitted provided that the following conditions are | |||
| * met: | |||
| * 1. Redistributions of source code must retain the above copyright | |||
| * notice, this list of conditions and the following disclaimer. | |||
| * 2. Redistributions in binary form must reproduce the above copyright | |||
| * notice, this list of conditions and the following disclaimer in | |||
| * the documentation and/or other materials provided with the | |||
| * distribution. | |||
| * 3. Neither the name of the OpenBLAS project nor the names of | |||
| * its contributors may be used to endorse or promote products | |||
| * derived from this software without specific prior written permission. | |||
| * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |||
| * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | |||
| * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | |||
| * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | |||
| * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |||
| * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | |||
| * POSSIBILITY OF SUCH DAMAGE. | |||
| * *****************************************************************************/ | |||
| #include "common.h" | |||
| int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) { | |||
| IFLOAT *a_offset, *a_offset1, *a_offset2; | |||
| IFLOAT *b_offset; | |||
| a_offset = a; | |||
| b_offset = b; | |||
| for (BLASLONG j = 0; j < n / 2; j++) { | |||
| a_offset1 = a_offset; | |||
| a_offset2 = a_offset1 + lda; | |||
| a_offset += 2 * lda; | |||
| for (BLASLONG i = 0; i < m / 4; i++) { | |||
| *(b_offset + 0) = *(a_offset1 + 0); | |||
| *(b_offset + 1) = *(a_offset1 + 1); | |||
| *(b_offset + 2) = *(a_offset1 + 2); | |||
| *(b_offset + 3) = *(a_offset1 + 3); | |||
| *(b_offset + 4) = *(a_offset2 + 0); | |||
| *(b_offset + 5) = *(a_offset2 + 1); | |||
| *(b_offset + 6) = *(a_offset2 + 2); | |||
| *(b_offset + 7) = *(a_offset2 + 3); | |||
| a_offset1 += 4; | |||
| a_offset2 += 4; | |||
| b_offset += 8; | |||
| } | |||
| BLASLONG rest = m & 3; | |||
| if (rest == 3) { | |||
| *(b_offset + 0) = *(a_offset1 + 0); | |||
| *(b_offset + 1) = *(a_offset1 + 1); | |||
| *(b_offset + 2) = *(a_offset1 + 2); | |||
| *(b_offset + 3) = *(a_offset2 + 0); | |||
| *(b_offset + 4) = *(a_offset2 + 1); | |||
| *(b_offset + 5) = *(a_offset2 + 2); | |||
| b_offset += 6; | |||
| } else if (rest == 2) { | |||
| *(b_offset + 0) = *(a_offset1 + 0); | |||
| *(b_offset + 1) = *(a_offset1 + 1); | |||
| *(b_offset + 2) = *(a_offset2 + 0); | |||
| *(b_offset + 3) = *(a_offset2 + 1); | |||
| b_offset += 4; | |||
| } else if (rest == 1) { | |||
| *(b_offset + 0) = *(a_offset1 + 0); | |||
| *(b_offset + 1) = *(a_offset2 + 0); | |||
| b_offset += 2; | |||
| } | |||
| } | |||
| if (n & 1) { | |||
| for (BLASLONG i = 0; i < m / 4; i++) { | |||
| *(b_offset + 0) = *(a_offset + 0); | |||
| *(b_offset + 1) = *(a_offset + 1); | |||
| *(b_offset + 2) = *(a_offset + 2); | |||
| *(b_offset + 3) = *(a_offset + 3); | |||
| b_offset += 4; | |||
| a_offset += 4; | |||
| } | |||
| BLASLONG rest = m & 3; | |||
| if (rest == 3) { | |||
| *(b_offset + 0) = *(a_offset + 0); | |||
| *(b_offset + 1) = *(a_offset + 1); | |||
| *(b_offset + 2) = *(a_offset + 2); | |||
| } else if (rest == 2) { | |||
| *(b_offset + 0) = *(a_offset + 0); | |||
| *(b_offset + 1) = *(a_offset + 1); | |||
| } else if (rest == 1) { | |||
| *(b_offset + 0) = *(a_offset + 0); | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,109 @@ | |||
| /*************************************************************************** | |||
| * Copyright (c) 2022, The OpenBLAS Project | |||
| * All rights reserved. | |||
| * Redistribution and use in source and binary forms, with or without | |||
| * modification, are permitted provided that the following conditions are | |||
| * met: | |||
| * 1. Redistributions of source code must retain the above copyright | |||
| * notice, this list of conditions and the following disclaimer. | |||
| * 2. Redistributions in binary form must reproduce the above copyright | |||
| * notice, this list of conditions and the following disclaimer in | |||
| * the documentation and/or other materials provided with the | |||
| * distribution. | |||
| * 3. Neither the name of the OpenBLAS project nor the names of | |||
| * its contributors may be used to endorse or promote products | |||
| * derived from this software without specific prior written permission. | |||
| * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |||
| * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | |||
| * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | |||
| * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | |||
| * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |||
| * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | |||
| * POSSIBILITY OF SUCH DAMAGE. | |||
| * *****************************************************************************/ | |||
| #include "common.h" | |||
| int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) { | |||
| IFLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4; | |||
| IFLOAT *b_offset; | |||
| a_offset = a; | |||
| b_offset = b; | |||
| for (BLASLONG j = 0; j < n / 2; j++) { | |||
| a_offset1 = a_offset; | |||
| a_offset2 = a_offset1 + lda; | |||
| a_offset3 = a_offset2 + lda; | |||
| a_offset4 = a_offset3 + lda; | |||
| a_offset += 2; | |||
| for (BLASLONG i = 0; i < m / 4; i++) { | |||
| *(b_offset + 0) = *(a_offset1 + 0); | |||
| *(b_offset + 1) = *(a_offset2 + 0); | |||
| *(b_offset + 2) = *(a_offset3 + 0); | |||
| *(b_offset + 3) = *(a_offset4 + 0); | |||
| *(b_offset + 4) = *(a_offset1 + 1); | |||
| *(b_offset + 5) = *(a_offset2 + 1); | |||
| *(b_offset + 6) = *(a_offset3 + 1); | |||
| *(b_offset + 7) = *(a_offset4 + 1); | |||
| b_offset += 8; | |||
| a_offset1 += 4 * lda; | |||
| a_offset2 += 4 * lda; | |||
| a_offset3 += 4 * lda; | |||
| a_offset4 += 4 * lda; | |||
| } | |||
| if (m & 3) { | |||
| BLASLONG rest = m & 3; | |||
| if (rest == 3) { | |||
| *(b_offset + 0) = *(a_offset1 + 0); | |||
| *(b_offset + 1) = *(a_offset2 + 0); | |||
| *(b_offset + 2) = *(a_offset3 + 0); | |||
| *(b_offset + 3) = *(a_offset1 + 1); | |||
| *(b_offset + 4) = *(a_offset2 + 1); | |||
| *(b_offset + 5) = *(a_offset3 + 1); | |||
| b_offset += 6; | |||
| } else if (rest == 2) { | |||
| *(b_offset + 0) = *(a_offset1 + 0); | |||
| *(b_offset + 1) = *(a_offset2 + 0); | |||
| *(b_offset + 2) = *(a_offset1 + 1); | |||
| *(b_offset + 3) = *(a_offset2 + 1); | |||
| b_offset += 4; | |||
| } else if (rest == 1) { | |||
| *(b_offset + 0) = *(a_offset1 + 0); | |||
| *(b_offset + 1) = *(a_offset1 + 1); | |||
| b_offset += 2; | |||
| } | |||
| } | |||
| } | |||
| if (n & 1) { | |||
| for (BLASLONG i = 0; i < m / 4; i++) { | |||
| *(b_offset + 0) = *(a_offset); | |||
| *(b_offset + 1) = *(a_offset + lda); | |||
| *(b_offset + 2) = *(a_offset + lda * 2); | |||
| *(b_offset + 3) = *(a_offset + lda * 3); | |||
| b_offset += 4; | |||
| a_offset += 4 * lda; | |||
| } | |||
| BLASLONG rest = m & 3; | |||
| if (rest == 3) { | |||
| *(b_offset + 0) = *(a_offset); | |||
| *(b_offset + 1) = *(a_offset + lda); | |||
| *(b_offset + 2) = *(a_offset + lda * 2); | |||
| } else if (rest == 2) { | |||
| *(b_offset + 0) = *(a_offset); | |||
| *(b_offset + 1) = *(a_offset + lda); | |||
| } else if (rest == 1) { | |||
| *(b_offset + 0) = *(a_offset); | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -198,8 +198,8 @@ extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n | |||
| static void zdot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, OPENBLAS_COMPLEX_FLOAT *result) | |||
| { | |||
| FLOAT dotr = 0.0, doti = 0.0; | |||
| CREAL(*result) = 0.0; | |||
| CIMAG(*result) = 0.0; | |||
| OPENBLAS_COMPLEX_FLOAT cf = OPENBLAS_MAKE_COMPLEX_FLOAT(0.0, 0.0); | |||
| *result = cf; | |||
| if ( n < 0 ) return; | |||
| @@ -290,8 +290,8 @@ static void zdot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLON | |||
| "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" | |||
| ); | |||
| CREAL(*result) = dotr; | |||
| CIMAG(*result) = doti; | |||
| cf=OPENBLAS_MAKE_COMPLEX_FLOAT(dotr, doti); | |||
| *result = cf; | |||
| return; | |||
| } | |||
| @@ -312,9 +312,7 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA | |||
| int nthreads; | |||
| FLOAT dummy_alpha; | |||
| #endif | |||
| OPENBLAS_COMPLEX_FLOAT zdot; | |||
| CREAL(zdot) = 0.0; | |||
| CIMAG(zdot) = 0.0; | |||
| OPENBLAS_COMPLEX_FLOAT zdot = OPENBLAS_MAKE_COMPLEX_FLOAT(0.0,0.0); | |||
| #if defined(SMP) | |||
| if (inc_x == 0 || inc_y == 0 || n <= 10000) | |||
| @@ -341,8 +339,7 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA | |||
| ptr = (OPENBLAS_COMPLEX_FLOAT *)result; | |||
| for (i = 0; i < nthreads; i++) { | |||
| CREAL(zdot) = CREAL(zdot) + CREAL(*ptr); | |||
| CIMAG(zdot) = CIMAG(zdot) + CIMAG(*ptr); | |||
| zdot = OPENBLAS_MAKE_COMPLEX_FLOAT (CREAL(zdot) + CREAL(*ptr), CIMAG(zdot) + CIMAG(*ptr)); | |||
| ptr = (void *)(((char *)ptr) + sizeof(double) * 2); | |||
| } | |||
| } | |||
| @@ -108,10 +108,10 @@ SGEMMINCOPY = ../generic/gemm_ncopy_2.c | |||
| SGEMMITCOPY = ../generic/gemm_tcopy_2.c | |||
| SGEMMONCOPY = ../generic/gemm_ncopy_8.c | |||
| SGEMMOTCOPY = ../generic/gemm_tcopy_8.c | |||
| SGEMMINCOPYOBJ = sgemm_incopy.o | |||
| SGEMMITCOPYOBJ = sgemm_itcopy.o | |||
| SGEMMONCOPYOBJ = sgemm_oncopy.o | |||
| SGEMMOTCOPYOBJ = sgemm_otcopy.o | |||
| SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| endif | |||
| ifndef DGEMMKERNEL | |||
| @@ -120,10 +120,10 @@ DGEMMINCOPY = ../generic/gemm_ncopy_2.c | |||
| DGEMMITCOPY = ../generic/gemm_tcopy_2.c | |||
| DGEMMONCOPY = ../generic/gemm_ncopy_8.c | |||
| DGEMMOTCOPY = ../generic/gemm_tcopy_8.c | |||
| DGEMMINCOPYOBJ = dgemm_incopy.o | |||
| DGEMMITCOPYOBJ = dgemm_itcopy.o | |||
| DGEMMONCOPYOBJ = dgemm_oncopy.o | |||
| DGEMMOTCOPYOBJ = dgemm_otcopy.o | |||
| DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| endif | |||
| ifndef CGEMMKERNEL | |||
| @@ -132,10 +132,10 @@ CGEMMINCOPY = ../generic/zgemm_ncopy_1.c | |||
| CGEMMITCOPY = ../generic/zgemm_tcopy_1.c | |||
| CGEMMONCOPY = ../generic/zgemm_ncopy_4.c | |||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c | |||
| CGEMMINCOPYOBJ = cgemm_incopy.o | |||
| CGEMMITCOPYOBJ = cgemm_itcopy.o | |||
| CGEMMONCOPYOBJ = cgemm_oncopy.o | |||
| CGEMMOTCOPYOBJ = cgemm_otcopy.o | |||
| CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| endif | |||
| ifndef ZGEMMKERNEL | |||
| @@ -144,10 +144,10 @@ ZGEMMINCOPY = ../generic/zgemm_ncopy_1.c | |||
| ZGEMMITCOPY = ../generic/zgemm_tcopy_1.c | |||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c | |||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c | |||
| ZGEMMINCOPYOBJ = zgemm_incopy.o | |||
| ZGEMMITCOPYOBJ = zgemm_itcopy.o | |||
| ZGEMMONCOPYOBJ = zgemm_oncopy.o | |||
| ZGEMMOTCOPYOBJ = zgemm_otcopy.o | |||
| ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| endif | |||
| ifndef SGEMM_BETA | |||
| @@ -3,10 +3,10 @@ DGEMMINCOPY = dgemm_ncopy_16.S | |||
| DGEMMITCOPY = dgemm_tcopy_16.S | |||
| DGEMMONCOPY = dgemm_ncopy_4.S | |||
| DGEMMOTCOPY = dgemm_tcopy_4.S | |||
| DGEMMINCOPYOBJ = dgemm_incopy.o | |||
| DGEMMITCOPYOBJ = dgemm_itcopy.o | |||
| DGEMMONCOPYOBJ = dgemm_oncopy.o | |||
| DGEMMOTCOPYOBJ = dgemm_otcopy.o | |||
| DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| @@ -11,26 +11,26 @@ ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c | |||
| SGEMMKERNEL = ../generic/gemmkernel_2x2.c | |||
| SGEMMONCOPY = ../generic/gemm_ncopy_2.c | |||
| SGEMMOTCOPY = ../generic/gemm_tcopy_2.c | |||
| SGEMMONCOPYOBJ = sgemm_oncopy.o | |||
| SGEMMOTCOPYOBJ = sgemm_otcopy.o | |||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMKERNEL = ../generic/gemmkernel_2x2.c | |||
| DGEMMONCOPY = ../generic/gemm_ncopy_2.c | |||
| DGEMMOTCOPY = ../generic/gemm_tcopy_2.c | |||
| DGEMMONCOPYOBJ = dgemm_oncopy.o | |||
| DGEMMOTCOPYOBJ = dgemm_otcopy.o | |||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMKERNEL = ../generic/zgemmkernel_2x2.c | |||
| CGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||
| CGEMMONCOPYOBJ = cgemm_oncopy.o | |||
| CGEMMOTCOPYOBJ = cgemm_otcopy.o | |||
| CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c | |||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||
| ZGEMMONCOPYOBJ = zgemm_oncopy.o | |||
| ZGEMMOTCOPYOBJ = zgemm_otcopy.o | |||
| ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| @@ -53,6 +53,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define s4 $f9 | |||
| #define ALPHA $f4 | |||
| #define max $f5 | |||
| #define INF $f6 | |||
| PROLOGUE | |||
| @@ -61,6 +62,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| LDINT INCX, 0(INCX) | |||
| #endif | |||
| // Init INF | |||
| addi.d TEMP, $r0, 0x7FF | |||
| slli.d TEMP, TEMP, 52 | |||
| MTC INF, TEMP | |||
| MTC s1, $r0 | |||
| bge $r0, N, .L999 | |||
| slli.d INCX, INCX, BASE_SHIFT | |||
| @@ -198,7 +204,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| CMPEQ $fcc0, s1, a1 | |||
| fcvt.d.s ALPHA, ALPHA | |||
| bcnez $fcc0, .L999 | |||
| fdiv.d ALPHA, ALPHA, s1 | |||
| CMPEQ $fcc0, INF, ALPHA | |||
| bcnez $fcc0, .L999 | |||
| MOV max, s1 | |||
| MOV s1, a1 | |||
| MOV s2, a1 | |||
| @@ -68,6 +68,7 @@ | |||
| #define ALPHA $f16 | |||
| #define max $f17 | |||
| #define INF $f18 | |||
| PROLOGUE | |||
| @@ -86,6 +87,11 @@ | |||
| move XX, X | |||
| NOP | |||
| //Init INF | |||
| lui TEMP, 0x7FF0 | |||
| dsll TEMP, TEMP, 32 | |||
| MTC1 TEMP, INF | |||
| LD a1, 0 * SIZE(X) | |||
| daddiu N, N, -1 | |||
| @@ -255,6 +261,9 @@ | |||
| div.d ALPHA, ALPHA, s1 | |||
| MOV max, s1 | |||
| CMPEQ $fcc0, ALPHA, INF | |||
| bc1t $fcc0, .L999 | |||
| MOV s1, a1 | |||
| MOV s2, a1 | |||
| MOV s3, a1 | |||
| @@ -1,152 +0,0 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2020, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #define HAVE_KERNEL 1 | |||
| static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y) | |||
| { | |||
| __asm__ | |||
| ( | |||
| "lxvp 32, 0(%2) \n\t" | |||
| "lxvp 34, 32(%2) \n\t" | |||
| "lxvp 36, 64(%2) \n\t" | |||
| "lxvp 38, 96(%2) \n\t" | |||
| "lxvp 40, 128(%2) \n\t" | |||
| "lxvp 42, 160(%2) \n\t" | |||
| "lxvp 44, 192(%2) \n\t" | |||
| "lxvp 46, 224(%2) \n\t" | |||
| "addi %2, %2, 256 \n\t" | |||
| "addic. %1, %1, -32 \n\t" | |||
| "ble two%= \n\t" | |||
| ".align 5 \n" | |||
| "one%=: \n\t" | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| "stxv 32, 0(%3) \n\t" | |||
| "stxv 33, 16(%3) \n\t" | |||
| "stxv 34, 32(%3) \n\t" | |||
| "stxv 35, 48(%3) \n\t" | |||
| "stxv 36, 64(%3) \n\t" | |||
| "stxv 37, 80(%3) \n\t" | |||
| "stxv 38, 96(%3) \n\t" | |||
| "stxv 39, 112(%3) \n\t" | |||
| #else | |||
| "stxv 33, 0(%3) \n\t" | |||
| "stxv 32, 16(%3) \n\t" | |||
| "stxv 35, 32(%3) \n\t" | |||
| "stxv 34, 48(%3) \n\t" | |||
| "stxv 37, 64(%3) \n\t" | |||
| "stxv 36, 80(%3) \n\t" | |||
| "stxv 39, 96(%3) \n\t" | |||
| "stxv 38, 112(%3) \n\t" | |||
| #endif | |||
| "lxvp 32, 0(%2) \n\t" | |||
| "lxvp 34, 32(%2) \n\t" | |||
| "lxvp 36, 64(%2) \n\t" | |||
| "lxvp 38, 96(%2) \n\t" | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| "stxv 40, 128(%3) \n\t" | |||
| "stxv 41, 144(%3) \n\t" | |||
| "stxv 42, 160(%3) \n\t" | |||
| "stxv 43, 176(%3) \n\t" | |||
| "stxv 44, 192(%3) \n\t" | |||
| "stxv 45, 208(%3) \n\t" | |||
| "stxv 46, 224(%3) \n\t" | |||
| "stxv 47, 240(%3) \n\t" | |||
| #else | |||
| "stxv 41, 128(%3) \n\t" | |||
| "stxv 40, 144(%3) \n\t" | |||
| "stxv 43, 160(%3) \n\t" | |||
| "stxv 42, 176(%3) \n\t" | |||
| "stxv 45, 192(%3) \n\t" | |||
| "stxv 44, 208(%3) \n\t" | |||
| "stxv 47, 224(%3) \n\t" | |||
| "stxv 46, 240(%3) \n\t" | |||
| #endif | |||
| "lxvp 40, 128(%2) \n\t" | |||
| "lxvp 42, 160(%2) \n\t" | |||
| "lxvp 44, 192(%2) \n\t" | |||
| "lxvp 46, 224(%2) \n\t" | |||
| "addi %3, %3, 256 \n\t" | |||
| "addi %2, %2, 256 \n\t" | |||
| "addic. %1, %1, -32 \n\t" | |||
| "bgt one%= \n" | |||
| "two%=: \n\t" | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| "stxv 32, 0(%3) \n\t" | |||
| "stxv 33, 16(%3) \n\t" | |||
| "stxv 34, 32(%3) \n\t" | |||
| "stxv 35, 48(%3) \n\t" | |||
| "stxv 36, 64(%3) \n\t" | |||
| "stxv 37, 80(%3) \n\t" | |||
| "stxv 38, 96(%3) \n\t" | |||
| "stxv 39, 112(%3) \n\t" | |||
| "stxv 40, 128(%3) \n\t" | |||
| "stxv 41, 144(%3) \n\t" | |||
| "stxv 42, 160(%3) \n\t" | |||
| "stxv 43, 176(%3) \n\t" | |||
| "stxv 44, 192(%3) \n\t" | |||
| "stxv 45, 208(%3) \n\t" | |||
| "stxv 46, 224(%3) \n\t" | |||
| "stxv 47, 240(%3) \n\t" | |||
| #else | |||
| "stxv 33, 0(%3) \n\t" | |||
| "stxv 32, 16(%3) \n\t" | |||
| "stxv 35, 32(%3) \n\t" | |||
| "stxv 34, 48(%3) \n\t" | |||
| "stxv 37, 64(%3) \n\t" | |||
| "stxv 36, 80(%3) \n\t" | |||
| "stxv 39, 96(%3) \n\t" | |||
| "stxv 38, 112(%3) \n\t" | |||
| "stxv 41, 128(%3) \n\t" | |||
| "stxv 40, 144(%3) \n\t" | |||
| "stxv 43, 160(%3) \n\t" | |||
| "stxv 42, 176(%3) \n\t" | |||
| "stxv 45, 192(%3) \n\t" | |||
| "stxv 44, 208(%3) \n\t" | |||
| "stxv 47, 224(%3) \n\t" | |||
| "stxv 46, 240(%3) \n\t" | |||
| #endif | |||
| "#n=%1 x=%4=%2 y=%0=%3" | |||
| : | |||
| "=m" (*y), | |||
| "+r" (n), // 1 | |||
| "+b" (x), // 2 | |||
| "+b" (y) // 3 | |||
| : | |||
| "m" (*x) | |||
| : | |||
| "cr0", | |||
| "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", | |||
| "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47" | |||
| ); | |||
| } | |||
| @@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||
| #include "ccopy_microk_power10.c" | |||
| #include "copy_microk_power10.c" | |||
| #endif | |||
| #ifndef HAVE_KERNEL | |||
| @@ -86,7 +86,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| if ( (inc_x == 1) && (inc_y == 1 )) | |||
| { | |||
| BLASLONG n1 = n & -32; | |||
| BLASLONG n1 = n & -64; | |||
| if ( n1 > 0 ) | |||
| { | |||
| copy_kernel(n1, x, y); | |||
| @@ -29,6 +29,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if defined(POWER10) | |||
| #pragma GCC optimize "O1" | |||
| #include "cdot_microk_power10.c" | |||
| #else | |||
| #ifndef HAVE_KERNEL_8 | |||
| @@ -61,37 +61,97 @@ static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y) | |||
| ".align 5 \n" | |||
| "one%=: \n\t" | |||
| "stxvp 32, 0(%3) \n\t" | |||
| "stxvp 34, 32(%3) \n\t" | |||
| "stxvp 36, 64(%3) \n\t" | |||
| "stxvp 38, 96(%3) \n\t" | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| "stxv 32, 0(%3) \n\t" | |||
| "stxv 33, 16(%3) \n\t" | |||
| "stxv 34, 32(%3) \n\t" | |||
| "stxv 35, 48(%3) \n\t" | |||
| "stxv 36, 64(%3) \n\t" | |||
| "stxv 37, 80(%3) \n\t" | |||
| "stxv 38, 96(%3) \n\t" | |||
| "stxv 39, 112(%3) \n\t" | |||
| #else | |||
| "stxv 33, 0(%3) \n\t" | |||
| "stxv 32, 16(%3) \n\t" | |||
| "stxv 35, 32(%3) \n\t" | |||
| "stxv 34, 48(%3) \n\t" | |||
| "stxv 37, 64(%3) \n\t" | |||
| "stxv 36, 80(%3) \n\t" | |||
| "stxv 39, 96(%3) \n\t" | |||
| "stxv 38, 112(%3) \n\t" | |||
| #endif | |||
| "lxvp 32, 0(%2) \n\t" | |||
| "lxvp 34, 32(%2) \n\t" | |||
| "lxvp 36, 64(%2) \n\t" | |||
| "lxvp 38, 96(%2) \n\t" | |||
| "stxvp 40, 128(%3) \n\t" | |||
| "stxvp 42, 160(%3) \n\t" | |||
| "stxvp 44, 192(%3) \n\t" | |||
| "stxvp 46, 224(%3) \n\t" | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| "stxv 40, 128(%3) \n\t" | |||
| "stxv 41, 144(%3) \n\t" | |||
| "stxv 42, 160(%3) \n\t" | |||
| "stxv 43, 176(%3) \n\t" | |||
| "stxv 44, 192(%3) \n\t" | |||
| "stxv 45, 208(%3) \n\t" | |||
| "stxv 46, 224(%3) \n\t" | |||
| "stxv 47, 240(%3) \n\t" | |||
| #else | |||
| "stxv 41, 128(%3) \n\t" | |||
| "stxv 40, 144(%3) \n\t" | |||
| "stxv 43, 160(%3) \n\t" | |||
| "stxv 42, 176(%3) \n\t" | |||
| "stxv 45, 192(%3) \n\t" | |||
| "stxv 44, 208(%3) \n\t" | |||
| "stxv 47, 224(%3) \n\t" | |||
| "stxv 46, 240(%3) \n\t" | |||
| #endif | |||
| "lxvp 40, 128(%2) \n\t" | |||
| "lxvp 42, 160(%2) \n\t" | |||
| "lxvp 44, 192(%2) \n\t" | |||
| "lxvp 46, 224(%2) \n\t" | |||
| "stxvp 48, 256(%3) \n\t" | |||
| "stxvp 50, 288(%3) \n\t" | |||
| "stxvp 52, 320(%3) \n\t" | |||
| "stxvp 54, 352(%3) \n\t" | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| "stxv 48, 256(%3) \n\t" | |||
| "stxv 49, 272(%3) \n\t" | |||
| "stxv 50, 288(%3) \n\t" | |||
| "stxv 51, 304(%3) \n\t" | |||
| "stxv 52, 320(%3) \n\t" | |||
| "stxv 53, 336(%3) \n\t" | |||
| "stxv 54, 352(%3) \n\t" | |||
| "stxv 55, 368(%3) \n\t" | |||
| #else | |||
| "stxv 49, 256(%3) \n\t" | |||
| "stxv 48, 272(%3) \n\t" | |||
| "stxv 51, 288(%3) \n\t" | |||
| "stxv 50, 304(%3) \n\t" | |||
| "stxv 53, 320(%3) \n\t" | |||
| "stxv 52, 336(%3) \n\t" | |||
| "stxv 55, 352(%3) \n\t" | |||
| "stxv 54, 368(%3) \n\t" | |||
| #endif | |||
| "lxvp 48, 256(%2) \n\t" | |||
| "lxvp 50, 288(%2) \n\t" | |||
| "lxvp 52, 320(%2) \n\t" | |||
| "lxvp 54, 352(%2) \n\t" | |||
| "stxvp 56, 384(%3) \n\t" | |||
| "stxvp 58, 416(%3) \n\t" | |||
| "stxvp 60, 448(%3) \n\t" | |||
| "stxvp 62, 480(%3) \n\t" | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| "stxv 56, 384(%3) \n\t" | |||
| "stxv 57, 400(%3) \n\t" | |||
| "stxv 58, 416(%3) \n\t" | |||
| "stxv 59, 432(%3) \n\t" | |||
| "stxv 60, 448(%3) \n\t" | |||
| "stxv 61, 464(%3) \n\t" | |||
| "stxv 62, 480(%3) \n\t" | |||
| "stxv 63, 496(%3) \n\t" | |||
| #else | |||
| "stxv 57, 384(%3) \n\t" | |||
| "stxv 56, 400(%3) \n\t" | |||
| "stxv 59, 416(%3) \n\t" | |||
| "stxv 58, 432(%3) \n\t" | |||
| "stxv 61, 448(%3) \n\t" | |||
| "stxv 60, 464(%3) \n\t" | |||
| "stxv 63, 480(%3) \n\t" | |||
| "stxv 62, 496(%3) \n\t" | |||
| #endif | |||
| "lxvp 56, 384(%2) \n\t" | |||
| "lxvp 58, 416(%2) \n\t" | |||
| "lxvp 60, 448(%2) \n\t" | |||
| @@ -111,22 +171,73 @@ static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y) | |||
| "two%=: \n\t" | |||
| "stxvp 32, 0(%3) \n\t" | |||
| "stxvp 34, 32(%3) \n\t" | |||
| "stxvp 36, 64(%3) \n\t" | |||
| "stxvp 38, 96(%3) \n\t" | |||
| "stxvp 40, 128(%3) \n\t" | |||
| "stxvp 42, 160(%3) \n\t" | |||
| "stxvp 44, 192(%3) \n\t" | |||
| "stxvp 46, 224(%3) \n\t" | |||
| "stxvp 48, 256(%3) \n\t" | |||
| "stxvp 50, 288(%3) \n\t" | |||
| "stxvp 52, 320(%3) \n\t" | |||
| "stxvp 54, 352(%3) \n\t" | |||
| "stxvp 56, 384(%3) \n\t" | |||
| "stxvp 58, 416(%3) \n\t" | |||
| "stxvp 60, 448(%3) \n\t" | |||
| "stxvp 62, 480(%3) \n\t" | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| "stxv 32, 0(%3) \n\t" | |||
| "stxv 33, 16(%3) \n\t" | |||
| "stxv 34, 32(%3) \n\t" | |||
| "stxv 35, 48(%3) \n\t" | |||
| "stxv 36, 64(%3) \n\t" | |||
| "stxv 37, 80(%3) \n\t" | |||
| "stxv 38, 96(%3) \n\t" | |||
| "stxv 39, 112(%3) \n\t" | |||
| "stxv 40, 128(%3) \n\t" | |||
| "stxv 41, 144(%3) \n\t" | |||
| "stxv 42, 160(%3) \n\t" | |||
| "stxv 43, 176(%3) \n\t" | |||
| "stxv 44, 192(%3) \n\t" | |||
| "stxv 45, 208(%3) \n\t" | |||
| "stxv 46, 224(%3) \n\t" | |||
| "stxv 47, 240(%3) \n\t" | |||
| "stxv 48, 256(%3) \n\t" | |||
| "stxv 49, 272(%3) \n\t" | |||
| "stxv 50, 288(%3) \n\t" | |||
| "stxv 51, 304(%3) \n\t" | |||
| "stxv 52, 320(%3) \n\t" | |||
| "stxv 53, 336(%3) \n\t" | |||
| "stxv 54, 352(%3) \n\t" | |||
| "stxv 55, 368(%3) \n\t" | |||
| "stxv 56, 384(%3) \n\t" | |||
| "stxv 57, 400(%3) \n\t" | |||
| "stxv 58, 416(%3) \n\t" | |||
| "stxv 59, 432(%3) \n\t" | |||
| "stxv 60, 448(%3) \n\t" | |||
| "stxv 61, 464(%3) \n\t" | |||
| "stxv 62, 480(%3) \n\t" | |||
| "stxv 63, 496(%3) \n\t" | |||
| #else | |||
| "stxv 33, 0(%3) \n\t" | |||
| "stxv 32, 16(%3) \n\t" | |||
| "stxv 35, 32(%3) \n\t" | |||
| "stxv 34, 48(%3) \n\t" | |||
| "stxv 37, 64(%3) \n\t" | |||
| "stxv 36, 80(%3) \n\t" | |||
| "stxv 39, 96(%3) \n\t" | |||
| "stxv 38, 112(%3) \n\t" | |||
| "stxv 41, 128(%3) \n\t" | |||
| "stxv 40, 144(%3) \n\t" | |||
| "stxv 43, 160(%3) \n\t" | |||
| "stxv 42, 176(%3) \n\t" | |||
| "stxv 45, 192(%3) \n\t" | |||
| "stxv 44, 208(%3) \n\t" | |||
| "stxv 47, 224(%3) \n\t" | |||
| "stxv 46, 240(%3) \n\t" | |||
| "stxv 49, 256(%3) \n\t" | |||
| "stxv 48, 272(%3) \n\t" | |||
| "stxv 51, 288(%3) \n\t" | |||
| "stxv 50, 304(%3) \n\t" | |||
| "stxv 53, 320(%3) \n\t" | |||
| "stxv 52, 336(%3) \n\t" | |||
| "stxv 55, 352(%3) \n\t" | |||
| "stxv 54, 368(%3) \n\t" | |||
| "stxv 57, 384(%3) \n\t" | |||
| "stxv 56, 400(%3) \n\t" | |||
| "stxv 59, 416(%3) \n\t" | |||
| "stxv 58, 432(%3) \n\t" | |||
| "stxv 61, 448(%3) \n\t" | |||
| "stxv 60, 464(%3) \n\t" | |||
| "stxv 63, 480(%3) \n\t" | |||
| "stxv 62, 496(%3) \n\t" | |||
| #endif | |||
| "#n=%1 x=%4=%2 y=%0=%3" | |||
| : | |||
| @@ -95,18 +95,38 @@ static void zscal_kernel_8 (long n, float *x, float alpha_r, float alpha_i) | |||
| "xvaddsp 50, 50, 36 \n\t" | |||
| "xvaddsp 51, 51, 37 \n\t" | |||
| "stxvp 48, 0(%2) \n\t" | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| "stxv 48, 0(%2) \n\t" | |||
| "stxv 49, 16(%2) \n\t" | |||
| #else | |||
| "stxv 49, 0(%2) \n\t" | |||
| "stxv 48, 16(%2) \n\t" | |||
| #endif | |||
| "xvaddsp 52, 52, 38 \n\t" | |||
| "xvaddsp 53, 53, 39 \n\t" | |||
| "stxvp 50, 32(%2) \n\t" | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| "stxv 50, 32(%2) \n\t" | |||
| "stxv 51, 48(%2) \n\t" | |||
| #else | |||
| "stxv 51, 32(%2) \n\t" | |||
| "stxv 50, 48(%2) \n\t" | |||
| #endif | |||
| "xvaddsp 54, 54, 56 \n\t" | |||
| "xvaddsp 55, 55, 57 \n\t" | |||
| "stxvp 52, 64(%2) \n\t" | |||
| "stxvp 54, 96(%2) \n\t" | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| "stxv 52, 64(%2) \n\t" | |||
| "stxv 53, 80(%2) \n\t" | |||
| "stxv 54, 96(%2) \n\t" | |||
| "stxv 55, 112(%2) \n\t" | |||
| #else | |||
| "stxv 53, 64(%2) \n\t" | |||
| "stxv 52, 80(%2) \n\t" | |||
| "stxv 55, 96(%2) \n\t" | |||
| "stxv 54, 112(%2) \n\t" | |||
| #endif | |||
| "addi %2, %2, 128 \n\t" | |||
| @@ -148,18 +168,39 @@ static void zscal_kernel_8 (long n, float *x, float alpha_r, float alpha_i) | |||
| "xvaddsp 50, 50, 36 \n\t" | |||
| "xvaddsp 51, 51, 37 \n\t" | |||
| "stxvp 48, 0(%2) \n\t" | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| "stxv 48, 0(%2) \n\t" | |||
| "stxv 49, 16(%2) \n\t" | |||
| #else | |||
| "stxv 49, 0(%2) \n\t" | |||
| "stxv 48, 16(%2) \n\t" | |||
| #endif | |||
| "xvaddsp 52, 52, 38 \n\t" | |||
| "xvaddsp 53, 53, 39 \n\t" | |||
| "stxvp 50, 32(%2) \n\t" | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| "stxv 50, 32(%2) \n\t" | |||
| "stxv 51, 48(%2) \n\t" | |||
| #else | |||
| "stxv 51, 32(%2) \n\t" | |||
| "stxv 50, 48(%2) \n\t" | |||
| #endif | |||
| "xvaddsp 54, 54, 56 \n\t" | |||
| "xvaddsp 55, 55, 57 \n\t" | |||
| "stxvp 52, 64(%2) \n\t" | |||
| "stxvp 54, 96(%2) \n\t" | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| "stxv 52, 64(%2) \n\t" | |||
| "stxv 53, 80(%2) \n\t" | |||
| "stxv 54, 96(%2) \n\t" | |||
| "stxv 55, 112(%2) \n\t" | |||
| #else | |||
| "stxv 53, 64(%2) \n\t" | |||
| "stxv 52, 80(%2) \n\t" | |||
| "stxv 55, 96(%2) \n\t" | |||
| "stxv 54, 112(%2) \n\t" | |||
| #endif | |||
| "#n=%1 x=%0=%2 alpha=(%3,%4)\n" | |||
| : | |||
| @@ -60,14 +60,25 @@ static void daxpy_kernel_8 (long n, double *x, double *y, double alpha) | |||
| "xvmaddadp 37, 33, %x4 \n\t" | |||
| "lxvp 32, 0(%2) \n\t" | |||
| "stxvp 36, 0(%3) \n\t" | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| "stxv 36, 0(%3) \n\t" | |||
| "stxv 37, 16(%3) \n\t" | |||
| #else | |||
| "stxv 37, 0(%3) \n\t" | |||
| "stxv 36, 16(%3) \n\t" | |||
| #endif | |||
| "xvmaddadp 38, 34, %x4 \n\t" | |||
| "xvmaddadp 39, 35, %x4 \n\t" | |||
| "lxvp 34, 32(%2) \n\t" | |||
| "stxvp 38, 32(%3) \n\t" | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| "stxv 38, 32(%3) \n\t" | |||
| "stxv 39, 48(%3) \n\t" | |||
| #else | |||
| "stxv 39, 32(%3) \n\t" | |||
| "stxv 38, 48(%3) \n\t" | |||
| #endif | |||
| "lxvp 36, 128(%3) \n\t" | |||
| "lxvp 38, 160(%3) \n\t" | |||
| @@ -76,13 +87,25 @@ static void daxpy_kernel_8 (long n, double *x, double *y, double alpha) | |||
| "xvmaddadp 45, 41, %x4 \n\t" | |||
| "lxvp 40, 64(%2) \n\t" | |||
| "stxvp 44, 64(%3) \n\t" | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| "stxv 44, 64(%3) \n\t" | |||
| "stxv 45, 80(%3) \n\t" | |||
| #else | |||
| "stxv 45, 64(%3) \n\t" | |||
| "stxv 44, 80(%3) \n\t" | |||
| #endif | |||
| "xvmaddadp 46, 42, %x4 \n\t" | |||
| "xvmaddadp 47, 43, %x4 \n\t" | |||
| "lxvp 42, 96(%2) \n\t" | |||
| "stxvp 46, 96(%3) \n\t" | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| "stxv 46, 96(%3) \n\t" | |||
| "stxv 47, 112(%3) \n\t" | |||
| #else | |||
| "stxv 47, 96(%3) \n\t" | |||
| "stxv 46, 112(%3) \n\t" | |||
| #endif | |||
| "addi %2, %2, 128 \n\t" | |||
| "addi %3, %3, 128 \n\t" | |||
| @@ -105,10 +128,25 @@ static void daxpy_kernel_8 (long n, double *x, double *y, double alpha) | |||
| "xvmaddadp 46, 42, %x4 \n\t" | |||
| "xvmaddadp 47, 43, %x4 \n\t" | |||
| "stxvp 36, 0(%3) \n\t" | |||
| "stxvp 38, 32(%3) \n\t" | |||
| "stxvp 44, 64(%3) \n\t" | |||
| "stxvp 46, 96(%3) \n\t" | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| "stxv 36, 0(%3) \n\t" | |||
| "stxv 37, 16(%3) \n\t" | |||
| "stxv 38, 32(%3) \n\t" | |||
| "stxv 39, 48(%3) \n\t" | |||
| "stxv 44, 64(%3) \n\t" | |||
| "stxv 45, 80(%3) \n\t" | |||
| "stxv 46, 96(%3) \n\t" | |||
| "stxv 47, 112(%3) \n\t" | |||
| #else | |||
| "stxv 37, 0(%3) \n\t" | |||
| "stxv 36, 16(%3) \n\t" | |||
| "stxv 39, 32(%3) \n\t" | |||
| "stxv 38, 48(%3) \n\t" | |||
| "stxv 45, 64(%3) \n\t" | |||
| "stxv 44, 80(%3) \n\t" | |||
| "stxv 47, 96(%3) \n\t" | |||
| "stxv 46, 112(%3) \n\t" | |||
| #endif | |||
| "#n=%1 x=%5=%2 y=%0=%3 alpha=%6 t0=%x4\n" | |||
| : | |||
| @@ -68,7 +68,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||
| if ( n >= 16 ) | |||
| { | |||
| BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 3) & 0x3; | |||
| BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3; | |||
| for (i = 0; i < align; i++) { | |||
| y[i] += da * x[i] ; | |||
| } | |||
| @@ -87,7 +87,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| { | |||
| if ( n >= 64 ) | |||
| { | |||
| BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 3) & 0x3; | |||
| BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3; | |||
| for (i = 0; i < align; i++) { | |||
| y[i] = x[i] ; | |||
| } | |||
| @@ -35,327 +35,75 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #if defined(HAVE_KERNEL4x8_ASM) | |||
| static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, double *y, double alpha) { | |||
| FLOAT *a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7; | |||
| BLASLONG off2; | |||
| BLASLONG tempR; | |||
| __asm__( | |||
| "sldi %[temp],%[off], 4 \n\t" // lda * sizeof (double) *2 | |||
| "sldi %[off], %[off], 3 \n\t" // lda * sizeof (double) | |||
| "xxlxor 34,34,34 \n\t" | |||
| "xxlxor 35,34,34 \n\t" | |||
| "add %[a2], %[a0], %[temp] \n\t" | |||
| "add %[a1], %[a0], %[off] \n\t" | |||
| "xxlxor 4,34,34 \n\t" | |||
| "xxlxor 5,34,34 \n\t" | |||
| "xxlxor 6,34,34 \n\t" | |||
| "xxlxor 7,34,34 \n\t" | |||
| "add %[a3], %[a2], %[off] \n\t" | |||
| "add %[a4], %[a2], %[temp] \n\t" | |||
| "xxlxor 8,34,34 \n\t" | |||
| "xxlxor 9,34,34 \n\t" | |||
| "add %[a5], %[a3], %[temp] \n\t" | |||
| "li %[off],0 \n\t" | |||
| "li %[off2],16 \n\t" | |||
| "add %[a6], %[a4], %[temp] \n\t" | |||
| "add %[a7], %[a5], %[temp] \n\t" | |||
| "lxvp 32, 0(%[x]) \n\t" | |||
| "lxvp 36, 0(%[a0]) \n\t" | |||
| "lxvp 38, 0(%[a1]) \n\t" | |||
| "lxvp 40, 0(%[a2]) \n\t" | |||
| "lxvp 42, 0(%[a3]) \n\t" | |||
| "lxvp 44, 0(%[a4]) \n\t" | |||
| "lxvp 46, 0(%[a5]) \n\t" | |||
| "lxvp 48, 0(%[a6]) \n\t" | |||
| "lxvp 50, 0(%[a7]) \n\t" | |||
| #if defined(PREFETCH) | |||
| "li %[temp],896 \n\t" | |||
| #endif | |||
| "addic. %[n],%[n],-4 \n\t" | |||
| "li %[off],32 \n\t" | |||
| "ble- two%= \n\t" | |||
| //-------------------------------------------------- | |||
| ".align 5 \n\t" | |||
| "one%=: \n\t" | |||
| "xvmaddadp 34,36,32 \n\t" | |||
| "xvmaddadp 35,38,32 \n\t" | |||
| "addi %[off2], %[off2],32 \n\t" | |||
| "xvmaddadp 4,40,32 \n\t" | |||
| "xvmaddadp 5,42,32 \n\t" | |||
| "xvmaddadp 6,44,32 \n\t" | |||
| "xvmaddadp 7,46,32 \n\t" | |||
| "xvmaddadp 8,48,32 \n\t" | |||
| "xvmaddadp 9,50,32 \n\t" | |||
| "xvmaddadp 34,37,33 \n\t" | |||
| "xvmaddadp 35,39,33 \n\t" | |||
| "lxvp 36, 32(%[a0]) \n\t" | |||
| "lxvp 38, 32(%[a1]) \n\t" | |||
| "xvmaddadp 4,41,33 \n\t" | |||
| "xvmaddadp 5,43,33 \n\t" | |||
| "addi %[off], %[off],32 \n\t" | |||
| "lxvp 40, 32(%[a2]) \n\t" | |||
| "lxvp 42, 32(%[a3]) \n\t" | |||
| "xvmaddadp 6,45,33 \n\t" | |||
| "xvmaddadp 7,47,33 \n\t" | |||
| "lxvp 44, 32(%[a4]) \n\t" | |||
| "lxvp 46, 32(%[a5]) \n\t" | |||
| "xvmaddadp 8,49,33 \n\t" | |||
| "xvmaddadp 9,51,33 \n\t" | |||
| "addic. %[n],%[n],-4 \n\t" | |||
| "lxvp 48, 32(%[a6]) \n\t" | |||
| "lxvp 50, 32(%[a7]) \n\t" | |||
| "lxvp 32, 32(%[x]) \n\t" | |||
| "ble- two%= \n\t" | |||
| "xvmaddadp 34,36,32 \n\t" | |||
| "xvmaddadp 35,38,32 \n\t" | |||
| "addi %[off2], %[off2],32 \n\t" | |||
| "xvmaddadp 4,40,32 \n\t" | |||
| "xvmaddadp 5,42,32 \n\t" | |||
| "xvmaddadp 6,44,32 \n\t" | |||
| "xvmaddadp 7,46,32 \n\t" | |||
| "xvmaddadp 8,48,32 \n\t" | |||
| "xvmaddadp 9,50,32 \n\t" | |||
| "xvmaddadp 34,37,33 \n\t" | |||
| "xvmaddadp 35,39,33 \n\t" | |||
| "lxvp 36, 64(%[a0]) \n\t" | |||
| "lxvp 38, 64(%[a1]) \n\t" | |||
| "xvmaddadp 4,41,33 \n\t" | |||
| "xvmaddadp 5,43,33 \n\t" | |||
| "addi %[off], %[off],32 \n\t" | |||
| "lxvp 40, 64(%[a2]) \n\t" | |||
| "lxvp 42, 64(%[a3]) \n\t" | |||
| "xvmaddadp 6,45,33 \n\t" | |||
| "xvmaddadp 7,47,33 \n\t" | |||
| "lxvp 44, 64(%[a4]) \n\t" | |||
| "lxvp 46, 64(%[a5]) \n\t" | |||
| "xvmaddadp 8,49,33 \n\t" | |||
| "xvmaddadp 9,51,33 \n\t" | |||
| "addic. %[n],%[n],-4 \n\t" | |||
| "lxvp 48, 64(%[a6]) \n\t" | |||
| "lxvp 50, 64(%[a7]) \n\t" | |||
| "lxvp 32, 64(%[x]) \n\t" | |||
| "ble- two%= \n\t" | |||
| "xvmaddadp 34,36,32 \n\t" | |||
| "xvmaddadp 35,38,32 \n\t" | |||
| #if defined(PREFETCH) | |||
| "addi %[temp],%[temp],128 \n\t" | |||
| #endif | |||
| "addi %[off2], %[off2],32 \n\t" | |||
| "xvmaddadp 4,40,32 \n\t" | |||
| "xvmaddadp 5,42,32 \n\t" | |||
| "xvmaddadp 6,44,32 \n\t" | |||
| "xvmaddadp 7,46,32 \n\t" | |||
| "xvmaddadp 8,48,32 \n\t" | |||
| "xvmaddadp 9,50,32 \n\t" | |||
| #if defined(PREFETCH) | |||
| "dcbt %[temp],%[a0] \n\t" | |||
| #endif | |||
| "xvmaddadp 34,37,33 \n\t" | |||
| "xvmaddadp 35,39,33 \n\t" | |||
| "lxvp 36, 96(%[a0]) \n\t" | |||
| "lxvp 38, 96(%[a1]) \n\t" | |||
| "xvmaddadp 4,41,33 \n\t" | |||
| "xvmaddadp 5,43,33 \n\t" | |||
| #if defined(PREFETCH) | |||
| "dcbt %[temp],%[a1] \n\t" | |||
| #endif | |||
| "lxvp 40, 96(%[a2]) \n\t" | |||
| "lxvp 42, 96(%[a3]) \n\t" | |||
| "addi %[off], %[off],32 \n\t" | |||
| "xvmaddadp 6,45,33 \n\t" | |||
| "xvmaddadp 7,47,33 \n\t" | |||
| "lxvp 44, 96(%[a4]) \n\t" | |||
| "lxvp 46, 96(%[a5]) \n\t" | |||
| "xvmaddadp 8,49,33 \n\t" | |||
| "xvmaddadp 9,51,33 \n\t" | |||
| #if defined(PREFETCH) | |||
| "dcbt %[temp],%[a3] \n\t" | |||
| #endif | |||
| "lxvp 48, 96(%[a6]) \n\t" | |||
| "lxvp 50, 96(%[a7]) \n\t" | |||
| "lxvp 32, 96(%[x]) \n\t" | |||
| "addic. %[n],%[n],-4 \n\t" | |||
| "ble- two%= \n\t" | |||
| "addi %[off2], %[off2],32 \n\t" | |||
| #if defined(PREFETCH) | |||
| "dcbt %[temp],%[a2] \n\t" | |||
| #endif | |||
| "xvmaddadp 34,36,32 \n\t" | |||
| "xvmaddadp 35,38,32 \n\t" | |||
| "xvmaddadp 4,40,32 \n\t" | |||
| "xvmaddadp 5,42,32 \n\t" | |||
| #if defined(PREFETCH) | |||
| "dcbt %[temp],%[a4] \n\t" | |||
| #endif | |||
| "xvmaddadp 6,44,32 \n\t" | |||
| "xvmaddadp 7,46,32 \n\t" | |||
| "xvmaddadp 8,48,32 \n\t" | |||
| "xvmaddadp 9,50,32 \n\t" | |||
| #if defined(PREFETCH) | |||
| "dcbt %[temp],%[a5] \n\t" | |||
| #endif | |||
| "xvmaddadp 34,37,33 \n\t" | |||
| "xvmaddadp 35,39,33 \n\t" | |||
| "lxvp 36, 128(%[a0]) \n\t" | |||
| "lxvp 38, 128(%[a1]) \n\t" | |||
| "xvmaddadp 4,41,33 \n\t" | |||
| "xvmaddadp 5,43,33 \n\t" | |||
| "addi %[off], %[off],32 \n\t" | |||
| "lxvp 40, 128(%[a2]) \n\t" | |||
| "lxvp 42, 128(%[a3]) \n\t" | |||
| #if defined(PREFETCH) | |||
| "dcbt %[temp],%[a6] \n\t" | |||
| #endif | |||
| "xvmaddadp 6,45,33 \n\t" | |||
| "xvmaddadp 7,47,33 \n\t" | |||
| "lxvp 44, 128(%[a4]) \n\t" | |||
| "lxvp 46, 128(%[a5]) \n\t" | |||
| "xvmaddadp 8,49,33 \n\t" | |||
| "xvmaddadp 9,51,33 \n\t" | |||
| #if defined(PREFETCH) | |||
| "dcbt %[temp],%[a7] \n\t" | |||
| #endif | |||
| "addic. %[n],%[n],-4 \n\t" | |||
| "lxvp 48, 128(%[a6]) \n\t" | |||
| "lxvp 50, 128(%[a7]) \n\t" | |||
| "lxvp 32, 128(%[x]) \n\t" | |||
| #if defined(PREFETCH) | |||
| "dcbt %[temp],%[x] \n\t" | |||
| #endif | |||
| "addi %[a0], %[a0], 128 \n\t" | |||
| "addi %[a1], %[a1], 128 \n\t" | |||
| "addi %[a2], %[a2], 128 \n\t" | |||
| "addi %[a3], %[a3], 128 \n\t" | |||
| "addi %[a4], %[a4], 128 \n\t" | |||
| "addi %[a5], %[a5], 128 \n\t" | |||
| "addi %[a6], %[a6], 128 \n\t" | |||
| "addi %[a7], %[a7], 128 \n\t" | |||
| "addi %[x], %[x], 128 \n\t" | |||
| "bgt+ one%= \n\t" | |||
| ".align 5 \n\t" | |||
| "two%=: \n\t" | |||
| //-------------------------------------------- | |||
| "xvmaddadp 34,36,32 \n\t" | |||
| "xvmaddadp 35,38,32 \n\t" | |||
| "xvmaddadp 4,40,32 \n\t" | |||
| "xvmaddadp 5,42,32 \n\t" | |||
| "xvmaddadp 6,44,32 \n\t" | |||
| "xvmaddadp 7,46,32 \n\t" | |||
| "xvmaddadp 8,48,32 \n\t" | |||
| "xvmaddadp 9,50,32 \n\t" | |||
| XXSPLTD_S(36,%x[alpha],0) | |||
| "xvmaddadp 34,37,33 \n\t" | |||
| "xvmaddadp 35,39,33 \n\t" | |||
| "xvmaddadp 4,41,33 \n\t" | |||
| "xvmaddadp 5,43,33 \n\t" | |||
| "xvmaddadp 6,45,33 \n\t" | |||
| "xvmaddadp 7,47,33 \n\t" | |||
| "xvmaddadp 8,49,33 \n\t" | |||
| "xvmaddadp 9,51,33 \n\t" | |||
| "lxvp 38, 0(%[y]) \n\t" | |||
| "lxvp 40, 32(%[y]) \n\t" | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| XXMRGHD_S(42,34,35) | |||
| XXMRGLD_S(43,34,35) | |||
| XXMRGHD_S(44,4,5) | |||
| XXMRGLD_S(45,4,5) | |||
| #else | |||
| XXMRGLD_S(42,35,34) | |||
| XXMRGHD_S(43,35,34) | |||
| XXMRGLD_S(44,5,4) | |||
| XXMRGHD_S(45,5,4) | |||
| #endif | |||
| "xvadddp 42,42,43 \n\t" | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| XXMRGHD_S(46,6,7) | |||
| XXMRGLD_S(47,6,7) | |||
| #else | |||
| XXMRGLD_S(46,7,6) | |||
| XXMRGHD_S(47,7,6) | |||
| #endif | |||
| "xvadddp 44,44,45 \n\t" | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| XXMRGHD_S(48,8,9) | |||
| XXMRGLD_S(49,8,9) | |||
| #else | |||
| XXMRGLD_S(48,9,8) | |||
| XXMRGHD_S(49,9,8) | |||
| #endif | |||
| "xvadddp 46,46,47 \n\t" | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| "xvmaddadp 38,42,36 \n\t" | |||
| "xvmaddadp 39,44,36 \n\t" | |||
| #else | |||
| "xvmaddadp 39,42,36 \n\t" | |||
| "xvmaddadp 38,44,36 \n\t" | |||
| #endif | |||
| "xvadddp 48,48,49 \n\t" | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| "xvmaddadp 41,48,36 \n\t" | |||
| #else | |||
| "xvmaddadp 41,46,36 \n\t" | |||
| #if !__has_builtin(__builtin_vsx_disassemble_pair) | |||
| #define __builtin_vsx_disassemble_pair __builtin_mma_disassemble_pair | |||
| #endif | |||
| "stxvp 38, 0(%[y]) \n\t" | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| "xvmaddadp 40,46,36 \n\t" | |||
| #else | |||
| "xvmaddadp 40,48,36 \n\t" | |||
| #endif | |||
| "stxvp 40, 32(%[y]) \n\t" | |||
| : [memy] "+m" (*(double (*)[8])y), | |||
| [n] "+&r" (n), | |||
| [a0] "=b" (a0), | |||
| [a1] "=&b" (a1), | |||
| [a2] "=&b" (a2), | |||
| [a3] "=&b" (a3), | |||
| [a4] "=&b" (a4), | |||
| [a5] "=&b" (a5), | |||
| [a6] "=&b" (a6), | |||
| [a7] "=&b" (a7), | |||
| [off] "+&b" (lda), | |||
| [off2]"=&b" (off2), | |||
| [temp] "=&b" (tempR) | |||
| : [memx] "m" (*(const double (*)[n])x), | |||
| [mem_ap] "m" (*(const double (*)[n*8]) ap), | |||
| [alpha] "d" (alpha), | |||
| "[a0]" (ap), | |||
| [x] "b" (x), | |||
| [y] "b" (y) | |||
| : "cc","vs4","vs5","vs6","vs7","vs8","vs9" ,"vs32","vs33","vs34","vs35", "vs36", "vs37", "vs38", "vs39", | |||
| "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs50", "vs51" | |||
| ); | |||
| return; | |||
| typedef __vector unsigned char vec_t; | |||
| static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { | |||
| BLASLONG i; | |||
| FLOAT *a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7; | |||
| __vector_pair vx, vp; | |||
| vec_t res[2],res1[2]; | |||
| register __vector double temp0 = {0, 0}; | |||
| register __vector double temp1 = {0, 0}; | |||
| register __vector double temp2 = {0, 0}; | |||
| register __vector double temp3 = {0, 0}; | |||
| register __vector double temp4 = {0, 0}; | |||
| register __vector double temp5 = {0, 0}; | |||
| register __vector double temp6 = {0, 0}; | |||
| register __vector double temp7 = {0, 0}; | |||
| a0 = ap; | |||
| a1 = ap + lda; | |||
| a2 = a1 + lda; | |||
| a3 = a2 + lda; | |||
| a4 = a3 + lda; | |||
| a5 = a4 + lda; | |||
| a6 = a5 + lda; | |||
| a7 = a6 + lda; | |||
| for (i = 0; i < n/2; i += 2) { | |||
| vp = *((__vector_pair *)((void *)&a0[i*2])); | |||
| vx = *((__vector_pair *)((void *)&x[i*2])); | |||
| __builtin_vsx_disassemble_pair (res, &vx); | |||
| __builtin_vsx_disassemble_pair (res1, &vp); | |||
| temp0 = vec_madd ((__vector double)res[0], (__vector double)res1[0], temp0); | |||
| temp0 = vec_madd ((__vector double)res[1], (__vector double)res1[1], temp0); | |||
| vp = *((__vector_pair *)((void *)&a1[i*2])); | |||
| __builtin_vsx_disassemble_pair (res1, &vp); | |||
| temp1 = vec_madd ((__vector double)res[0], (__vector double)res1[0], temp1); | |||
| temp1 = vec_madd ((__vector double)res[1], (__vector double)res1[1], temp1); | |||
| vp = *((__vector_pair *)((void *)&a2[i*2])); | |||
| __builtin_vsx_disassemble_pair (res1, &vp); | |||
| temp2 = vec_madd ((__vector double)res[0], (__vector double)res1[0], temp2); | |||
| temp2 = vec_madd ((__vector double)res[1], (__vector double)res1[1], temp2); | |||
| vp = *((__vector_pair *)((void *)&a3[i*2])); | |||
| __builtin_vsx_disassemble_pair (res1, &vp); | |||
| temp3 = vec_madd ((__vector double)res[0], (__vector double)res1[0], temp3); | |||
| temp3 = vec_madd ((__vector double)res[1], (__vector double)res1[1], temp3); | |||
| vp = *((__vector_pair *)((void *)&a4[i*2])); | |||
| __builtin_vsx_disassemble_pair (res1, &vp); | |||
| temp4 = vec_madd ((__vector double)res[0], (__vector double)res1[0], temp4); | |||
| temp4 = vec_madd ((__vector double)res[1], (__vector double)res1[1], temp4); | |||
| vp = *((__vector_pair *)((void *)&a5[i*2])); | |||
| __builtin_vsx_disassemble_pair (res1, &vp); | |||
| temp5 = vec_madd ((__vector double)res[0], (__vector double)res1[0], temp5); | |||
| temp5 = vec_madd ((__vector double)res[1], (__vector double)res1[1], temp5); | |||
| vp = *((__vector_pair *)((void *)&a6[i*2])); | |||
| __builtin_vsx_disassemble_pair (res1, &vp); | |||
| temp6 = vec_madd ((__vector double)res[0], (__vector double)res1[0], temp6); | |||
| temp6 = vec_madd ((__vector double)res[1], (__vector double)res1[1], temp6); | |||
| vp = *((__vector_pair *)((void *)&a7[i*2])); | |||
| __builtin_vsx_disassemble_pair (res1, &vp); | |||
| temp7 = vec_madd ((__vector double)res[0], (__vector double)res1[0], temp7); | |||
| temp7 = vec_madd ((__vector double)res[1], (__vector double)res1[1], temp7); | |||
| } | |||
| y[0] += alpha * (temp0[0] + temp0[1]); | |||
| y[1] += alpha * (temp1[0] + temp1[1]); | |||
| y[2] += alpha * (temp2[0] + temp2[1]); | |||
| y[3] += alpha * (temp3[0] + temp3[1]); | |||
| y[4] += alpha * (temp4[0] + temp4[1]); | |||
| y[5] += alpha * (temp5[0] + temp5[1]); | |||
| y[6] += alpha * (temp6[0] + temp6[1]); | |||
| y[7] += alpha * (temp7[0] + temp7[1]); | |||
| } | |||
| #else | |||
| static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { | |||
| @@ -59,10 +59,25 @@ static void dscal_kernel_8 (long n, double *x, double alpha) | |||
| "lxvp 36, 192(%2) \n\t" | |||
| "lxvp 38, 224(%2) \n\t" | |||
| "stxvp 40, 0(%2) \n\t" | |||
| "stxvp 42, 32(%2) \n\t" | |||
| "stxvp 44, 64(%2) \n\t" | |||
| "stxvp 46, 96(%2) \n\t" | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| "stxv 40, 0(%2) \n\t" | |||
| "stxv 41, 16(%2) \n\t" | |||
| "stxv 42, 32(%2) \n\t" | |||
| "stxv 43, 48(%2) \n\t" | |||
| "stxv 44, 64(%2) \n\t" | |||
| "stxv 45, 80(%2) \n\t" | |||
| "stxv 46, 96(%2) \n\t" | |||
| "stxv 47, 112(%2) \n\t" | |||
| #else | |||
| "stxv 41, 0(%2) \n\t" | |||
| "stxv 40, 16(%2) \n\t" | |||
| "stxv 43, 32(%2) \n\t" | |||
| "stxv 42, 48(%2) \n\t" | |||
| "stxv 45, 64(%2) \n\t" | |||
| "stxv 44, 80(%2) \n\t" | |||
| "stxv 47, 96(%2) \n\t" | |||
| "stxv 46, 112(%2) \n\t" | |||
| #endif | |||
| "addi %2, %2, 128 \n\t" | |||
| @@ -81,10 +96,25 @@ static void dscal_kernel_8 (long n, double *x, double alpha) | |||
| "xvmuldp 46, 38, 48 \n\t" | |||
| "xvmuldp 47, 39, 48 \n\t" | |||
| "stxvp 40, 0(%2) \n\t" | |||
| "stxvp 42, 32(%2) \n\t" | |||
| "stxvp 44, 64(%2) \n\t" | |||
| "stxvp 46, 96(%2) \n\t" | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| "stxv 40, 0(%2) \n\t" | |||
| "stxv 41, 16(%2) \n\t" | |||
| "stxv 42, 32(%2) \n\t" | |||
| "stxv 43, 48(%2) \n\t" | |||
| "stxv 44, 64(%2) \n\t" | |||
| "stxv 45, 80(%2) \n\t" | |||
| "stxv 46, 96(%2) \n\t" | |||
| "stxv 47, 112(%2) \n\t" | |||
| #else | |||
| "stxv 41, 0(%2) \n\t" | |||
| "stxv 40, 16(%2) \n\t" | |||
| "stxv 43, 32(%2) \n\t" | |||
| "stxv 42, 48(%2) \n\t" | |||
| "stxv 45, 64(%2) \n\t" | |||
| "stxv 44, 80(%2) \n\t" | |||
| "stxv 47, 96(%2) \n\t" | |||
| "stxv 46, 112(%2) \n\t" | |||
| #endif | |||
| "#n=%1 alpha=%3 x=%0=%2" | |||
| : | |||
| @@ -112,10 +142,14 @@ static void dscal_kernel_8_zero (long n, double *x) | |||
| ".align 5 \n" | |||
| "one%=: \n\t" | |||
| "stxvp 32, 0(%2) \n\t" | |||
| "stxvp 32, 32(%2) \n\t" | |||
| "stxvp 32, 64(%2) \n\t" | |||
| "stxvp 32, 96(%2) \n\t" | |||
| "stxv 32, 0(%2) \n\t" | |||
| "stxv 32, 16(%2) \n\t" | |||
| "stxv 32, 32(%2) \n\t" | |||
| "stxv 32, 48(%2) \n\t" | |||
| "stxv 32, 64(%2) \n\t" | |||
| "stxv 32, 80(%2) \n\t" | |||
| "stxv 32, 96(%2) \n\t" | |||
| "stxv 32, 112(%2) \n\t" | |||
| "addi %2, %2, 128 \n\t" | |||
| @@ -120,7 +120,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, | |||
| #if defined(POWER10) | |||
| if ( n >= 32 ) | |||
| { | |||
| BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 3) & 0x3; | |||
| BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3; | |||
| for (i = 0; i < align; i++) { | |||
| temp = y[i]; | |||
| y[i] = x[i]; | |||
| @@ -69,6 +69,7 @@ int CNAME(int transa, int transb, BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alph | |||
| #endif | |||
| #ifdef SMP | |||
| // Multi-threading execution outperforms (or approaches) the execution of the | |||
| // small kernel. | |||
| if (num_cpu_avail(3) > 1) { | |||
| @@ -77,6 +78,9 @@ int CNAME(int transa, int transb, BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alph | |||
| } else { | |||
| return 1; | |||
| } | |||
| #else | |||
| return 1; | |||
| #endif | |||
| #endif | |||
| @@ -131,6 +131,10 @@ | |||
| #define alpha f27 | |||
| #if defined(PPC440) | |||
| #define PREFETCHSIZE_A (3 * 4) | |||
| #endif | |||
| #if defined(PPCG4) | |||
| #define PREFETCHSIZE_A (3 * 4) | |||
| #endif | |||
| @@ -96,6 +96,11 @@ | |||
| #define X1 r22 | |||
| #if defined(PPC440) | |||
| #define PREFETCHSIZE_A 42 | |||
| #define PREFETCHSIZE_C 7 | |||
| #endif | |||
| #if defined(PPCG4) | |||
| #define PREFETCHSIZE_A 42 | |||
| #define PREFETCHSIZE_C 7 | |||