merge develop into 0.3.0 for 0.3.24tags/v0.3.24
| @@ -0,0 +1,167 @@ | |||
| macos_instance: | |||
| image: ghcr.io/cirruslabs/macos-monterey-xcode:latest | |||
| task: | |||
| name: AppleM1/LLVM | |||
| compile_script: | |||
| - brew install llvm | |||
| - export PATH=/opt/homebrew/opt/llvm/bin:$PATH | |||
| - export LDFLAGS="-L/opt/homebrew/opt/llvm/lib" | |||
| - export CPPFLAGS="-I/opt/homebrew/opt/llvm/include" | |||
| - make TARGET=VORTEX USE_OPENMP=1 CC=clang | |||
| task: | |||
| name: AppleM1/LLVM/ILP64 | |||
| compile_script: | |||
| - brew install llvm | |||
| - export PATH=/opt/homebrew/opt/llvm/bin:$PATH | |||
| - export LDFLAGS="-L/opt/homebrew/opt/llvm/lib" | |||
| - export CPPFLAGS="-I/opt/homebrew/opt/llvm/include" | |||
| - make TARGET=VORTEX USE_OPENMP=1 CC=clang INTERFACE64=1 | |||
| task: | |||
| name: AppleM1/LLVM/CMAKE | |||
| compile_script: | |||
| - brew install llvm | |||
| - export PATH=/opt/homebrew/opt/llvm/bin:$PATH | |||
| - export LDFLAGS="-L/opt/homebrew/opt/llvm/lib" | |||
| - export CPPFLAGS="-I/opt/homebrew/opt/llvm/include" | |||
| - mkdir build | |||
| - cd build | |||
| - cmake -DTARGET=VORTEX -DCMAKE_C_COMPILER=clang -DBUILD_SHARED_LIBS=ON .. | |||
| - make | |||
| task: | |||
| name: AppleM1/GCC/MAKE/OPENMP | |||
| compile_script: | |||
| - brew install gcc@11 | |||
| - export PATH=/opt/homebrew/bin:$PATH | |||
| - export LDFLAGS="-L/opt/homebrew/lib" | |||
| - export CPPFLAGS="-I/opt/homebrew/include" | |||
| - make CC=gcc-11 FC=gfortran-11 USE_OPENMP=1 | |||
| macos_instance: | |||
| image: ghcr.io/cirruslabs/macos-monterey-xcode:latest | |||
| task: | |||
| name: AppleM1/LLVM x86_64 xbuild | |||
| compile_script: | |||
| - #brew install llvm | |||
| - export #PATH=/opt/homebrew/opt/llvm/bin:$PATH | |||
| - export #LDFLAGS="-L/opt/homebrew/opt/llvm/lib" | |||
| - export #CPPFLAGS="-I/opt/homebrew/opt/llvm/include" | |||
| - export ARCHS="i386 x86_64" | |||
| - export ARCHS_STANDARD="i386 x86_64" | |||
| - export ARCHS_STANDARD_32_64_BIT="i386 x86_64" | |||
| - export ARCHS_STANDARD_64_BIT=x86_64 | |||
| - export ARCHS_STANDARD_INCLUDING_64_BIT="i386 x86_64" | |||
| - export ARCHS_UNIVERSAL_IPHONE_OS="i386 x86_64" | |||
| - export VALID_ARCHS="i386 x86_64" | |||
| - xcrun --sdk macosx --show-sdk-path | |||
| - xcodebuild -version | |||
| - export CC=/Applications/Xcode-14.0.0.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang | |||
| - export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode-14.0.0.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX12.3.sdk -arch x86_64" | |||
| - make TARGET=CORE2 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 RANLIB="ls -l" | |||
| always: | |||
| config_artifacts: | |||
| path: "*conf*" | |||
| type: text/plain | |||
| # lib_artifacts: | |||
| # path: "libopenblas*" | |||
| # type: application/octet-streamm | |||
| macos_instance: | |||
| image: ghcr.io/cirruslabs/macos-monterey-xcode:latest | |||
| task: | |||
| name: AppleM1/LLVM armv8-ios xbuild | |||
| compile_script: | |||
| - #brew install llvm | |||
| - export #PATH=/opt/homebrew/opt/llvm/bin:$PATH | |||
| - export #LDFLAGS="-L/opt/homebrew/opt/llvm/lib" | |||
| - export #CPPFLAGS="-I/opt/homebrew/opt/llvm/include" | |||
| - export CC=/Applications/Xcode-14.0.0.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang | |||
| - export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode-14.0.0.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS16.0.sdk -arch arm64 -miphoneos-version-min=10.0" | |||
| - make TARGET=ARMV8 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 CROSS=1 | |||
| always: | |||
| config_artifacts: | |||
| path: "*conf*" | |||
| type: text/plain | |||
| macos_instance: | |||
| image: ghcr.io/cirruslabs/macos-monterey-xcode:latest | |||
| task: | |||
| name: AppleM1/LLVM armv7-androidndk xbuild | |||
| compile_script: | |||
| - #brew install android-ndk | |||
| - export #PATH=/opt/homebrew/opt/llvm/bin:$PATH | |||
| - export #LDFLAGS="-L/opt/homebrew/opt/llvm/lib" | |||
| - export #CPPFLAGS="-I/opt/homebrew/opt/llvm/include" | |||
| - find /System/Volumes/Data/opt/homebrew/Caskroom/android-ndk/25b -name "armv7a-linux-androideabi*-ranlib" | |||
| - #export CC=/Applications/Xcode-13.4.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang | |||
| - #export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode-13.4.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS16.0.sdk -arch arm64 -miphoneos-version-min=10.0" | |||
| - export CC=/System/Volumes/Data/opt/homebrew/Caskroom/android-ndk/25b/AndroidNDK8937393.app/Contents/NDK/toolchains/llvm/prebuilt/darwin-x86_64/bin/armv7a-linux-androideabi23-clang | |||
| - make TARGET=ARMV7 ARM_SOFTFP_ABI=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 RANLIB="ls -l" | |||
| always: | |||
| config_artifacts: | |||
| path: "*conf*" | |||
| type: text/plain | |||
| task: | |||
| name: NeoverseN1 | |||
| arm_container: | |||
| image: node:latest | |||
| compile_script: | |||
| - make | |||
| task: | |||
| name: NeoverseN1-ILP64 | |||
| arm_container: | |||
| image: node:latest | |||
| compile_script: | |||
| - make INTERFACE64=1 | |||
| task: | |||
| name: NeoverseN1-OMP | |||
| arm_container: | |||
| image: node:latest | |||
| cpu: 8 | |||
| compile_script: | |||
| - make USE_OPENMP=1 | |||
| FreeBSD_task: | |||
| name: FreeBSD-gcc12 | |||
| freebsd_instance: | |||
| image_family: freebsd-13-2 | |||
| install_script: | |||
| - pkg update -f && pkg upgrade -y && pkg install -y gmake gcc | |||
| compile_script: | |||
| - ls -l /usr/local/lib | |||
| - gmake CC=gcc | |||
| FreeBSD_task: | |||
| name: freebsd-gcc12-ilp64 | |||
| freebsd_instance: | |||
| image_family: freebsd-13-2 | |||
| install_script: | |||
| - pkg update -f && pkg upgrade -y && pkg install -y gmake gcc | |||
| compile_script: | |||
| - ls -l /usr/local/lib | |||
| - gmake CC=gcc INTERFACE64=1 | |||
| #task: | |||
| # name: Windows/LLVM16 --- too slow --- | |||
| # windows_container: | |||
| # image: cirrusci/windowsservercore:cmake-2021.12.07 | |||
| # install_script: | |||
| # - choco list --localonly | |||
| # - choco install -y llvm | |||
| # - # choco install -y cmake --installargs '"ADD_CMAKE_TO_PATH=System"' | |||
| # - choco install -y ninja | |||
| # - refreshenv | |||
| # - cd "c:/Program Files (x86)/Microsoft Visual Studio/2019/BuildTools/VC/Auxiliary/Build" | |||
| # - vcvarsall x64 | |||
| # - cd "C:\Users\ContainerAdministrator\AppData\Local\Temp\cirrus-ci-build" | |||
| # - cmake -S . -B build -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_MT=mt -DCMAKE_BUILD_TYPE=Release | |||
| # - cd build | |||
| # - cmake --build . | |||
| # - ctest | |||
| @@ -0,0 +1,121 @@ | |||
| name: c910v qemu test | |||
| on: [push, pull_request] | |||
| permissions: | |||
| contents: read # to fetch code (actions/checkout) | |||
| jobs: | |||
| TEST: | |||
| runs-on: ubuntu-latest | |||
| env: | |||
| xuetie_toolchain: https://occ-oss-prod.oss-cn-hangzhou.aliyuncs.com/resource//1663142514282 | |||
| toolchain_file_name: Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.6.1-20220906.tar.gz | |||
| strategy: | |||
| fail-fast: false | |||
| matrix: | |||
| include: | |||
| - target: RISCV64_GENERIC | |||
| triple: riscv64-linux-gnu | |||
| apt_triple: riscv64-linux-gnu | |||
| opts: NO_SHARED=1 TARGET=RISCV64_GENERIC | |||
| - target: C910V | |||
| triple: riscv64-unknown-linux-gnu | |||
| apt_triple: riscv64-linux-gnu | |||
| opts: NO_SHARED=1 TARGET=C910V | |||
| steps: | |||
| - name: Checkout repository | |||
| uses: actions/checkout@v3 | |||
| - name: install build deps | |||
| run: | | |||
| sudo apt-get update | |||
| sudo apt-get install autoconf automake autotools-dev ninja-build make ccache \ | |||
| gcc-${{ matrix.apt_triple }} gfortran-${{ matrix.apt_triple }} libgomp1-riscv64-cross | |||
| - name: checkout qemu | |||
| uses: actions/checkout@v3 | |||
| with: | |||
| repository: T-head-Semi/qemu | |||
| path: qemu | |||
| ref: 1e692ebb43d396c52352406323fc782c1ac99a42 | |||
| - name: build qemu | |||
| run: | | |||
| # Force use c910v qemu-user | |||
| wget https://github.com/revyos/qemu/commit/5164bca5a4bcde4534dc1a9aa3a7f619719874cf.patch | |||
| cd qemu | |||
| patch -p1 < ../5164bca5a4bcde4534dc1a9aa3a7f619719874cf.patch | |||
| ./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=riscv64-linux-user --disable-system | |||
| make -j$(nproc) | |||
| make install | |||
| - name: Compilation cache | |||
| uses: actions/cache@v3 | |||
| with: | |||
| path: ~/.ccache | |||
| key: ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}-${{ github.sha }} | |||
| restore-keys: | | |||
| ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }} | |||
| ccache-${{ runner.os }}-${{ matrix.target }} | |||
| - name: Configure ccache | |||
| run: | | |||
| test -d ~/.ccache || mkdir -p ~/.ccache | |||
| echo "max_size = 300M" > ~/.ccache/ccache.conf | |||
| echo "compression = true" >> ~/.ccache/ccache.conf | |||
| ccache -s | |||
| - name: build OpenBLAS | |||
| run: | | |||
| wget ${xuetie_toolchain}/${toolchain_file_name} | |||
| tar -xvf ${toolchain_file_name} -C /opt | |||
| export PATH="/opt/Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.6.1/bin:$PATH" | |||
| make CC='ccache ${{ matrix.triple }}-gcc -static' FC='ccache ${{ matrix.triple }}-gfortran -static' ${{ matrix.opts }} HOSTCC='ccache gcc' -j$(nproc) | |||
| - name: test | |||
| run: | | |||
| export PATH=$GITHUB_WORKSPACE/qemu-install/bin/:$PATH | |||
| qemu-riscv64 ./utest/openblas_utest | |||
| OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xscblat1 | |||
| OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xdcblat1 | |||
| OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xccblat1 | |||
| OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xzcblat1 | |||
| OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xscblat2 < ./ctest/sin2 | |||
| OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xdcblat2 < ./ctest/din2 | |||
| OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xccblat2 < ./ctest/cin2 | |||
| OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xzcblat2 < ./ctest/zin2 | |||
| OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xscblat3 < ./ctest/sin3 | |||
| OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xdcblat3 < ./ctest/din3 | |||
| OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xccblat3 < ./ctest/cin3 | |||
| OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xzcblat3 < ./ctest/zin3 | |||
| OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/sblat1 | |||
| OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/dblat1 | |||
| OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/cblat1 | |||
| OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/zblat1 | |||
| OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/sblat1 | |||
| OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/dblat1 | |||
| OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/cblat1 | |||
| OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/zblat1 | |||
| rm -f ./test/?BLAT2.SUMM | |||
| OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/sblat2 < ./test/sblat2.dat | |||
| OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/dblat2 < ./test/dblat2.dat | |||
| OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/cblat2 < ./test/cblat2.dat | |||
| OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/zblat2 < ./test/zblat2.dat | |||
| rm -f ./test/?BLAT2.SUMM | |||
| OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/sblat2 < ./test/sblat2.dat | |||
| OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/dblat2 < ./test/dblat2.dat | |||
| OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/cblat2 < ./test/cblat2.dat | |||
| OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/zblat2 < ./test/zblat2.dat | |||
| rm -f ./test/?BLAT3.SUMM | |||
| OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/sblat3 < ./test/sblat3.dat | |||
| OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/dblat3 < ./test/dblat3.dat | |||
| OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/cblat3 < ./test/cblat3.dat | |||
| OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/zblat3 < ./test/zblat3.dat | |||
| rm -f ./test/?BLAT3.SUMM | |||
| OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/sblat3 < ./test/sblat3.dat | |||
| OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/dblat3 < ./test/dblat3.dat | |||
| OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/cblat3 < ./test/cblat3.dat | |||
| OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/zblat3 < ./test/zblat3.dat | |||
| @@ -151,40 +151,53 @@ jobs: | |||
| strategy: | |||
| fail-fast: false | |||
| matrix: | |||
| msystem: [MINGW64, MINGW32, CLANG64] | |||
| msystem: [MINGW64, MINGW32, CLANG64, CLANG32] | |||
| idx: [int32, int64] | |||
| build-type: [Release] | |||
| include: | |||
| - msystem: MINGW64 | |||
| idx: int32 | |||
| target-prefix: mingw-w64-x86_64 | |||
| fc-pkg: mingw-w64-x86_64-gcc-fortran | |||
| fc-pkg: fc | |||
| - msystem: MINGW32 | |||
| idx: int32 | |||
| target-prefix: mingw-w64-i686 | |||
| fc-pkg: mingw-w64-i686-gcc-fortran | |||
| fc-pkg: fc | |||
| - msystem: CLANG64 | |||
| idx: int32 | |||
| target-prefix: mingw-w64-clang-x86_64 | |||
| fc-pkg: fc | |||
| # Compiling with Flang 16 seems to cause test errors on machines | |||
| # with AVX512 instructions. Revisit after MSYS2 distributes Flang 17. | |||
| no-avx512-flags: -DNO_AVX512=1 | |||
| - msystem: CLANG32 | |||
| idx: int32 | |||
| target-prefix: mingw-w64-clang-i686 | |||
| fc-pkg: cc | |||
| c-lapack-flags: -DC_LAPACK=ON | |||
| - msystem: MINGW64 | |||
| idx: int64 | |||
| idx64-flags: -DBINARY=64 -DINTERFACE64=1 | |||
| target-prefix: mingw-w64-x86_64 | |||
| fc-pkg: mingw-w64-x86_64-gcc-fortran | |||
| fc-pkg: fc | |||
| - msystem: CLANG64 | |||
| idx: int64 | |||
| idx64-flags: -DBINARY=64 -DINTERFACE64=1 | |||
| target-prefix: mingw-w64-clang-x86_64 | |||
| c-lapack-flags: -DC_LAPACK=ON | |||
| fc-pkg: fc | |||
| # Compiling with Flang 16 seems to cause test errors on machines | |||
| # with AVX512 instructions. Revisit after MSYS2 distributes Flang 17. | |||
| no-avx512-flags: -DNO_AVX512=1 | |||
| - msystem: MINGW64 | |||
| idx: int32 | |||
| target-prefix: mingw-w64-x86_64 | |||
| fc-pkg: mingw-w64-x86_64-gcc-fortran | |||
| fc-pkg: fc | |||
| build-type: None | |||
| exclude: | |||
| - msystem: MINGW32 | |||
| idx: int64 | |||
| - msystem: CLANG32 | |||
| idx: int64 | |||
| defaults: | |||
| run: | |||
| @@ -209,7 +222,7 @@ jobs: | |||
| install: >- | |||
| base-devel | |||
| ${{ matrix.target-prefix }}-cc | |||
| ${{ matrix.fc-pkg }} | |||
| ${{ matrix.target-prefix }}-${{ matrix.fc-pkg }} | |||
| ${{ matrix.target-prefix }}-cmake | |||
| ${{ matrix.target-prefix }}-ninja | |||
| ${{ matrix.target-prefix }}-ccache | |||
| @@ -217,14 +230,21 @@ jobs: | |||
| - name: Checkout repository | |||
| uses: actions/checkout@v3 | |||
| - name: Compilation cache | |||
| uses: actions/cache@v3 | |||
| with: | |||
| # It looks like this path needs to be hard-coded. | |||
| path: C:/msys64/home/runneradmin/.ccache | |||
| - name: Prepare ccache | |||
| # Get cache location of ccache | |||
| # Create key that is used in action/cache/restore and action/cache/save steps | |||
| id: ccache-prepare | |||
| run: | | |||
| echo "ccachedir=$(cygpath -m $(ccache -k cache_dir))" >> $GITHUB_OUTPUT | |||
| # We include the commit sha in the cache key, as new cache entries are | |||
| # only created if there is no existing entry for the key yet. | |||
| key: ccache-msys2-${{ matrix.msystem }}-${{ matrix.idx }}-${{ matrix.build-type }}-${{ github.ref }}-${{ github.sha }} | |||
| echo "key=ccache-msys2-${{ matrix.msystem }}-${{ matrix.idx }}-${{ matrix.build-type }}-${{ github.ref }}-${{ github.sha }}" >> $GITHUB_OUTPUT | |||
| - name: Restore ccache | |||
| uses: actions/cache/restore@v3 | |||
| with: | |||
| path: ${{ steps.ccache-prepare.outputs.ccachedir }} | |||
| key: ${{ steps.ccache-prepare.outputs.key }} | |||
| # Restore a matching ccache cache entry. Prefer same branch. | |||
| restore-keys: | | |||
| ccache-msys2-${{ matrix.msystem }}-${{ matrix.idx }}-${{ matrix.build-type }}-${{ github.ref }} | |||
| @@ -234,9 +254,10 @@ jobs: | |||
| # Limit the maximum size and switch on compression to avoid exceeding the total disk or cache quota. | |||
| run: | | |||
| which ccache | |||
| test -d ~/.ccache || mkdir -p ~/.ccache | |||
| echo "max_size = 250M" > ~/.ccache/ccache.conf | |||
| echo "compression = true" >> ~/.ccache/ccache.conf | |||
| test -d ${{ steps.ccache-prepare.outputs.ccachedir }} || mkdir -p ${{ steps.ccache-prepare.outputs.ccachedir }} | |||
| echo "max_size = 250M" > ${{ steps.ccache-prepare.outputs.ccachedir }}/ccache.conf | |||
| echo "compression = true" >> ${{ steps.ccache-prepare.outputs.ccachedir }}/ccache.conf | |||
| ccache -p | |||
| ccache -s | |||
| echo $HOME | |||
| cygpath -w $HOME | |||
| @@ -253,6 +274,7 @@ jobs: | |||
| -DTARGET=CORE2 \ | |||
| ${{ matrix.idx64-flags }} \ | |||
| ${{ matrix.c-lapack-flags }} \ | |||
| ${{ matrix.no-avx512-flags }} \ | |||
| -DCMAKE_C_COMPILER_LAUNCHER=ccache \ | |||
| -DCMAKE_Fortran_COMPILER_LAUNCHER=ccache \ | |||
| .. | |||
| @@ -264,10 +286,30 @@ jobs: | |||
| continue-on-error: true | |||
| run: ccache -s | |||
| - name: Save ccache | |||
| # Save the cache after we are done (successfully) building | |||
| uses: actions/cache/save@v3 | |||
| with: | |||
| path: ${{ steps.ccache-prepare.outputs.ccachedir }} | |||
| key: ${{ steps.ccache-prepare.outputs.key }} | |||
| - name: Run tests | |||
| id: run-ctest | |||
| timeout-minutes: 60 | |||
| run: cd build && ctest | |||
| - name: Re-run tests | |||
| if: always() && (steps.run-ctest.outcome == 'failure') | |||
| timeout-minutes: 60 | |||
| run: | | |||
| cd build | |||
| echo "::group::Re-run ctest" | |||
| ctest --rerun-failed --output-on-failure || true | |||
| echo "::endgroup::" | |||
| echo "::group::Log from these tests" | |||
| [ ! -f Testing/Temporary/LastTest.log ] || cat Testing/Temporary/LastTest.log | |||
| echo "::endgroup::" | |||
| cross_build: | |||
| runs-on: ubuntu-22.04 | |||
| @@ -295,6 +337,7 @@ jobs: | |||
| - name: Install Dependencies | |||
| run: | | |||
| sudo apt-get update | |||
| sudo apt-get install -y ccache gcc-${{ matrix.triple }} gfortran-${{ matrix.triple }} libgomp1-${{ matrix.target }}-cross | |||
| - name: Compilation cache | |||
| @@ -0,0 +1,110 @@ | |||
| name: loongarch64 qemu test | |||
| on: [push, pull_request] | |||
| jobs: | |||
| TEST: | |||
| runs-on: ubuntu-latest | |||
| strategy: | |||
| fail-fast: false | |||
| matrix: | |||
| include: | |||
| - target: LOONGSONGENERIC | |||
| triple: loongarch64-unknown-linux-gnu | |||
| opts: NO_SHARED=1 TARGET=LOONGSONGENERIC | |||
| - target: LOONGSON3R5 | |||
| triple: loongarch64-unknown-linux-gnu | |||
| opts: NO_SHARED=1 TARGET=LOONGSON3R5 | |||
| - target: LOONGSON2K1000 | |||
| triple: loongarch64-unknown-linux-gnu | |||
| opts: NO_SHARED=1 TARGET=LOONGSON2K1000 | |||
| steps: | |||
| - name: Checkout repository | |||
| uses: actions/checkout@v3 | |||
| - name: Install APT deps | |||
| run: | | |||
| sudo add-apt-repository ppa:savoury1/virtualisation | |||
| sudo apt-get update | |||
| sudo apt-get install autoconf automake autotools-dev ninja-build make ccache \ | |||
| qemu-user-static | |||
| - name: Download and install loongarch64-toolchain | |||
| run: | | |||
| wget https://github.com/loongson/build-tools/releases/download/2022.09.06/loongarch64-clfs-7.3-cross-tools-gcc-glibc.tar.xz | |||
| tar -xf loongarch64-clfs-7.3-cross-tools-gcc-glibc.tar.xz -C /opt | |||
| - name: Set env | |||
| run: | | |||
| echo "LD_LIBRARY_PATH=/opt/cross-tools/target/usr/lib64:/opt/cross-tools/loongarch64-unknown-linux-gnu/lib64:$LD_LIBRARY_PATH" >> $GITHUB_ENV | |||
| echo "PATH=$GITHUB_WORKSPACE:/opt/cross-tools/bin:$PATH" >> $GITHUB_ENV | |||
| - name: Compilation cache | |||
| uses: actions/cache@v3 | |||
| with: | |||
| path: ~/.ccache | |||
| key: ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}-${{ github.sha }} | |||
| restore-keys: | | |||
| ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }} | |||
| ccache-${{ runner.os }}-${{ matrix.target }} | |||
| - name: Configure ccache | |||
| run: | | |||
| test -d ~/.ccache || mkdir -p ~/.ccache | |||
| echo "max_size = 300M" > ~/.ccache/ccache.conf | |||
| echo "compression = true" >> ~/.ccache/ccache.conf | |||
| ccache -s | |||
| - name: Disable utest dsdot:dsdot_n_1 | |||
| run: | | |||
| echo -n > utest/test_dsdot.c | |||
| echo "Due to the qemu versions 7.2 causing utest cases to fail," | |||
| echo "the utest dsdot:dsdot_n_1 have been temporarily disabled." | |||
| - name: Build OpenBLAS | |||
| run: make CC='ccache ${{ matrix.triple }}-gcc -static' FC='ccache ${{ matrix.triple }}-gfortran -static' ${{ matrix.opts }} HOSTCC='ccache gcc' -j$(nproc) | |||
| - name: Test | |||
| run: | | |||
| qemu-loongarch64-static ./utest/openblas_utest | |||
| OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xscblat1 | |||
| OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xdcblat1 | |||
| OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xccblat1 | |||
| OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xzcblat1 | |||
| OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xscblat2 < ./ctest/sin2 | |||
| OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xdcblat2 < ./ctest/din2 | |||
| OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xccblat2 < ./ctest/cin2 | |||
| OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xzcblat2 < ./ctest/zin2 | |||
| OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xscblat3 < ./ctest/sin3 | |||
| OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xdcblat3 < ./ctest/din3 | |||
| OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xccblat3 < ./ctest/cin3 | |||
| OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xzcblat3 < ./ctest/zin3 | |||
| OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/sblat1 | |||
| OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/dblat1 | |||
| OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/cblat1 | |||
| OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/zblat1 | |||
| OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/sblat1 | |||
| OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/dblat1 | |||
| OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/cblat1 | |||
| OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/zblat1 | |||
| rm -f ./test/?BLAT2.SUMM | |||
| OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/sblat2 < ./test/sblat2.dat | |||
| OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/dblat2 < ./test/dblat2.dat | |||
| OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/cblat2 < ./test/cblat2.dat | |||
| OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/zblat2 < ./test/zblat2.dat | |||
| rm -f ./test/?BLAT2.SUMM | |||
| OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/sblat2 < ./test/sblat2.dat | |||
| OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/dblat2 < ./test/dblat2.dat | |||
| OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/cblat2 < ./test/cblat2.dat | |||
| OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/zblat2 < ./test/zblat2.dat | |||
| rm -f ./test/?BLAT3.SUMM | |||
| OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/sblat3 < ./test/sblat3.dat | |||
| OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/dblat3 < ./test/dblat3.dat | |||
| OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/cblat3 < ./test/cblat3.dat | |||
| OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/zblat3 < ./test/zblat3.dat | |||
| rm -f ./test/?BLAT3.SUMM | |||
| OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/sblat3 < ./test/sblat3.dat | |||
| OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/dblat3 < ./test/dblat3.dat | |||
| OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/cblat3 < ./test/cblat3.dat | |||
| OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/zblat3 < ./test/zblat3.dat | |||
| @@ -14,6 +14,7 @@ lapack-3.4.2 | |||
| lapack-3.4.2.tgz | |||
| lapack-netlib/make.inc | |||
| lapack-netlib/lapacke/include/lapacke_mangling.h | |||
| lapack-netlib/SRC/la_constants.mod | |||
| lapack-netlib/TESTING/testing_results.txt | |||
| lapack-netlib/INSTALL/test* | |||
| lapack-netlib/TESTING/xeigtstc | |||
| @@ -71,6 +72,7 @@ test/SBLAT3.SUMM | |||
| test/ZBLAT2.SUMM | |||
| test/ZBLAT3.SUMM | |||
| test/SHBLAT3.SUMM | |||
| test/SBBLAT3.SUMM | |||
| test/cblat1 | |||
| test/cblat2 | |||
| test/cblat3 | |||
| @@ -81,6 +83,7 @@ test/sblat1 | |||
| test/sblat2 | |||
| test/sblat3 | |||
| test/test_shgemm | |||
| test/test_sbgemm | |||
| test/zblat1 | |||
| test/zblat2 | |||
| test/zblat3 | |||
| @@ -8,7 +8,7 @@ project(OpenBLAS C ASM) | |||
| set(OpenBLAS_MAJOR_VERSION 0) | |||
| set(OpenBLAS_MINOR_VERSION 3) | |||
| set(OpenBLAS_PATCH_VERSION 23) | |||
| set(OpenBLAS_PATCH_VERSION 23.dev) | |||
| set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") | |||
| @@ -20,6 +20,8 @@ include(CMakePackageConfigHelpers) | |||
| ####### | |||
| option(BUILD_WITHOUT_LAPACK "Do not build LAPACK and LAPACKE (Only BLAS or CBLAS)" OFF) | |||
| option(BUILD_LAPACK_DEPRECATED "When building LAPACK, include also some older, deprecated routines" ON) | |||
| option(BUILD_TESTING "Build LAPACK testsuite when building LAPACK" ON) | |||
| option(C_LAPACK "Build LAPACK from C sources instead of the original Fortran" OFF) | |||
| @@ -309,19 +311,25 @@ endif() | |||
| #if (MSVC OR NOT NOFORTRAN) | |||
| if (NOT NO_CBLAS) | |||
| if (NOT ONLY_CBLAS) | |||
| # Broken without fortran on unix | |||
| add_subdirectory(utest) | |||
| add_subdirectory(utest) | |||
| endif() | |||
| endif() | |||
| if (NOT NOFORTRAN) | |||
| if (NOT ONLY_CBLAS) | |||
| # Build test and ctest | |||
| add_subdirectory(test) | |||
| endif() | |||
| if (BUILD_TESTING) | |||
| add_subdirectory(lapack-netlib/TESTING) | |||
| endif() | |||
| endif() | |||
| if(NOT NO_CBLAS) | |||
| if (NOT ONLY_CBLAS) | |||
| add_subdirectory(ctest) | |||
| endif() | |||
| endif() | |||
| if (CPP_THREAD_SAFETY_TEST OR CPP_THREAD_SAFETY_GEMV) | |||
| add_subdirectory(cpp_thread_test) | |||
| @@ -398,15 +406,45 @@ if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFFIX} STREQUAL "") | |||
| message(STATUS "adding suffix ${SYMBOLSUFFIX} to names of exported symbols in ${OpenBLAS_LIBNAME}") | |||
| endif() | |||
| if (${BUILD_LAPACK_DEPRECATED}) | |||
| set (BLD 1) | |||
| else () | |||
| set (BLD 0) | |||
| endif() | |||
| if (${BUILD_BFLOAT16}) | |||
| set (BBF16 1) | |||
| else () | |||
| set (BBF16 0) | |||
| endif() | |||
| if (${BUILD_SINGLE}) | |||
| set (BS 1) | |||
| else () | |||
| set (BS 0) | |||
| endif() | |||
| if (${BUILD_DOUBLE}) | |||
| set (BD 1) | |||
| else () | |||
| set (BD 0) | |||
| endif() | |||
| if (${BUILD_COMPLEX}) | |||
| set (BC 1) | |||
| else () | |||
| set (BC 0) | |||
| endif() | |||
| if (${BUILD_COMPLEX16}) | |||
| set (BZ 1) | |||
| else () | |||
| set (BZ 0) | |||
| endif() | |||
| if (NOT USE_PERL) | |||
| add_custom_command(TARGET ${OpenBLAS_LIBNAME}_shared POST_BUILD | |||
| COMMAND ${PROJECT_SOURCE_DIR}/exports/gensymbol "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BUILD_LAPACK_DEPRECATED}" "${BUILD_BFLOAT16}" "${BUILD_SINGLE}" "${BUILD_DOUBLE}" "${BUILD_COMPLEX}" "${BUILD_COMPLEX16}" > ${PROJECT_BINARY_DIR}/objcopy.def | |||
| COMMAND ${PROJECT_SOURCE_DIR}/exports/gensymbol "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BLD}" "${BBF16}" "${BS}" "${BD}" "${BC}" "${BZ}" > ${PROJECT_BINARY_DIR}/objcopy.def | |||
| COMMAND objcopy -v --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def ${PROJECT_BINARY_DIR}/lib/lib${OpenBLAS_LIBNAME}.so | |||
| COMMENT "renaming symbols" | |||
| ) | |||
| else() | |||
| add_custom_command(TARGET ${OpenBLAS_LIBNAME}_shared POST_BUILD | |||
| COMMAND perl ${PROJECT_SOURCE_DIR}/exports/gensymbol.pl "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BUILD_LAPACK_DEPRECATED}" "${BUILD_BFLOAT16}" "${BUILD_SINGLE}" "${BUILD_DOUBLE}" "${BUILD_COMPLEX}" "${BUILD_COMPLEX16}" > ${PROJECT_BINARY_DIR}/objcopy.def | |||
| COMMAND perl ${PROJECT_SOURCE_DIR}/exports/gensymbol.pl "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BLD}" "${BBF16}" "${BS}" "${BD}" "${BC}" "${BZ}" > ${PROJECT_BINARY_DIR}/objcopy.def | |||
| COMMAND objcopy -v --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def ${PROJECT_BINARY_DIR}/lib/lib${OpenBLAS_LIBNAME}.so | |||
| COMMENT "renaming symbols" | |||
| ) | |||
| @@ -511,9 +549,8 @@ configure_file(${PROJECT_SOURCE_DIR}/cmake/openblas.pc.in ${PROJECT_BINARY_DIR}/ | |||
| install (FILES ${PROJECT_BINARY_DIR}/openblas${SUFFIX64}.pc DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig/) | |||
| # GNUInstallDirs "DATADIR" wrong here; CMake search path wants "share". | |||
| set(PN OpenBLAS) | |||
| set(CMAKECONFIG_INSTALL_DIR "share/cmake/${PN}${SUFFIX64}") | |||
| set(CMAKECONFIG_INSTALL_DIR "${CMAKE_INSTALL_LIBDIR}/cmake/${PN}${SUFFIX64}") | |||
| configure_package_config_file(cmake/${PN}Config.cmake.in | |||
| "${CMAKE_CURRENT_BINARY_DIR}/${PN}${SUFFIX64}Config.cmake" | |||
| INSTALL_DESTINATION ${CMAKECONFIG_INSTALL_DIR}) | |||
| @@ -23,6 +23,9 @@ | |||
| * Optimization on AMD Piledriver | |||
| * Optimization on Intel Haswell | |||
| * Chris Sidebottom <chris.sidebottom@arm.com> | |||
| * Optimizations and other improvements targeting AArch64 | |||
| ## Previous Developers | |||
| * Zaheer Chothia <zaheer.chothia@gmail.com> | |||
| @@ -212,4 +215,4 @@ In chronological order: | |||
| * [2022-03] Support RISC-V Vector Intrinisc 1.0 version. | |||
| * Pablo Romero <https://github.com/pablorcum> | |||
| * [2022-08] Fix building from sources for QNX | |||
| * [2022-08] Fix building from sources for QNX | |||
| @@ -1,4 +1,104 @@ | |||
| OpenBLAS ChangeLog | |||
| ==================================================================== | |||
| Version 0.3.24 | |||
| 03-Sep-2023 | |||
| general: | |||
| - declared the arguments of cblas_xerbla as const (in accordance with the reference implementation | |||
| and others, the previous discrepancy appears to have dated back to GotoBLAS) | |||
| - fixed the implementation of ?GEMMT that was added in 0.3.23 | |||
| - made cpu-specific SWITCH_RATIO parameters for GEMM available to DYNAMIC_ARCH builds | |||
| - fixed application of SYMBOLSUFFIX in CMAKE builds | |||
| - fixed missing SSYCONVF function in the shared library | |||
| - fixed parallel build logic used with gmake | |||
| - added support for compilation with LLVM17, in particular its new Fortran compiler | |||
| - added support for CMAKE builds using the NVIDIA HPC compiler | |||
| - fixed INTERFACE64 builds with CMAKE and the f95 Fortran compiler | |||
| - fixed cross-build detection and management in c_check | |||
| - disabled building of the tests with CMAKE when ONLY_CBLAS is defined | |||
| - fixed several issues with the handling of runtime limits on the number of OPENMP threads | |||
| - corrected the error code returned by SGEADD/DGEADD when LDA is too small | |||
| - corrected the error code returned by IMATCOPY when LDB is too small | |||
| - updated ?NRM2 to support negative increment values (as introduced in release 3.10 | |||
| of the reference BLAS) | |||
| - fixed OpenMP builds with CLANG for the case where libomp is not in a standard location | |||
| - fixed a potential overwrite of unrelated memory during thread initialisation on startup | |||
| - fixed a potential integer overflow in the multithreading threshold for ?SYMM/?SYRK | |||
| - fixed build of the LAPACKE interfaces for the LAPACK 3.11.0 ?TRSYL functions added in 0.3.22 | |||
| - fixed installation of .cmake files in concurrent 32 and 64bit builds with CMAKE | |||
| - applied additions and corrections from the development branch of Reference-LAPACK: | |||
| - fixed actual arguments passed to a number of LAPACK functions (from Reference-LAPACK PR 885) | |||
| - fixed workspace query results in LAPACK ?SYTRF/?TRECV3 (from Reference-LAPACK PR 883) | |||
| - fixed derivation of the UPLO parameter in LAPACKE_?larfb (from Reference-LAPACK PR 878) | |||
| - fixed a crash in LAPACK ?GELSDD on NRHS=0 (from Reference-LAPACK PR 876) | |||
| - added new LAPACK utility functions CRSCL and ZRSCL (from Reference-LAPACK PR 839) | |||
| - corrected the order of eigenvalues for 2x2 matrices in ?STEMR (Reference-LAPACK PR 867) | |||
| - removed spurious reference to OpenMP variables outside OpenMP contexts (Reference-LAPACK PR 860) | |||
| - updated file comments on use of LAMBDA variable in LAPACK (Reference-LAPACK PR 852) | |||
| - fixed documentation of LAPACK SLASD0/DLASD0 (Reference-LAPACK PR 855) | |||
| - fixed confusing use of "minor" in LAPACK documentation (Reference-LAPACK PR 849) | |||
| - added new LAPACK functions ?GEDMD for dynamic mode decomposition (Reference-LAPACK PR 736) | |||
| - fixed potential stack overflows in the EIG part of the LAPACK testsuite (Reference-LAPACK PR 854) | |||
| - applied small improvements to the variants of Cholesky and QR functions (Reference-LAPACK PR 847) | |||
| - removed unused variables from LAPACK ?BDSQR (Reference-LAPACK PR 832) | |||
| - fixed a potential crash on allocation failure in LAPACKE SGEESX/DGEESX (Reference-LAPACK PR 836) | |||
| - added a quick return from SLARUV/DLARUV for N < 1 (Reference-LAPACK PR 837) | |||
| - updated function descriptions in LAPACK ?GEGS/?GEGV (Reference-LAPACK PR 831) | |||
| - improved algorithm description in ?GELSY (Reference-LAPACK PR 833) | |||
| - fixed scaling in LAPACK STGSNA/DTGSNA (Reference-LAPACK PR 830) | |||
| - fixed crash in LAPACKE_?geqrt with row-major data (Reference-LAPACK PR 768) | |||
| - added LAPACKE interfaces for C/ZUNHR_COL and S/DORHR_COL (Reference-LAPACK PR 827) | |||
| - added error exit tests for SYSV/SYTD2/GEHD2 to the testsuite (Reference-LAPACK PR 795) | |||
| - fixed typos in LAPACK source and comments (Reference-LAPACK PRs 809,811,812,814,820) | |||
| - adopt refactored ?GEBAL implementation (Reference-LAPACK PR 808) | |||
| x86_64: | |||
| - added cpu model autodetection for Intel Alder Lake N | |||
| - added activation of the AMX tile to the Sapphire Rapids SBGEMM kernel | |||
| - worked around miscompilations of GEMV/SYMV kernels by gcc's tree-vectorizer | |||
| - fixed compilation of Cooperlake and Sapphire Rapids kernels with CLANG | |||
| - fixed runtime detection of Cooperlake and Sapphire Rapids in DYNAMIC_ARCH | |||
| - fixed feature-based cputype fallback in DYNAMIC_ARCH | |||
| - added support for building the AVX512 kernels with the NVIDIA HPC compiler | |||
| - corrected ZAXPY result on old pre-AVX hardware for the INCX=0 case | |||
| - fixed a potential use of uninitialized variables in ZTRSM | |||
| ARM64: | |||
| - added cpu model autodetection for Apple M2 | |||
| - fixed wrong results of CGEMM/CTRMM/DNRM2 under OSX (use of reserved register) | |||
| - added support for building the SVE kernels with the NVIDIA HPC compiler | |||
| - added support for building the SVE kernels with the Apple Clang compiler | |||
| - fixed compiler option handling for building the SVE kernels with LLVM | |||
| - implemented SWITCH_RATIO parameter for improved GEMM performance on Neoverse | |||
| - activated SVE SGEMM and DGEMM kernels for Neoverse V1 | |||
| - improved performance of the SVE CGEMM and ZGEMM kernels on Neoverse V1 | |||
| - improved kernel selection for the ARMV8SVE target and added it to DYNAMIC_ARCH | |||
| - fixed runtime check for SVE availability in DYNAMIC_ARCH builds to take OS or | |||
| container restrictions into account | |||
| - fixed a potential use of uninitialized variables in ZTRSM | |||
| - fix a potential misdetection of ARMV8 hardware as 32bit in CMAKE builds | |||
| LOONGARCH64: | |||
| - added ABI detection | |||
| - added support for cpu affinity handling | |||
| - fixed compilation with early versions of the Loongson toolchain | |||
| - added an optimized SGEMM kernel for 3A5000 | |||
| - added optimized DGEMV kernels for 3A5000 | |||
| - improved the performance of the DGEMM kernel for 3A5000 | |||
| MIPS64: | |||
| - fixed miscompilation of TRMM kernels for the MIPS64_GENERIC target | |||
| POWER: | |||
| - fixed compiler warnings in the POWER10 SBGEMM kernel | |||
| RISCV: | |||
| - fixed application of the INTERFACE64 option when building with CMAKE | |||
| - fix a potential misdetection of RISCV hardware as 32bit in CMAKE builds | |||
| - fixed IDAMAX and DOT kernels for C910V | |||
| - fixed corner cases in the ROT and SWAP kernels for C910V | |||
| - fixed compilation of the C910V target with recent vendor compilers | |||
| ==================================================================== | |||
| Version 0.3.23 | |||
| 01-Apr-2023 | |||
| @@ -1,9 +1,14 @@ | |||
| node { | |||
| stage('Checkout') { | |||
| checkout | |||
| pipeline { | |||
| agent { | |||
| docker { | |||
| image 'osuosl/ubuntu-s390x' | |||
| } | |||
| } | |||
| stages { | |||
| stage('Build') { | |||
| sh("make") | |||
| steps { | |||
| sh 'make clean && make' | |||
| } | |||
| } | |||
| } | |||
| } | |||
| @@ -0,0 +1,16 @@ | |||
| pipeline { | |||
| agent { | |||
| docker { | |||
| image 'osuosl/ubuntu-ppc64le' | |||
| } | |||
| } | |||
| stages { | |||
| stage('Build') { | |||
| steps { | |||
| sh 'sudo apt update' | |||
| sh 'sudo apt install gfortran -y' | |||
| sh 'make clean && make' | |||
| } | |||
| } | |||
| } | |||
| } | |||
| @@ -40,9 +40,9 @@ LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast -O -Og -Os,$(LAPACK_FFLAGS)) | |||
| SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench cpp_thread_test | |||
| .PHONY : all libs netlib $(RELA) test ctest shared install | |||
| .NOTPARALLEL : all libs $(RELA) prof lapack-test install blas-test | |||
| .NOTPARALLEL : shared | |||
| all :: libs netlib $(RELA) tests shared | |||
| all :: tests | |||
| @echo | |||
| @echo " OpenBLAS build complete. ($(LIB_COMPONENTS))" | |||
| @echo | |||
| @@ -150,7 +150,7 @@ ifeq ($(OSNAME), CYGWIN_NT) | |||
| endif | |||
| endif | |||
| tests : libs netlib $(RELA) shared | |||
| tests : shared | |||
| ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) | |||
| touch $(LIBNAME) | |||
| ifndef NO_FBLAS | |||
| @@ -373,10 +373,10 @@ ifneq ($(CROSS), 1) | |||
| (cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r -b TESTING) | |||
| endif | |||
| lapack-runtest: | |||
| lapack-runtest: lapack-test | |||
| ( cd $(NETLIB_LAPACK_DIR)/INSTALL; ./testlsame; ./testslamch; ./testdlamch; \ | |||
| ./testsecond; ./testdsecnd; ./testieee; ./testversion ) | |||
| (cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r ) | |||
| (cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r -b TESTING ) | |||
| blas-test: | |||
| @@ -69,7 +69,7 @@ endif | |||
| # in GCC>=9 | |||
| ifeq ($(CORE), NEOVERSEN1) | |||
| ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG))) | |||
| ifeq ($(GCCVERSIONGTEQ9), 1) | |||
| ifeq (1, $(filter 1,$(GCCVERSIONGTEQ9) $(ISCLANG))) | |||
| CCOMMON_OPT += -march=armv8.2-a -mtune=neoverse-n1 | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=armv8.2-a -mtune=neoverse-n1 | |||
| @@ -92,9 +92,14 @@ endif | |||
| # in GCC>=10.4 | |||
| ifeq ($(CORE), NEOVERSEV1) | |||
| ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG))) | |||
| ifeq ($(GCCVERSIONGTEQ10), 1) | |||
| ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ11))) | |||
| CCOMMON_OPT += -march=armv8.4-a+sve -mtune=neoverse-v1 | |||
| ifeq (1, $(filter 1,$(GCCVERSIONGTEQ10) $(ISCLANG))) | |||
| ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ11) $(ISCLANG))) | |||
| CCOMMON_OPT += -march=armv8.4-a+sve | |||
| ifeq (1, $(ISCLANG)) | |||
| CCOMMON_OPT += -mtune=cortex-x1 | |||
| else | |||
| CCOMMON_OPT += -mtune=neoverse-v1 | |||
| endif | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=armv8.4-a -mtune=neoverse-v1 | |||
| endif | |||
| @@ -122,8 +127,8 @@ endif | |||
| # in GCC>=10.4 | |||
| ifeq ($(CORE), NEOVERSEN2) | |||
| ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG))) | |||
| ifeq ($(GCCVERSIONGTEQ10), 1) | |||
| ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ11))) | |||
| ifeq (1, $(filter 1,$(GCCVERSIONGTEQ10) $(ISCLANG))) | |||
| ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ11) $(ISCLANG))) | |||
| ifneq ($(OSNAME), Darwin) | |||
| CCOMMON_OPT += -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2 | |||
| else | |||
| @@ -155,7 +160,7 @@ endif | |||
| # Use a53 tunings because a55 is only available in GCC>=8.1 | |||
| ifeq ($(CORE), CORTEXA55) | |||
| ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG))) | |||
| ifeq ($(GCCVERSIONGTEQ8), 1) | |||
| ifeq (1, $(filter 1,$(GCCVERSIONGTEQ8) $(ISCLANG))) | |||
| CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a55 | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a55 | |||
| @@ -196,8 +201,13 @@ endif | |||
| endif | |||
| ifeq ($(CORE), THUNDERX3T110) | |||
| ifeq ($(GCCVERSIONGTEQ10), 1) | |||
| CCOMMON_OPT += -march=armv8.3-a -mtune=thunderx3t110 | |||
| ifeq (1, $(filter 1,$(GCCVERSIONGTEQ10) $(ISCLANG))) | |||
| CCOMMON_OPT += -march=armv8.3-a | |||
| ifeq (0, $(ISCLANG)) | |||
| CCOMMON_OPT += -mtune=thunderx3t110 | |||
| else | |||
| CCOMMON_OPT += -mtune=thunderx2t99 | |||
| endif | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=armv8.3-a -mtune=thunderx3t110 | |||
| endif | |||
| @@ -225,9 +235,12 @@ endif | |||
| endif | |||
| endif | |||
| ifeq ($(GCCVERSIONGTEQ9), 1) | |||
| ifeq (1, $(filter 1,$(GCCVERSIONGTEQ9) $(ISCLANG))) | |||
| ifeq ($(CORE), EMAG8180) | |||
| CCOMMON_OPT += -march=armv8-a -mtune=emag | |||
| CCOMMON_OPT += -march=armv8-a | |||
| ifeq ($(ISCLANG), 0) | |||
| CCOMMON_OPT += -mtune=emag | |||
| endif | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=armv8-a -mtune=emag | |||
| endif | |||
| @@ -3,7 +3,7 @@ | |||
| # | |||
| # This library's version | |||
| VERSION = 0.3.23 | |||
| VERSION = 0.3.23.dev | |||
| # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a | |||
| # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library | |||
| @@ -384,6 +384,11 @@ GCCMINORVERSIONGTEQ4 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d | |||
| GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 7) | |||
| endif | |||
| ifeq ($(C_COMPILER), CLANG) | |||
| CLANGVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9) | |||
| CLANGVERSIONGTEQ12 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 12) | |||
| endif | |||
| # | |||
| # OS dependent settings | |||
| # | |||
| @@ -645,7 +650,7 @@ DYNAMIC_CORE += HASWELL ZEN | |||
| endif | |||
| ifneq ($(NO_AVX512), 1) | |||
| ifneq ($(NO_AVX2), 1) | |||
| DYNAMIC_CORE += SKYLAKEX COOPERLAKE | |||
| DYNAMIC_CORE += SKYLAKEX COOPERLAKE SAPPHIRERAPIDS | |||
| endif | |||
| endif | |||
| endif | |||
| @@ -668,6 +673,7 @@ DYNAMIC_CORE += NEOVERSEN1 | |||
| ifneq ($(NO_SVE), 1) | |||
| DYNAMIC_CORE += NEOVERSEV1 | |||
| DYNAMIC_CORE += NEOVERSEN2 | |||
| DYNAMIC_CORE += ARMV8SVE | |||
| endif | |||
| DYNAMIC_CORE += CORTEXA55 | |||
| DYNAMIC_CORE += FALKOR | |||
| @@ -932,8 +938,12 @@ BINARY_DEFINED = 1 | |||
| endif | |||
| ifeq ($(ARCH), loongarch64) | |||
| CCOMMON_OPT += -march=loongarch64 -mabi=lp64 | |||
| FCOMMON_OPT += -march=loongarch64 -mabi=lp64 | |||
| LA64_ABI=$(shell $(CC) -mabi=lp64d -c $(TOPDIR)/cpuid_loongarch64.c -o /dev/null > /dev/null 2> /dev/null && echo lp64d) | |||
| ifneq ($(LA64_ABI), lp64d) | |||
| LA64_ABI=lp64 | |||
| endif | |||
| CCOMMON_OPT += -march=loongarch64 -mabi=$(LA64_ABI) | |||
| FCOMMON_OPT += -march=loongarch64 -mabi=$(LA64_ABI) | |||
| endif | |||
| endif | |||
| @@ -1082,8 +1092,9 @@ endif | |||
| endif | |||
| endif | |||
| ifeq ($(F_COMPILER), GFORTRAN) | |||
| ifeq ($(F_COMPILER), $(filter $(F_COMPILER),GFORTRAN FLANGNEW)) | |||
| CCOMMON_OPT += -DF_INTERFACE_GFORT | |||
| ifeq ($(F_COMPILER), GFORTRAN) | |||
| FCOMMON_OPT += -Wall | |||
| # make single-threaded LAPACK calls thread-safe #1847 | |||
| FCOMMON_OPT += -frecursive | |||
| @@ -1097,6 +1108,7 @@ EXTRALIB += -lgfortran | |||
| endif | |||
| endif | |||
| endif | |||
| endif | |||
| ifdef NO_BINARY_MODE | |||
| ifeq ($(ARCH), $(filter $(ARCH),mips64)) | |||
| ifdef BINARY64 | |||
| @@ -1763,6 +1775,8 @@ export TARGET_CORE | |||
| export NO_AVX512 | |||
| export NO_AVX2 | |||
| export BUILD_BFLOAT16 | |||
| export NO_LSX | |||
| export NO_LASX | |||
| export SBGEMM_UNROLL_M | |||
| export SBGEMM_UNROLL_N | |||
| @@ -75,18 +75,31 @@ endif | |||
| ifeq ($(CORE), COOPERLAKE) | |||
| ifndef NO_AVX512 | |||
| ifeq ($(C_COMPILER), GCC) | |||
| # cooperlake support was added in 10.1 | |||
| ifeq ($(GCCVERSIONGTEQ10)$(GCCMINORVERSIONGTEQ1), 11) | |||
| CCOMMON_OPT += -march=cooperlake | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=cooperlake | |||
| endif | |||
| else # gcc not support, fallback to avx512 | |||
| CCOMMON_OPT += -march=skylake-avx512 | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=skylake-avx512 | |||
| endif | |||
| endif | |||
| # cooperlake support was added in 10.1 | |||
| ifeq ($(GCCVERSIONGTEQ10)$(GCCMINORVERSIONGTEQ1), 11) | |||
| CCOMMON_OPT += -march=cooperlake | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=cooperlake | |||
| endif | |||
| else # gcc not support, fallback to avx512 | |||
| CCOMMON_OPT += -march=skylake-avx512 | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=skylake-avx512 | |||
| endif | |||
| endif | |||
| else ifeq ($(C_COMPILER), CLANG) | |||
| # cooperlake support was added in clang 9 | |||
| ifeq ($(CLANGVERSIONGTEQ9), 1) | |||
| CCOMMON_OPT += -march=cooperlake | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=cooperlake | |||
| endif | |||
| else # not supported in clang, fallback to avx512 | |||
| CCOMMON_OPT += -march=skylake-avx512 | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=skylake-avx512 | |||
| endif | |||
| endif | |||
| endif | |||
| ifeq ($(OSNAME), CYGWIN_NT) | |||
| CCOMMON_OPT += -fno-asynchronous-unwind-tables | |||
| @@ -104,18 +117,31 @@ endif | |||
| ifeq ($(CORE), SAPPHIRERAPIDS) | |||
| ifndef NO_AVX512 | |||
| ifeq ($(C_COMPILER), GCC) | |||
| # sapphire rapids support was added in 11 | |||
| ifeq ($(GCCVERSIONGTEQ11), 1) | |||
| CCOMMON_OPT += -march=sapphirerapids | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=sapphirerapids | |||
| endif | |||
| else # gcc not support, fallback to avx512 | |||
| CCOMMON_OPT += -march=skylake-avx512 | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=skylake-avx512 | |||
| endif | |||
| endif | |||
| # sapphire rapids support was added in 11 | |||
| ifeq ($(GCCVERSIONGTEQ11), 1) | |||
| CCOMMON_OPT += -march=sapphirerapids | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=sapphirerapids | |||
| endif | |||
| else # gcc not support, fallback to avx512 | |||
| CCOMMON_OPT += -march=skylake-avx512 | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=skylake-avx512 | |||
| endif | |||
| endif | |||
| else ifeq ($(C_COMPILER), CLANG) | |||
| # cooperlake support was added in clang 12 | |||
| ifeq ($(CLANGVERSIONGTEQ12), 1) | |||
| CCOMMON_OPT += -march=cooperlake | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=cooperlake | |||
| endif | |||
| else # not supported in clang, fallback to avx512 | |||
| CCOMMON_OPT += -march=skylake-avx512 | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=skylake-avx512 | |||
| endif | |||
| endif | |||
| endif | |||
| ifeq ($(OSNAME), CYGWIN_NT) | |||
| CCOMMON_OPT += -fno-asynchronous-unwind-tables | |||
| @@ -6,11 +6,15 @@ Travis CI: [](https://ci.appveyor.com/project/xianyi/openblas/branch/develop) | |||
| Drone CI: [](https://cloud.drone.io/xianyi/OpenBLAS/) | |||
| Cirrus CI: [](https://cirrus-ci.com/github/xianyi/OpenBLAS) | |||
| <!-- Drone CI: [](https://cloud.drone.io/xianyi/OpenBLAS/)--> | |||
| [](https://dev.azure.com/xianyi/OpenBLAS/_build/latest?definitionId=1&branchName=develop) | |||
| OSUOSL POWERCI [](http://powerci.osuosl.org/job/OpenBLAS_gh/job/develop/) | |||
| OSUOSL IBMZ-CI [](http://ibmz-ci.osuosl.org/job/OpenBLAS-Z/job/develop/) | |||
| ## Introduction | |||
| OpenBLAS is an optimized BLAS (Basic Linear Algebra Subprograms) library based on GotoBLAS2 1.13 BSD version. | |||
| @@ -115,7 +115,7 @@ jobs: | |||
| mkdir build | |||
| cd build | |||
| call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvars64.bat" | |||
| cmake -G "Ninja" -DCMAKE_C_COMPILER=clang-cl -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_TESTING=OFF -DCMAKE_MT=mt -DCMAKE_BUILD_TYPE=Release -DMSVC_STATIC_CRT=ON .. | |||
| cmake -G "Ninja" -DCMAKE_C_COMPILER=clang-cl -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER="flang -I C:\Miniconda\Library\include\flang" -DBUILD_TESTING=OFF -DCMAKE_MT=mt -DCMAKE_BUILD_TYPE=Release -DMSVC_STATIC_CRT=ON .. | |||
| cmake --build . --config Release | |||
| ctest | |||
| @@ -271,6 +271,19 @@ jobs: | |||
| - script: | | |||
| make TARGET=ARMV7 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 | |||
| - job: OSX_xbuild_DYNAMIC_ARM64 | |||
| pool: | |||
| vmImage: 'macOS-11' | |||
| variables: | |||
| CC: /Applications/Xcode_12.5.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang | |||
| CFLAGS: -O2 -Wno-macro-redefined -isysroot /Applications/Xcode_12.5.1.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX11.3.sdk -arch arm64 | |||
| steps: | |||
| - script: | | |||
| ls /Applications/Xcode_12.5.1.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs | |||
| /Applications/Xcode_12.5.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang -arch arm64 --print-supported-cpus | |||
| /Applications/Xcode_11.7.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang --version | |||
| make TARGET=ARMV8 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 | |||
| - job: ALPINE_MUSL | |||
| pool: | |||
| vmImage: 'ubuntu-latest' | |||
| @@ -1,5 +1,5 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2014, The OpenBLAS Project | |||
| Copyright (c) 2014, 2023 The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| @@ -67,7 +67,7 @@ int main(int argc, char *argv[]){ | |||
| int step = 1; | |||
| int loops = 1; | |||
| if ((p = getenv("OPENBLAS_LOOPS"))) loops=*p; | |||
| if ((p = getenv("OPENBLAS_LOOPS"))) loops=atoi(p); | |||
| double time1,timeg; | |||
| @@ -77,7 +77,7 @@ int main(int argc, char *argv[]){ | |||
| if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} | |||
| if (argc > 0) { step = atol(*argv); argc--; argv++;} | |||
| fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = %c Trans = %c\n", from, to, step,uplo,trans); | |||
| fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = %c Trans = %c Loops = %d\n", from, to, step,uplo,trans,loops); | |||
| if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ | |||
| @@ -31,13 +31,17 @@ flags="$*" | |||
| cross_suffix="" | |||
| if [ "`dirname \"$compiler_name\"`" != '.' ]; then | |||
| cross_suffix="$cross_suffix`dirname \"$compiler_name\"`/" | |||
| if [ "`dirname "$compiler_name"`" != '.' ]; then | |||
| cross_suffix="$cross_suffix`dirname "$compiler_name"`/" | |||
| fi | |||
| bn=`basename $compiler_name` | |||
| cn=`echo $compiler_name | sed -e 's/ -.*//'` | |||
| bn=`basename "$cn"` | |||
| case "$bn" in | |||
| *-*) cross_suffix="$cross_suffix${bn%-*}-" | |||
| *-*) if [ "$bn" != '-' ]; then | |||
| cross_suffix="$cross_suffix${bn%-*}-" | |||
| fi | |||
| esac | |||
| compiler="" | |||
| @@ -164,7 +168,7 @@ fi | |||
| no_msa=0 | |||
| if [ "$architecture" = "mips" ] || [ "$architecture" = "mips64" ]; then | |||
| tmpd="$(mktemp -d)" | |||
| tmpd=$(mktemp -d 2>/dev/null || mktemp -d -t 'OBC') | |||
| tmpf="$tmpd/a.c" | |||
| code='"addvi.b $w0, $w1, 1"' | |||
| msa_flags='-mmsa -mfp64 -mload-store-pairs' | |||
| @@ -181,6 +185,37 @@ if [ "$architecture" = "mips" ] || [ "$architecture" = "mips64" ]; then | |||
| rm -rf "$tmpd" | |||
| fi | |||
| no_lsx=0 | |||
| no_lasx=0 | |||
| if [ "$architecture" = "loongarch64" ]; then | |||
| tmpd="$(mktemp -d)" | |||
| tmplsx="$tmpd/lsx.c" | |||
| codelsx='"vadd.b $vr0, $vr0, $vr0"' | |||
| lsx_flags='-march=loongarch64 -mlsx' | |||
| printf "#include <lsxintrin.h>\n\n" >> "$tmplsx" | |||
| printf "void main(void){ __asm__ volatile(%s);}\n" "$codelsx" >> "$tmplsx" | |||
| args="$lsx_flags -o $tmplsx.o $tmplsx" | |||
| { | |||
| $compiler_name $flags $args >/dev/null 2>&1 | |||
| } || { | |||
| no_lsx=1 | |||
| } | |||
| tmplasx="$tmpd/lasx.c" | |||
| codelasx='"xvadd.b $xr0, $xr0, $xr0"' | |||
| lasx_flags='-march=loongarch64 -mlasx' | |||
| printf "#include <lasxintrin.h>\n\n" >> "$tmplasx" | |||
| printf "void main(void){ __asm__ volatile(%s);}\n" "$codelasx" >> "$tmplasx" | |||
| args="$lasx_flags -o $tmplasx.o $tmplasx" | |||
| { | |||
| $compiler_name $flags $args >/dev/null 2>&1 | |||
| } || { | |||
| no_lasx=1 | |||
| } | |||
| rm -rf "$tmpd" | |||
| fi | |||
| case "$data" in | |||
| *ARCH_X86_64*) architecture=x86_64 ;; | |||
| *ARCH_X86*) architecture=x86 ;; | |||
| @@ -204,7 +239,7 @@ esac | |||
| no_avx512=0 | |||
| if [ "$architecture" = "x86" ] || [ "$architecture" = "x86_64" ]; then | |||
| tmpd=`mktemp -d` | |||
| tmpd=$(mktemp -d 2>/dev/null || mktemp -d -t 'OBC') | |||
| tmpf="$tmpd/a.c" | |||
| code='"vbroadcastss -4 * 4(%rsi), %zmm2"' | |||
| printf "#include <immintrin.h>\n\nint main(void){ __asm__ volatile(%s); }\n" "$code" >> "$tmpf" | |||
| @@ -225,7 +260,7 @@ fi | |||
| no_rv64gv=0 | |||
| if [ "$architecture" = "riscv64" ]; then | |||
| tmpd=`mktemp -d` | |||
| tmpd=$(mktemp -d 2>/dev/null || mktemp -d -t 'OBC') | |||
| tmpf="$tmpd/a.c" | |||
| code='"vsetvli zero, zero, e8, m1\n"' | |||
| printf "int main(void){ __asm__ volatile(%s); }\n" "$code" >> "$tmpf" | |||
| @@ -241,13 +276,16 @@ fi | |||
| no_sve=0 | |||
| if [ "$architecture" = "arm64" ]; then | |||
| tmpd=`mktemp -d` | |||
| tmpd=$(mktemp -d 2>/dev/null || mktemp -d -t 'OBC') | |||
| tmpf="$tmpd/a.c" | |||
| printf "#include <arm_sve.h>\n\n int main(void){}\n">> "$tmpf" | |||
| args=" -march=armv8-a+sve -c -o $tmpf.o $tmpf" | |||
| no_sve=0 | |||
| { | |||
| $compiler_name $flags $args >/dev/null 2>&1 | |||
| } || { | |||
| args=" -Msve_intrinsics -c -o $tmpf.o $tmpf" | |||
| $compiler_name $flags $args >/dev/null 2>&1 | |||
| } || { | |||
| no_sve=1 | |||
| } | |||
| @@ -257,7 +295,7 @@ fi | |||
| c11_atomics=0 | |||
| case "$data" in | |||
| *HAVE_C11*) | |||
| tmpd=`mktemp -d` | |||
| tmpd=$(mktemp -d 2>/dev/null || mktemp -d -t 'OBC') | |||
| tmpf="$tmpd/a.c" | |||
| printf "#include <stdatomic.h>\nint main(void){}\n" >> "$tmpf" | |||
| args=" -c -o $tmpf.o $tmpf" | |||
| @@ -395,6 +433,8 @@ done | |||
| [ "$no_avx512" -eq 1 ] && printf "NO_AVX512=1\n" | |||
| [ "$no_avx2" -eq 1 ] && printf "NO_AVX2=1\n" | |||
| [ "$oldgcc" -eq 1 ] && printf "OLDGCC=1\n" | |||
| [ "$no_lsx" -eq 1 ] && printf "NO_LSX=1\n" | |||
| [ "$no_lasx" -eq 1 ] && printf "NO_LASX=1\n" | |||
| } >> "$makefile" | |||
| os=`echo "$os" | tr '[[:lower:]]' '[[:upper:]]'/ ` | |||
| @@ -410,6 +450,8 @@ compiler=`echo "$compiler" | tr '[[:lower:]]' '[[:upper:]]' ` | |||
| [ -n "$need_fu" ] && printf "#define FUNDERSCORE\t%s\n" "$need_fu" | |||
| [ "$no_msa" -eq 1 ] && printf "#define NO_MSA\t1\n" | |||
| [ "$c11_atomics" -eq 1 ] && printf "#define HAVE_C11\t1\n" | |||
| [ "$no_lsx" -eq 1 ] && printf "#define NO_LSX\t1\n" | |||
| [ "$no_lasx" -eq 1 ] && printf "#define NO_LASX\t1\n" | |||
| } >> "$config" | |||
| @@ -232,6 +232,47 @@ if (($architecture eq "mips") || ($architecture eq "mips64")) { | |||
| } | |||
| } | |||
| $no_lsx = 0; | |||
| $no_lasx = 0; | |||
| if (($architecture eq "loongarch64")) { | |||
| eval "use File::Temp qw(tempfile)"; | |||
| if ($@){ | |||
| warn "could not load PERL module File::Temp, so could not check LSX and LASX capatibility"; | |||
| } else { | |||
| $tmplsx = new File::Temp( SUFFIX => '.c' , UNLINK => 1 ); | |||
| $codelsx = '"vadd.b $vr0, $vr0, $vr0"'; | |||
| $lsx_flags = "-march=loongarch64 -mlsx"; | |||
| print $tmplsx "#include <lsxintrin.h>\n\n"; | |||
| print $tmplsx "void main(void){ __asm__ volatile($codelsx); }\n"; | |||
| $args = "$lsx_flags -o $tmplsx.o $tmplsx"; | |||
| my @cmd = ("$compiler_name $flags $args >/dev/null 2>/dev/null"); | |||
| system(@cmd) == 0; | |||
| if ($? != 0) { | |||
| $no_lsx = 1; | |||
| } else { | |||
| $no_lsx = 0; | |||
| } | |||
| unlink("$tmplsx.o"); | |||
| $tmplasx = new File::Temp( SUFFIX => '.c' , UNLINK => 1 ); | |||
| $codelasx = '"xvadd.b $xr0, $xr0, $xr0"'; | |||
| $lasx_flags = "-march=loongarch64 -mlasx"; | |||
| print $tmplasx "#include <lasxintrin.h>\n\n"; | |||
| print $tmplasx "void main(void){ __asm__ volatile($codelasx); }\n"; | |||
| $args = "$lasx_flags -o $tmplasx.o $tmplasx"; | |||
| my @cmd = ("$compiler_name $flags $args >/dev/null 2>/dev/null"); | |||
| system(@cmd) == 0; | |||
| if ($? != 0) { | |||
| $no_lasx = 1; | |||
| } else { | |||
| $no_lasx = 0; | |||
| } | |||
| unlink("$tmplasx.o"); | |||
| } | |||
| } | |||
| $architecture = x86 if ($data =~ /ARCH_X86/); | |||
| $architecture = x86_64 if ($data =~ /ARCH_X86_64/); | |||
| $architecture = e2k if ($data =~ /ARCH_E2K/); | |||
| @@ -424,6 +465,8 @@ print MAKEFILE "NO_RV64GV=1\n" if $no_rv64gv eq 1; | |||
| print MAKEFILE "NO_AVX512=1\n" if $no_avx512 eq 1; | |||
| print MAKEFILE "NO_AVX2=1\n" if $no_avx2 eq 1; | |||
| print MAKEFILE "OLDGCC=1\n" if $oldgcc eq 1; | |||
| print MAKEFILE "NO_LSX=1\n" if $no_lsx eq 1; | |||
| print MAKEFILE "NO_LASX=1\n" if $no_lasx eq 1; | |||
| $os =~ tr/[a-z]/[A-Z]/; | |||
| $architecture =~ tr/[a-z]/[A-Z]/; | |||
| @@ -437,6 +480,8 @@ print CONFFILE "#define __64BIT__\t1\n" if $binformat eq bin64; | |||
| print CONFFILE "#define FUNDERSCORE\t$need_fu\n" if $need_fu ne ""; | |||
| print CONFFILE "#define HAVE_MSA\t1\n" if $have_msa eq 1; | |||
| print CONFFILE "#define HAVE_C11\t1\n" if $c11_atomics eq 1; | |||
| print CONFFILE "#define NO_LSX\t1\n" if $no_lsx eq 1; | |||
| print CONFFILE "#define NO_LASX\t1\n" if $no_lasx eq 1; | |||
| if ($os eq "LINUX") { | |||
| @@ -350,7 +350,7 @@ void cblas_cher2k(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBL | |||
| void cblas_zher2k(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE Trans, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, | |||
| OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST double beta, void *C, OPENBLAS_CONST blasint ldc); | |||
| void cblas_xerbla(blasint p, char *rout, char *form, ...); | |||
| void cblas_xerbla(blasint p, OPENBLAS_CONST char *rout, OPENBLAS_CONST char *form, ...); | |||
| /*** BLAS extensions ***/ | |||
| @@ -46,7 +46,7 @@ if (DYNAMIC_ARCH) | |||
| if (ARM64) | |||
| set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA55 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110) | |||
| if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 9.99) | |||
| set(DYNAMIC_CORE "${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2") | |||
| set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2 ARMV8SVE) | |||
| endif () | |||
| if (DYNAMIC_LIST) | |||
| set(DYNAMIC_CORE ARMV8 ${DYNAMIC_LIST}) | |||
| @@ -82,7 +82,7 @@ if (DYNAMIC_ARCH) | |||
| set(DYNAMIC_CORE ${DYNAMIC_CORE} HASWELL ZEN) | |||
| endif () | |||
| if (NOT NO_AVX512) | |||
| set(DYNAMIC_CORE ${DYNAMIC_CORE} SKYLAKEX COOPERLAKE) | |||
| set(DYNAMIC_CORE ${DYNAMIC_CORE} SKYLAKEX COOPERLAKE SAPPHIRERAPIDS) | |||
| string(REGEX REPLACE "-march=native" "" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}") | |||
| endif () | |||
| if (DYNAMIC_LIST) | |||
| @@ -135,7 +135,7 @@ if (ARM64) | |||
| set(BINARY_DEFINED 1) | |||
| endif () | |||
| if (${ARCH} STREQUAL "riscv64") | |||
| if (RISCV64) | |||
| set(NO_BINARY_MODE 1) | |||
| set(BINARY_DEFINED 1) | |||
| endif () | |||
| @@ -65,6 +65,14 @@ if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI") | |||
| endif () | |||
| endif () | |||
| if (${CMAKE_C_COMPILER_ID} STREQUAL "NVHPC") | |||
| if (POWER) | |||
| set(CCOMMON_OPT "${CCOMMON_OPT} -tp pwr8") | |||
| else () | |||
| set(CCOMMON_OPT "${CCOMMON_OPT} -tp px") | |||
| endif () | |||
| endif () | |||
| if (${CMAKE_C_COMPILER_ID} STREQUAL "PATHSCALE") | |||
| if (BINARY64) | |||
| set(CCOMMON_OPT "${CCOMMON_OPT} -m64") | |||
| @@ -172,22 +180,30 @@ endif () | |||
| if (${CORE} STREQUAL NEOVERSEN2) | |||
| if (NOT DYNAMIC_ARCH) | |||
| execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) | |||
| if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4) | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2") | |||
| if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE) | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -Msve_intrinsics -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2") | |||
| else () | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve") | |||
| endif() | |||
| execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) | |||
| if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4) | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2") | |||
| else () | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve") | |||
| endif() | |||
| endif () | |||
| endif () | |||
| endif () | |||
| if (${CORE} STREQUAL NEOVERSEV1) | |||
| if (NOT DYNAMIC_ARCH) | |||
| execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) | |||
| if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4) | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.4-a+sve -mtune=neoverse-v1") | |||
| if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE) | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -Msve_intrinsics -march=armv8.4-a+sve -mtune=neoverse-v1") | |||
| else () | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve") | |||
| execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) | |||
| if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4) | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.4-a+sve -mtune=neoverse-v1") | |||
| else () | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve") | |||
| endif() | |||
| endif() | |||
| endif () | |||
| endif () | |||
| @@ -205,7 +221,11 @@ endif () | |||
| if (${CORE} STREQUAL ARMV8SVE) | |||
| if (NOT DYNAMIC_ARCH) | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve") | |||
| if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE) | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -Msve_intrinsics -march=armv8-a+sve") | |||
| else () | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve") | |||
| endif () | |||
| endif () | |||
| endif () | |||
| @@ -3,7 +3,8 @@ | |||
| ## Description: Ported from portion of OpenBLAS/Makefile.system | |||
| ## Sets Fortran related variables. | |||
| if (${F_COMPILER} STREQUAL "FLANG") | |||
| if (${F_COMPILER} STREQUAL "FLANG" AND NOT CMAKE_Fortran_COMPILER_ID STREQUAL "LLVMFlang") | |||
| # This is for classic Flang. LLVM Flang is handled with gfortran below. | |||
| set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_FLANG") | |||
| if (BINARY64 AND INTERFACE64) | |||
| set(FCOMMON_OPT "${FCOMMON_OPT} -i8") | |||
| @@ -38,15 +39,17 @@ if (${F_COMPILER} STREQUAL "G95") | |||
| endif () | |||
| endif () | |||
| if (${F_COMPILER} STREQUAL "GFORTRAN") | |||
| if (${F_COMPILER} STREQUAL "GFORTRAN" OR ${F_COMPILER} STREQUAL "F95" OR CMAKE_Fortran_COMPILER_ID STREQUAL "LLVMFlang") | |||
| set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_GFORT") | |||
| # ensure reentrancy of lapack codes | |||
| set(FCOMMON_OPT "${FCOMMON_OPT} -Wall -frecursive") | |||
| # work around ABI violation in passing string arguments from C | |||
| set(FCOMMON_OPT "${FCOMMON_OPT} -fno-optimize-sibling-calls") | |||
| #Don't include -lgfortran, when NO_LAPACK=1 or lsbcc | |||
| if (NOT NO_LAPACK) | |||
| set(EXTRALIB "${EXTRALIB} -lgfortran") | |||
| if (NOT CMAKE_Fortran_COMPILER_ID STREQUAL "LLVMFlang") | |||
| # ensure reentrancy of lapack codes | |||
| set(FCOMMON_OPT "${FCOMMON_OPT} -Wall -frecursive") | |||
| # work around ABI violation in passing string arguments from C | |||
| set(FCOMMON_OPT "${FCOMMON_OPT} -fno-optimize-sibling-calls") | |||
| if (NOT NO_LAPACK) | |||
| # Don't include -lgfortran, when NO_LAPACK=1 or lsbcc | |||
| set(EXTRALIB "${EXTRALIB} -lgfortran") | |||
| endif () | |||
| endif () | |||
| if (NO_BINARY_MODE) | |||
| if (MIPS64) | |||
| @@ -63,6 +66,13 @@ if (${F_COMPILER} STREQUAL "GFORTRAN") | |||
| set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp32") | |||
| endif () | |||
| endif () | |||
| if (RISCV64) | |||
| if (BINARY64) | |||
| if (INTERFACE64) | |||
| set(FCOMMON_OPT "${FCOMMON_OPT} -fdefault-integer-8") | |||
| endif () | |||
| endif () | |||
| endif () | |||
| else () | |||
| if (BINARY64) | |||
| set(FCOMMON_OPT "${FCOMMON_OPT} -m64") | |||
| @@ -121,7 +131,7 @@ if (${F_COMPILER} STREQUAL "IBM") | |||
| endif () | |||
| endif () | |||
| if (${F_COMPILER} STREQUAL "PGI") | |||
| if (${F_COMPILER} STREQUAL "PGI" OR ${F_COMPILER} STREQUAL "PGF95") | |||
| set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_PGI") | |||
| set(COMMON_PROF "${COMMON_PROF} -DPGICOMPILER") | |||
| if (BINARY64) | |||
| @@ -124,7 +124,7 @@ set(SLASRC | |||
| ssbev_2stage.f ssbevx_2stage.f ssbevd_2stage.f ssygv_2stage.f | |||
| sgesvdq.f slaorhr_col_getrfnp.f | |||
| slaorhr_col_getrfnp2.f sorgtsqr.f sorgtsqr_row.f sorhr_col.f | |||
| slatrs3.f strsyl3.f sgelst.f) | |||
| slatrs3.f strsyl3.f sgelst.f sgedmd.f90 sgedmdq.f90) | |||
| set(SXLASRC sgesvxx.f sgerfsx.f sla_gerfsx_extended.f sla_geamv.f | |||
| sla_gercond.f sla_gerpvgrw.f ssysvxx.f ssyrfsx.f | |||
| @@ -187,7 +187,7 @@ set(CLASRC | |||
| cposv.f cposvx.f cpotrf2.f cpotri.f cpstrf.f cpstf2.f | |||
| cppcon.f cppequ.f cpprfs.f cppsv.f cppsvx.f cpptrf.f cpptri.f cpptrs.f | |||
| cptcon.f cpteqr.f cptrfs.f cptsv.f cptsvx.f cpttrf.f cpttrs.f cptts2.f | |||
| crot.f cspcon.f csprfs.f cspsv.f | |||
| crot.f crscl.f cspcon.f csprfs.f cspsv.f | |||
| cspsvx.f csptrf.f csptri.f csptrs.f csrscl.f cstedc.f | |||
| cstegr.f cstein.f csteqr.f csycon.f | |||
| csyrfs.f csysv.f csysvx.f csytf2.f csytrf.f csytri.f | |||
| @@ -223,7 +223,7 @@ set(CLASRC | |||
| chbev_2stage.f chbevx_2stage.f chbevd_2stage.f chegv_2stage.f | |||
| cgesvdq.f claunhr_col_getrfnp.f claunhr_col_getrfnp2.f | |||
| cungtsqr.f cungtsqr_row.f cunhr_col.f | |||
| clatrs3.f ctrsyl3.f cgelst.f) | |||
| clatrs3.f ctrsyl3.f cgelst.f cgedmd.f90 cgedmdq.f90) | |||
| set(CXLASRC cgesvxx.f cgerfsx.f cla_gerfsx_extended.f cla_geamv.f | |||
| cla_gercond_c.f cla_gercond_x.f cla_gerpvgrw.f | |||
| @@ -316,7 +316,7 @@ set(DLASRC | |||
| dsbev_2stage.f dsbevx_2stage.f dsbevd_2stage.f dsygv_2stage.f | |||
| dcombssq.f dgesvdq.f dlaorhr_col_getrfnp.f | |||
| dlaorhr_col_getrfnp2.f dorgtsqr.f dorgtsqr_row.f dorhr_col.f | |||
| dlatrs3.f dtrsyl3.f dgelst.f) | |||
| dlatrs3.f dtrsyl3.f dgelst.f dgedmd.f90 dgedmdq.f90) | |||
| set(DXLASRC dgesvxx.f dgerfsx.f dla_gerfsx_extended.f dla_geamv.f | |||
| dla_gercond.f dla_gerpvgrw.f dsysvxx.f dsyrfsx.f | |||
| @@ -381,7 +381,7 @@ set(ZLASRC | |||
| zposv.f zposvx.f zpotrf2.f zpotri.f zpotrs.f zpstrf.f zpstf2.f | |||
| zppcon.f zppequ.f zpprfs.f zppsv.f zppsvx.f zpptrf.f zpptri.f zpptrs.f | |||
| zptcon.f zpteqr.f zptrfs.f zptsv.f zptsvx.f zpttrf.f zpttrs.f zptts2.f | |||
| zrot.f zspcon.f zsprfs.f zspsv.f | |||
| zrot.f zrscl.f zspcon.f zsprfs.f zspsv.f | |||
| zspsvx.f zsptrf.f zsptri.f zsptrs.f zdrscl.f zstedc.f | |||
| zstegr.f zstein.f zsteqr.f zsycon.f | |||
| zsyrfs.f zsysv.f zsysvx.f zsytf2.f zsytrf.f zsytri.f | |||
| @@ -419,7 +419,7 @@ set(ZLASRC | |||
| zhbev_2stage.f zhbevx_2stage.f zhbevd_2stage.f zhegv_2stage.f | |||
| zgesvdq.f zlaunhr_col_getrfnp.f zlaunhr_col_getrfnp2.f | |||
| zungtsqr.f zungtsqr_row.f zunhr_col.f | |||
| zlatrs3.f ztrsyl3.f zgelst.f) | |||
| zlatrs3.f ztrsyl3.f zgelst.f zgedmd.f90 zgedmdq.f90) | |||
| set(ZXLASRC zgesvxx.f zgerfsx.f zla_gerfsx_extended.f zla_geamv.f | |||
| zla_gercond_c.f zla_gercond_x.f zla_gerpvgrw.f zsysvxx.f zsyrfsx.f | |||
| @@ -436,6 +436,7 @@ if(USE_XBLAS) | |||
| set(ALLXOBJ ${SXLASRC} ${DXLASRC} ${CXLASRC} ${ZXLASRC}) | |||
| endif() | |||
| if(BUILD_LAPACK_DEPRECATED) | |||
| list(APPEND SLASRC DEPRECATED/sgegs.f DEPRECATED/sgegv.f | |||
| DEPRECATED/sgeqpf.f DEPRECATED/sgelsx.f DEPRECATED/sggsvd.f | |||
| DEPRECATED/sggsvp.f DEPRECATED/slahrd.f DEPRECATED/slatzm.f DEPRECATED/stzrqf.f) | |||
| @@ -449,6 +450,7 @@ list(APPEND ZLASRC DEPRECATED/zgegs.f DEPRECATED/zgegv.f | |||
| DEPRECATED/zgeqpf.f DEPRECATED/zgelsx.f DEPRECATED/zggsvd.f | |||
| DEPRECATED/zggsvp.f DEPRECATED/zlahrd.f DEPRECATED/zlatzm.f DEPRECATED/ztzrqf.f) | |||
| message(STATUS "Building deprecated routines") | |||
| endif() | |||
| set(DSLASRC spotrs.f) | |||
| @@ -622,7 +624,7 @@ set(SLASRC | |||
| ssbev_2stage.c ssbevx_2stage.c ssbevd_2stage.c ssygv_2stage.c | |||
| sgesvdq.c slaorhr_col_getrfnp.c | |||
| slaorhr_col_getrfnp2.c sorgtsqr.c sorgtsqr_row.c sorhr_col.c | |||
| slatrs3.c strsyl3.c sgelst.c) | |||
| slatrs3.c strsyl3.c sgelst.c sgedmd.c sgedmdq.c) | |||
| set(SXLASRC sgesvxx.c sgerfsx.c sla_gerfsx_extended.c sla_geamv.c | |||
| sla_gercond.c sla_gerpvgrw.c ssysvxx.c ssyrfsx.c | |||
| @@ -684,7 +686,7 @@ set(CLASRC | |||
| cposv.c cposvx.c cpotrf2.c cpotri.c cpstrf.c cpstf2.c | |||
| cppcon.c cppequ.c cpprfs.c cppsv.c cppsvx.c cpptrf.c cpptri.c cpptrs.c | |||
| cptcon.c cpteqr.c cptrfs.c cptsv.c cptsvx.c cpttrf.c cpttrs.c cptts2.c | |||
| crot.c cspcon.c csprfs.c cspsv.c | |||
| crot.c crscl.c cspcon.c csprfs.c cspsv.c | |||
| cspsvx.c csptrf.c csptri.c csptrs.c csrscl.c cstedc.c | |||
| cstegr.c cstein.c csteqr.c csycon.c | |||
| csyrfs.c csysv.c csysvx.c csytf2.c csytrf.c csytri.c | |||
| @@ -720,7 +722,7 @@ set(CLASRC | |||
| chbev_2stage.c chbevx_2stage.c chbevd_2stage.c chegv_2stage.c | |||
| cgesvdq.c claunhr_col_getrfnp.c claunhr_col_getrfnp2.c | |||
| cungtsqr.c cungtsqr_row.c cunhr_col.c | |||
| clatrs3.c ctrsyl3.c cgelst.c) | |||
| clatrs3.c ctrsyl3.c cgelst.c cgedmd.c cgedmdq.c) | |||
| set(CXLASRC cgesvxx.c cgerfsx.c cla_gerfsx_extended.c cla_geamv.c | |||
| cla_gercond_c.c cla_gercond_x.c cla_gerpvgrw.c | |||
| @@ -812,7 +814,7 @@ set(DLASRC | |||
| dsbev_2stage.c dsbevx_2stage.c dsbevd_2stage.c dsygv_2stage.c | |||
| dcombssq.c dgesvdq.c dlaorhr_col_getrfnp.c | |||
| dlaorhr_col_getrfnp2.c dorgtsqr.c dorgtsqr_row.c dorhr_col.c | |||
| dlatrs3.c dtrsyl3.c dgelst.c) | |||
| dlatrs3.c dtrsyl3.c dgelst.c dgedmd.c dgedmdq.c) | |||
| set(DXLASRC dgesvxx.c dgerfsx.c dla_gerfsx_extended.c dla_geamv.c | |||
| dla_gercond.c dla_gerpvgrw.c dsysvxx.c dsyrfsx.c | |||
| @@ -876,7 +878,7 @@ set(ZLASRC | |||
| zposv.c zposvx.c zpotrf2.c zpotri.c zpotrs.c zpstrf.c zpstf2.c | |||
| zppcon.c zppequ.c zpprfs.c zppsv.c zppsvx.c zpptrf.c zpptri.c zpptrs.c | |||
| zptcon.c zpteqr.c zptrfs.c zptsv.c zptsvx.c zpttrf.c zpttrs.c zptts2.c | |||
| zrot.c zspcon.c zsprfs.c zspsv.c | |||
| zrot.c zrscl.c zspcon.c zsprfs.c zspsv.c | |||
| zspsvx.c zsptrf.c zsptri.c zsptrs.c zdrscl.c zstedc.c | |||
| zstegr.c zstein.c zsteqr.c zsycon.c | |||
| zsyrfs.c zsysv.c zsysvx.c zsytf2.c zsytrf.c zsytri.c | |||
| @@ -913,7 +915,8 @@ set(ZLASRC | |||
| zheevd_2stage.c zheev_2stage.c zheevx_2stage.c zheevr_2stage.c | |||
| zhbev_2stage.c zhbevx_2stage.c zhbevd_2stage.c zhegv_2stage.c | |||
| zgesvdq.c zlaunhr_col_getrfnp.c zlaunhr_col_getrfnp2.c | |||
| zungtsqr.c zungtsqr_row.c zunhr_col.c zlatrs3.c ztrsyl3.c zgelst.c) | |||
| zungtsqr.c zungtsqr_row.c zunhr_col.c zlatrs3.c ztrsyl3.c zgelst.c | |||
| zgedmd.c zgedmdq.c) | |||
| set(ZXLASRC zgesvxx.c zgerfsx.c zla_gerfsx_extended.c zla_geamv.c | |||
| zla_gercond_c.c zla_gercond_x.c zla_gerpvgrw.c zsysvxx.c zsyrfsx.c | |||
| @@ -930,6 +933,7 @@ if(USE_XBLAS) | |||
| set(ALLXOBJ ${SXLASRC} ${DXLASRC} ${CXLASRC} ${ZXLASRC}) | |||
| endif() | |||
| if(BUILD_LAPACK_DEPRECATED) | |||
| list(APPEND SLASRC DEPRECATED/sgegs.c DEPRECATED/sgegv.c | |||
| DEPRECATED/sgeqpf.c DEPRECATED/sgelsx.c DEPRECATED/sggsvd.c | |||
| DEPRECATED/sggsvp.c DEPRECATED/slahrd.c DEPRECATED/slatzm.c DEPRECATED/stzrqf.c) | |||
| @@ -943,6 +947,7 @@ list(APPEND ZLASRC DEPRECATED/zgegs.c DEPRECATED/zgegv.c | |||
| DEPRECATED/zgeqpf.c DEPRECATED/zgelsx.c DEPRECATED/zggsvd.c | |||
| DEPRECATED/zggsvp.c DEPRECATED/zlahrd.c DEPRECATED/zlatzm.c DEPRECATED/ztzrqf.c) | |||
| message(STATUS "Building deprecated routines") | |||
| endif() | |||
| set(DSLASRC spotrs.c) | |||
| @@ -70,8 +70,6 @@ set(CSRC | |||
| lapacke_cgeqlf_work.c | |||
| lapacke_cgeqp3.c | |||
| lapacke_cgeqp3_work.c | |||
| lapacke_cgeqpf.c | |||
| lapacke_cgeqpf_work.c | |||
| lapacke_cgeqr.c | |||
| lapacke_cgeqr_work.c | |||
| lapacke_cgeqr2.c | |||
| @@ -92,6 +90,10 @@ set(CSRC | |||
| lapacke_cgerqf_work.c | |||
| lapacke_cgesdd.c | |||
| lapacke_cgesdd_work.c | |||
| lapacke_cgedmd.c | |||
| lapacke_cgedmd_work.c | |||
| lapacke_cgedmdq.c | |||
| lapacke_cgedmdq_work.c | |||
| lapacke_cgesv.c | |||
| lapacke_cgesv_work.c | |||
| lapacke_cgesvd.c | |||
| @@ -144,12 +146,8 @@ set(CSRC | |||
| lapacke_cggqrf_work.c | |||
| lapacke_cggrqf.c | |||
| lapacke_cggrqf_work.c | |||
| lapacke_cggsvd.c | |||
| lapacke_cggsvd_work.c | |||
| lapacke_cggsvd3.c | |||
| lapacke_cggsvd3_work.c | |||
| lapacke_cggsvp.c | |||
| lapacke_cggsvp_work.c | |||
| lapacke_cggsvp3.c | |||
| lapacke_cggsvp3_work.c | |||
| lapacke_cgtcon.c | |||
| @@ -564,6 +562,8 @@ set(CSRC | |||
| lapacke_ctrsna_work.c | |||
| lapacke_ctrsyl.c | |||
| lapacke_ctrsyl_work.c | |||
| lapacke_ctrsyl3.c | |||
| lapacke_ctrsyl3_work.c | |||
| lapacke_ctrtri.c | |||
| lapacke_ctrtri_work.c | |||
| lapacke_ctrtrs.c | |||
| @@ -596,6 +596,8 @@ set(CSRC | |||
| lapacke_cungtr_work.c | |||
| lapacke_cungtsqr_row.c | |||
| lapacke_cungtsqr_row_work.c | |||
| lapacke_cunhr_col.c | |||
| lapacke_cunhr_col_work.c | |||
| lapacke_cunmbr.c | |||
| lapacke_cunmbr_work.c | |||
| lapacke_cunmhr.c | |||
| @@ -695,8 +697,6 @@ set(DSRC | |||
| lapacke_dgeqlf_work.c | |||
| lapacke_dgeqp3.c | |||
| lapacke_dgeqp3_work.c | |||
| lapacke_dgeqpf.c | |||
| lapacke_dgeqpf_work.c | |||
| lapacke_dgeqr.c | |||
| lapacke_dgeqr_work.c | |||
| lapacke_dgeqr2.c | |||
| @@ -717,6 +717,10 @@ set(DSRC | |||
| lapacke_dgerqf_work.c | |||
| lapacke_dgesdd.c | |||
| lapacke_dgesdd_work.c | |||
| lapacke_dgedmd.c | |||
| lapacke_dgedmd_work.c | |||
| lapacke_dgedmdq.c | |||
| lapacke_dgedmdq_work.c | |||
| lapacke_dgesv.c | |||
| lapacke_dgesv_work.c | |||
| lapacke_dgesvd.c | |||
| @@ -771,12 +775,8 @@ set(DSRC | |||
| lapacke_dggqrf_work.c | |||
| lapacke_dggrqf.c | |||
| lapacke_dggrqf_work.c | |||
| lapacke_dggsvd.c | |||
| lapacke_dggsvd_work.c | |||
| lapacke_dggsvd3.c | |||
| lapacke_dggsvd3_work.c | |||
| lapacke_dggsvp.c | |||
| lapacke_dggsvp_work.c | |||
| lapacke_dggsvp3.c | |||
| lapacke_dggsvp3_work.c | |||
| lapacke_dgtcon.c | |||
| @@ -874,6 +874,8 @@ set(DSRC | |||
| lapacke_dorgtr_work.c | |||
| lapacke_dorgtsqr_row.c | |||
| lapacke_dorgtsqr_row_work.c | |||
| lapacke_dorhr_col.c | |||
| lapacke_dorhr_col_work.c | |||
| lapacke_dormbr.c | |||
| lapacke_dormbr_work.c | |||
| lapacke_dormhr.c | |||
| @@ -1186,6 +1188,8 @@ set(DSRC | |||
| lapacke_dtrsna_work.c | |||
| lapacke_dtrsyl.c | |||
| lapacke_dtrsyl_work.c | |||
| lapacke_dtrsyl3.c | |||
| lapacke_dtrsyl3_work.c | |||
| lapacke_dtrtri.c | |||
| lapacke_dtrtri_work.c | |||
| lapacke_dtrtrs.c | |||
| @@ -1275,8 +1279,6 @@ set(SSRC | |||
| lapacke_sgeqlf_work.c | |||
| lapacke_sgeqp3.c | |||
| lapacke_sgeqp3_work.c | |||
| lapacke_sgeqpf.c | |||
| lapacke_sgeqpf_work.c | |||
| lapacke_sgeqr.c | |||
| lapacke_sgeqr_work.c | |||
| lapacke_sgeqr2.c | |||
| @@ -1297,6 +1299,10 @@ set(SSRC | |||
| lapacke_sgerqf_work.c | |||
| lapacke_sgesdd.c | |||
| lapacke_sgesdd_work.c | |||
| lapacke_sgedmd.c | |||
| lapacke_sgedmd_work.c | |||
| lapacke_sgedmdq.c | |||
| lapacke_sgedmdq_work.c | |||
| lapacke_sgesv.c | |||
| lapacke_sgesv_work.c | |||
| lapacke_sgesvd.c | |||
| @@ -1351,12 +1357,8 @@ set(SSRC | |||
| lapacke_sggqrf_work.c | |||
| lapacke_sggrqf.c | |||
| lapacke_sggrqf_work.c | |||
| lapacke_sggsvd.c | |||
| lapacke_sggsvd_work.c | |||
| lapacke_sggsvd3.c | |||
| lapacke_sggsvd3_work.c | |||
| lapacke_sggsvp.c | |||
| lapacke_sggsvp_work.c | |||
| lapacke_sggsvp3.c | |||
| lapacke_sggsvp3_work.c | |||
| lapacke_sgtcon.c | |||
| @@ -1453,6 +1455,8 @@ set(SSRC | |||
| lapacke_sorgtr_work.c | |||
| lapacke_sorgtsqr_row.c | |||
| lapacke_sorgtsqr_row_work.c | |||
| lapacke_sorhr_col.c | |||
| lapacke_sorhr_col_work.c | |||
| lapacke_sormbr.c | |||
| lapacke_sormbr_work.c | |||
| lapacke_sormhr.c | |||
| @@ -1762,6 +1766,8 @@ set(SSRC | |||
| lapacke_strsna_work.c | |||
| lapacke_strsyl.c | |||
| lapacke_strsyl_work.c | |||
| lapacke_ctrsyl3.c | |||
| lapacke_ctrsyl3_work.c | |||
| lapacke_strtri.c | |||
| lapacke_strtri_work.c | |||
| lapacke_strtrs.c | |||
| @@ -1849,8 +1855,6 @@ set(ZSRC | |||
| lapacke_zgeqlf_work.c | |||
| lapacke_zgeqp3.c | |||
| lapacke_zgeqp3_work.c | |||
| lapacke_zgeqpf.c | |||
| lapacke_zgeqpf_work.c | |||
| lapacke_zgeqr.c | |||
| lapacke_zgeqr_work.c | |||
| lapacke_zgeqr2.c | |||
| @@ -1871,6 +1875,10 @@ set(ZSRC | |||
| lapacke_zgerqf_work.c | |||
| lapacke_zgesdd.c | |||
| lapacke_zgesdd_work.c | |||
| lapacke_zgedmd.c | |||
| lapacke_zgedmd_work.c | |||
| lapacke_zgedmdq.c | |||
| lapacke_zgedmdq_work.c | |||
| lapacke_zgesv.c | |||
| lapacke_zgesv_work.c | |||
| lapacke_zgesvd.c | |||
| @@ -1925,12 +1933,8 @@ set(ZSRC | |||
| lapacke_zggqrf_work.c | |||
| lapacke_zggrqf.c | |||
| lapacke_zggrqf_work.c | |||
| lapacke_zggsvd.c | |||
| lapacke_zggsvd_work.c | |||
| lapacke_zggsvd3.c | |||
| lapacke_zggsvd3_work.c | |||
| lapacke_zggsvp.c | |||
| lapacke_zggsvp_work.c | |||
| lapacke_zggsvp3.c | |||
| lapacke_zggsvp3_work.c | |||
| lapacke_zgtcon.c | |||
| @@ -2343,6 +2347,8 @@ set(ZSRC | |||
| lapacke_ztrsna_work.c | |||
| lapacke_ztrsyl.c | |||
| lapacke_ztrsyl_work.c | |||
| lapacke_ztrsyl3.c | |||
| lapacke_ztrsyl3_work.c | |||
| lapacke_ztrtri.c | |||
| lapacke_ztrtri_work.c | |||
| lapacke_ztrtrs.c | |||
| @@ -2375,6 +2381,8 @@ set(ZSRC | |||
| lapacke_zungtr_work.c | |||
| lapacke_zungtsqr_row.c | |||
| lapacke_zungtsqr_row_work.c | |||
| lapacke_zunhr_col.c | |||
| lapacke_zunhr_col_work.c | |||
| lapacke_zunmbr.c | |||
| lapacke_zunmbr_work.c | |||
| lapacke_zunmhr.c | |||
| @@ -2401,6 +2409,12 @@ set(ZSRC | |||
| lapacke_csyr_work.c | |||
| lapacke_ilaver.c | |||
| ) | |||
| if (BUILD_LAPACK_DEPRECATED) | |||
| set(SRCS $SRCS lapacke_sgeqpf.c lapacke_sgeqpf_work.c lapacke_sggsvd.c lapacke_sggsvd_work.c lapacke_sggsvp.c lapacke_sggsvp_work.c) | |||
| set(SRCD $SRCD lapacke_dgeqpf.c lapacke_dgeqpf_work.c lapacke_dggsvd.c lapacke_dggsvd_work.c lapacke_dggsvp.c lapacke_dggsvp_work.c) | |||
| set(SRCC $SRCC lapacke_cgeqpf.c lapacke_cgeqpf_work.c lapacke_cggsvd.c lapacke_cggsvd_work.c lapacke_cggsvp.c lapacke_cggsvp_work.c) | |||
| set(SRCZ $SRCZ lapacke_zgeqpf.c lapacke_zgeqpf_work.c lapacke_zggsvd.c lapacke_zggsvd_work.c lapacke_zggsvp.c lapacke_zggsvp_work.c) | |||
| endif() | |||
| set(SRCX | |||
| lapacke_cgbrfsx.c lapacke_cporfsx.c lapacke_dgerfsx.c lapacke_sgbrfsx.c lapacke_ssyrfsx.c lapacke_zherfsx.c | |||
| @@ -55,7 +55,7 @@ if (DEFINED TARGET) | |||
| endif () | |||
| # On x86_64 build getarch with march=native. This is required to detect AVX512 support in getarch. | |||
| if (X86_64 AND NOT ${CMAKE_C_COMPILER_ID} STREQUAL "PGI") | |||
| if (X86_64 AND NOT (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" OR ${CMAKE_C_COMPILER_ID} STREQUAL "NVHPC")) | |||
| set(GETARCH_FLAGS "${GETARCH_FLAGS} -march=native") | |||
| endif () | |||
| @@ -280,7 +280,41 @@ if (DEFINED TARGET) | |||
| if (${TARGET} STREQUAL POWER8) | |||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math") | |||
| endif() | |||
| if (${TARGET} STREQUAL NEOVERSEV1) | |||
| if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE) | |||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -Msve_intrinsics -march=armv8.4-a+sve -mtune=neoverse-v1") | |||
| else () | |||
| execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) | |||
| if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4) | |||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv8.4-a+sve -mtune=neoverse-v1") | |||
| else () | |||
| message(FATAL_ERROR "Compiler ${CMAKE_C_COMPILER} ${GCC_VERSION} does not support Neoverse V1.") | |||
| endif() | |||
| endif() | |||
| endif() | |||
| if (${TARGET} STREQUAL NEOVERSEN2) | |||
| if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE) | |||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -Msve-intrinsics -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2") | |||
| else () | |||
| execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) | |||
| if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4) | |||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2") | |||
| else () | |||
| message(FATAL_ERROR "Compiler $${CMAKE_C_COMPILER} {GCC_VERSION} does not support Neoverse N2.") | |||
| endif() | |||
| endif() | |||
| endif() | |||
| if (${TARGET} STREQUAL ARMV8SVE) | |||
| if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE) | |||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -Msve-intrinsics -march=armv8.2-a+sve") | |||
| else () | |||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv8.2-a+sve") | |||
| endif() | |||
| endif() | |||
| endif() | |||
| if (DEFINED BINARY) | |||
| message(STATUS "Compiling a ${BINARY}-bit binary.") | |||
| endif () | |||
| @@ -44,6 +44,8 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "mips64.*") | |||
| set(MIPS64 1) | |||
| elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "loongarch64.*") | |||
| set(LOONGARCH64 1) | |||
| elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "riscv64.*") | |||
| set(RISCV64 1) | |||
| elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*") | |||
| if (NOT BINARY) | |||
| if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8") | |||
| @@ -60,7 +62,7 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*") | |||
| endif() | |||
| elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "i686.*|i386.*|x86.*|amd64.*|AMD64.*") | |||
| set(X86 1) | |||
| elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*|arm64.*|ARM64.*)") | |||
| elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*|arm64.*|ARM64.*|armv8.*)") | |||
| if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8") | |||
| set(ARM64 1) | |||
| else() | |||
| @@ -107,7 +109,7 @@ else() | |||
| endif () | |||
| if (NOT BINARY) | |||
| if (X86_64 OR ARM64 OR POWER OR MIPS64 OR LOONGARCH64) | |||
| if (X86_64 OR ARM64 OR POWER OR MIPS64 OR LOONGARCH64 OR RISCV64) | |||
| set(BINARY 64) | |||
| else () | |||
| set(BINARY 32) | |||
| @@ -87,6 +87,15 @@ macro(ParseMakefileVars MAKEFILE_IN) | |||
| #message(STATUS "skipping ${makefile_line}") | |||
| continue () | |||
| endif () | |||
| # Example 1: SBGEMM_SMALL_M_PERMIT = | |||
| # Unset the variable | |||
| string(REGEX MATCH "([0-9_a-zA-Z]+)[ \t]*=[ \t]*$" line_match "${makefile_line}") | |||
| if (NOT "${line_match}" STREQUAL "") | |||
| set(var_name ${CMAKE_MATCH_1}) | |||
| unset(${var_name}) | |||
| endif() | |||
| string(REGEX MATCH "([0-9_a-zA-Z]+)[ \t]*=[ \t]*(.+)$" line_match "${makefile_line}") | |||
| if (NOT "${line_match}" STREQUAL "") | |||
| #message(STATUS "match on ${line_match}") | |||
| @@ -525,7 +525,7 @@ static inline unsigned long long rpcc(void){ | |||
| #endif // !RPCC_DEFINED | |||
| #if !defined(BLAS_LOCK_DEFINED) && defined(__GNUC__) | |||
| static void __inline blas_lock(volatile BLASULONG *address){ | |||
| static __inline void blas_lock(volatile BLASULONG *address){ | |||
| do { | |||
| while (*address) {YIELDING;}; | |||
| @@ -45,7 +45,7 @@ | |||
| #define WMB asm("wmb") | |||
| #define RMB asm("mb") | |||
| static void __inline blas_lock(unsigned long *address){ | |||
| static __inline void blas_lock(unsigned long *address){ | |||
| #ifndef __DECC | |||
| unsigned long tmp1, tmp2; | |||
| asm volatile( | |||
| @@ -55,7 +55,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #if defined(ARMV6) || defined(ARMV7) || defined(ARMV8) | |||
| static void __inline blas_lock(volatile BLASULONG *address){ | |||
| static __inline void blas_lock(volatile BLASULONG *address){ | |||
| int register ret; | |||
| @@ -55,7 +55,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #ifndef ASSEMBLER | |||
| static void __inline blas_lock(volatile BLASULONG *address){ | |||
| static __inline void blas_lock(volatile BLASULONG *address){ | |||
| BLASULONG ret; | |||
| @@ -83,6 +83,19 @@ static inline int blas_quickdivide(blasint x, blasint y){ | |||
| return x / y; | |||
| } | |||
| #ifndef NO_AFFINITY | |||
| static inline int WhereAmI(void){ | |||
| int ret = 0, counter = 0; | |||
| __asm__ volatile ( | |||
| "rdtimel.w %[counter], %[id]" | |||
| : [id]"=r"(ret), [counter]"=r"(counter) | |||
| : | |||
| : "memory" | |||
| ); | |||
| return ret; | |||
| } | |||
| #endif | |||
| #ifdef DOUBLE | |||
| #define GET_IMAGE(res) __asm__ __volatile__("fmov.d %0, $f2" : "=f"(res) : : "memory") | |||
| #else | |||
| @@ -1,5 +1,6 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* Copyright 2023 The OpenBLAS Project. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| @@ -45,12 +46,14 @@ | |||
| typedef struct { | |||
| int dtb_entries; | |||
| int switch_ratio; | |||
| int offsetA, offsetB, align; | |||
| #if BUILD_BFLOAT16 == 1 | |||
| int sbgemm_p, sbgemm_q, sbgemm_r; | |||
| int sbgemm_unroll_m, sbgemm_unroll_n, sbgemm_unroll_mn; | |||
| int sbgemm_align_k; | |||
| int need_amxtile_permission; // 0 default, 1 for device support amx. | |||
| void (*sbstobf16_k) (BLASLONG, float *, BLASLONG, bfloat16 *, BLASLONG); | |||
| void (*sbdtobf16_k) (BLASLONG, double *, BLASLONG, bfloat16 *, BLASLONG); | |||
| @@ -91,7 +91,7 @@ | |||
| void *qalloc(int flags, size_t bytes); | |||
| static void INLINE blas_lock(volatile unsigned long *address){ | |||
| static INLINE void blas_lock(volatile unsigned long *address){ | |||
| long int ret, val = 1; | |||
| @@ -45,7 +45,7 @@ | |||
| #ifndef ASSEMBLER | |||
| static void __inline blas_lock(volatile unsigned long *address){ | |||
| static __inline void blas_lock(volatile unsigned long *address){ | |||
| long int ret = 1; | |||
| @@ -53,7 +53,6 @@ extern void goto_set_num_threads(int nthreads); | |||
| /* Global Parameter */ | |||
| extern int blas_cpu_number; | |||
| extern int blas_num_threads; | |||
| extern int blas_num_threads_set; | |||
| extern int blas_omp_linked; | |||
| #define BLAS_LEGACY 0x8000U | |||
| @@ -136,15 +135,13 @@ typedef struct blas_queue { | |||
| #ifdef SMP_SERVER | |||
| extern int blas_server_avail; | |||
| extern int blas_omp_number_max; | |||
| static __inline int num_cpu_avail(int level) { | |||
| #ifdef USE_OPENMP | |||
| int openmp_nthreads; | |||
| if (blas_num_threads_set == 0) | |||
| openmp_nthreads=omp_get_max_threads(); | |||
| else | |||
| openmp_nthreads=blas_cpu_number; | |||
| #endif | |||
| #ifndef USE_OPENMP | |||
| @@ -156,7 +153,13 @@ int openmp_nthreads; | |||
| ) return 1; | |||
| #ifdef USE_OPENMP | |||
| if (blas_cpu_number != openmp_nthreads) { | |||
| if (openmp_nthreads > blas_omp_number_max){ | |||
| #ifdef DEBUG | |||
| fprintf(stderr,"WARNING - more OpenMP threads requested (%d) than available (%d)\n",openmp_nthreads,blas_omp_number_max); | |||
| #endif | |||
| openmp_nthreads = blas_omp_number_max; | |||
| } | |||
| if (blas_cpu_number != openmp_nthreads) { | |||
| goto_set_num_threads(openmp_nthreads); | |||
| } | |||
| #endif | |||
| @@ -54,7 +54,7 @@ | |||
| #define __volatile__ | |||
| #endif | |||
| static void __inline blas_lock(volatile BLASULONG *address){ | |||
| static __inline void blas_lock(volatile BLASULONG *address){ | |||
| int ret; | |||
| @@ -70,7 +70,7 @@ | |||
| #define RMB | |||
| #endif | |||
| static void __inline blas_lock(volatile BLASULONG *address){ | |||
| static __inline void blas_lock(volatile BLASULONG *address){ | |||
| #ifndef C_MSVC | |||
| @@ -45,7 +45,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #ifndef ASSEMBLER | |||
| /* | |||
| static void __inline blas_lock(volatile BLASULONG *address){ | |||
| static __inline void blas_lock(volatile BLASULONG *address){ | |||
| BLASULONG ret; | |||
| @@ -267,8 +267,9 @@ int detect(void) | |||
| } | |||
| #else | |||
| #ifdef __APPLE__ | |||
| sysctlbyname("hw.cpufamily",&value,&length,NULL,0); | |||
| if (value ==131287967|| value == 458787763 ) return CPU_VORTEX; | |||
| sysctlbyname("hw.cpufamily",&value64,&length64,NULL,0); | |||
| if (value64 ==131287967|| value64 == 458787763 ) return CPU_VORTEX; //A12/M1 | |||
| if (value64 == 3660830781) return CPU_VORTEX; //A15/M2 | |||
| #endif | |||
| return CPU_ARMV8; | |||
| #endif | |||
| @@ -32,6 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| **********************************************************************************/ | |||
| #include <stdint.h> | |||
| #include <sys/auxv.h> | |||
| /* If LASX extension instructions supported, | |||
| * using core LOONGSON3R5 | |||
| @@ -46,9 +47,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define CPU_LOONGSON3R5 1 | |||
| #define CPU_LOONGSON2K1000 2 | |||
| #define LOONGARCH_CFG2 0x02 | |||
| #define LOONGARCH_LASX 1<<7 | |||
| #define LOONGARCH_LSX 1<<6 | |||
| #define LA_HWCAP_LSX (1<<4) | |||
| #define LA_HWCAP_LASX (1<<5) | |||
| static char *cpuname[] = { | |||
| "LOONGSONGENERIC", | |||
| @@ -64,17 +64,11 @@ static char *cpuname_lower[] = { | |||
| int detect(void) { | |||
| #ifdef __linux | |||
| uint32_t reg = 0; | |||
| int flag = (int)getauxval(AT_HWCAP); | |||
| __asm__ volatile ( | |||
| "cpucfg %0, %1 \n\t" | |||
| : "+&r"(reg) | |||
| : "r"(LOONGARCH_CFG2) | |||
| ); | |||
| if (reg & LOONGARCH_LASX) | |||
| if (flag & LA_HWCAP_LASX) | |||
| return CPU_LOONGSON3R5; | |||
| else if (reg & LOONGARCH_LSX) | |||
| else if (flag & LA_HWCAP_LSX) | |||
| return CPU_LOONGSON2K1000; | |||
| else | |||
| return CPU_GENERIC; | |||
| @@ -1479,6 +1479,8 @@ int get_cpuname(void){ | |||
| else | |||
| return CPUTYPE_NEHALEM; | |||
| case 15: // Sapphire Rapids | |||
| if(support_amx_bf16()) | |||
| return CPUTYPE_SAPPHIRERAPIDS; | |||
| if(support_avx512_bf16()) | |||
| return CPUTYPE_COOPERLAKE; | |||
| if(support_avx512()) | |||
| @@ -1549,6 +1551,7 @@ int get_cpuname(void){ | |||
| case 7: // Raptor Lake | |||
| case 10: | |||
| case 15: | |||
| case 14: // Alder Lake N | |||
| if(support_avx2()) | |||
| return CPUTYPE_HASWELL; | |||
| if(support_avx()) | |||
| @@ -1845,7 +1848,8 @@ static char *cpuname[] = { | |||
| "ZEN", | |||
| "SKYLAKEX", | |||
| "DHYANA", | |||
| "COOPERLAKE" | |||
| "COOPERLAKE", | |||
| "SAPPHIRERAPIDS", | |||
| }; | |||
| static char *lowercpuname[] = { | |||
| @@ -1902,7 +1906,8 @@ static char *lowercpuname[] = { | |||
| "zen", | |||
| "skylakex", | |||
| "dhyana", | |||
| "cooperlake" | |||
| "cooperlake", | |||
| "sapphirerapids", | |||
| }; | |||
| static char *corename[] = { | |||
| @@ -1936,7 +1941,8 @@ static char *corename[] = { | |||
| "ZEN", | |||
| "SKYLAKEX", | |||
| "DHYANA", | |||
| "COOPERLAKE" | |||
| "COOPERLAKE", | |||
| "SAPPHIRERAPIDS", | |||
| }; | |||
| static char *corename_lower[] = { | |||
| @@ -1970,7 +1976,8 @@ static char *corename_lower[] = { | |||
| "zen", | |||
| "skylakex", | |||
| "dhyana", | |||
| "cooperlake" | |||
| "cooperlake", | |||
| "sapphirerapids", | |||
| }; | |||
| @@ -2276,16 +2283,18 @@ int get_coretype(void){ | |||
| return CORE_NEHALEM; | |||
| } | |||
| if (model == 15) { // Sapphire Rapids | |||
| if(support_amx_bf16()) | |||
| return CORE_SAPPHIRERAPIDS; | |||
| if(support_avx512_bf16()) | |||
| return CPUTYPE_COOPERLAKE; | |||
| return CORE_COOPERLAKE; | |||
| if(support_avx512()) | |||
| return CPUTYPE_SKYLAKEX; | |||
| return CORE_SKYLAKEX; | |||
| if(support_avx2()) | |||
| return CPUTYPE_HASWELL; | |||
| return CORE_HASWELL; | |||
| if(support_avx()) | |||
| return CPUTYPE_SANDYBRIDGE; | |||
| return CORE_SANDYBRIDGE; | |||
| else | |||
| return CPUTYPE_NEHALEM; | |||
| return CORE_NEHALEM; | |||
| } | |||
| break; | |||
| @@ -2352,6 +2361,7 @@ int get_coretype(void){ | |||
| case 7: // Raptor Lake | |||
| case 10: | |||
| case 15: | |||
| case 14: // Alder Lake N | |||
| #ifndef NO_AVX2 | |||
| if(support_avx2()) | |||
| return CORE_HASWELL; | |||
| @@ -208,7 +208,7 @@ FLDFLAGS = $(FFLAGS:-fPIC=) $(LDFLAGS) | |||
| ifeq ($(USE_OPENMP), 1) | |||
| ifeq ($(F_COMPILER), GFORTRAN) | |||
| ifeq ($(C_COMPILER), CLANG) | |||
| CEXTRALIB = -lomp | |||
| CEXTRALIB += -lomp | |||
| endif | |||
| endif | |||
| ifeq ($(F_COMPILER), NAG) | |||
| @@ -0,0 +1,270 @@ | |||
| # Guidance for redistributing OpenBLAS | |||
| *We note that this document contains recommendations only - packagers and other | |||
| redistributors are in charge of how OpenBLAS is built and distributed in their | |||
| systems, and may have good reasons to deviate from the guidance given on this | |||
| page. These recommendations are aimed at general packaging systems, with a user | |||
| base that typically is large, open source (or freely available at least), and | |||
| doesn't behave uniformly or that the packager is directly connected with.* | |||
| OpenBLAS has a large number of build-time options which can be used to change | |||
| how it behaves at runtime, how artifacts or symbols are named, etc. Variation | |||
| in build configuration can be necessary to acheive a given end goal within a | |||
| distribution or as an end user. However, such variation can also make it more | |||
| difficult to build on top of OpenBLAS and ship code or other packages in a way | |||
| that works across many different distros. Here we provide guidance about the | |||
| most important build options, what effects they may have when changed, and | |||
| which ones to default to. | |||
| The Make and CMake build systems provide equivalent options and yield more or | |||
| less the same artifacts, but not exactly (the CMake builds are still | |||
| experimental). You can choose either one and the options will function in the | |||
| same way, however the CMake outputs may require some renaming. To review | |||
| available build options, see `Makefile.rule` or `CMakeLists.txt` in the root of | |||
| the repository. | |||
| Build options typically fall into two categories: (a) options that affect the | |||
| user interface, such as library and symbol names or APIs that are made | |||
| available, and (b) options that affect performance and runtime behavior, such | |||
| as threading behavior or CPU architecture-specific code paths. The user | |||
| interface options are more important to keep aligned between distributions, | |||
| while for the performance-related options there are typically more reasons to | |||
| make choices that deviate from the defaults. | |||
| Here are recommendations for user interface related packaging choices where it | |||
| is not likely to be a good idea to deviate (typically these are the default | |||
| settings): | |||
| 1. Include CBLAS. The CBLAS interface is widely used and it doesn't affect | |||
| binary size much, so don't turn it off. | |||
| 2. Include LAPACK and LAPACKE. The LAPACK interface is also widely used, and | |||
| while it does make up a significant part of the binary size of the installed | |||
| library, that does not outweigh the regression in usability when deviating | |||
| from the default here.[^1] | |||
| 3. Always distribute the pkg-config (`.pc`) and CMake `.cmake`) dependency | |||
| detection files. These files are used by build systems when users want to | |||
| link against OpenBLAS, and there is no benefit of leaving them out. | |||
| 4. Provide the LP64 interface by default, and if in addition to that you choose | |||
| to provide an ILP64 interface build as well, use a symbol suffix to avoid | |||
| symbol name clashes (see the next section). | |||
| [^1] All major distributions do include LAPACK as of mid 2023 as far as we | |||
| know. Older versions of Arch Linux did not, and that was known to cause | |||
| problems. | |||
| ## ILP64 interface builds | |||
| The LP64 (32-bit integer) interface is the default build, and has | |||
| well-established C and Fortran APIs as determined by the reference (Netlib) | |||
| BLAS and LAPACK libraries. The ILP64 (64-bit integer) interface however does | |||
| not have a standard API: symbol names and shared/static library names can be | |||
| produced in multiple ways, and this tends to make it difficult to use. | |||
| As of today there is an agreed-upon way of choosing names for OpenBLAS between | |||
| a number of key users/redistributors, which is the closest thing to a standard | |||
| that there is now. However, there is an ongoing standardization effort in the | |||
| reference BLAS and LAPACK libraries, which differs from the current OpenBLAS | |||
| agreed-upon convention. In this section we'll aim to explain both. | |||
| Those two methods are fairly similar, and have a key thing in common: *using a | |||
| symbol suffix*. This is good practice; it is recommended that if you distribute | |||
| an ILP64 build, to have it use a symbol suffix containing `64` in the name. | |||
| This avoids potential symbol clashes when different packages which depend on | |||
| OpenBLAS load both an LP64 and an ILP64 library into memory at the same time. | |||
| ### The current OpenBLAS agreed-upon ILP64 convention | |||
| This convention comprises the shared library name and the symbol suffix in the | |||
| shared library. The symbol suffix to use is `64_`, implying that the library | |||
| name will be `libopenblas64_.so` and the symbols in that library end in `64_`. | |||
| The central issue where this was discussed is | |||
| [openblas#646](https://github.com/xianyi/OpenBLAS/issues/646), and adopters | |||
| include Fedora, Julia, NumPy and SciPy - SuiteSparse already used it as well. | |||
| To build shared and static libraries with the currently recommended ILP64 | |||
| conventions with Make: | |||
| ```bash | |||
| $ make INTERFACE64=1 SYMBOLSUFFIX=64_ | |||
| ``` | |||
| This will produce libraries named `libopenblas64_.so|a`, a pkg-config file | |||
| named `openblas64.pc`, and CMake and header files. | |||
| Installing locally and inspecting the output will show a few more details: | |||
| ```bash | |||
| $ make install PREFIX=$PWD/../openblas/make64 INTERFACE64=1 SYMBOLSUFFIX=64_ | |||
| $ tree . # output slightly edited down | |||
| . | |||
| ├── include | |||
| │ ├── cblas.h | |||
| │ ├── f77blas.h | |||
| │ ├── lapacke_config.h | |||
| │ ├── lapacke.h | |||
| │ ├── lapacke_mangling.h | |||
| │ ├── lapacke_utils.h | |||
| │ ├── lapack.h | |||
| │ └── openblas_config.h | |||
| └── lib | |||
| ├── cmake | |||
| │ └── openblas | |||
| │ ├── OpenBLASConfig.cmake | |||
| │ └── OpenBLASConfigVersion.cmake | |||
| ├── libopenblas64_.a | |||
| ├── libopenblas64_.so | |||
| └── pkgconfig | |||
| └── openblas64.pc | |||
| ``` | |||
| A key point are the symbol names. These will equal the LP64 symbol names, then | |||
| (for Fortran only) the compiler mangling, and then the `64_` symbol suffix. | |||
| Hence to obtain the final symbol names, we need to take into account which | |||
| Fortran compiler we are using. For the most common cases (e.g., gfortran, Intel | |||
| Fortran, or Flang), that means appending a single underscore. In that case, the | |||
| result is: | |||
| | base API name | binary symbol name | call from Fortran code | call from C code | | |||
| |---------------|--------------------|------------------------|-----------------------| | |||
| | `dgemm` | `dgemm_64_` | `dgemm_64(...)` | `dgemm_64_(...)` | | |||
| | `cblas_dgemm` | `cblas_dgemm64_` | n/a | `cblas_dgemm64_(...)` | | |||
| It is quite useful to have these symbol names be as uniform as possible across | |||
| different packaging systems. | |||
| The equivalent build options with CMake are: | |||
| ```bash | |||
| $ mkdir build && cd build | |||
| $ cmake .. -DINTERFACE64=1 -DSYMBOLSUFFIX=64_ -DBUILD_SHARED_LIBS=ON -DBUILD_STATIC_LIBS=ON | |||
| $ cmake --build . -j | |||
| ``` | |||
| Note that the result is not 100% identical to the Make result. For example, the | |||
| library name ends in `_64` rather than `64_` - it is recommended to rename them | |||
| to match the Make library names (also update the `libsuffix` entry in | |||
| `openblas64.pc` to match that rename). | |||
| ```bash | |||
| $ cmake --install . --prefix $PWD/../../openblas/cmake64 | |||
| $ tree . | |||
| . | |||
| ├── include | |||
| │ └── openblas64 | |||
| │ ├── cblas.h | |||
| │ ├── f77blas.h | |||
| │ ├── lapacke_config.h | |||
| │ ├── lapacke_example_aux.h | |||
| │ ├── lapacke.h | |||
| │ ├── lapacke_mangling.h | |||
| │ ├── lapacke_utils.h | |||
| │ ├── lapack.h | |||
| │ ├── openblas64 | |||
| │ │ └── lapacke_mangling.h | |||
| │ └── openblas_config.h | |||
| └── lib | |||
| ├── cmake | |||
| │ └── OpenBLAS64 | |||
| │ ├── OpenBLAS64Config.cmake | |||
| │ ├── OpenBLAS64ConfigVersion.cmake | |||
| │ ├── OpenBLAS64Targets.cmake | |||
| │ └── OpenBLAS64Targets-noconfig.cmake | |||
| ├── libopenblas_64.a | |||
| ├── libopenblas_64.so -> libopenblas_64.so.0 | |||
| └── pkgconfig | |||
| └── openblas64.pc | |||
| ``` | |||
| ### The upcoming standardized ILP64 convention | |||
| While the `64_` convention above got some adoption, it's slightly hacky and is | |||
| implemented through the use of `objcopy`. An effort is ongoing for a more | |||
| broadly adopted convention in the reference BLAS and LAPACK libraries, using | |||
| (a) the `_64` suffix, and (b) applying that suffix _before_ rather than after | |||
| Fortran compiler mangling. The central issue for this is | |||
| [lapack#666](https://github.com/Reference-LAPACK/lapack/issues/666). | |||
| For the most common cases of compiler mangling (a single `_` appended), the end | |||
| result will be: | |||
| | base API name | binary symbol name | call from Fortran code | call from C code | | |||
| |---------------|--------------------|------------------------|-----------------------| | |||
| | `dgemm` | `dgemm_64_` | `dgemm_64(...)` | `dgemm_64_(...)` | | |||
| | `cblas_dgemm` | `cblas_dgemm_64` | n/a | `cblas_dgemm_64(...)` | | |||
| For other compiler mangling schemes, replace the trailing `_` by the scheme in use. | |||
| The shared library name for this `_64` convention should be `libopenblas_64.so`. | |||
| Note: it is not yet possible to produce an OpenBLAS build which employs this | |||
| convention! Once reference BLAS and LAPACK with support for `_64` have been | |||
| released, a future OpenBLAS release will support it. For now, please use the | |||
| older `64_` scheme and avoid using the name `libopenblas_64.so`; it should be | |||
| considered reserved for future use of the `_64` standard as prescribed by | |||
| reference BLAS/LAPACK. | |||
| ## Performance and runtime behavior related build options | |||
| For these options there are multiple reasonable or common choices. | |||
| ### Threading related options | |||
| OpenBLAS can be built as a multi-threaded or single-threaded library, with the | |||
| default being multi-threaded. It's expected that the default `libopenblas` | |||
| library is multi-threaded; if you'd like to also distribute single-threaded | |||
| builds, consider naming them `libopenblas_sequential`. | |||
| OpenBLAS can be built with pthreads or OpenMP as the threading model, with the | |||
| default being pthreads. Both options are commonly used, and the choice here | |||
| should not influence the shared library name. The choice will be captured by | |||
| the `.pc` file. E.g.,: | |||
| ```bash | |||
| $ pkg-config --libs openblas | |||
| -fopenmp -lopenblas | |||
| $ cat openblas.pc | |||
| ... | |||
| openblas_config= ... USE_OPENMP=0 MAX_THREADS=24 | |||
| ``` | |||
| The maximum number of threads users will be able to use is determined at build | |||
| time by the `NUM_THREADS` build option. It defaults to 24, and there's a wide | |||
| range of values that are reasonable to use (up to 256). 64 is a typical choice | |||
| here; there is a memory footprint penalty that is linear in `NUM_THREADS`. | |||
| Please see `Makefile.rule` for more details. | |||
| ### CPU architecture related options | |||
| OpenBLAS contains a lot of CPU architecture-specific optimizations, hence when | |||
| distributing to a user base with a variety of hardware, it is recommended to | |||
| enable CPU architecture runtime detection. This will dynamically select | |||
| optimized kernels for individual APIs. To do this, use the `DYNAMIC_ARCH=1` | |||
| build option. This is usually done on all common CPU families, except when | |||
| there are known issues. | |||
| In case the CPU architecture is known (e.g. you're building binaries for macOS | |||
| M1 users), it is possible to specify the target architecture directly with the | |||
| `TARGET=` build option. | |||
| `DYNAMIC_ARCH` and `TARGET` are covered in more detail in the main `README.md` | |||
| in this repository. | |||
| ## Real-world examples | |||
| OpenBLAS is likely to be distributed in one of these distribution models: | |||
| 1. As a standalone package, or multiple packages, in a packaging ecosystem like | |||
| a Linux distro, Homebrew, conda-forge or MSYS2. | |||
| 2. Vendored as part of a larger package, e.g. in Julia, NumPy, SciPy, or R. | |||
| 3. Locally, e.g. making available as a build on a single HPC cluster. | |||
| The guidance on this page is most important for models (1) and (2). These links | |||
| to build recipes for a representative selection of packaging systems may be | |||
| helpful as a reference: | |||
| - [Fedora](https://src.fedoraproject.org/rpms/openblas/blob/rawhide/f/openblas.spec) | |||
| - [Debian](https://salsa.debian.org/science-team/openblas/-/blob/master/debian/rules) | |||
| - [Homebrew](https://github.com/Homebrew/homebrew-core/blob/HEAD/Formula/openblas.rb) | |||
| - [MSYS2](https://github.com/msys2/MINGW-packages/blob/master/mingw-w64-openblas/PKGBUILD) | |||
| - [conda-forge](https://github.com/conda-forge/openblas-feedstock/blob/main/recipe/build.sh) | |||
| - [NumPy/SciPy](https://github.com/MacPython/openblas-libs/blob/main/tools/build_openblas.sh) | |||
| - [Nixpkgs](https://github.com/NixOS/nixpkgs/blob/master/pkgs/development/libraries/science/math/openblas/default.nix) | |||
| @@ -1,5 +1,6 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* Copyright 2023 The OpenBLAS Project. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| @@ -44,10 +45,6 @@ | |||
| #define DIVIDE_RATE 2 | |||
| #endif | |||
| #ifndef SWITCH_RATIO | |||
| #define SWITCH_RATIO 2 | |||
| #endif | |||
| //The array of job_t may overflow the stack. | |||
| //Instead, use malloc to alloc job_t. | |||
| #if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD | |||
| @@ -1015,6 +1012,12 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO | |||
| BLASLONG divN, divT; | |||
| int mode; | |||
| #if defined(DYNAMIC_ARCH) | |||
| int switch_ratio = gotoblas->switch_ratio; | |||
| #else | |||
| int switch_ratio = SWITCH_RATIO; | |||
| #endif | |||
| if (range_m) { | |||
| BLASLONG m_from = *(((BLASLONG *)range_m) + 0); | |||
| BLASLONG m_to = *(((BLASLONG *)range_m) + 1); | |||
| @@ -1030,7 +1033,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO | |||
| } | |||
| */ | |||
| if ((args -> m < nthreads * SWITCH_RATIO) || (args -> n < nthreads * SWITCH_RATIO)) { | |||
| if ((args -> m < nthreads * switch_ratio) || (args -> n < nthreads * switch_ratio)) { | |||
| GEMM3M_LOCAL(args, range_m, range_n, sa, sb, 0); | |||
| return 0; | |||
| } | |||
| @@ -1038,7 +1041,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO | |||
| divT = nthreads; | |||
| divN = 1; | |||
| while ((GEMM3M_P * divT > m * SWITCH_RATIO) && (divT > 1)) { | |||
| while ((GEMM3M_P * divT > m * switch_ratio) && (divT > 1)) { | |||
| do { | |||
| divT --; | |||
| divN = 1; | |||
| @@ -1,5 +1,6 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* Copyright 2023 The OpenBLAS Project. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| @@ -44,10 +45,6 @@ | |||
| #define DIVIDE_RATE 2 | |||
| #endif | |||
| #ifndef SWITCH_RATIO | |||
| #define SWITCH_RATIO 2 | |||
| #endif | |||
| //The array of job_t may overflow the stack. | |||
| //Instead, use malloc to alloc job_t. | |||
| #if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD | |||
| @@ -528,7 +525,13 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO | |||
| int mode, mask; | |||
| double dnum, di, dinum; | |||
| if ((nthreads == 1) || (args -> n < nthreads * SWITCH_RATIO)) { | |||
| #if defined(DYNAMIC_ARCH) | |||
| int switch_ratio = gotoblas->switch_ratio; | |||
| #else | |||
| int switch_ratio = SWITCH_RATIO; | |||
| #endif | |||
| if ((nthreads == 1) || (args->n < nthreads * switch_ratio)) { | |||
| SYRK_LOCAL(args, range_m, range_n, sa, sb, 0); | |||
| return 0; | |||
| } | |||
| @@ -1,5 +1,6 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* Copyright 2023 The OpenBLAS Project. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| @@ -44,10 +45,6 @@ | |||
| #define DIVIDE_RATE 2 | |||
| #endif | |||
| #ifndef SWITCH_RATIO | |||
| #define SWITCH_RATIO 2 | |||
| #endif | |||
| #ifndef GEMM_PREFERED_SIZE | |||
| #define GEMM_PREFERED_SIZE 1 | |||
| #endif | |||
| @@ -577,6 +574,11 @@ InitializeCriticalSection((PCRITICAL_SECTION)&level3_lock); | |||
| BLASLONG width, i, j, k, js; | |||
| BLASLONG m, n, n_from, n_to; | |||
| int mode; | |||
| #if defined(DYNAMIC_ARCH) | |||
| int switch_ratio = gotoblas->switch_ratio; | |||
| #else | |||
| int switch_ratio = SWITCH_RATIO; | |||
| #endif | |||
| /* Get execution mode */ | |||
| #ifndef COMPLEX | |||
| @@ -698,8 +700,8 @@ EnterCriticalSection((PCRITICAL_SECTION)&level3_lock); | |||
| num_parts = 0; | |||
| while (n > 0){ | |||
| width = blas_quickdivide(n + nthreads - num_parts - 1, nthreads - num_parts); | |||
| if (width < SWITCH_RATIO) { | |||
| width = SWITCH_RATIO; | |||
| if (width < switch_ratio) { | |||
| width = switch_ratio; | |||
| } | |||
| width = round_up(n, width, GEMM_PREFERED_SIZE); | |||
| @@ -746,6 +748,11 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, IFLOAT *sa, IF | |||
| BLASLONG m = args -> m; | |||
| BLASLONG n = args -> n; | |||
| BLASLONG nthreads_m, nthreads_n; | |||
| #if defined(DYNAMIC_ARCH) | |||
| int switch_ratio = gotoblas->switch_ratio; | |||
| #else | |||
| int switch_ratio = SWITCH_RATIO; | |||
| #endif | |||
| /* Get dimensions from index ranges if available */ | |||
| if (range_m) { | |||
| @@ -755,21 +762,21 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, IFLOAT *sa, IF | |||
| n = range_n[1] - range_n[0]; | |||
| } | |||
| /* Partitions in m should have at least SWITCH_RATIO rows */ | |||
| if (m < 2 * SWITCH_RATIO) { | |||
| /* Partitions in m should have at least switch_ratio rows */ | |||
| if (m < 2 * switch_ratio) { | |||
| nthreads_m = 1; | |||
| } else { | |||
| nthreads_m = args -> nthreads; | |||
| while (m < nthreads_m * SWITCH_RATIO) { | |||
| while (m < nthreads_m * switch_ratio) { | |||
| nthreads_m = nthreads_m / 2; | |||
| } | |||
| } | |||
| /* Partitions in n should have at most SWITCH_RATIO * nthreads_m columns */ | |||
| if (n < SWITCH_RATIO * nthreads_m) { | |||
| /* Partitions in n should have at most switch_ratio * nthreads_m columns */ | |||
| if (n < switch_ratio * nthreads_m) { | |||
| nthreads_n = 1; | |||
| } else { | |||
| nthreads_n = (n + SWITCH_RATIO * nthreads_m - 1) / (SWITCH_RATIO * nthreads_m); | |||
| nthreads_n = (n + switch_ratio * nthreads_m - 1) / (switch_ratio * nthreads_m); | |||
| if (nthreads_m * nthreads_n > args -> nthreads) { | |||
| nthreads_n = blas_quickdivide(args -> nthreads, nthreads_m); | |||
| } | |||
| @@ -973,7 +973,7 @@ void goto_set_num_threads(int num_threads) { | |||
| increased_threads = 1; | |||
| for(i = blas_num_threads - 1; i < num_threads - 1; i++){ | |||
| for(i = (blas_num_threads > 0) ? blas_num_threads - 1 : 0; i < num_threads - 1; i++){ | |||
| atomic_store_queue(&thread_status[i].queue, (blas_queue_t *)0); | |||
| thread_status[i].status = THREAD_STATUS_WAKEUP; | |||
| @@ -68,6 +68,7 @@ | |||
| #endif | |||
| int blas_server_avail = 0; | |||
| int blas_omp_number_max = 0; | |||
| extern int openblas_omp_adaptive_env(); | |||
| @@ -100,8 +101,6 @@ static void adjust_thread_buffers() { | |||
| void goto_set_num_threads(int num_threads) { | |||
| blas_num_threads_set = 1; | |||
| if (num_threads < 0) blas_num_threads_set = 0; | |||
| if (num_threads < 1) num_threads = blas_num_threads; | |||
| if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER; | |||
| @@ -125,6 +124,8 @@ void openblas_set_num_threads(int num_threads) { | |||
| } | |||
| int blas_thread_init(void){ | |||
| if(blas_omp_number_max <= 0) | |||
| blas_omp_number_max = omp_get_max_threads(); | |||
| blas_get_cpu_number(); | |||
| @@ -568,7 +568,7 @@ void goto_set_num_threads(int num_threads) | |||
| blas_server_avail = 1; | |||
| } | |||
| for(i = blas_num_threads - 1; i < num_threads - 1; i++){ | |||
| for(i = (blas_num_threads > 0) ? blas_num_threads - 1 : 0; i < num_threads - 1; i++){ | |||
| blas_threads[i] = CreateThread(NULL, 0, | |||
| blas_thread_server, (void *)i, | |||
| @@ -220,6 +220,19 @@ extern gotoblas_t gotoblas_COOPERLAKE; | |||
| #else | |||
| #define gotoblas_COOPERLAKE gotoblas_PRESCOTT | |||
| #endif | |||
| #ifdef DYN_SAPPHIRERAPIDS | |||
| extern gotoblas_t gotoblas_SAPPHIRERAPIDS; | |||
| #elif defined(DYN_SKYLAKEX) | |||
| #define gotoblas_SAPPHIRERAPIDS gotoblas_SKYLAKEX | |||
| #elif defined(DYN_HASWELL) | |||
| #define gotoblas_SAPPHIRERAPIDS gotoblas_HASWELL | |||
| #elif defined(DYN_SANDYBRIDGE) | |||
| #define gotoblas_SAPPHIRERAPIDS gotoblas_SANDYBRIDGE | |||
| #elif defined(DYN_NEHALEM) | |||
| #define gotoblas_SAPPHIRERAPIDS gotoblas_NEHALEM | |||
| #else | |||
| #define gotoblas_SAPPHIRERAPIDS gotoblas_PRESCOTT | |||
| #endif | |||
| #else // not DYNAMIC_LIST | |||
| @@ -268,9 +281,11 @@ extern gotoblas_t gotoblas_ZEN; | |||
| #ifndef NO_AVX512 | |||
| extern gotoblas_t gotoblas_SKYLAKEX; | |||
| extern gotoblas_t gotoblas_COOPERLAKE; | |||
| extern gotoblas_t gotoblas_SAPPHIRERAPIDS; | |||
| #else | |||
| #define gotoblas_SKYLAKEX gotoblas_HASWELL | |||
| #define gotoblas_COOPERLAKE gotoblas_HASWELL | |||
| #define gotoblas_SAPPHIRERAPIDS gotoblas_HASWELL | |||
| #endif | |||
| #endif | |||
| #else | |||
| @@ -279,6 +294,7 @@ extern gotoblas_t gotoblas_COOPERLAKE; | |||
| #define gotoblas_HASWELL gotoblas_NEHALEM | |||
| #define gotoblas_SKYLAKEX gotoblas_NEHALEM | |||
| #define gotoblas_COOPERLAKE gotoblas_NEHALEM | |||
| #define gotoblas_SAPPHIRERAPIDS gotoblas_NEHALEM | |||
| #define gotoblas_BULLDOZER gotoblas_BARCELONA | |||
| #define gotoblas_PILEDRIVER gotoblas_BARCELONA | |||
| #define gotoblas_STEAMROLLER gotoblas_BARCELONA | |||
| @@ -378,6 +394,31 @@ int support_avx512_bf16(){ | |||
| #endif | |||
| } | |||
| #define BIT_AMX_TILE 0x01000000 | |||
| #define BIT_AMX_BF16 0x00400000 | |||
| #define BIT_AMX_ENBD 0x00060000 | |||
| int support_amx_bf16() { | |||
| #if !defined(NO_AVX) && !defined(NO_AVX512) | |||
| int eax, ebx, ecx, edx; | |||
| int ret=0; | |||
| if (!support_avx512()) | |||
| return 0; | |||
| // CPUID.7.0:EDX indicates AMX support | |||
| cpuid_count(7, 0, &eax, &ebx, &ecx, &edx); | |||
| if ((edx & BIT_AMX_TILE) && (edx & BIT_AMX_BF16)) { | |||
| // CPUID.D.0:EAX[17:18] indicates AMX enabled | |||
| cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx); | |||
| if ((eax & BIT_AMX_ENBD) == BIT_AMX_ENBD) | |||
| ret = 1; | |||
| } | |||
| return ret; | |||
| #else | |||
| return 0; | |||
| #endif | |||
| } | |||
| extern void openblas_warning(int verbose, const char * msg); | |||
| #define FALLBACK_VERBOSE 1 | |||
| #define NEHALEM_FALLBACK "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n" | |||
| @@ -689,6 +730,8 @@ static gotoblas_t *get_coretype(void){ | |||
| } | |||
| } | |||
| if (model == 15){ // Sapphire Rapids | |||
| if(support_amx_bf16()) | |||
| return &gotoblas_SAPPHIRERAPIDS; | |||
| if(support_avx512_bf16()) | |||
| return &gotoblas_COOPERLAKE; | |||
| if (support_avx512()) | |||
| @@ -941,7 +984,8 @@ static char *corename[] = { | |||
| "Excavator", | |||
| "Zen", | |||
| "SkylakeX", | |||
| "Cooperlake" | |||
| "Cooperlake", | |||
| "SapphireRapids" | |||
| }; | |||
| char *gotoblas_corename(void) { | |||
| @@ -1006,6 +1050,7 @@ char *gotoblas_corename(void) { | |||
| if (gotoblas == &gotoblas_ZEN) return corename[23]; | |||
| if (gotoblas == &gotoblas_SKYLAKEX) return corename[24]; | |||
| if (gotoblas == &gotoblas_COOPERLAKE) return corename[25]; | |||
| if (gotoblas == &gotoblas_SAPPHIRERAPIDS) return corename[26]; | |||
| return corename[0]; | |||
| } | |||
| @@ -1,5 +1,6 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* Copyright 2023 The OpenBLAS Project */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| @@ -109,6 +110,11 @@ extern gotoblas_t gotoblas_NEOVERSEN2; | |||
| #else | |||
| #define gotoblas_NEOVERSEN2 gotoblas_ARMV8 | |||
| #endif | |||
| #ifdef DYN_ARMV8SVE | |||
| extern gotoblas_t gotoblas_ARMV8SVE; | |||
| #else | |||
| #define gotoblas_ARMV8SVE gotoblas_ARMV8 | |||
| #endif | |||
| #ifdef DYN_CORTEX_A55 | |||
| extern gotoblas_t gotoblas_CORTEXA55; | |||
| #else | |||
| @@ -128,17 +134,21 @@ extern gotoblas_t gotoblas_NEOVERSEN1; | |||
| #ifndef NO_SVE | |||
| extern gotoblas_t gotoblas_NEOVERSEV1; | |||
| extern gotoblas_t gotoblas_NEOVERSEN2; | |||
| extern gotoblas_t gotoblas_ARMV8SVE; | |||
| #else | |||
| #define gotoblas_NEOVERSEV1 gotoblas_ARMV8 | |||
| #define gotoblas_NEOVERSEN2 gotoblas_ARMV8 | |||
| #define gotoblas_ARMV8SVE gotoblas_ARMV8 | |||
| #endif | |||
| extern gotoblas_t gotoblas_THUNDERX3T110; | |||
| extern gotoblas_t gotoblas_CORTEXA55; | |||
| #endif | |||
| extern void openblas_warning(int verbose, const char * msg); | |||
| #define FALLBACK_VERBOSE 1 | |||
| #define NEOVERSEN1_FALLBACK "OpenBLAS : Your OS does not support SVE instructions. OpenBLAS is using Neoverse N1 kernels as a fallback, which may give poorer performance.\n" | |||
| #define NUM_CORETYPES 13 | |||
| #define NUM_CORETYPES 16 | |||
| /* | |||
| * In case asm/hwcap.h is outdated on the build system, make sure | |||
| @@ -147,6 +157,9 @@ extern void openblas_warning(int verbose, const char * msg); | |||
| #ifndef HWCAP_CPUID | |||
| #define HWCAP_CPUID (1 << 11) | |||
| #endif | |||
| #ifndef HWCAP_SVE | |||
| #define HWCAP_SVE (1 << 22) | |||
| #endif | |||
| #define get_cpu_ftr(id, var) ({ \ | |||
| __asm__ __volatile__ ("mrs %0, "#id : "=r" (var)); \ | |||
| @@ -168,6 +181,7 @@ static char *corename[] = { | |||
| "neoversen2", | |||
| "thunderx3t110", | |||
| "cortexa55", | |||
| "armv8sve", | |||
| "unknown" | |||
| }; | |||
| @@ -187,6 +201,7 @@ char *gotoblas_corename(void) { | |||
| if (gotoblas == &gotoblas_NEOVERSEN2) return corename[12]; | |||
| if (gotoblas == &gotoblas_THUNDERX3T110) return corename[13]; | |||
| if (gotoblas == &gotoblas_CORTEXA55) return corename[14]; | |||
| if (gotoblas == &gotoblas_ARMV8SVE) return corename[15]; | |||
| return corename[NUM_CORETYPES]; | |||
| } | |||
| @@ -221,6 +236,7 @@ static gotoblas_t *force_coretype(char *coretype) { | |||
| case 12: return (&gotoblas_NEOVERSEN2); | |||
| case 13: return (&gotoblas_THUNDERX3T110); | |||
| case 14: return (&gotoblas_CORTEXA55); | |||
| case 15: return (&gotoblas_ARMV8SVE); | |||
| } | |||
| snprintf(message, 128, "Core not found: %s\n", coretype); | |||
| openblas_warning(1, message); | |||
| @@ -281,9 +297,17 @@ static gotoblas_t *get_coretype(void) { | |||
| return &gotoblas_NEOVERSEN1; | |||
| #ifndef NO_SVE | |||
| case 0xd49: | |||
| return &gotoblas_NEOVERSEN2; | |||
| if (!(getauxval(AT_HWCAP) & HWCAP_SVE)) { | |||
| openblas_warning(FALLBACK_VERBOSE, NEOVERSEN1_FALLBACK); | |||
| return &gotoblas_NEOVERSEN1; | |||
| } else | |||
| return &gotoblas_NEOVERSEN2; | |||
| case 0xd40: | |||
| return &gotoblas_NEOVERSEV1; | |||
| if (!(getauxval(AT_HWCAP) & HWCAP_SVE)) { | |||
| openblas_warning(FALLBACK_VERBOSE, NEOVERSEN1_FALLBACK); | |||
| return &gotoblas_NEOVERSEN1; | |||
| }else | |||
| return &gotoblas_NEOVERSEV1; | |||
| #endif | |||
| case 0xd05: // Cortex A55 | |||
| return &gotoblas_CORTEXA55; | |||
| @@ -332,6 +356,12 @@ static gotoblas_t *get_coretype(void) { | |||
| snprintf(coremsg, 128, "Unknown CPU model - implementer %x part %x\n",implementer,part); | |||
| openblas_warning(1, coremsg); | |||
| } | |||
| #ifndef NO_SVE | |||
| if ((getauxval(AT_HWCAP) & HWCAP_SVE)) { | |||
| return &gotoblas_ARMV8SVE; | |||
| } | |||
| #endif | |||
| return NULL; | |||
| #endif | |||
| } | |||
| @@ -422,8 +422,6 @@ This value is equal or large than blas_cpu_number. This means some threads are s | |||
| */ | |||
| int blas_num_threads = 0; | |||
| int blas_num_threads_set = 0; | |||
| int goto_get_num_procs (void) { | |||
| return blas_cpu_number; | |||
| } | |||
| @@ -1996,8 +1994,6 @@ This value is equal or large than blas_cpu_number. This means some threads are s | |||
| */ | |||
| int blas_num_threads = 0; | |||
| int blas_num_threads_set = 0; | |||
| int goto_get_num_procs (void) { | |||
| return blas_cpu_number; | |||
| } | |||
| @@ -3015,6 +3011,8 @@ void *blas_memory_alloc(int procpos){ | |||
| #endif | |||
| if (memory_overflowed) goto terminate; | |||
| fprintf(stderr,"OpenBLAS warning: precompiled NUM_THREADS exceeded, adding auxiliary array for thread metadata.\n"); | |||
| fprintf(stderr,"To avoid this warning, please rebuild your copy of OpenBLAS with a larger NUM_THREADS setting\n"); | |||
| fprintf(stderr,"or set the environment variable OPENBLAS_NUM_THREADS to %d or lower\n", NUM_BUFFERS); | |||
| memory_overflowed=1; | |||
| new_release_info = (struct release_t*) malloc(512*sizeof(struct release_t)); | |||
| newmemory = (struct newmemstruct*) malloc(512*sizeof(struct newmemstruct)); | |||
| @@ -283,7 +283,6 @@ The numbers of threads in the thread pool. | |||
| This value is equal or large than blas_cpu_number. This means some threads are sleep. | |||
| */ | |||
| int blas_num_threads = 0; | |||
| int blas_num_threads_set = 0; | |||
| int goto_get_num_procs (void) { | |||
| return blas_cpu_number; | |||
| @@ -21,7 +21,7 @@ blasobjsc=" | |||
| chbmv chemm chemv cher2 cher2k cher cherk scabs1 scamax | |||
| chpmv chpr2 chpr crotg cscal csrot csscal cswap scamin scasum scnrm2 | |||
| csymm csyr2k csyrk ctbmv ctbsv ctpmv ctpsv ctrmm ctrmv ctrsm | |||
| ctrsv icamax icamin cimatcopy comatcopy cgeadd scsum" | |||
| ctrsv icamax icamin cimatcopy comatcopy cgeadd scsum cgemmt" | |||
| blasobjsd=" | |||
| damax damin dasum daxpy daxpby dcabs1 dcopy ddot dgbmv dgemm | |||
| @@ -29,7 +29,7 @@ blasobjsd=" | |||
| dscal dsdot dspmv dspr2 dimatcopy domatcopy | |||
| dspr dswap dsymm dsymv dsyr2 dsyr2k dsyr dsyrk dtbmv dtbsv | |||
| dtpmv dtpsv dtrmm dtrmv dtrsm dtrsv | |||
| idamax idamin idmax idmin dgeadd dsum" | |||
| idamax idamin idmax idmin dgeadd dsum dgemmt" | |||
| blasobjss=" | |||
| isamax isamin ismax ismin | |||
| @@ -38,7 +38,7 @@ blasobjss=" | |||
| smax smin snrm2 simatcopy somatcopy | |||
| srot srotg srotm srotmg ssbmv sscal sspmv sspr2 sspr sswap | |||
| ssymm ssymv ssyr2 ssyr2k ssyr ssyrk stbmv stbsv stpmv stpsv | |||
| strmm strmv strsm strsv sgeadd ssum" | |||
| strmm strmv strsm strsv sgeadd ssum sgemmt" | |||
| blasobjsz=" | |||
| izamax izamin | |||
| @@ -48,7 +48,7 @@ blasobjsz=" | |||
| zhpr zrotg zscal zswap zsymm zsyr2k zsyrk ztbmv | |||
| ztbsv ztpmv ztpsv ztrmm ztrmv ztrsm ztrsv | |||
| zomatcopy zimatcopy dzamax dzamin dzasum dznrm2 | |||
| zgeadd dzsum" | |||
| zgeadd dzsum zgemmt" | |||
| blasobjs="lsame xerbla" | |||
| bfblasobjs="sbgemm sbgemv sbdot sbstobf16 sbdtobf16 sbf16tos dbf16tod" | |||
| @@ -58,7 +58,7 @@ cblasobjsc=" | |||
| cblas_cher cblas_cherk cblas_chpmv cblas_chpr2 cblas_chpr cblas_cscal cblas_caxpby | |||
| cblas_csscal cblas_cswap cblas_csymm cblas_csyr2k cblas_csyrk cblas_ctbmv cblas_cgeadd | |||
| cblas_ctbsv cblas_ctpmv cblas_ctpsv cblas_ctrmm cblas_ctrmv cblas_ctrsm cblas_ctrsv | |||
| cblas_scnrm2 cblas_scasum | |||
| cblas_scnrm2 cblas_scasum cblas_cgemmt | |||
| cblas_icamax cblas_icamin cblas_icmin cblas_icmax cblas_scsum cblas_cimatcopy cblas_comatcopy | |||
| " | |||
| cblasobjsd=" | |||
| @@ -67,7 +67,7 @@ cblasobjsd=" | |||
| cblas_drot cblas_drotg cblas_drotm cblas_drotmg cblas_dsbmv cblas_dscal cblas_dsdot | |||
| cblas_dspmv cblas_dspr2 cblas_dspr cblas_dswap cblas_dsymm cblas_dsymv cblas_dsyr2 | |||
| cblas_dsyr2k cblas_dsyr cblas_dsyrk cblas_dtbmv cblas_dtbsv cblas_dtpmv cblas_dtpsv | |||
| cblas_dtrmm cblas_dtrmv cblas_dtrsm cblas_dtrsv cblas_daxpby cblas_dgeadd | |||
| cblas_dtrmm cblas_dtrmv cblas_dtrsm cblas_dtrsv cblas_daxpby cblas_dgeadd cblas_dgemmt | |||
| cblas_idamax cblas_idamin cblas_idmin cblas_idmax cblas_dsum cblas_dimatcopy cblas_domatcopy | |||
| " | |||
| @@ -78,7 +78,7 @@ cblasobjss=" | |||
| cblas_srotm cblas_srotmg cblas_ssbmv cblas_sscal cblas_sspmv cblas_sspr2 cblas_sspr | |||
| cblas_sswap cblas_ssymm cblas_ssymv cblas_ssyr2 cblas_ssyr2k cblas_ssyr cblas_ssyrk | |||
| cblas_stbmv cblas_stbsv cblas_stpmv cblas_stpsv cblas_strmm cblas_strmv cblas_strsm | |||
| cblas_strsv cblas_sgeadd | |||
| cblas_strsv cblas_sgeadd cblas_sgemmt | |||
| cblas_isamax cblas_isamin cblas_ismin cblas_ismax cblas_ssum cblas_simatcopy cblas_somatcopy | |||
| " | |||
| @@ -89,7 +89,7 @@ cblasobjsz=" | |||
| cblas_zhpr cblas_zscal cblas_zswap cblas_zsymm cblas_zsyr2k cblas_zsyrk | |||
| cblas_ztbmv cblas_ztbsv cblas_ztpmv cblas_ztpsv cblas_ztrmm cblas_ztrmv cblas_ztrsm | |||
| cblas_ztrsv cblas_cdotc_sub cblas_cdotu_sub cblas_zdotc_sub cblas_zdotu_sub | |||
| cblas_zaxpby cblas_zgeadd | |||
| cblas_zaxpby cblas_zgeadd cblas_zgemmt | |||
| cblas_izamax cblas_izamin cblas_izmin cblas_izmax cblas_dzsum cblas_zimatcopy cblas_zomatcopy | |||
| " | |||
| @@ -716,6 +716,7 @@ lapackobjs2z="$lapackobjs2z | |||
| # functions added for lapack-3.7.0 | |||
| lapackobjs2s="$lapackobjs2s | |||
| slarfy | |||
| ssyconvf | |||
| strevc3 | |||
| sgelqt | |||
| sgelqt3 | |||
| @@ -843,6 +844,23 @@ lapackobjs2z="$lapackobjs2z | |||
| zungtsqr_row | |||
| " | |||
| #functions added for lapack-3.11 | |||
| lapackobjs2c="$lapackobjs2c | |||
| cgedmd | |||
| cgedmdq | |||
| " | |||
| lapackobjs2d="$lapackobjs2d | |||
| dgedmd | |||
| dgedmdq | |||
| " | |||
| lapackobjs2s="$lapackobjs2s | |||
| sgedmd | |||
| sgedmdq | |||
| " | |||
| lapackobjs2z="$lapackobjs2z | |||
| zgedmd | |||
| zgedmdq | |||
| " | |||
| lapack_extendedprecision_objs=" | |||
| zposvxx clagge clatms chesvxx cposvxx cgesvxx ssyrfssx csyrfsx | |||
| dlagsy dsysvxx sporfsx slatms zlatms zherfsx csysvxx | |||
| @@ -1012,6 +1030,10 @@ lapackeobjsc=" | |||
| LAPACKE_cgebrd_work | |||
| LAPACKE_cgecon | |||
| LAPACKE_cgecon_work | |||
| LAPACKE_cgedmd | |||
| LAPACKE_cgedmd_work | |||
| LAPACKE_cgedmdq | |||
| LAPACKE_cgedmdq_work | |||
| LAPACKE_cgeequ | |||
| LAPACKE_cgeequ_work | |||
| LAPACKE_cgeequb | |||
| @@ -1671,6 +1693,10 @@ lapackeobjsd=" | |||
| LAPACKE_dgebrd_work | |||
| LAPACKE_dgecon | |||
| LAPACKE_dgecon_work | |||
| LAPACKE_dgedmd | |||
| LAPACKE_dgedmd_work | |||
| LAPACKE_dgedmdq | |||
| LAPACKE_dgedmdq_work | |||
| LAPACKE_dgeequ | |||
| LAPACKE_dgeequ_work | |||
| LAPACKE_dgeequb | |||
| @@ -2284,6 +2310,10 @@ lapackeobjss=" | |||
| LAPACKE_sgebrd_work | |||
| LAPACKE_sgecon | |||
| LAPACKE_sgecon_work | |||
| LAPACKE_sgedmd | |||
| LAPACKE_sgedmd_work | |||
| LAPACKE_sgedmdq | |||
| LAPACKE_sgedmdq_work | |||
| LAPACKE_sgeequ | |||
| LAPACKE_sgeequ_work | |||
| LAPACKE_sgeequb | |||
| @@ -2893,6 +2923,10 @@ lapackeobjsz=" | |||
| LAPACKE_zgebrd_work | |||
| LAPACKE_zgecon | |||
| LAPACKE_zgecon_work | |||
| LAPACKE_zgedmd | |||
| LAPACKE_zgedmd_work | |||
| LAPACKE_zgedmdq | |||
| LAPACKE_zgedmdq_work | |||
| LAPACKE_zgeequ | |||
| LAPACKE_zgeequ_work | |||
| LAPACKE_zgeequb | |||
| @@ -21,7 +21,7 @@ | |||
| chbmv,chemm,chemv,cher2,cher2k,cher,cherk,scabs1,scamax, | |||
| chpmv,chpr2,chpr,crotg,cscal,csrot,csscal,cswap,scamin,scasum,scnrm2, | |||
| csymm,csyr2k,csyrk,ctbmv,ctbsv,ctpmv,ctpsv,ctrmm,ctrmv,ctrsm, | |||
| ctrsv,icamax,icamin,cimatcopy,comatcopy,cgeadd,scsum); | |||
| ctrsv,icamax,icamin,cimatcopy,comatcopy,cgeadd,scsum,cgemmt); | |||
| @blasobjsd = ( | |||
| damax,damin,dasum,daxpy,daxpby,dcabs1,dcopy,ddot,dgbmv,dgemm, | |||
| @@ -29,7 +29,7 @@ | |||
| dscal,dsdot,dspmv,dspr2,dimatcopy,domatcopy, | |||
| dspr,dswap,dsymm,dsymv,dsyr2,dsyr2k,dsyr,dsyrk,dtbmv,dtbsv, | |||
| dtpmv,dtpsv,dtrmm,dtrmv,dtrsm,dtrsv, | |||
| idamax,idamin,idmax,idmin,dgeadd,dsum); | |||
| idamax,idamin,idmax,idmin,dgeadd,dsum,dgemmt); | |||
| @blasobjss = ( | |||
| isamax,isamin,ismax,ismin, | |||
| @@ -38,7 +38,7 @@ | |||
| smax,smin,snrm2,simatcopy,somatcopy, | |||
| srot,srotg,srotm,srotmg,ssbmv,sscal,sspmv,sspr2,sspr,sswap, | |||
| ssymm,ssymv,ssyr2,ssyr2k,ssyr,ssyrk,stbmv,stbsv,stpmv,stpsv, | |||
| strmm,strmv,strsm,strsv, sgeadd,ssum); | |||
| strmm,strmv,strsm,strsv, sgeadd,ssum,sgemmt); | |||
| @blasobjsz = ( | |||
| izamax,izamin,, | |||
| @@ -48,7 +48,7 @@ | |||
| zhpr,zrotg,zscal,zswap,zsymm,zsyr2k,zsyrk,ztbmv, | |||
| ztbsv,ztpmv,ztpsv,ztrmm,ztrmv,ztrsm,ztrsv, | |||
| zomatcopy, zimatcopy,dzamax,dzamin,dzasum,dznrm2, | |||
| zgeadd, dzsum); | |||
| zgeadd, dzsum, zgemmt); | |||
| @blasobjs = (lsame, xerbla); | |||
| @bfblasobjs = (sbgemm, sbgemv, sbdot, sbstobf16, sbdtobf16, sbf16tos, dbf16tod); | |||
| @@ -60,7 +60,7 @@ | |||
| cblas_ctbsv, cblas_ctpmv, cblas_ctpsv, cblas_ctrmm, cblas_ctrmv, cblas_ctrsm, cblas_ctrsv, | |||
| cblas_scnrm2, cblas_scasum, | |||
| cblas_icamax, cblas_icamin, cblas_icmin, cblas_icmax, cblas_scsum,cblas_cimatcopy,cblas_comatcopy | |||
| ); | |||
| cblas_cgemmt); | |||
| @cblasobjsd = ( | |||
| cblas_dasum, cblas_daxpy, cblas_dcopy, cblas_ddot, | |||
| cblas_dgbmv, cblas_dgemm, cblas_dgemv, cblas_dger, cblas_dnrm2, | |||
| @@ -69,7 +69,7 @@ | |||
| cblas_dsyr2k, cblas_dsyr, cblas_dsyrk, cblas_dtbmv, cblas_dtbsv, cblas_dtpmv, cblas_dtpsv, | |||
| cblas_dtrmm, cblas_dtrmv, cblas_dtrsm, cblas_dtrsv, cblas_daxpby, cblas_dgeadd, | |||
| cblas_idamax, cblas_idamin, cblas_idmin, cblas_idmax, cblas_dsum,cblas_dimatcopy,cblas_domatcopy | |||
| ); | |||
| cblas_dgemmt); | |||
| @cblasobjss = ( | |||
| cblas_sasum, cblas_saxpy, cblas_saxpby, | |||
| @@ -80,7 +80,7 @@ | |||
| cblas_stbmv, cblas_stbsv, cblas_stpmv, cblas_stpsv, cblas_strmm, cblas_strmv, cblas_strsm, | |||
| cblas_strsv, cblas_sgeadd, | |||
| cblas_isamax, cblas_isamin, cblas_ismin, cblas_ismax, cblas_ssum,cblas_simatcopy,cblas_somatcopy | |||
| ); | |||
| cblas_sgemmt); | |||
| @cblasobjsz = ( | |||
| cblas_dzasum, cblas_dznrm2, cblas_zaxpy, cblas_zcopy, cblas_zdotc, cblas_zdotu, cblas_zdscal, | |||
| cblas_zgbmv, cblas_zgemm, cblas_zgemv, cblas_zgerc, cblas_zgeru, cblas_zhbmv, cblas_zhemm, | |||
| @@ -90,7 +90,7 @@ | |||
| cblas_ztrsv, cblas_cdotc_sub, cblas_cdotu_sub, cblas_zdotc_sub, cblas_zdotu_sub, | |||
| cblas_zaxpby, cblas_zgeadd, | |||
| cblas_izamax, cblas_izamin, cblas_izmin, cblas_izmax, cblas_dzsum,cblas_zimatcopy,cblas_zomatcopy | |||
| ); | |||
| cblas_zgemmt); | |||
| @cblasobjs = ( cblas_xerbla ); | |||
| @@ -101,7 +101,14 @@ else | |||
| *flang*) | |||
| vendor=FLANG | |||
| openmp='-fopenmp' | |||
| ;; | |||
| data=`$compiler -v 2>&1 > /dev/null ` | |||
| v="${data#*version *}" | |||
| v="${v%%*.}" | |||
| major="${v%%.*}" | |||
| if [ "$major" -ge 17 ]; then | |||
| vendor=FLANGNEW | |||
| fi | |||
| ;; | |||
| *ifort*|*ifx*) | |||
| vendor=INTEL | |||
| openmp='-fopenmp' | |||
| @@ -1930,15 +1930,15 @@ printf("ELF_VERSION=2\n"); | |||
| #ifdef MAKE_NB_JOBS | |||
| #if MAKE_NB_JOBS > 0 | |||
| printf("MAKE += -j %d\n", MAKE_NB_JOBS); | |||
| printf("MAKEFLAGS += -j %d\n", MAKE_NB_JOBS); | |||
| #else | |||
| // Let make use parent -j argument or -j1 if there | |||
| // is no make parent | |||
| #endif | |||
| #elif NO_PARALLEL_MAKE==1 | |||
| printf("MAKE += -j 1\n"); | |||
| printf("MAKEFLAGS += -j 1\n"); | |||
| #else | |||
| printf("MAKE += -j %d\n", get_num_cores()); | |||
| printf("MAKEFLAGS += -j %d\n", get_num_cores()); | |||
| #endif | |||
| break; | |||
| @@ -68,7 +68,7 @@ void NAME(blasint *M, blasint *N, FLOAT *ALPHA, FLOAT *a, blasint *LDA, | |||
| info = 0; | |||
| if (lda < MAX(1, m)) info = 6; | |||
| if (lda < MAX(1, m)) info = 5; | |||
| if (ldc < MAX(1, m)) info = 8; | |||
| if (n < 0) info = 2; | |||
| @@ -154,6 +154,23 @@ static size_t zgemm_small_kernel_b0[] = { | |||
| #endif | |||
| #endif | |||
| #if defined(__linux__) && defined(__x86_64__) && defined(BFLOAT16) | |||
| #define XFEATURE_XTILEDATA 18 | |||
| #define ARCH_REQ_XCOMP_PERM 0x1023 | |||
| static int openblas_amxtile_permission = 0; | |||
| static int init_amxtile_permission() { | |||
| long status = | |||
| syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA); | |||
| if (status != 0) { | |||
| fprintf(stderr, "XTILEDATA permission not granted in your device(Linux, " | |||
| "Intel Sapphier Rapids), skip sbgemm calculation\n"); | |||
| return -1; | |||
| } | |||
| openblas_amxtile_permission = 1; | |||
| return 0; | |||
| } | |||
| #endif | |||
| #ifndef CBLAS | |||
| void NAME(char *TRANSA, char *TRANSB, | |||
| @@ -455,6 +472,20 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS | |||
| #endif | |||
| #if defined(__linux__) && defined(__x86_64__) && defined(BFLOAT16) | |||
| #if defined(DYNAMIC_ARCH) | |||
| if (gotoblas->need_amxtile_permission && | |||
| openblas_amxtile_permission == 0 && init_amxtile_permission() == -1) { | |||
| return; | |||
| } | |||
| #endif | |||
| #if !defined(DYNAMIC_ARCH) && defined(SAPPHIRERAPIDS) | |||
| if (openblas_amxtile_permission == 0 && init_amxtile_permission() == -1) { | |||
| return; | |||
| } | |||
| #endif | |||
| #endif // defined(__linux__) && defined(__x86_64__) && defined(BFLOAT16) | |||
| if ((args.m == 0) || (args.n == 0)) return; | |||
| #if 0 | |||
| @@ -35,29 +35,26 @@ | |||
| #include <stdio.h> | |||
| #include <stdlib.h> | |||
| #include "common.h" | |||
| #ifdef FUNCTION_PROFILE | |||
| #include "functable.h" | |||
| #endif | |||
| #ifndef COMPLEX | |||
| #define SMP_THRESHOLD_MIN 65536.0 | |||
| #ifdef XDOUBLE | |||
| #define ERROR_NAME "QGEMT " | |||
| #define ERROR_NAME "QGEMMT " | |||
| #elif defined(DOUBLE) | |||
| #define ERROR_NAME "DGEMT " | |||
| #define ERROR_NAME "DGEMMT " | |||
| #elif defined(BFLOAT16) | |||
| #define ERROR_NAME "SBGEMT " | |||
| #define ERROR_NAME "SBGEMMT " | |||
| #else | |||
| #define ERROR_NAME "SGEMT " | |||
| #define ERROR_NAME "SGEMMT " | |||
| #endif | |||
| #else | |||
| #define SMP_THRESHOLD_MIN 8192.0 | |||
| #ifdef XDOUBLE | |||
| #define ERROR_NAME "XGEMT " | |||
| #define ERROR_NAME "XGEMMT " | |||
| #elif defined(DOUBLE) | |||
| #define ERROR_NAME "ZGEMT " | |||
| #define ERROR_NAME "ZGEMMT " | |||
| #else | |||
| #define ERROR_NAME "CGEMT " | |||
| #define ERROR_NAME "CGEMMT " | |||
| #endif | |||
| #endif | |||
| @@ -68,18 +65,19 @@ | |||
| #ifndef CBLAS | |||
| void NAME(char *UPLO, char *TRANSA, char *TRANSB, | |||
| blasint * M, blasint * N, blasint * K, | |||
| blasint * M, blasint * K, | |||
| FLOAT * Alpha, | |||
| IFLOAT * a, blasint * ldA, | |||
| IFLOAT * b, blasint * ldB, FLOAT * Beta, FLOAT * c, blasint * ldC) | |||
| { | |||
| blasint m, n, k; | |||
| blasint m, k; | |||
| blasint lda, ldb, ldc; | |||
| int transa, transb, uplo; | |||
| blasint info; | |||
| char transA, transB, Uplo; | |||
| blasint nrowa, nrowb; | |||
| IFLOAT *buffer; | |||
| IFLOAT *aa, *bb; | |||
| FLOAT *cc; | |||
| @@ -92,7 +90,6 @@ void NAME(char *UPLO, char *TRANSA, char *TRANSB, | |||
| PRINT_DEBUG_NAME; | |||
| m = *M; | |||
| n = *N; | |||
| k = *K; | |||
| #if defined(COMPLEX) | |||
| @@ -159,32 +156,39 @@ void NAME(char *UPLO, char *TRANSA, char *TRANSB, | |||
| if (Uplo == 'L') | |||
| uplo = 1; | |||
| nrowa = m; | |||
| if (transa) nrowa = k; | |||
| nrowb = k; | |||
| if (transb) nrowb = m; | |||
| info = 0; | |||
| if (uplo < 0) | |||
| info = 14; | |||
| if (ldc < m) | |||
| if (ldc < MAX(1, m)) | |||
| info = 13; | |||
| if (ldb < MAX(1, nrowa)) | |||
| info = 10; | |||
| if (lda < MAX(1, nrowb)) | |||
| info = 8; | |||
| if (k < 0) | |||
| info = 5; | |||
| if (n < 0) | |||
| info = 4; | |||
| if (m < 0) | |||
| info = 3; | |||
| info = 4; | |||
| if (transb < 0) | |||
| info = 2; | |||
| info = 3; | |||
| if (transa < 0) | |||
| info = 2; | |||
| if (uplo < 0) | |||
| info = 1; | |||
| if (info) { | |||
| if (info != 0) { | |||
| BLASFUNC(xerbla) (ERROR_NAME, &info, sizeof(ERROR_NAME)); | |||
| return; | |||
| } | |||
| #else | |||
| void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, | |||
| enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, | |||
| blasint N, blasint k, | |||
| enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint m, | |||
| blasint k, | |||
| #ifndef COMPLEX | |||
| FLOAT alpha, | |||
| IFLOAT * A, blasint LDA, | |||
| @@ -205,17 +209,20 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, | |||
| int transa, transb, uplo; | |||
| blasint info; | |||
| blasint m, n, lda, ldb; | |||
| blasint lda, ldb; | |||
| FLOAT *a, *b; | |||
| XFLOAT *buffer; | |||
| PRINT_DEBUG_CNAME; | |||
| uplo = -1; | |||
| transa = -1; | |||
| transb = -1; | |||
| info = 0; | |||
| if (order == CblasColMajor) { | |||
| if (Uplo == CblasUpper) uplo = 0; | |||
| if (Uplo == CblasLower) uplo = 1; | |||
| if (TransA == CblasNoTrans) | |||
| transa = 0; | |||
| @@ -248,9 +255,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, | |||
| transb = 3; | |||
| #endif | |||
| m = M; | |||
| n = N; | |||
| a = (void *)A; | |||
| b = (void *)B; | |||
| lda = LDA; | |||
| @@ -258,23 +262,31 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, | |||
| info = -1; | |||
| if (ldc < m) | |||
| blasint nrowa, nrowb; | |||
| nrowa = m; | |||
| if (transa) nrowa = k; | |||
| nrowb = k; | |||
| if (transb) nrowb = m; | |||
| if (ldc < MAX(1, m)) | |||
| info = 13; | |||
| if (ldb < MAX(1, nrowb)) | |||
| info = 10; | |||
| if (lda < MAX(1, nrowa)) | |||
| info = 8; | |||
| if (k < 0) | |||
| info = 5; | |||
| if (n < 0) | |||
| info = 4; | |||
| if (m < 0) | |||
| info = 3; | |||
| info = 4; | |||
| if (transb < 0) | |||
| info = 2; | |||
| info = 3; | |||
| if (transa < 0) | |||
| info = 2; | |||
| if (uplo < 0) | |||
| info = 1; | |||
| } | |||
| if (order == CblasRowMajor) { | |||
| m = N; | |||
| n = M; | |||
| a = (void *)B; | |||
| b = (void *)A; | |||
| @@ -282,6 +294,9 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, | |||
| lda = LDB; | |||
| ldb = LDA; | |||
| if (Uplo == CblasUpper) uplo = 0; | |||
| if (Uplo == CblasLower) uplo = 1; | |||
| if (TransB == CblasNoTrans) | |||
| transa = 0; | |||
| if (TransB == CblasTrans) | |||
| @@ -315,29 +330,30 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, | |||
| info = -1; | |||
| if (ldc < m) | |||
| blasint ncola, ncolb; | |||
| ncola = k; | |||
| if (transa) ncola = m; | |||
| ncolb = m; | |||
| if (transb) ncolb = k; | |||
| if (ldc < MAX(1,m)) | |||
| info = 13; | |||
| if (ldb < MAX(1, ncolb)) | |||
| info = 10; | |||
| if (lda < MAX(1, ncola)) | |||
| info = 8; | |||
| if (k < 0) | |||
| info = 5; | |||
| if (n < 0) | |||
| info = 4; | |||
| if (m < 0) | |||
| info = 3; | |||
| info = 4; | |||
| if (transb < 0) | |||
| info = 2; | |||
| info = 3; | |||
| if (transa < 0) | |||
| info = 2; | |||
| if (uplo < 0) | |||
| info = 1; | |||
| } | |||
| uplo = -1; | |||
| if (Uplo == CblasUpper) | |||
| uplo = 0; | |||
| if (Uplo == CblasLower) | |||
| uplo = 1; | |||
| if (uplo < 0) | |||
| info = 14; | |||
| if (info >= 0) { | |||
| BLASFUNC(xerbla) (ERROR_NAME, &info, sizeof(ERROR_NAME)); | |||
| return; | |||
| @@ -407,37 +423,35 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, | |||
| #endif | |||
| if ((m == 0) || (n == 0)) | |||
| if (m == 0) | |||
| return; | |||
| IDEBUG_START; | |||
| FUNCTION_PROFILE_START(); | |||
| const blasint incb = (transb == 0) ? 1 : ldb; | |||
| if (uplo == 1) { | |||
| for (i = 0; i < n; i++) { | |||
| j = n - i; | |||
| for (i = 0; i < m; i++) { | |||
| j = m - i; | |||
| l = j; | |||
| #if defined(COMPLEX) | |||
| aa = a + i * 2; | |||
| bb = b + i * ldb * 2; | |||
| if (transa) { | |||
| l = k; | |||
| aa = a + lda * i * 2; | |||
| bb = b + i * 2; | |||
| } | |||
| if (transb) | |||
| bb = b + i * 2; | |||
| cc = c + i * 2 * ldc + i * 2; | |||
| #else | |||
| aa = a + i; | |||
| bb = b + i * ldb; | |||
| if (transa) { | |||
| l = k; | |||
| aa = a + lda * i; | |||
| bb = b + i; | |||
| } | |||
| if (transb) | |||
| bb = b + i; | |||
| cc = c + i * ldc + i; | |||
| #endif | |||
| @@ -458,8 +472,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, | |||
| IDEBUG_START; | |||
| FUNCTION_PROFILE_START(); | |||
| buffer_size = j + k + 128 / sizeof(FLOAT); | |||
| #ifdef WINDOWS_ABI | |||
| buffer_size += 160 / sizeof(FLOAT); | |||
| @@ -479,20 +491,34 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, | |||
| #endif | |||
| #if defined(COMPLEX) | |||
| if (!transa) | |||
| (gemv[(int)transa]) (j, k, 0, alpha_r, alpha_i, | |||
| aa, lda, bb, incb, cc, 1, | |||
| buffer); | |||
| else | |||
| (gemv[(int)transa]) (k, j, 0, alpha_r, alpha_i, | |||
| aa, lda, bb, incb, cc, 1, | |||
| buffer); | |||
| #else | |||
| if (!transa) | |||
| (gemv[(int)transa]) (j, k, 0, alpha, aa, lda, | |||
| bb, incb, cc, 1, buffer); | |||
| else | |||
| (gemv[(int)transa]) (k, j, 0, alpha, aa, lda, | |||
| bb, incb, cc, 1, buffer); | |||
| #endif | |||
| #ifdef SMP | |||
| } else { | |||
| if (!transa) | |||
| (gemv_thread[(int)transa]) (j, k, alpha, aa, | |||
| lda, bb, incb, cc, | |||
| 1, buffer, | |||
| nthreads); | |||
| else | |||
| (gemv_thread[(int)transa]) (k, j, alpha, aa, | |||
| lda, bb, incb, cc, | |||
| 1, buffer, | |||
| nthreads); | |||
| } | |||
| #endif | |||
| @@ -501,21 +527,19 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, | |||
| } | |||
| } else { | |||
| for (i = 0; i < n; i++) { | |||
| for (i = 0; i < m; i++) { | |||
| j = i + 1; | |||
| l = j; | |||
| #if defined COMPLEX | |||
| bb = b + i * ldb * 2; | |||
| if (transa) { | |||
| l = k; | |||
| if (transb) { | |||
| bb = b + i * 2; | |||
| } | |||
| cc = c + i * 2 * ldc; | |||
| #else | |||
| bb = b + i * ldb; | |||
| if (transa) { | |||
| l = k; | |||
| if (transb) { | |||
| bb = b + i; | |||
| } | |||
| cc = c + i * ldc; | |||
| @@ -537,8 +561,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, | |||
| #endif | |||
| IDEBUG_START; | |||
| FUNCTION_PROFILE_START(); | |||
| buffer_size = j + k + 128 / sizeof(FLOAT); | |||
| #ifdef WINDOWS_ABI | |||
| buffer_size += 160 / sizeof(FLOAT); | |||
| @@ -558,30 +580,39 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, | |||
| #endif | |||
| #if defined(COMPLEX) | |||
| if (!transa) | |||
| (gemv[(int)transa]) (j, k, 0, alpha_r, alpha_i, | |||
| a, lda, bb, incb, cc, 1, | |||
| buffer); | |||
| else | |||
| (gemv[(int)transa]) (k, j, 0, alpha_r, alpha_i, | |||
| a, lda, bb, incb, cc, 1, | |||
| buffer); | |||
| #else | |||
| if (!transa) | |||
| (gemv[(int)transa]) (j, k, 0, alpha, a, lda, bb, | |||
| incb, cc, 1, buffer); | |||
| else | |||
| (gemv[(int)transa]) (k, j, 0, alpha, a, lda, bb, | |||
| incb, cc, 1, buffer); | |||
| #endif | |||
| #ifdef SMP | |||
| } else { | |||
| if (!transa) | |||
| (gemv_thread[(int)transa]) (j, k, alpha, a, lda, | |||
| bb, incb, cc, 1, | |||
| buffer, nthreads); | |||
| else | |||
| (gemv_thread[(int)transa]) (k, j, alpha, a, lda, | |||
| bb, incb, cc, 1, | |||
| buffer, nthreads); | |||
| } | |||
| #endif | |||
| STACK_FREE(buffer); | |||
| } | |||
| } | |||
| FUNCTION_PROFILE_END(COMPSIZE * COMPSIZE, | |||
| args.m * args.k + args.k * args.n + | |||
| args.m * args.n, 2 * args.m * args.n * args.k); | |||
| IDEBUG_END; | |||
| @@ -100,13 +100,13 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, | |||
| if ( order == BlasColMajor) | |||
| { | |||
| if ( trans == BlasNoTrans && *ldb < *rows ) info = 9; | |||
| if ( trans == BlasTrans && *ldb < *cols ) info = 9; | |||
| if ( trans == BlasNoTrans && *ldb < *rows ) info = 8; | |||
| if ( trans == BlasTrans && *ldb < *cols ) info = 8; | |||
| } | |||
| if ( order == BlasRowMajor) | |||
| { | |||
| if ( trans == BlasNoTrans && *ldb < *cols ) info = 9; | |||
| if ( trans == BlasTrans && *ldb < *rows ) info = 9; | |||
| if ( trans == BlasNoTrans && *ldb < *cols ) info = 8; | |||
| if ( trans == BlasTrans && *ldb < *rows ) info = 8; | |||
| } | |||
| if ( order == BlasColMajor && *lda < *rows ) info = 7; | |||
| @@ -120,17 +120,20 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, | |||
| BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); | |||
| return; | |||
| } | |||
| #ifdef NEW_IMATCOPY | |||
| if ( *lda == *ldb && *rows == *cols) { | |||
| if ( *lda == *ldb ) { | |||
| if ( order == BlasColMajor ) | |||
| { | |||
| if ( trans == BlasNoTrans ) | |||
| { | |||
| IMATCOPY_K_CN(*rows, *cols, *alpha, a, *lda ); | |||
| return; | |||
| } | |||
| else | |||
| else if ( *rows == *cols ) | |||
| { | |||
| IMATCOPY_K_CT(*rows, *cols, *alpha, a, *lda ); | |||
| return; | |||
| } | |||
| } | |||
| else | |||
| @@ -138,26 +141,23 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, | |||
| if ( trans == BlasNoTrans ) | |||
| { | |||
| IMATCOPY_K_RN(*rows, *cols, *alpha, a, *lda ); | |||
| return; | |||
| } | |||
| else | |||
| else if ( *rows == *cols ) | |||
| { | |||
| IMATCOPY_K_RT(*rows, *cols, *alpha, a, *lda ); | |||
| return; | |||
| } | |||
| } | |||
| return; | |||
| } | |||
| #endif | |||
| if ( *lda > *ldb ) | |||
| msize = (size_t)(*lda) * (*ldb) * sizeof(FLOAT); | |||
| else | |||
| msize = (size_t)(*ldb) * (*ldb) * sizeof(FLOAT); | |||
| msize = (size_t)(*rows) * (*cols) * sizeof(FLOAT); | |||
| b = malloc(msize); | |||
| if ( b == NULL ) | |||
| { | |||
| printf("Memory alloc failed\n"); | |||
| printf("Memory alloc failed in imatcopy\n"); | |||
| exit(1); | |||
| } | |||
| @@ -165,26 +165,26 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, | |||
| { | |||
| if ( trans == BlasNoTrans ) | |||
| { | |||
| OMATCOPY_K_CN(*rows, *cols, *alpha, a, *lda, b, *ldb ); | |||
| OMATCOPY_K_CN(*rows, *cols, (FLOAT) 1.0 , b, *ldb, a, *ldb ); | |||
| OMATCOPY_K_CN(*rows, *cols, *alpha, a, *lda, b, *rows ); | |||
| OMATCOPY_K_CN(*rows, *cols, (FLOAT) 1.0 , b, *rows, a, *ldb ); | |||
| } | |||
| else | |||
| { | |||
| OMATCOPY_K_CT(*rows, *cols, *alpha, a, *lda, b, *ldb ); | |||
| OMATCOPY_K_CN(*cols, *rows, (FLOAT) 1.0, b, *ldb, a, *ldb ); | |||
| OMATCOPY_K_CT(*rows, *cols, *alpha, a, *lda, b, *cols ); | |||
| OMATCOPY_K_CN(*cols, *rows, (FLOAT) 1.0, b, *cols, a, *ldb ); | |||
| } | |||
| } | |||
| else | |||
| { | |||
| if ( trans == BlasNoTrans ) | |||
| { | |||
| OMATCOPY_K_RN(*rows, *cols, *alpha, a, *lda, b, *ldb ); | |||
| OMATCOPY_K_RN(*rows, *cols, (FLOAT) 1.0, b, *ldb, a, *ldb ); | |||
| OMATCOPY_K_RN(*rows, *cols, *alpha, a, *lda, b, *cols ); | |||
| OMATCOPY_K_RN(*rows, *cols, (FLOAT) 1.0, b, *cols, a, *ldb ); | |||
| } | |||
| else | |||
| { | |||
| OMATCOPY_K_RT(*rows, *cols, *alpha, a, *lda, b, *ldb ); | |||
| OMATCOPY_K_RN(*cols, *rows, (FLOAT) 1.0, b, *ldb, a, *ldb ); | |||
| OMATCOPY_K_RT(*rows, *cols, *alpha, a, *lda, b, *rows ); | |||
| OMATCOPY_K_RN(*cols, *rows, (FLOAT) 1.0, b, *rows, a, *ldb ); | |||
| } | |||
| } | |||
| @@ -54,6 +54,21 @@ FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX){ | |||
| if (n <= 0) return 0.; | |||
| #ifndef COMPLEX | |||
| if (n == 1) | |||
| #ifdef DOUBLE | |||
| return fabs(x[0]); | |||
| #else | |||
| return fabsf(x[0]); | |||
| #endif | |||
| #endif | |||
| if (incx < 0) | |||
| #ifdef COMPLEX | |||
| x -= (n - 1) * incx * 2; | |||
| #else | |||
| x -= (n - 1) * incx; | |||
| #endif | |||
| IDEBUG_START; | |||
| FUNCTION_PROFILE_START(); | |||
| @@ -82,6 +97,22 @@ FLOAT CNAME(blasint n, FLOAT *x, blasint incx){ | |||
| if (n <= 0) return 0.; | |||
| #ifndef COMPLEX | |||
| if (n == 1) | |||
| #ifdef DOUBLE | |||
| return fabs(x[0]); | |||
| #else | |||
| return fabsf(x[0]); | |||
| #endif | |||
| #endif | |||
| if (incx < 0) | |||
| #ifdef COMPLEX | |||
| x -= (n - 1) * incx * 2; | |||
| #else | |||
| x -= (n - 1) * incx; | |||
| #endif | |||
| IDEBUG_START; | |||
| FUNCTION_PROFILE_START(); | |||
| @@ -1,9 +1,11 @@ | |||
| #include <math.h> | |||
| #include <float.h> | |||
| #include "common.h" | |||
| #ifdef FUNCTION_PROFILE | |||
| #include "functable.h" | |||
| #endif | |||
| #ifndef CBLAS | |||
| void NAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){ | |||
| @@ -14,17 +16,27 @@ void CNAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){ | |||
| #endif | |||
| #ifdef DOUBLE | |||
| long double safmin = DBL_MIN; | |||
| #else | |||
| long double safmin = FLT_MIN; | |||
| #endif | |||
| #if defined(__i386__) || defined(__x86_64__) || defined(__ia64__) || defined(_M_X64) || defined(_M_IX86) | |||
| long double da = *DA; | |||
| long double db = *DB; | |||
| long double c; | |||
| long double s; | |||
| long double r, roe, z; | |||
| long double r, z; | |||
| long double sigma, dascal,dbscal; | |||
| long double ada = fabsl(da); | |||
| long double adb = fabsl(db); | |||
| long double scale = ada + adb; | |||
| long double maxab = MAX(ada,adb); | |||
| long double safmax; | |||
| long double scale; | |||
| #ifndef CBLAS | |||
| PRINT_DEBUG_NAME; | |||
| @@ -32,17 +44,25 @@ void CNAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){ | |||
| PRINT_DEBUG_CNAME; | |||
| #endif | |||
| roe = db; | |||
| if (ada > adb) roe = da; | |||
| if (scale == ZERO) { | |||
| if (adb == ZERO) { | |||
| *C = ONE; | |||
| *S = ZERO; | |||
| *DA = ZERO; | |||
| *DB = ZERO; | |||
| } else if (ada == ZERO) { | |||
| *C = ZERO; | |||
| *S = ONE; | |||
| *DA = *DB; | |||
| *DB = ONE; | |||
| } else { | |||
| r = sqrt(da * da + db * db); | |||
| if (roe < 0) r = -r; | |||
| safmax = 1./safmin; | |||
| scale = MIN(MAX(safmin,maxab), safmax); | |||
| if (ada > adb) | |||
| sigma = copysign(1.,da); | |||
| else | |||
| sigma = copysign(1.,db); | |||
| dascal = da / scale; | |||
| dbscal = db / scale; | |||
| r = sigma * (scale * sqrt(dascal * dascal + dbscal * dbscal)); | |||
| c = da / r; | |||
| s = db / r; | |||
| z = ONE; | |||
| @@ -65,11 +85,22 @@ void CNAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){ | |||
| FLOAT db = *DB; | |||
| FLOAT c = *C; | |||
| FLOAT s = *S; | |||
| FLOAT r, roe, z; | |||
| FLOAT sigma; | |||
| FLOAT r, z; | |||
| FLOAT ada = fabs(da); | |||
| FLOAT adb = fabs(db); | |||
| FLOAT scale = ada + adb; | |||
| FLOAT maxab = MAX(ada,adb); | |||
| long double safmax ; | |||
| FLOAT scale ; | |||
| safmax = 1./safmin; | |||
| scale = MIN(MAX(safmin,maxab), safmax); | |||
| if (ada > adb) | |||
| sigma = copysign(1.,da); | |||
| else | |||
| sigma = copysign(1.,db); | |||
| #ifndef CBLAS | |||
| PRINT_DEBUG_NAME; | |||
| @@ -77,20 +108,21 @@ void CNAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){ | |||
| PRINT_DEBUG_CNAME; | |||
| #endif | |||
| roe = db; | |||
| if (ada > adb) roe = da; | |||
| if (scale == ZERO) { | |||
| if (adb == ZERO) { | |||
| *C = ONE; | |||
| *S = ZERO; | |||
| *DA = ZERO; | |||
| *DB = ZERO; | |||
| } else if (ada == ZERO) { | |||
| *C = ZERO; | |||
| *S = ONE; | |||
| *DA = *DB; | |||
| *DB = ONE; | |||
| } else { | |||
| FLOAT aa = da / scale; | |||
| FLOAT bb = db / scale; | |||
| r = scale * sqrt(aa * aa + bb * bb); | |||
| if (roe < 0) r = -r; | |||
| r = sigma * scale * sqrt(aa * aa + bb * bb); | |||
| c = da / r; | |||
| s = db / r; | |||
| z = ONE; | |||
| @@ -166,7 +166,7 @@ void NAME(char *SIDE, char *UPLO, | |||
| int nodes; | |||
| #endif | |||
| # if defined(SMP) | |||
| int MN; | |||
| double MN; | |||
| #endif | |||
| blasint info; | |||
| int side; | |||
| @@ -264,7 +264,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, | |||
| int nodes; | |||
| #endif | |||
| #if defined(SMP) | |||
| int MN; | |||
| double MN; | |||
| #endif | |||
| PRINT_DEBUG_CNAME; | |||
| @@ -107,7 +107,7 @@ void NAME(char *UPLO, char *TRANS, | |||
| FLOAT *sa, *sb; | |||
| #ifdef SMP | |||
| int NNK; | |||
| double NNK; | |||
| #ifdef USE_SIMPLE_THREADED_LEVEL3 | |||
| #ifndef COMPLEX | |||
| #ifdef XDOUBLE | |||
| @@ -232,7 +232,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Tr | |||
| FLOAT *sa, *sb; | |||
| #ifdef SMP | |||
| int NNK; | |||
| double NNK; | |||
| #ifdef USE_SIMPLE_THREADED_LEVEL3 | |||
| #ifndef COMPLEX | |||
| @@ -125,27 +125,33 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, | |||
| BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); | |||
| return; | |||
| } | |||
| #ifdef NEW_IMATCOPY | |||
| if (*lda == *ldb && *cols == *rows) { | |||
| if (*lda == *ldb ) { | |||
| if ( order == BlasColMajor ) | |||
| { | |||
| if ( trans == BlasNoTrans ) | |||
| { | |||
| IMATCOPY_K_CN(*rows, *cols, alpha[0], alpha[1], a, *lda ); | |||
| return; | |||
| } | |||
| if ( trans == BlasConj ) | |||
| { | |||
| IMATCOPY_K_CNC(*rows, *cols, alpha[0], alpha[1], a, *lda ); | |||
| return; | |||
| } | |||
| if ( trans == BlasTrans ) | |||
| if ( trans == BlasTrans && *rows == *cols ) | |||
| { | |||
| IMATCOPY_K_CT(*rows, *cols, alpha[0], alpha[1], a, *lda ); | |||
| return; | |||
| } | |||
| if ( trans == BlasTransConj ) | |||
| if ( trans == BlasTransConj && *rows == *cols ) | |||
| { | |||
| IMATCOPY_K_CTC(*rows, *cols, alpha[0], alpha[1], a, *lda ); | |||
| return; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| @@ -153,67 +159,59 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, | |||
| if ( trans == BlasNoTrans ) | |||
| { | |||
| IMATCOPY_K_RN(*rows, *cols, alpha[0], alpha[1], a, *lda ); | |||
| return; | |||
| } | |||
| if ( trans == BlasConj ) | |||
| { | |||
| IMATCOPY_K_RNC(*rows, *cols, alpha[0], alpha[1], a, *lda ); | |||
| return; | |||
| } | |||
| if ( trans == BlasTrans ) | |||
| if ( trans == BlasTrans && *rows == *cols ) | |||
| { | |||
| IMATCOPY_K_RT(*rows, *cols, alpha[0], alpha[1], a, *lda ); | |||
| return; | |||
| } | |||
| if ( trans == BlasTransConj ) | |||
| if ( trans == BlasTransConj && *rows == *cols ) | |||
| { | |||
| IMATCOPY_K_RTC(*rows, *cols, alpha[0], alpha[1], a, *lda ); | |||
| return; | |||
| } | |||
| } | |||
| return; | |||
| } | |||
| #endif | |||
| if ( *lda > *ldb ) | |||
| msize = (size_t)(*lda) * (*ldb) * sizeof(FLOAT) * 2; | |||
| else | |||
| msize = (size_t)(*ldb) * (*ldb) * sizeof(FLOAT) * 2; | |||
| b = malloc(msize); | |||
| if ( b == NULL ) | |||
| { | |||
| printf("Memory alloc failed in zimatcopy\n"); | |||
| exit(1); | |||
| } | |||
| msize = (size_t)(*rows) * (*cols) * sizeof(FLOAT) * 2; | |||
| b = malloc(msize); | |||
| if ( b == NULL ) | |||
| { | |||
| printf("Memory alloc failed in zimatcopy\n"); | |||
| exit(1); | |||
| } | |||
| if ( order == BlasColMajor ) | |||
| { | |||
| if ( trans == BlasNoTrans ) | |||
| { | |||
| OMATCOPY_K_CN(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb ); | |||
| OMATCOPY_K_CN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb ); | |||
| free(b); | |||
| return; | |||
| OMATCOPY_K_CN(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *rows ); | |||
| OMATCOPY_K_CN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *rows, a, *ldb ); | |||
| } | |||
| if ( trans == BlasConj ) | |||
| else if ( trans == BlasConj ) | |||
| { | |||
| OMATCOPY_K_CNC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb ); | |||
| OMATCOPY_K_CN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb ); | |||
| free(b); | |||
| return; | |||
| OMATCOPY_K_CNC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *rows ); | |||
| OMATCOPY_K_CN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *rows, a, *ldb ); | |||
| } | |||
| if ( trans == BlasTrans ) | |||
| else if ( trans == BlasTrans ) | |||
| { | |||
| OMATCOPY_K_CT(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb ); | |||
| OMATCOPY_K_CN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb ); | |||
| free(b); | |||
| return; | |||
| OMATCOPY_K_CT(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *cols ); | |||
| OMATCOPY_K_CN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *cols, a, *ldb ); | |||
| } | |||
| if ( trans == BlasTransConj ) | |||
| else if ( trans == BlasTransConj ) | |||
| { | |||
| OMATCOPY_K_CTC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb ); | |||
| OMATCOPY_K_CN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb ); | |||
| free(b); | |||
| return; | |||
| OMATCOPY_K_CTC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *cols ); | |||
| OMATCOPY_K_CN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *cols, a, *ldb ); | |||
| } | |||
| } | |||
| @@ -222,34 +220,27 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, | |||
| if ( trans == BlasNoTrans ) | |||
| { | |||
| OMATCOPY_K_RN(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb ); | |||
| OMATCOPY_K_RN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb ); | |||
| free(b); | |||
| return; | |||
| OMATCOPY_K_RN(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *cols ); | |||
| OMATCOPY_K_RN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *cols, a, *ldb ); | |||
| } | |||
| if ( trans == BlasConj ) | |||
| else if ( trans == BlasConj ) | |||
| { | |||
| OMATCOPY_K_RNC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb ); | |||
| OMATCOPY_K_RN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb ); | |||
| free(b); | |||
| return; | |||
| OMATCOPY_K_RNC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *cols ); | |||
| OMATCOPY_K_RN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *cols, a, *ldb ); | |||
| } | |||
| if ( trans == BlasTrans ) | |||
| else if ( trans == BlasTrans ) | |||
| { | |||
| OMATCOPY_K_RT(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb ); | |||
| OMATCOPY_K_RN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb ); | |||
| free(b); | |||
| return; | |||
| OMATCOPY_K_RT(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *rows ); | |||
| OMATCOPY_K_RN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *rows, a, *ldb ); | |||
| } | |||
| if ( trans == BlasTransConj ) | |||
| else if ( trans == BlasTransConj ) | |||
| { | |||
| OMATCOPY_K_RTC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb ); | |||
| OMATCOPY_K_RN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb ); | |||
| free(b); | |||
| return; | |||
| OMATCOPY_K_RTC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *rows ); | |||
| OMATCOPY_K_RN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *rows, a, *ldb ); | |||
| } | |||
| } | |||
| free(b); | |||
| return; | |||
| @@ -1,9 +1,11 @@ | |||
| #include <math.h> | |||
| #include <float.h> | |||
| #include "common.h" | |||
| #ifdef FUNCTION_PROFILE | |||
| #include "functable.h" | |||
| #endif | |||
| #ifndef CBLAS | |||
| void NAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){ | |||
| @@ -14,53 +16,28 @@ void CNAME(void *VDA, void *VDB, FLOAT *C, void *VS) { | |||
| FLOAT *S = (FLOAT*) VS; | |||
| #endif /* CBLAS */ | |||
| #if defined(__i386__) || defined(__x86_64__) || defined(__ia64__) || defined(_M_X64) || defined(_M_IX86) | |||
| long double da_r = *(DA + 0); | |||
| long double da_i = *(DA + 1); | |||
| long double db_r = *(DB + 0); | |||
| long double db_i = *(DB + 1); | |||
| long double r; | |||
| long double ada = fabsl(da_r) + fabsl(da_i); | |||
| PRINT_DEBUG_NAME; | |||
| IDEBUG_START; | |||
| FUNCTION_PROFILE_START(); | |||
| if (ada == ZERO) { | |||
| *C = ZERO; | |||
| *(S + 0) = ONE; | |||
| *(S + 1) = ZERO; | |||
| *(DA + 0) = db_r; | |||
| *(DA + 1) = db_i; | |||
| } else { | |||
| long double alpha_r, alpha_i; | |||
| ada = sqrt(da_r * da_r + da_i * da_i); | |||
| r = sqrt(da_r * da_r + da_i * da_i + db_r * db_r + db_i * db_i); | |||
| #ifdef DOUBLE | |||
| long double safmin = DBL_MIN; | |||
| long double rtmin = sqrt(DBL_MIN/DBL_EPSILON); | |||
| #else | |||
| long double safmin = FLT_MIN; | |||
| long double rtmin = sqrt(FLT_MIN/FLT_EPSILON); | |||
| #endif | |||
| alpha_r = da_r / ada; | |||
| alpha_i = da_i / ada; | |||
| *(C + 0) = ada / r; | |||
| *(S + 0) = (alpha_r * db_r + alpha_i *db_i) / r; | |||
| *(S + 1) = (alpha_i * db_r - alpha_r *db_i) / r; | |||
| *(DA + 0) = alpha_r * r; | |||
| *(DA + 1) = alpha_i * r; | |||
| } | |||
| #else | |||
| FLOAT da_r = *(DA + 0); | |||
| FLOAT da_i = *(DA + 1); | |||
| FLOAT db_r = *(DB + 0); | |||
| FLOAT db_i = *(DB + 1); | |||
| FLOAT r; | |||
| FLOAT da_r = *(DA+0); | |||
| FLOAT da_i = *(DA+1); | |||
| FLOAT db_r = *(DB+0); | |||
| FLOAT db_i = *(DB+1); | |||
| //long double r; | |||
| FLOAT *r, *S1=(FLOAT *)malloc(2*sizeof(FLOAT)); | |||
| FLOAT *R=(FLOAT *)malloc(2*sizeof(FLOAT)); | |||
| long double d; | |||
| FLOAT ada = fabs(da_r) + fabs(da_i); | |||
| FLOAT adb; | |||
| FLOAT ada = da_r * da_r + da_i * da_i; | |||
| FLOAT adb = db_r * db_r + db_i * db_i; | |||
| FLOAT adart = sqrt( da_r * da_r + da_i * da_i); | |||
| FLOAT adbrt = sqrt( db_r * db_r + db_i * db_i); | |||
| PRINT_DEBUG_NAME; | |||
| @@ -68,69 +45,137 @@ void CNAME(void *VDA, void *VDB, FLOAT *C, void *VS) { | |||
| FUNCTION_PROFILE_START(); | |||
| if (ada == ZERO) { | |||
| *C = ZERO; | |||
| *(S + 0) = ONE; | |||
| if (db_r == ZERO && db_i == ZERO) { | |||
| *C = ONE; | |||
| *(S + 0) = ZERO; | |||
| *(S + 1) = ZERO; | |||
| *(DA + 0) = db_r; | |||
| *(DA + 1) = db_i; | |||
| } else { | |||
| FLOAT scale; | |||
| FLOAT aa_r, aa_i, bb_r, bb_i; | |||
| FLOAT alpha_r, alpha_i; | |||
| aa_r = fabs(da_r); | |||
| aa_i = fabs(da_i); | |||
| if (aa_i > aa_r) { | |||
| aa_r = fabs(da_i); | |||
| aa_i = fabs(da_r); | |||
| } | |||
| if (aa_r == ZERO) { | |||
| ada = 0.; | |||
| } else { | |||
| scale = (aa_i / aa_r); | |||
| ada = aa_r * sqrt(ONE + scale * scale); | |||
| } | |||
| bb_r = fabs(db_r); | |||
| bb_i = fabs(db_i); | |||
| if (bb_i > bb_r) { | |||
| bb_r = fabs(bb_i); | |||
| bb_i = fabs(bb_r); | |||
| } | |||
| if (bb_r == ZERO) { | |||
| adb = 0.; | |||
| } else { | |||
| scale = (bb_i / bb_r); | |||
| adb = bb_r * sqrt(ONE + scale * scale); | |||
| } | |||
| scale = ada + adb; | |||
| aa_r = da_r / scale; | |||
| aa_i = da_i / scale; | |||
| bb_r = db_r / scale; | |||
| bb_i = db_i / scale; | |||
| r = scale * sqrt(aa_r * aa_r + aa_i * aa_i + bb_r * bb_r + bb_i * bb_i); | |||
| alpha_r = da_r / ada; | |||
| alpha_i = da_i / ada; | |||
| *(C + 0) = ada / r; | |||
| *(S + 0) = (alpha_r * db_r + alpha_i *db_i) / r; | |||
| *(S + 1) = (alpha_i * db_r - alpha_r *db_i) / r; | |||
| *(DA + 0) = alpha_r * r; | |||
| *(DA + 1) = alpha_i * r; | |||
| return; | |||
| } | |||
| #endif | |||
| FUNCTION_PROFILE_END(4, 4, 4); | |||
| IDEBUG_END; | |||
| return; | |||
| long double safmax = 1./safmin; | |||
| #if defined DOUBLE | |||
| long double rtmax = safmax /DBL_EPSILON; | |||
| #else | |||
| long double rtmax = safmax /FLT_EPSILON; | |||
| #endif | |||
| *(S1 + 0) = *(DB + 0); | |||
| *(S1 + 1) = *(DB + 1) *-1; | |||
| if (da_r == ZERO && da_i == ZERO) { | |||
| *C = ZERO; | |||
| if (db_r == ZERO) { | |||
| (*DA) = fabsl(db_i); | |||
| *S = *S1 /da_r; | |||
| *(S+1) = *(S1+1) /da_r; | |||
| return; | |||
| } else if ( db_i == ZERO) { | |||
| *DA = fabsl(db_r); | |||
| *S = *S1 /da_r; | |||
| *(S+1) = *(S1+1) /da_r; | |||
| return; | |||
| } else { | |||
| long double g1 = MAX( fabsl(db_r), fabsl(db_i)); | |||
| rtmax =sqrt(safmax/2.); | |||
| if (g1 > rtmin && g1 < rtmax) { // unscaled | |||
| d = sqrt(adb); | |||
| *S = *S1 /d; | |||
| *(S+1) = *(S1+1) /d; | |||
| *DA = d ; | |||
| *(DA+1) = ZERO; | |||
| return; | |||
| } else { // scaled algorithm | |||
| long double u = MIN ( safmax, MAX ( safmin, g1)); | |||
| FLOAT gs_r = db_r/u; | |||
| FLOAT gs_i = db_i/u; | |||
| d = sqrt ( gs_r*gs_r + gs_i*gs_i); | |||
| *S = gs_r / d; | |||
| *(S + 1) = (gs_i * -1) / d; | |||
| *DA = d * u; | |||
| *(DA+1) = ZERO; | |||
| return; | |||
| } | |||
| } | |||
| } else { | |||
| FLOAT f1 = MAX ( fabsl(da_r), fabsl(da_i)); | |||
| FLOAT g1 = MAX ( fabsl(db_r), fabsl(db_i)); | |||
| rtmax = sqrt(safmax / 4.); | |||
| if ( f1 > rtmin && f1 < rtmax && g1 > rtmin && g1 < rtmax) { //unscaled | |||
| long double h = ada + adb; | |||
| double adahsq = sqrt(ada * h); | |||
| if (ada >= h *safmin) { | |||
| *C = sqrt(ada/h); | |||
| *R = *DA / *C; | |||
| *(R+1) = *(DA+1) / *(C+1); | |||
| rtmax *= 2.; | |||
| if ( ada > rtmin && h < rtmax) { // no risk of intermediate overflow | |||
| *S = *S1 * (*DA / adahsq) - *(S1+1)* (*(DA+1)/adahsq); | |||
| *(S+1) = *S1 * (*(DA+1) / adahsq) + *(S1+1) * (*DA/adahsq); | |||
| } else { | |||
| *S = *S1 * (*R/h) - *(S1+1) * (*(R+1)/h); | |||
| *(S+1) = *S1 * (*(R+1)/h) + *(S1+1) * (*(R)/h); | |||
| } | |||
| } else { | |||
| *C = ada / adahsq; | |||
| if (*C >= safmin) | |||
| *R = *DA / *C; | |||
| else | |||
| *R = *DA * (h / adahsq); | |||
| *S = *S1 * ada / adahsq; | |||
| *(S+1) = *(S1+1) * ada / adahsq; | |||
| } | |||
| *DA=*R; | |||
| *(DA+1)=*(R+1); | |||
| return; | |||
| } else { // scaled | |||
| FLOAT fs_r, fs_i, gs_r, gs_i; | |||
| long double v,w,f2,g2,h; | |||
| long double u = MIN ( safmax, MAX ( safmin, MAX(f1,g1))); | |||
| gs_r = db_r/u; | |||
| gs_i = db_i/u; | |||
| g2 = sqrt ( gs_r*gs_r + gs_i*gs_i); | |||
| if (f1 /u < rtmin) { | |||
| v = MIN (safmax, MAX (safmin, f1)); | |||
| w = v / u; | |||
| fs_r = *DA/ v; | |||
| fs_i = *(DA+1) / v; | |||
| f2 = sqrt ( fs_r*fs_r + fs_i*fs_i); | |||
| h = f2 * w * w + g2; | |||
| } else { // use same scaling for both | |||
| w = 1.; | |||
| fs_r = *DA/ u; | |||
| fs_i = *(DA+1) / u; | |||
| f2 = sqrt ( fs_r*fs_r + fs_i*fs_i); | |||
| h = f2 + g2; | |||
| } | |||
| if ( f2 >= h * safmin) { | |||
| *C = sqrt ( f2 / h ); | |||
| *DA = fs_r / *C; | |||
| *(DA+1) = fs_i / *C; | |||
| rtmax *= 2; | |||
| if ( f2 > rtmin && h < rtmax) { | |||
| *S = gs_r * (fs_r /sqrt(f2*h)) - gs_i * (fs_i / sqrt(f2*h)); | |||
| *(S+1) = gs_r * (fs_i /sqrt(f2*h)) + gs_i * -1. * (fs_r / sqrt(f2*h)); | |||
| } else { | |||
| *S = gs_r * (*DA/h) - gs_i * (*(DA+1) / h); | |||
| *(S+1) = gs_r * (*(DA+1) /h) + gs_i * -1. * (*DA / h); | |||
| } | |||
| } else { // intermediates might overflow | |||
| d = sqrt ( f2 * h); | |||
| *C = f2 /d; | |||
| if (*C >= safmin) { | |||
| *DA = fs_r / *C; | |||
| *(DA+1) = fs_i / *C; | |||
| } else { | |||
| *DA = fs_r * (h / d); | |||
| *(DA+1) = fs_i / (h / d); | |||
| } | |||
| *S = gs_r * (fs_r /d) - gs_i * (fs_i / d); | |||
| *(S+1) = gs_r * (fs_i /d) + gs_i * -1. * (fs_r / d); | |||
| } | |||
| *C *= w; | |||
| *DA *= u; | |||
| *(DA+1) *= u; | |||
| return; | |||
| } | |||
| } | |||
| } | |||
| @@ -33,7 +33,7 @@ endif | |||
| ifdef TARGET_CORE | |||
| ifeq ($(TARGET_CORE), SAPPHIRERAPIDS) | |||
| override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) | |||
| ifeq ($(GCCVERSIONGTEQ10), 1) | |||
| ifeq (1, $(filter 1,$(GCCVERSIONGTEQ11) $(CLANGVERSIONGTEQ12))) | |||
| override CFLAGS += -march=sapphirerapids | |||
| else | |||
| override CFLAGS += -march=skylake-avx512 -mavx512f | |||
| @@ -48,7 +48,7 @@ ifeq ($(TARGET_CORE), SAPPHIRERAPIDS) | |||
| endif | |||
| else ifeq ($(TARGET_CORE), COOPERLAKE) | |||
| override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) | |||
| ifeq ($(GCCVERSIONGTEQ10), 1) | |||
| ifeq (1, $(filter 1,$(GCCVERSIONGTEQ10) $(CLANGVERSIONGTEQ9))) | |||
| override CFLAGS += -march=cooperlake | |||
| else | |||
| override CFLAGS += -march=skylake-avx512 -mavx512f | |||
| @@ -77,6 +77,12 @@ else ifeq ($(TARGET_CORE), ZEN) | |||
| override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) $(AVX2OPT) | |||
| else ifeq ($(TARGET_CORE), LOONGSON3R4) | |||
| override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) $(MSA_FLAGS) | |||
| else ifneq ($(filter NEOVERSEN2 NEOVERSEV1, $(TARGET_CORE)),) | |||
| ifeq ($(C_COMPILER), PGI) | |||
| override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -Msve_intrinsics | |||
| else | |||
| override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) | |||
| endif | |||
| else | |||
| override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) | |||
| endif | |||
| @@ -35,6 +35,12 @@ USE_TRMM = 1 | |||
| endif | |||
| endif | |||
| ifneq ($(DYNAMIC_ARCH), 1) | |||
| ifeq ($(TARGET), MIPS64_GENERIC) | |||
| USE_TRMM = 1 | |||
| endif | |||
| endif | |||
| ifeq ($(CORE), HASWELL) | |||
| USE_TRMM = 1 | |||
| endif | |||
| @@ -57,7 +57,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| FLOAT absxi = 0.0; | |||
| if (n <= 0 || inc_x <= 0) return(0.0); | |||
| if (n <= 0 || inc_x == 0) return(0.0); | |||
| if ( n == 1 ) return( ABS(x[0]) ); | |||
| n *= inc_x; | |||
| @@ -57,7 +57,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| BLASLONG inc_x2; | |||
| FLOAT temp; | |||
| if (n <= 0 || inc_x <= 0) return(0.0); | |||
| if (n <= 0 || inc_x == 0) return(0.0); | |||
| inc_x2 = 2 * inc_x; | |||
| @@ -57,7 +57,7 @@ CAMAXKERNEL = zamax.S | |||
| ZAMAXKERNEL = zamax.S | |||
| SAXPYKERNEL = axpy.S | |||
| DAXPYKERNEL = axpy.S | |||
| DAXPYKERNEL = daxpy_thunderx2t99.S | |||
| CAXPYKERNEL = zaxpy.S | |||
| ZAXPYKERNEL = zaxpy.S | |||
| @@ -81,45 +81,35 @@ DGEMVTKERNEL = gemv_t.S | |||
| CGEMVTKERNEL = zgemv_t.S | |||
| ZGEMVTKERNEL = zgemv_t.S | |||
| SASUMKERNEL = asum.S | |||
| DASUMKERNEL = asum.S | |||
| CASUMKERNEL = casum.S | |||
| ZASUMKERNEL = zasum.S | |||
| SCOPYKERNEL = copy.S | |||
| DCOPYKERNEL = copy.S | |||
| CCOPYKERNEL = copy.S | |||
| ZCOPYKERNEL = copy.S | |||
| SSWAPKERNEL = swap.S | |||
| DSWAPKERNEL = swap.S | |||
| CSWAPKERNEL = swap.S | |||
| ZSWAPKERNEL = swap.S | |||
| ISAMAXKERNEL = iamax.S | |||
| IDAMAXKERNEL = iamax.S | |||
| ICAMAXKERNEL = izamax.S | |||
| IZAMAXKERNEL = izamax.S | |||
| SNRM2KERNEL = nrm2.S | |||
| DNRM2KERNEL = nrm2.S | |||
| CNRM2KERNEL = znrm2.S | |||
| ZNRM2KERNEL = znrm2.S | |||
| DDOTKERNEL = dot.S | |||
| ifneq ($(C_COMPILER), PGI) | |||
| SDOTKERNEL = ../generic/dot.c | |||
| else | |||
| SDOTKERNEL = dot.S | |||
| endif | |||
| ifneq ($(C_COMPILER), PGI) | |||
| CDOTKERNEL = zdot.S | |||
| ZDOTKERNEL = zdot.S | |||
| else | |||
| CDOTKERNEL = ../arm/zdot.c | |||
| ZDOTKERNEL = ../arm/zdot.c | |||
| endif | |||
| SASUMKERNEL = sasum_thunderx2t99.c | |||
| DASUMKERNEL = dasum_thunderx2t99.c | |||
| CASUMKERNEL = casum_thunderx2t99.c | |||
| ZASUMKERNEL = zasum_thunderx2t99.c | |||
| SCOPYKERNEL = copy_thunderx2t99.c | |||
| DCOPYKERNEL = copy_thunderx2t99.c | |||
| CCOPYKERNEL = copy_thunderx2t99.c | |||
| ZCOPYKERNEL = copy_thunderx2t99.c | |||
| SSWAPKERNEL = swap_thunderx2t99.S | |||
| DSWAPKERNEL = swap_thunderx2t99.S | |||
| CSWAPKERNEL = swap_thunderx2t99.S | |||
| ZSWAPKERNEL = swap_thunderx2t99.S | |||
| ISAMAXKERNEL = iamax_thunderx2t99.c | |||
| IDAMAXKERNEL = iamax_thunderx2t99.c | |||
| ICAMAXKERNEL = izamax_thunderx2t99.c | |||
| IZAMAXKERNEL = izamax_thunderx2t99.c | |||
| SNRM2KERNEL = scnrm2_thunderx2t99.c | |||
| DNRM2KERNEL = dznrm2_thunderx2t99.c | |||
| CNRM2KERNEL = scnrm2_thunderx2t99.c | |||
| ZNRM2KERNEL = dznrm2_thunderx2t99.c | |||
| DDOTKERNEL = dot.c | |||
| SDOTKERNEL = dot.c | |||
| CDOTKERNEL = zdot_thunderx2t99.c | |||
| ZDOTKERNEL = zdot_thunderx2t99.c | |||
| DSDOTKERNEL = dot.S | |||
| DGEMM_BETA = dgemm_beta.S | |||
| @@ -128,10 +118,10 @@ SGEMM_BETA = sgemm_beta.S | |||
| SGEMMKERNEL = sgemm_kernel_sve_v2x$(SGEMM_UNROLL_N).S | |||
| STRMMKERNEL = strmm_kernel_sve_v1x$(SGEMM_UNROLL_N).S | |||
| SGEMMINCOPY = sgemm_ncopy_sve_v1.c | |||
| SGEMMITCOPY = sgemm_tcopy_sve_v1.c | |||
| SGEMMONCOPY = sgemm_ncopy_$(DGEMM_UNROLL_N).S | |||
| SGEMMOTCOPY = sgemm_tcopy_$(DGEMM_UNROLL_N).S | |||
| SGEMMINCOPY = gemm_ncopy_sve_v1x$(SGEMM_UNROLL_N).c | |||
| SGEMMITCOPY = gemm_tcopy_sve_v1x$(SGEMM_UNROLL_N).c | |||
| SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S | |||
| SGEMMOTCOPY = sgemm_tcopy_$(SGEMM_UNROLL_N).S | |||
| SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| @@ -149,8 +139,8 @@ SSYMMLCOPY_M = symm_lcopy_sve.c | |||
| DGEMMKERNEL = dgemm_kernel_sve_v2x$(DGEMM_UNROLL_N).S | |||
| DTRMMKERNEL = dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S | |||
| DGEMMINCOPY = dgemm_ncopy_sve_v1.c | |||
| DGEMMITCOPY = dgemm_tcopy_sve_v1.c | |||
| DGEMMINCOPY = gemm_ncopy_sve_v1x$(DGEMM_UNROLL_N).c | |||
| DGEMMITCOPY = gemm_tcopy_sve_v1x$(DGEMM_UNROLL_N).c | |||
| DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S | |||
| DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S | |||
| @@ -170,8 +160,8 @@ DSYMMLCOPY_M = symm_lcopy_sve.c | |||
| CGEMMKERNEL = cgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S | |||
| CTRMMKERNEL = ctrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S | |||
| CGEMMINCOPY = cgemm_ncopy_sve_v1.c | |||
| CGEMMITCOPY = cgemm_tcopy_sve_v1.c | |||
| CGEMMINCOPY = gemm_ncopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c | |||
| CGEMMITCOPY = gemm_tcopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c | |||
| CGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c | |||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c | |||
| @@ -194,8 +184,8 @@ CSYMMLCOPY_M = zsymm_lcopy_sve.c | |||
| ZGEMMKERNEL = zgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S | |||
| ZTRMMKERNEL = ztrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S | |||
| ZGEMMINCOPY = zgemm_ncopy_sve_v1.c | |||
| ZGEMMITCOPY = zgemm_tcopy_sve_v1.c | |||
| ZGEMMINCOPY = gemm_ncopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c | |||
| ZGEMMITCOPY = gemm_tcopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c | |||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c | |||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c | |||
| @@ -1,189 +1 @@ | |||
| SAMINKERNEL = ../arm/amin.c | |||
| DAMINKERNEL = ../arm/amin.c | |||
| CAMINKERNEL = ../arm/zamin.c | |||
| ZAMINKERNEL = ../arm/zamin.c | |||
| SMAXKERNEL = ../arm/max.c | |||
| DMAXKERNEL = ../arm/max.c | |||
| SMINKERNEL = ../arm/min.c | |||
| DMINKERNEL = ../arm/min.c | |||
| ISAMINKERNEL = ../arm/iamin.c | |||
| IDAMINKERNEL = ../arm/iamin.c | |||
| ICAMINKERNEL = ../arm/izamin.c | |||
| IZAMINKERNEL = ../arm/izamin.c | |||
| ISMAXKERNEL = ../arm/imax.c | |||
| IDMAXKERNEL = ../arm/imax.c | |||
| ISMINKERNEL = ../arm/imin.c | |||
| IDMINKERNEL = ../arm/imin.c | |||
| STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| SAMAXKERNEL = amax.S | |||
| DAMAXKERNEL = amax.S | |||
| CAMAXKERNEL = zamax.S | |||
| ZAMAXKERNEL = zamax.S | |||
| SAXPYKERNEL = axpy.S | |||
| DAXPYKERNEL = daxpy_thunderx2t99.S | |||
| CAXPYKERNEL = zaxpy.S | |||
| ZAXPYKERNEL = zaxpy.S | |||
| SROTKERNEL = rot.S | |||
| DROTKERNEL = rot.S | |||
| CROTKERNEL = zrot.S | |||
| ZROTKERNEL = zrot.S | |||
| SSCALKERNEL = scal.S | |||
| DSCALKERNEL = scal.S | |||
| CSCALKERNEL = zscal.S | |||
| ZSCALKERNEL = zscal.S | |||
| SGEMVNKERNEL = gemv_n.S | |||
| DGEMVNKERNEL = gemv_n.S | |||
| CGEMVNKERNEL = zgemv_n.S | |||
| ZGEMVNKERNEL = zgemv_n.S | |||
| SGEMVTKERNEL = gemv_t.S | |||
| DGEMVTKERNEL = gemv_t.S | |||
| CGEMVTKERNEL = zgemv_t.S | |||
| ZGEMVTKERNEL = zgemv_t.S | |||
| SASUMKERNEL = sasum_thunderx2t99.c | |||
| DASUMKERNEL = dasum_thunderx2t99.c | |||
| CASUMKERNEL = casum_thunderx2t99.c | |||
| ZASUMKERNEL = zasum_thunderx2t99.c | |||
| SCOPYKERNEL = copy_thunderx2t99.c | |||
| DCOPYKERNEL = copy_thunderx2t99.c | |||
| CCOPYKERNEL = copy_thunderx2t99.c | |||
| ZCOPYKERNEL = copy_thunderx2t99.c | |||
| SSWAPKERNEL = swap_thunderx2t99.S | |||
| DSWAPKERNEL = swap_thunderx2t99.S | |||
| CSWAPKERNEL = swap_thunderx2t99.S | |||
| ZSWAPKERNEL = swap_thunderx2t99.S | |||
| ISAMAXKERNEL = iamax_thunderx2t99.c | |||
| IDAMAXKERNEL = iamax_thunderx2t99.c | |||
| ICAMAXKERNEL = izamax_thunderx2t99.c | |||
| IZAMAXKERNEL = izamax_thunderx2t99.c | |||
| SNRM2KERNEL = scnrm2_thunderx2t99.c | |||
| DNRM2KERNEL = dznrm2_thunderx2t99.c | |||
| CNRM2KERNEL = scnrm2_thunderx2t99.c | |||
| ZNRM2KERNEL = dznrm2_thunderx2t99.c | |||
| DDOTKERNEL = dot.c | |||
| SDOTKERNEL = dot.c | |||
| CDOTKERNEL = zdot_thunderx2t99.c | |||
| ZDOTKERNEL = zdot_thunderx2t99.c | |||
| DSDOTKERNEL = dot.S | |||
| DGEMM_BETA = dgemm_beta.S | |||
| SGEMM_BETA = sgemm_beta.S | |||
| SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S | |||
| STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S | |||
| ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) | |||
| ifeq ($(SGEMM_UNROLL_M), 16) | |||
| SGEMMITCOPY = sgemm_tcopy_$(SGEMM_UNROLL_M).S | |||
| else | |||
| SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c | |||
| endif | |||
| ifeq ($(SGEMM_UNROLL_M), 4) | |||
| SGEMMINCOPY = sgemm_ncopy_$(SGEMM_UNROLL_M).S | |||
| else | |||
| SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c | |||
| endif | |||
| SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| endif | |||
| ifeq ($(SGEMM_UNROLL_N), 16) | |||
| SGEMMOTCOPY = sgemm_tcopy_$(SGEMM_UNROLL_N).S | |||
| else | |||
| SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c | |||
| endif | |||
| ifeq ($(SGEMM_UNROLL_N), 4) | |||
| SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S | |||
| else | |||
| SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c | |||
| endif | |||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S | |||
| DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S | |||
| ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) | |||
| ifeq ($(DGEMM_UNROLL_M), 8) | |||
| DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S | |||
| DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S | |||
| else | |||
| DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c | |||
| DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c | |||
| endif | |||
| DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| endif | |||
| ifeq ($(DGEMM_UNROLL_N), 4) | |||
| DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S | |||
| DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S | |||
| else | |||
| DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c | |||
| DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c | |||
| endif | |||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S | |||
| CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S | |||
| ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) | |||
| CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c | |||
| CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c | |||
| CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| endif | |||
| CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c | |||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c | |||
| CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S | |||
| ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S | |||
| ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) | |||
| ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c | |||
| ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c | |||
| ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| endif | |||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c | |||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c | |||
| ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| include $(KERNELDIR)/KERNEL.ARMV8SVE | |||
| @@ -49,7 +49,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define pCRow3 x15 | |||
| #define pA x16 | |||
| #define alphaR w17 | |||
| #define alphaI w18 | |||
| #define alphaI w19 | |||
| #define alpha0_R s10 | |||
| #define alphaV0_R v10.s[0] | |||
| @@ -49,7 +49,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define pCRow3 x15 | |||
| #define pA x16 | |||
| #define alphaR w17 | |||
| #define alphaI w18 | |||
| #define alphaI w19 | |||
| #define alpha0_R s10 | |||
| #define alphaV0_R v10.s[0] | |||
| @@ -240,7 +240,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| add pB, pB, 32 | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] | |||
| .endm | |||
| .macro KERNELv1x4_M1 | |||
| @@ -276,9 +275,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1rw z15.s, p0/z, [pB, 28] | |||
| add pB, pB, 32 | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] | |||
| .endm | |||
| .macro KERNELv1x4_M2 | |||
| @@ -313,11 +309,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| OP_ri z23.s, p1/m, z2.s, z15.s | |||
| ld1rw z15.s, p0/z, [pB, 28] | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| add pB, pB, 32 | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] | |||
| .endm | |||
| .macro KERNELv1x4_E | |||
| @@ -341,10 +333,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| OP_ii z22.s, p1/m, z3.s, z15.s | |||
| OP_ri z23.s, p1/m, z2.s, z15.s | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] | |||
| .endm | |||
| .macro KERNELv1x4_SUB | |||
| @@ -383,13 +371,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| OP_ii z22.s, p1/m, z1.s, z15.s | |||
| OP_ri z23.s, p1/m, z0.s, z15.s | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| .endm | |||
| .macro SAVEv1x4 | |||
| prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||
| ld2w {z24.s, z25.s}, p1/z, [pCRow0] | |||
| fmla z24.s, p1/m, z16.s, alphaz_R | |||
| fmls z24.s, p1/m, z17.s, alphaz_I | |||
| @@ -407,8 +391,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| st2w {z26.s, z27.s}, p1, [pCRow1] | |||
| add pCRow1, pCRow1, lanes, lsl #3 | |||
| prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||
| ld2w {z28.s, z29.s}, p1/z, [pCRow2] | |||
| fmla z28.s, p1/m, z20.s, alphaz_R | |||
| fmls z28.s, p1/m, z21.s, alphaz_I | |||
| @@ -425,12 +407,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| fmla z31.s, p1/m, z23.s, alphaz_R | |||
| st2w {z30.s, z31.s}, p1, [pCRow3] | |||
| prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] | |||
| add pCRow3, pCRow3, lanes, lsl #3 // pC = pC + lanes * 2 *4 | |||
| prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] | |||
| .endm | |||
| /******************************************************************************/ | |||
| @@ -466,8 +444,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVEv1x2 | |||
| prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||
| ld2w {z24.s, z25.s}, p1/z, [pCRow0] | |||
| fmla z24.s, p1/m, z16.s, alphaz_R | |||
| fmls z24.s, p1/m, z17.s, alphaz_I | |||
| @@ -485,10 +461,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| st2w {z26.s, z27.s}, p1, [pCRow1] | |||
| add pCRow1, pCRow1, lanes, lsl #3 | |||
| prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||
| prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] | |||
| .endm | |||
| /******************************************************************************/ | |||
| @@ -516,8 +488,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVEv1x1 | |||
| prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||
| ld2w {z24.s, z25.s}, p1/z, [pCRow0] | |||
| fmla z24.s, p1/m, z16.s, alphaz_R | |||
| fmls z24.s, p1/m, z17.s, alphaz_I | |||
| @@ -527,8 +497,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 2 *4 | |||
| prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] | |||
| .endm | |||
| /******************************************************************************/ | |||
| @@ -553,9 +521,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| stp x26, x27, [sp, #(9 * 16)] | |||
| str x28, [sp, #(10 * 16)] | |||
| prfm PLDL1KEEP, [origPB] | |||
| prfm PLDL1KEEP, [origPA] | |||
| fmov alphaR, s0 | |||
| dup alphaz_R, alphaR | |||
| fmov alphaI, s1 | |||
| @@ -676,10 +641,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| bne .Lcgemm_kernel_L4_Mv1_46 | |||
| .Lcgemm_kernel_L4_Mv1_100: | |||
| prfm PLDL1KEEP, [pA] | |||
| prfm PLDL1KEEP, [pA, #64] | |||
| prfm PLDL1KEEP, [origPB] | |||
| SAVEv1x4 | |||
| .Lcgemm_kernel_L4_Mv1_END: | |||
| @@ -1,5 +1,6 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* Copyright 2023 The OpenBLAS Project */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| @@ -52,7 +53,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ | |||
| boffset = b; | |||
| j = 0; | |||
| svbool_t pg = svwhilelt_b32(j, n); | |||
| svbool_t pg = svwhilelt_b32((uint64_t)j, (uint64_t)n); | |||
| uint32_t active = svcntp_b32(svptrue_b32(), pg); | |||
| do { | |||
| @@ -69,7 +70,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ | |||
| aoffset += active * lda * 2; | |||
| j += svcntw(); | |||
| pg = svwhilelt_b32(j, n); | |||
| pg = svwhilelt_b32((uint64_t)j, (uint64_t)n); | |||
| active = svcntp_b32(svptrue_b32(), pg); | |||
| @@ -1,5 +1,6 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* Copyright 2023 The OpenBLAS Project */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| @@ -50,7 +51,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ | |||
| boffset = b; | |||
| j = 0; | |||
| svbool_t pg = svwhilelt_b32(j, n); | |||
| svbool_t pg = svwhilelt_b32((uint64_t)j, (uint64_t)n); | |||
| uint32_t active = svcntp_b32(svptrue_b32(), pg); | |||
| do { | |||
| @@ -66,7 +67,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ | |||
| aoffset += active * 2; | |||
| j += svcntw(); | |||
| pg = svwhilelt_b32(j, n); | |||
| pg = svwhilelt_b32((uint64_t)j, (uint64_t)n); | |||
| active = svcntp_b32(svptrue_b32(), pg); | |||
| } while (svptest_any(svptrue_b32(), pg)); | |||
| @@ -49,10 +49,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define pCRow3 x15 | |||
| #define pA x16 | |||
| #define alphaR w17 | |||
| #define alphaI w18 | |||
| #define temp x19 | |||
| #define tempOffset x20 | |||
| #define tempK x21 | |||
| #define alphaI w19 | |||
| #define temp x20 | |||
| #define tempOffset x21 | |||
| #define tempK x22 | |||
| #define alpha0_R s10 | |||
| #define alphaV0_R v10.s[0] | |||
| @@ -1,79 +0,0 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| #include <arm_sve.h> | |||
| // TODO: write in assembly with proper unrolling of inner loop | |||
| int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ | |||
| BLASLONG j; | |||
| IFLOAT *aoffset, *aoffset1, *boffset; | |||
| svint64_t lda_vec = svindex_s64(0LL, lda); | |||
| uint64_t sve_size = svcntd(); | |||
| aoffset = a; | |||
| boffset = b; | |||
| j = 0; | |||
| svbool_t pg = svwhilelt_b64(j, n); | |||
| uint64_t active = svcntp_b64(svptrue_b64(), pg); | |||
| do { | |||
| aoffset1 = aoffset; | |||
| uint64_t i_cnt = m; | |||
| while (i_cnt--) { | |||
| svfloat64_t a_vec = svld1_gather_index(pg, (double *) aoffset1, lda_vec); | |||
| svst1_f64(pg, (double *) boffset, a_vec); | |||
| aoffset1++; | |||
| boffset += active; | |||
| } | |||
| aoffset += sve_size * lda; | |||
| j += svcntd(); | |||
| pg = svwhilelt_b64(j, n); | |||
| active = svcntp_b64(svptrue_b64(), pg); | |||
| } while (svptest_any(svptrue_b64(), pg)); | |||
| return 0; | |||
| } | |||
| @@ -1,77 +0,0 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| #include <arm_sve.h> | |||
| // TODO: write in assembly with proper unrolling of inner loop | |||
| int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ | |||
| BLASLONG j; | |||
| IFLOAT *aoffset, *aoffset1, *boffset; | |||
| uint64_t sve_size = svcntd(); | |||
| aoffset = a; | |||
| boffset = b; | |||
| j = 0; | |||
| svbool_t pg = svwhilelt_b64(j, n); | |||
| uint64_t active = svcntp_b64(svptrue_b64(), pg); | |||
| do { | |||
| aoffset1 = aoffset; | |||
| uint64_t i_cnt = m; | |||
| while (i_cnt--) { | |||
| svfloat64_t a_vec = svld1(pg, (double *)aoffset1); | |||
| svst1_f64(pg, (double *) boffset, a_vec); | |||
| aoffset1 += lda; | |||
| boffset += active; | |||
| } | |||
| aoffset += sve_size; | |||
| j += svcntd(); | |||
| pg = svwhilelt_b64(j, n); | |||
| active = svcntp_b64(svptrue_b64(), pg); | |||
| } while (svptest_any(svptrue_b64(), pg)); | |||
| return 0; | |||
| } | |||
| @@ -50,8 +50,8 @@ static FLOAT dot_kernel_sve(BLASLONG n, FLOAT *x, FLOAT *y) { | |||
| BLASLONG sve_width = SVE_WIDTH; | |||
| for (BLASLONG i = 0; i < n; i += sve_width * 2) { | |||
| svbool_t pg_a = SVE_WHILELT(i, n); | |||
| svbool_t pg_b = SVE_WHILELT(i + sve_width, n); | |||
| svbool_t pg_a = SVE_WHILELT((uint64_t)i, (uint64_t)n); | |||
| svbool_t pg_b = SVE_WHILELT((uint64_t)(i + sve_width), (uint64_t)n); | |||
| SVE_TYPE x_vec_a = svld1(pg_a, &x[i]); | |||
| SVE_TYPE y_vec_a = svld1(pg_a, &y[i]); | |||
| @@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #include <float.h> | |||
| #include <arm_neon.h> | |||
| #if defined(SMP) | |||
| @@ -404,7 +404,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| #else | |||
| nrm2_compute(n, x, inc_x, &ssq, &scale); | |||
| #endif | |||
| if (fabs(scale) <1.e-300) return 0.; | |||
| volatile FLOAT sca = fabs(scale); | |||
| if (sca < DBL_MIN) return 0.; | |||
| ssq = sqrt(ssq) * scale; | |||
| return ssq; | |||
| @@ -0,0 +1,121 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2023, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include <stdint.h> | |||
| #include <stdio.h> | |||
| #include <arm_sve.h> | |||
| #include "common.h" | |||
| #ifdef DOUBLE | |||
| #define COUNT "cntd" | |||
| #define SV_TYPE svfloat64_t | |||
| #define SV_INDEX svuint64_t | |||
| #define SV_INDEXER svindex_u64 | |||
| #define SV_TRUE svptrue_b64 | |||
| #define SV_WHILE svwhilelt_b64 | |||
| #else | |||
| #define COUNT "cntw" | |||
| #define SV_TYPE svfloat32_t | |||
| #define SV_INDEX svuint32_t | |||
| #define SV_INDEXER svindex_u32 | |||
| #define SV_TRUE svptrue_b32 | |||
| #define SV_WHILE svwhilelt_b32 | |||
| #endif | |||
| #define INNER_COPY(pg, a_offset_inner, b_offset, lda, active) \ | |||
| a_vec_real = svld1_gather_index(pg, a_offset_inner, lda_vec); \ | |||
| a_vec_imag = svld1_gather_index(pg, a_offset_inner + 1, lda_vec); \ | |||
| svst2(pg, b_offset, svcreate2(a_vec_real, a_vec_imag)); \ | |||
| a_offset_inner += 2; \ | |||
| b_offset += active * 2; | |||
| int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) { | |||
| uint64_t sve_size; | |||
| asm(COUNT" %[SIZE_]" : [SIZE_] "=r" (sve_size) : : ); | |||
| IFLOAT *a_offset, *a_offset_inner, *b_offset; | |||
| a_offset = a; | |||
| b_offset = b; | |||
| SV_INDEX lda_vec = SV_INDEXER(0LL, lda * 2); | |||
| SV_TYPE a_vec_real; | |||
| SV_TYPE a_vec_imag; | |||
| svbool_t pg_true = SV_TRUE(); | |||
| BLASLONG single_vectors_n = n & -sve_size; | |||
| for (BLASLONG j = 0; j < single_vectors_n; j += sve_size) { | |||
| a_offset_inner = a_offset; | |||
| svbool_t pg = pg_true; | |||
| uint64_t active = sve_size; | |||
| uint64_t i_cnt = m >> 2; | |||
| while (i_cnt--) { | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| } | |||
| if (m & 2) { | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| } | |||
| if (m & 1) { | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| } | |||
| a_offset += sve_size * lda * 2; | |||
| } | |||
| BLASLONG remaining_n = n - single_vectors_n; | |||
| if (remaining_n) { | |||
| a_offset_inner = a_offset; | |||
| svbool_t pg = SV_WHILE((uint64_t)0L, (uint64_t)remaining_n); | |||
| uint64_t active = remaining_n; | |||
| uint64_t i_cnt = m >> 2; | |||
| while (i_cnt--) { | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| } | |||
| if (m & 2) { | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| } | |||
| if (m & 1) { | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,131 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2023, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include <stdint.h> | |||
| #include <stdio.h> | |||
| #include <arm_sve.h> | |||
| #include "common.h" | |||
| #ifdef DOUBLE | |||
| #define COUNT "cntd" | |||
| #define SV_TYPE svfloat64_t | |||
| #define SV_INDEX svuint64_t | |||
| #define SV_INDEXER svindex_u64 | |||
| #define SV_TRUE svptrue_b64 | |||
| #define SV_WHILE svwhilelt_b64 | |||
| #define SV_PREFETCH svprfd_gather_index | |||
| #else | |||
| #define COUNT "cntw" | |||
| #define SV_TYPE svfloat32_t | |||
| #define SV_INDEX svuint32_t | |||
| #define SV_INDEXER svindex_u32 | |||
| #define SV_TRUE svptrue_b32 | |||
| #define SV_WHILE svwhilelt_b32 | |||
| #define SV_PREFETCH svprfw_gather_index | |||
| #endif | |||
| #define INNER_COPY(pg, a_offset_inner, b_offset, lda, active) \ | |||
| a_vec = svld1_gather_index(pg, a_offset_inner, lda_vec); \ | |||
| svst1(pg, b_offset, a_vec); \ | |||
| a_offset_inner++; \ | |||
| b_offset += active; | |||
| int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) { | |||
| uint64_t sve_size; | |||
| asm(COUNT" %[SIZE_]" : [SIZE_] "=r" (sve_size) : : ); | |||
| IFLOAT *a_offset, *a_offset_inner, *b_offset; | |||
| a_offset = a; | |||
| b_offset = b; | |||
| SV_INDEX lda_vec = SV_INDEXER(0LL, lda); | |||
| SV_TYPE a_vec; | |||
| svbool_t pg_true = SV_TRUE(); | |||
| BLASLONG single_vectors_n = n & -sve_size; | |||
| for (BLASLONG j = 0; j < single_vectors_n; j += sve_size) { | |||
| a_offset_inner = a_offset; | |||
| svbool_t pg = pg_true; | |||
| uint64_t active = sve_size; | |||
| uint64_t i_cnt = m >> 3; | |||
| while (i_cnt--) { | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| } | |||
| if (m & 4) { | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| } | |||
| if (m & 2) { | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| } | |||
| if (m & 1) { | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| } | |||
| a_offset += sve_size * lda; | |||
| } | |||
| BLASLONG remaining_n = n - single_vectors_n; | |||
| if (remaining_n) { | |||
| a_offset_inner = a_offset; | |||
| svbool_t pg = SV_WHILE((uint64_t)0L, (uint64_t)remaining_n); | |||
| uint64_t active = remaining_n; | |||
| uint64_t i_cnt = m >> 2; | |||
| while (i_cnt--) { | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| } | |||
| if (m & 2) { | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| } | |||
| if (m & 1) { | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,115 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2023, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include <stdint.h> | |||
| #include <stdio.h> | |||
| #include <arm_sve.h> | |||
| #include "common.h" | |||
| #ifdef DOUBLE | |||
| #define COUNT "cntd" | |||
| #define SV_TYPE svfloat64x2_t | |||
| #define SV_TRUE svptrue_b64 | |||
| #define SV_WHILE svwhilelt_b64 | |||
| #else | |||
| #define COUNT "cntw" | |||
| #define SV_TYPE svfloat32x2_t | |||
| #define SV_TRUE svptrue_b32 | |||
| #define SV_WHILE svwhilelt_b32 | |||
| #endif | |||
| #define INNER_COPY(pg, a_offset_inner, b_offset, lda, active) \ | |||
| a_vec = svld2(pg, a_offset_inner); \ | |||
| svst2(pg, b_offset, a_vec); \ | |||
| a_offset_inner += lda * 2; \ | |||
| b_offset += active * 2; | |||
| int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ | |||
| uint64_t sve_size = svcntw(); | |||
| asm(COUNT" %[SIZE_]" : [SIZE_] "=r" (sve_size) : : ); | |||
| IFLOAT *a_offset, *a_offset_inner, *b_offset; | |||
| a_offset = a; | |||
| b_offset = b; | |||
| SV_TYPE a_vec; | |||
| svbool_t pg_true = SV_TRUE(); | |||
| BLASLONG single_vectors_n = n & -sve_size; | |||
| for (BLASLONG j = 0; j < single_vectors_n; j += sve_size) { | |||
| a_offset_inner = a_offset; | |||
| svbool_t pg = pg_true; | |||
| uint64_t active = sve_size; | |||
| uint64_t i_cnt = m >> 2; | |||
| while (i_cnt--) { | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| } | |||
| if (m & 2) { | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| } | |||
| if (m & 1) { | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| } | |||
| a_offset += sve_size * 2; | |||
| } | |||
| BLASLONG remaining_n = n - single_vectors_n; | |||
| if (remaining_n) { | |||
| a_offset_inner = a_offset; | |||
| svbool_t pg = SV_WHILE((uint64_t)0L, (uint64_t)remaining_n); | |||
| uint64_t active = remaining_n; | |||
| uint64_t i_cnt = m >> 2; | |||
| while (i_cnt--) { | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| } | |||
| if (m & 2) { | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| } | |||
| if (m & 1) { | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,125 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2023, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include <stdint.h> | |||
| #include <stdio.h> | |||
| #include <arm_sve.h> | |||
| #include "common.h" | |||
| #ifdef DOUBLE | |||
| #define COUNT "cntd" | |||
| #define SV_TYPE svfloat64_t | |||
| #define SV_TRUE svptrue_b64 | |||
| #define SV_WHILE svwhilelt_b64 | |||
| #else | |||
| #define COUNT "cntw" | |||
| #define SV_TYPE svfloat32_t | |||
| #define SV_TRUE svptrue_b32 | |||
| #define SV_WHILE svwhilelt_b32 | |||
| #endif | |||
| #define INNER_COPY(pg, a_offset_inner, b_offset, lda, active) \ | |||
| a_vec = svld1(pg, a_offset_inner); \ | |||
| svst1(pg, b_offset, a_vec); \ | |||
| a_offset_inner += lda; \ | |||
| b_offset += active; | |||
| int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ | |||
| uint64_t sve_size = svcntw(); | |||
| asm(COUNT" %[SIZE_]" : [SIZE_] "=r" (sve_size) : : ); | |||
| IFLOAT *a_offset, *a_offset_inner, *b_offset; | |||
| a_offset = a; | |||
| b_offset = b; | |||
| SV_TYPE a_vec; | |||
| svbool_t pg_true = SV_TRUE(); | |||
| BLASLONG single_vectors_n = n & -sve_size; | |||
| for (BLASLONG j = 0; j < single_vectors_n; j += sve_size) { | |||
| a_offset_inner = a_offset; | |||
| svbool_t pg = pg_true; | |||
| uint64_t active = sve_size; | |||
| uint64_t i_cnt = m >> 3; | |||
| while (i_cnt--) { | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| } | |||
| if (m & 4) { | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| } | |||
| if (m & 2) { | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| } | |||
| if (m & 1) { | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| } | |||
| a_offset += sve_size; | |||
| } | |||
| BLASLONG remaining_n = n - single_vectors_n; | |||
| if (remaining_n) { | |||
| a_offset_inner = a_offset; | |||
| svbool_t pg = SV_WHILE((uint64_t)0L, (uint64_t)remaining_n); | |||
| uint64_t active = remaining_n; | |||
| uint64_t i_cnt = m >> 2; | |||
| while (i_cnt--) { | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| } | |||
| if (m & 2) { | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| } | |||
| if (m & 1) { | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -1,78 +0,0 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| #include <arm_sve.h> | |||
| // TODO: write in assembly with proper unrolling of inner loop | |||
| int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ | |||
| BLASLONG j; | |||
| IFLOAT *aoffset, *aoffset1, *boffset; | |||
| svint32_t lda_vec = svindex_s32(0LL, lda); | |||
| uint32_t sve_size = svcntw(); | |||
| aoffset = a; | |||
| boffset = b; | |||
| j = 0; | |||
| svbool_t pg = svwhilelt_b32(j, n); | |||
| uint32_t active = svcntp_b32(svptrue_b32(), pg); | |||
| do { | |||
| aoffset1 = aoffset; | |||
| uint32_t i_cnt = m; | |||
| while (i_cnt--) { | |||
| svfloat32_t a_vec = svld1_gather_index(pg, (float *) aoffset1, lda_vec); | |||
| svst1_f32(pg, (float *) boffset, a_vec); | |||
| aoffset1++; | |||
| boffset += active; | |||
| } | |||
| aoffset += sve_size * lda; | |||
| j += svcntw(); | |||
| pg = svwhilelt_b32(j, n); | |||
| active = svcntp_b32(svptrue_b32(), pg); | |||
| } while (svptest_any(svptrue_b32(), pg)); | |||
| return 0; | |||
| } | |||
| @@ -1,77 +0,0 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| #include <arm_sve.h> | |||
| // TODO: write in assembly with proper unrolling of inner loop | |||
| int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ | |||
| BLASLONG j; | |||
| IFLOAT *aoffset, *aoffset1, *boffset; | |||
| uint32_t sve_size = svcntw(); | |||
| aoffset = a; | |||
| boffset = b; | |||
| j = 0; | |||
| svbool_t pg = svwhilelt_b32(j, n); | |||
| uint32_t active = svcntp_b32(svptrue_b32(), pg); | |||
| do { | |||
| aoffset1 = aoffset; | |||
| uint32_t i_cnt = m; | |||
| while (i_cnt--) { | |||
| svfloat32_t a_vec = svld1(pg, (float *) aoffset1); | |||
| svst1_f32(pg, (float *) boffset, a_vec); | |||
| aoffset1 += lda; | |||
| boffset += active; | |||
| } | |||
| aoffset += sve_size; | |||
| j += svcntw(); | |||
| pg = svwhilelt_b32(j, n); | |||
| active = svcntp_b32(svptrue_b32(), pg); | |||
| } while (svptest_any(svptrue_b32(), pg)); | |||
| return 0; | |||
| } | |||
| @@ -1,5 +1,6 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* Copyright 2023 The OpenBLAS Project */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| @@ -52,7 +53,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| svint64_t one_vec = svdup_s64(1LL); | |||
| int64_t j = 0; | |||
| svbool_t pg = svwhilelt_b64(j, n); | |||
| svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); | |||
| int64_t active = svcntp_b64(svptrue_b64(), pg); | |||
| svint64_t index_neg = svindex_s64(0LL, -1LL); | |||
| svint64_t index = svindex_s64(0LL, 1LL); | |||
| @@ -86,7 +87,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| posX += sve_size; | |||
| posX_vec = svdup_s64(posX); | |||
| j += sve_size; | |||
| pg = svwhilelt_b64(j, n); | |||
| pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); | |||
| active = svcntp_b64(svptrue_b64(), pg); | |||
| } while (svptest_any(svptrue_b64(), pg)); | |||
| @@ -99,7 +100,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| int32_t N = n; | |||
| int32_t j = 0; | |||
| svbool_t pg = svwhilelt_b32(j, N); | |||
| svbool_t pg = svwhilelt_b32((uint32_t)j, (uint32_t)N); | |||
| int32_t active = svcntp_b32(svptrue_b32(), pg); | |||
| svint32_t index_neg = svindex_s32(0, -1); | |||
| svint32_t index = svindex_s32(0, 1); | |||
| @@ -133,7 +134,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| posX += sve_size; | |||
| posX_vec = svdup_s32(posX); | |||
| j += sve_size; | |||
| pg = svwhilelt_b32(j, N); | |||
| pg = svwhilelt_b32((uint32_t)j, (uint32_t)N); | |||
| active = svcntp_b32(svptrue_b32(), pg); | |||
| } while (svptest_any(svptrue_b32(), pg)); | |||
| @@ -1,5 +1,6 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* Copyright 2023 The OpenBLAS Project */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| @@ -52,7 +53,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| svint64_t one_vec = svdup_s64(1LL); | |||
| int64_t j = 0; | |||
| svbool_t pg = svwhilelt_b64(j, n); | |||
| svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); | |||
| int64_t active = svcntp_b64(svptrue_b64(), pg); | |||
| svint64_t index_neg = svindex_s64(0LL, -1LL); | |||
| svint64_t index = svindex_s64(0LL, 1LL); | |||
| @@ -86,7 +87,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| posX += sve_size; | |||
| posX_vec = svdup_s64(posX); | |||
| j += sve_size; | |||
| pg = svwhilelt_b64(j, n); | |||
| pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); | |||
| active = svcntp_b64(svptrue_b64(), pg); | |||
| } while (svptest_any(svptrue_b64(), pg)); | |||
| @@ -99,7 +100,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| int32_t N = n; | |||
| int32_t j = 0; | |||
| svbool_t pg = svwhilelt_b32(j, N); | |||
| svbool_t pg = svwhilelt_b32((uint32_t)j, (uint32_t)N); | |||
| int32_t active = svcntp_b32(svptrue_b32(), pg); | |||
| svint32_t index_neg = svindex_s32(0, -1); | |||
| svint32_t index = svindex_s32(0, 1); | |||
| @@ -133,7 +134,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| posX += sve_size; | |||
| posX_vec = svdup_s32(posX); | |||
| j += sve_size; | |||
| pg = svwhilelt_b32(j, N); | |||
| pg = svwhilelt_b32((uint32_t)j, (uint32_t)N); | |||
| active = svcntp_b32(svptrue_b32(), pg); | |||
| } while (svptest_any(svptrue_b32(), pg)); | |||
| @@ -52,11 +52,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| FLOAT *ao; | |||
| #ifdef DOUBLE | |||
| svint64_t index = svindex_s64(0LL, lda); | |||
| svbool_t pn = svwhilelt_b64(js, n); | |||
| svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); | |||
| int n_active = svcntp_b64(svptrue_b64(), pn); | |||
| #else | |||
| svint32_t index = svindex_s32(0, lda); | |||
| svbool_t pn = svwhilelt_b32(js, n); | |||
| svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n); | |||
| int n_active = svcntp_b32(svptrue_b32(), pn); | |||
| #endif | |||
| do | |||
| @@ -123,11 +123,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| posY += n_active; | |||
| js += n_active; | |||
| #ifdef DOUBLE | |||
| pn = svwhilelt_b64(js, n); | |||
| pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); | |||
| n_active = svcntp_b64(svptrue_b64(), pn); | |||
| } while (svptest_any(svptrue_b64(), pn)); | |||
| #else | |||
| pn = svwhilelt_b32(js, n); | |||
| pn = svwhilelt_b32((uint64_t)js, (uint64_t)n); | |||
| n_active = svcntp_b32(svptrue_b32(), pn); | |||
| } while (svptest_any(svptrue_b32(), pn)); | |||
| #endif | |||
| @@ -51,10 +51,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| FLOAT *ao; | |||
| js = 0; | |||
| #ifdef DOUBLE | |||
| svbool_t pn = svwhilelt_b64(js, n); | |||
| svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); | |||
| int n_active = svcntp_b64(svptrue_b64(), pn); | |||
| #else | |||
| svbool_t pn = svwhilelt_b32(js, n); | |||
| svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n); | |||
| int n_active = svcntp_b32(svptrue_b32(), pn); | |||
| #endif | |||
| do | |||
| @@ -122,11 +122,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| posY += n_active; | |||
| js += n_active; | |||
| #ifdef DOUBLE | |||
| pn = svwhilelt_b64(js, n); | |||
| pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); | |||
| n_active = svcntp_b64(svptrue_b64(), pn); | |||
| } while (svptest_any(svptrue_b64(), pn)); | |||
| #else | |||
| pn = svwhilelt_b32(js, n); | |||
| pn = svwhilelt_b32((uint64_t)js, (uint64_t)n); | |||
| n_active = svcntp_b32(svptrue_b32(), pn); | |||
| } while (svptest_any(svptrue_b32(), pn)); | |||
| #endif | |||
| @@ -52,11 +52,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| FLOAT *ao; | |||
| #ifdef DOUBLE | |||
| svint64_t index = svindex_s64(0LL, lda); | |||
| svbool_t pn = svwhilelt_b64(js, n); | |||
| svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); | |||
| int n_active = svcntp_b64(svptrue_b64(), pn); | |||
| #else | |||
| svint32_t index = svindex_s32(0, lda); | |||
| svbool_t pn = svwhilelt_b32(js, n); | |||
| svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n); | |||
| int n_active = svcntp_b32(svptrue_b32(), pn); | |||
| #endif | |||
| do | |||
| @@ -123,11 +123,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| posY += n_active; | |||
| js += n_active; | |||
| #ifdef DOUBLE | |||
| pn = svwhilelt_b64(js, n); | |||
| pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); | |||
| n_active = svcntp_b64(svptrue_b64(), pn); | |||
| } while (svptest_any(svptrue_b64(), pn)); | |||
| #else | |||
| pn = svwhilelt_b32(js, n); | |||
| pn = svwhilelt_b32((uint64_t)js, (uint64_t)n); | |||
| n_active = svcntp_b32(svptrue_b32(), pn); | |||
| } while (svptest_any(svptrue_b32(), pn)); | |||
| #endif | |||