Merge develop branch for 0.3.27tags/v0.3.27
| @@ -1,44 +1,44 @@ | |||
| macos_instance: | |||
| image: ghcr.io/cirruslabs/macos-monterey-xcode:latest | |||
| task: | |||
| name: AppleM1/LLVM | |||
| compile_script: | |||
| - brew install llvm | |||
| - export PATH=/opt/homebrew/opt/llvm/bin:$PATH | |||
| - export LDFLAGS="-L/opt/homebrew/opt/llvm/lib" | |||
| - export CPPFLAGS="-I/opt/homebrew/opt/llvm/include" | |||
| - make TARGET=VORTEX USE_OPENMP=1 CC=clang | |||
| #task: | |||
| # name: AppleM1/LLVM | |||
| # compile_script: | |||
| # - brew install llvm | |||
| # - export PATH=/opt/homebrew/opt/llvm/bin:$PATH | |||
| # - export LDFLAGS="-L/opt/homebrew/opt/llvm/lib" | |||
| # - export CPPFLAGS="-I/opt/homebrew/opt/llvm/include" | |||
| # - make TARGET=VORTEX USE_OPENMP=1 CC=clang | |||
| task: | |||
| name: AppleM1/LLVM/ILP64 | |||
| compile_script: | |||
| - brew install llvm | |||
| - export PATH=/opt/homebrew/opt/llvm/bin:$PATH | |||
| - export LDFLAGS="-L/opt/homebrew/opt/llvm/lib" | |||
| - export CPPFLAGS="-I/opt/homebrew/opt/llvm/include" | |||
| - make TARGET=VORTEX USE_OPENMP=1 CC=clang INTERFACE64=1 | |||
| #task: | |||
| # name: AppleM1/LLVM/ILP64 | |||
| # compile_script: | |||
| # - brew install llvm | |||
| # - export PATH=/opt/homebrew/opt/llvm/bin:$PATH | |||
| # - export LDFLAGS="-L/opt/homebrew/opt/llvm/lib" | |||
| # - export CPPFLAGS="-I/opt/homebrew/opt/llvm/include" | |||
| # - make TARGET=VORTEX USE_OPENMP=1 CC=clang INTERFACE64=1 | |||
| task: | |||
| name: AppleM1/LLVM/CMAKE | |||
| compile_script: | |||
| - brew install llvm | |||
| - export PATH=/opt/homebrew/opt/llvm/bin:$PATH | |||
| - export LDFLAGS="-L/opt/homebrew/opt/llvm/lib" | |||
| - export CPPFLAGS="-I/opt/homebrew/opt/llvm/include" | |||
| - mkdir build | |||
| - cd build | |||
| - cmake -DTARGET=VORTEX -DCMAKE_C_COMPILER=clang -DBUILD_SHARED_LIBS=ON .. | |||
| - make -j 4 | |||
| #task: | |||
| # name: AppleM1/LLVM/CMAKE | |||
| # compile_script: | |||
| # - brew install llvm | |||
| # - export PATH=/opt/homebrew/opt/llvm/bin:$PATH | |||
| # - export LDFLAGS="-L/opt/homebrew/opt/llvm/lib" | |||
| # - export CPPFLAGS="-I/opt/homebrew/opt/llvm/include" | |||
| # - mkdir build | |||
| # - cd build | |||
| # - cmake -DTARGET=VORTEX -DCMAKE_C_COMPILER=clang -DBUILD_SHARED_LIBS=ON .. | |||
| # - make -j 4 | |||
| task: | |||
| name: AppleM1/GCC/MAKE/OPENMP | |||
| compile_script: | |||
| - brew install gcc@11 | |||
| - export PATH=/opt/homebrew/bin:$PATH | |||
| - export LDFLAGS="-L/opt/homebrew/lib" | |||
| - export CPPFLAGS="-I/opt/homebrew/include" | |||
| - make CC=gcc-11 FC=gfortran-11 USE_OPENMP=1 | |||
| #task: | |||
| # name: AppleM1/GCC/MAKE/OPENMP | |||
| # compile_script: | |||
| # - brew install gcc@11 | |||
| # - export PATH=/opt/homebrew/bin:$PATH | |||
| # - export LDFLAGS="-L/opt/homebrew/lib" | |||
| # - export CPPFLAGS="-I/opt/homebrew/include" | |||
| # - make CC=gcc-11 FC=gfortran-11 USE_OPENMP=1 | |||
| macos_instance: | |||
| image: ghcr.io/cirruslabs/macos-monterey-xcode:latest | |||
| @@ -58,8 +58,8 @@ task: | |||
| - export VALID_ARCHS="i386 x86_64" | |||
| - xcrun --sdk macosx --show-sdk-path | |||
| - xcodebuild -version | |||
| - export CC=/Applications/Xcode-14.0.0.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang | |||
| - export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode-14.0.0.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX12.3.sdk -arch x86_64" | |||
| - export CC=/Applications/Xcode-15.3.0.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang | |||
| - export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode-15.3.0.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX14.4.sdk -arch x86_64" | |||
| - make TARGET=CORE2 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 RANLIB="ls -l" | |||
| always: | |||
| config_artifacts: | |||
| @@ -78,8 +78,8 @@ task: | |||
| - export #PATH=/opt/homebrew/opt/llvm/bin:$PATH | |||
| - export #LDFLAGS="-L/opt/homebrew/opt/llvm/lib" | |||
| - export #CPPFLAGS="-I/opt/homebrew/opt/llvm/include" | |||
| - export CC=/Applications/Xcode-14.0.0.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang | |||
| - export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode-14.0.0.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS16.0.sdk -arch arm64 -miphoneos-version-min=10.0" | |||
| - export CC=/Applications/Xcode-15.3.0.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang | |||
| - export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode-15.3.0.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS17.4.sdk -arch arm64 -miphoneos-version-min=10.0" | |||
| - make TARGET=ARMV8 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 CROSS=1 | |||
| always: | |||
| config_artifacts: | |||
| @@ -91,14 +91,16 @@ macos_instance: | |||
| task: | |||
| name: AppleM1/LLVM armv7-androidndk xbuild | |||
| compile_script: | |||
| - #brew install android-ndk | |||
| - brew install android-ndk | |||
| - export #PATH=/opt/homebrew/opt/llvm/bin:$PATH | |||
| - export #LDFLAGS="-L/opt/homebrew/opt/llvm/lib" | |||
| - export #CPPFLAGS="-I/opt/homebrew/opt/llvm/include" | |||
| - find /System/Volumes/Data/opt/homebrew/Caskroom/android-ndk/25b -name "armv7a-linux-androideabi*-ranlib" | |||
| - ls /System/Volumes/Data/opt/homebrew | |||
| - ls -l /System/Volumes/Data/opt/homebrew/Caskroom/ | |||
| - find /System/Volumes/Data/opt/homebrew -name "armv7a-linux-androideabi*-ranlib" | |||
| - #export CC=/Applications/Xcode-13.4.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang | |||
| - #export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode-13.4.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS16.0.sdk -arch arm64 -miphoneos-version-min=10.0" | |||
| - export CC=/System/Volumes/Data/opt/homebrew/Caskroom/android-ndk/25b/AndroidNDK8937393.app/Contents/NDK/toolchains/llvm/prebuilt/darwin-x86_64/bin/armv7a-linux-androideabi23-clang | |||
| - export CC=/System/Volumes/Data/opt/homebrew/Caskroom/android-ndk/26c/AndroidNDK*.app/Contents/NDK/toolchains/llvm/prebuilt/darwin-x86_64/bin/armv7a-linux-androideabi23-clang | |||
| - make TARGET=ARMV7 ARM_SOFTFP_ABI=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 RANLIB="ls -l" | |||
| always: | |||
| config_artifacts: | |||
| @@ -0,0 +1,149 @@ | |||
| name: apple m | |||
| on: [push, pull_request] | |||
| concurrency: | |||
| group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} | |||
| cancel-in-progress: true | |||
| permissions: | |||
| contents: read # to fetch code (actions/checkout) | |||
| jobs: | |||
| build: | |||
| if: "github.repository == 'OpenMathLib/OpenBLAS'" | |||
| runs-on: macos-14 | |||
| strategy: | |||
| fail-fast: false | |||
| matrix: | |||
| build: [cmake, make] | |||
| fortran: [gfortran] | |||
| openmp: [0, 1] | |||
| ilp64: [0, 1] | |||
| steps: | |||
| - name: Checkout repository | |||
| uses: actions/checkout@v3 | |||
| - name: Print system information | |||
| run: | | |||
| if [ "$RUNNER_OS" == "Linux" ]; then | |||
| cat /proc/cpuinfo | |||
| elif [ "$RUNNER_OS" == "macOS" ]; then | |||
| sysctl -a | grep machdep.cpu | |||
| else | |||
| echo "::error::$RUNNER_OS not supported" | |||
| exit 1 | |||
| fi | |||
| - name: Install Dependencies | |||
| run: | | |||
| if [ "$RUNNER_OS" == "Linux" ]; then | |||
| sudo apt-get install -y gfortran cmake ccache libtinfo5 | |||
| elif [ "$RUNNER_OS" == "macOS" ]; then | |||
| # It looks like "gfortran" isn't working correctly unless "gcc" is re-installed. | |||
| brew reinstall gcc | |||
| brew install coreutils cmake ccache | |||
| brew install llvm | |||
| else | |||
| echo "::error::$RUNNER_OS not supported" | |||
| exit 1 | |||
| fi | |||
| - name: Compilation cache | |||
| uses: actions/cache@v3 | |||
| with: | |||
| path: ~/.ccache | |||
| # We include the commit sha in the cache key, as new cache entries are | |||
| # only created if there is no existing entry for the key yet. | |||
| # GNU make and cmake call the compilers differently. It looks like | |||
| # that causes the cache to mismatch. Keep the ccache for both build | |||
| # tools separate to avoid polluting each other. | |||
| key: ccache-${{ runner.os }}-${{ matrix.build }}-${{ matrix.fortran }}-${{ github.ref }}-${{ github.sha }} | |||
| # Restore a matching ccache cache entry. Prefer same branch and same Fortran compiler. | |||
| restore-keys: | | |||
| ccache-${{ runner.os }}-${{ matrix.build }}-${{matrix.fortran }}-${{ github.ref }} | |||
| ccache-${{ runner.os }}-${{ matrix.build }}-${{matrix.fortran }} | |||
| ccache-${{ runner.os }}-${{ matrix.build }} | |||
| - name: Configure ccache | |||
| run: | | |||
| if [ "${{ matrix.build }}" = "make" ]; then | |||
| # Add ccache to path | |||
| if [ "$RUNNER_OS" = "Linux" ]; then | |||
| echo "/usr/lib/ccache" >> $GITHUB_PATH | |||
| elif [ "$RUNNER_OS" = "macOS" ]; then | |||
| echo "$(brew --prefix)/opt/ccache/libexec" >> $GITHUB_PATH | |||
| echo "/opt/homebrew/opt/llvm/bin" >>$GITHUB_PATH | |||
| echo "" >>$GITHUB_PATH | |||
| else | |||
| echo "::error::$RUNNER_OS not supported" | |||
| exit 1 | |||
| fi | |||
| fi | |||
| # Limit the maximum size and switch on compression to avoid exceeding the total disk or cache quota (5 GB). | |||
| test -d ~/.ccache || mkdir -p ~/.ccache | |||
| echo "max_size = 300M" > ~/.ccache/ccache.conf | |||
| echo "compression = true" >> ~/.ccache/ccache.conf | |||
| ccache -s | |||
| - name: Build OpenBLAS | |||
| run: | | |||
| export LDFLAGS="-L/opt/homebrew/opt/llvm/lib" | |||
| export CPPFLAGS="-I/opt/homebrew/opt/llvm/include" | |||
| export CC="/opt/homebrew/opt/llvm/bin/clang" | |||
| case "${{ matrix.build }}" in | |||
| "make") | |||
| make -j$(nproc) DYNAMIC_ARCH=1 USE_OPENMP=${{matrix.openmp}} INTERFACE64=${{matrix.ilp64}} FC="ccache ${{ matrix.fortran }}" | |||
| ;; | |||
| "cmake") | |||
| export LDFLAGS="$LDFLAGS -Wl,-ld_classic" | |||
| mkdir build && cd build | |||
| cmake -DDYNAMIC_ARCH=1 \ | |||
| -DUSE_OPENMP=${{matrix.openmp}} \ | |||
| -DINTERFACE64=${{matrix.ilp64}} \ | |||
| -DNOFORTRAN=0 \ | |||
| -DBUILD_WITHOUT_LAPACK=0 \ | |||
| -DCMAKE_VERBOSE_MAKEFILE=ON \ | |||
| -DCMAKE_BUILD_TYPE=Release \ | |||
| -DCMAKE_Fortran_COMPILER=${{ matrix.fortran }} \ | |||
| -DCMAKE_C_COMPILER_LAUNCHER=ccache \ | |||
| -DCMAKE_Fortran_COMPILER_LAUNCHER=ccache \ | |||
| .. | |||
| cmake --build . | |||
| ;; | |||
| *) | |||
| echo "::error::Configuration not supported" | |||
| exit 1 | |||
| ;; | |||
| esac | |||
| - name: Show ccache status | |||
| continue-on-error: true | |||
| run: ccache -s | |||
| - name: Run tests | |||
| timeout-minutes: 60 | |||
| run: | | |||
| case "${{ matrix.build }}" in | |||
| "make") | |||
| MAKE_FLAGS='DYNAMIC_ARCH=1 USE_OPENMP=0' | |||
| echo "::group::Tests in 'test' directory" | |||
| make -C test $MAKE_FLAGS FC="ccache ${{ matrix.fortran }}" | |||
| echo "::endgroup::" | |||
| echo "::group::Tests in 'ctest' directory" | |||
| make -C ctest $MAKE_FLAGS FC="ccache ${{ matrix.fortran }}" | |||
| echo "::endgroup::" | |||
| echo "::group::Tests in 'utest' directory" | |||
| make -C utest $MAKE_FLAGS FC="ccache ${{ matrix.fortran }}" | |||
| echo "::endgroup::" | |||
| ;; | |||
| "cmake") | |||
| cd build && ctest | |||
| ;; | |||
| *) | |||
| echo "::error::Configuration not supported" | |||
| exit 1 | |||
| ;; | |||
| esac | |||
| @@ -14,8 +14,8 @@ jobs: | |||
| if: "github.repository == 'OpenMathLib/OpenBLAS'" | |||
| runs-on: ubuntu-latest | |||
| env: | |||
| xuetie_toolchain: https://occ-oss-prod.oss-cn-hangzhou.aliyuncs.com/resource//1663142514282 | |||
| toolchain_file_name: Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.6.1-20220906.tar.gz | |||
| xuetie_toolchain: https://occ-oss-prod.oss-cn-hangzhou.aliyuncs.com/resource//1698113812618 | |||
| toolchain_file_name: Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.8.0-20231018.tar.gz | |||
| strategy: | |||
| fail-fast: false | |||
| matrix: | |||
| @@ -76,7 +76,7 @@ jobs: | |||
| run: | | |||
| wget ${xuetie_toolchain}/${toolchain_file_name} | |||
| tar -xvf ${toolchain_file_name} -C /opt | |||
| export PATH="/opt/Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.6.1/bin:$PATH" | |||
| export PATH="/opt/Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.8.0/bin:$PATH" | |||
| make CC='ccache ${{ matrix.triple }}-gcc -static' FC='ccache ${{ matrix.triple }}-gfortran -static' ${{ matrix.opts }} HOSTCC='ccache gcc' -j$(nproc) | |||
| @@ -42,6 +42,7 @@ jobs: | |||
| - name: Install Dependencies | |||
| run: | | |||
| if [ "$RUNNER_OS" == "Linux" ]; then | |||
| sudo apt-get update | |||
| sudo apt-get install -y gfortran cmake ccache libtinfo5 | |||
| elif [ "$RUNNER_OS" == "macOS" ]; then | |||
| # It looks like "gfortran" isn't working correctly unless "gcc" is re-installed. | |||
| @@ -0,0 +1,253 @@ | |||
| name: riscv64 zvl256b qemu test | |||
| on: [push, pull_request] | |||
| concurrency: | |||
| group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} | |||
| cancel-in-progress: true | |||
| permissions: | |||
| contents: read # to fetch code (actions/checkout) | |||
| jobs: | |||
| TEST: | |||
| if: "github.repository == 'OpenMathLib/OpenBLAS'" | |||
| runs-on: ubuntu-latest | |||
| env: | |||
| triple: riscv64-unknown-linux-gnu | |||
| riscv_gnu_toolchain: https://github.com/riscv-collab/riscv-gnu-toolchain | |||
| riscv_gnu_toolchain_version: 13.2.0 | |||
| riscv_gnu_toolchain_nightly_download_path: /releases/download/2024.02.02/riscv64-glibc-ubuntu-22.04-llvm-nightly-2024.02.02-nightly.tar.gz | |||
| strategy: | |||
| fail-fast: false | |||
| matrix: | |||
| include: | |||
| - target: RISCV64_ZVL128B | |||
| opts: TARGET=RISCV64_ZVL128B BINARY=64 ARCH=riscv64 | |||
| qemu_cpu: rv64,g=true,c=true,v=true,vext_spec=v1.0,vlen=128,elen=64 | |||
| - target: RISCV64_ZVL256B | |||
| opts: TARGET=RISCV64_ZVL256B BINARY=64 ARCH=riscv64 | |||
| qemu_cpu: rv64,g=true,c=true,v=true,vext_spec=v1.0,vlen=256,elen=64 | |||
| steps: | |||
| - name: Checkout repository | |||
| uses: actions/checkout@v3 | |||
| - name: install build deps | |||
| run: | | |||
| sudo apt-get update | |||
| sudo apt-get install autoconf automake autotools-dev ninja-build make \ | |||
| libgomp1-riscv64-cross ccache | |||
| wget ${riscv_gnu_toolchain}/${riscv_gnu_toolchain_nightly_download_path} | |||
| tar -xvf $(basename ${riscv_gnu_toolchain_nightly_download_path}) -C /opt | |||
| - name: Compilation cache | |||
| uses: actions/cache@v3 | |||
| with: | |||
| path: ~/.ccache | |||
| key: ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}-${{ github.sha }} | |||
| restore-keys: | | |||
| ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }} | |||
| ccache-${{ runner.os }}-${{ matrix.target }} | |||
| - name: Configure ccache | |||
| run: | | |||
| test -d ~/.ccache || mkdir -p ~/.ccache | |||
| echo "max_size = 300M" > ~/.ccache/ccache.conf | |||
| echo "compression = true" >> ~/.ccache/ccache.conf | |||
| ccache -s | |||
| - name: build OpenBLAS libs | |||
| run: | | |||
| export PATH="/opt/riscv/bin:$PATH" | |||
| make TARGET=${{ matrix.target }} CFLAGS="-DTARGET=${{ matrix.target }}" \ | |||
| CC='ccache clang --rtlib=compiler-rt -target ${triple} --sysroot /opt/riscv/sysroot --gcc-toolchain=/opt/riscv/lib/gcc/riscv64-unknown-linux-gnu/${riscv_gnu_toolchain_version}/' \ | |||
| AR='ccache ${triple}-ar' AS='ccache ${triple}-gcc' LD='ccache ${triple}-gcc' \ | |||
| RANLIB='ccache ${triple}-ranlib' \ | |||
| FC='ccache ${triple}-gfortran' ${{ matrix.opts }} \ | |||
| HOSTCC=gcc HOSTFC=gfortran -j$(nproc) | |||
| - name: build OpenBLAS tests | |||
| run: | | |||
| export PATH="/opt/riscv/bin:$PATH" | |||
| make TARGET=${{ matrix.target }} CFLAGS="-DTARGET=${{ matrix.target }}" \ | |||
| CC='${triple}-gcc' \ | |||
| AR='ccache ${triple}-ar' AS='ccache ${triple}-gcc' LD='ccache ${triple}-gcc' \ | |||
| RANLIB='ccache ${triple}-ranlib' \ | |||
| FC='ccache ${triple}-gfortran' ${{ matrix.opts }} \ | |||
| HOSTCC=gcc HOSTFC=gfortran -j$(nproc) tests | |||
| - name: build lapack-netlib tests | |||
| working-directory: ./lapack-netlib/TESTING | |||
| run: | | |||
| export PATH="/opt/riscv/bin:$PATH" | |||
| make TARGET=${{ matrix.target }} CFLAGS="-DTARGET=${{ matrix.target }}" \ | |||
| CC='${triple}-gcc' \ | |||
| AR='ccache ${triple}-ar' AS='ccache ${triple}-gcc' LD='ccache ${triple}-gcc' \ | |||
| RANLIB='ccache ${triple}-ranlib' \ | |||
| FC='ccache ${triple}-gfortran' ${{ matrix.opts }} \ | |||
| HOSTCC=gcc HOSTFC=gfortran -j$(nproc) \ | |||
| LIN/xlintsts LIN/xlintstc LIN/xlintstd LIN/xlintstz LIN/xlintstrfs \ | |||
| LIN/xlintstrfc LIN/xlintstrfd LIN/xlintstrfz LIN/xlintstds \ | |||
| LIN/xlintstzc EIG/xeigtsts EIG/xeigtstc EIG/xeigtstd EIG/xeigtstz \ | |||
| - name: OpenBLAS tests | |||
| shell: bash | |||
| run: | | |||
| export PATH="/opt/riscv/bin:$PATH" | |||
| export QEMU_CPU=${{ matrix.qemu_cpu }} | |||
| rm -rf ./test_out | |||
| mkdir -p ./test_out | |||
| run_test() { local DIR=$1; local CMD=$2; local DATA=$3; local OUTPUT="./test_out/$DIR.$CMD"; \ | |||
| echo "`pwd`/$DIR/$CMD $DIR/$DATA" >> $OUTPUT; \ | |||
| if [[ -z $DATA ]]; then qemu-riscv64 ./$DIR/$CMD |& tee $OUTPUT ; \ | |||
| else qemu-riscv64 ./$DIR/$CMD < ./$DIR/$DATA |& tee $OUTPUT ; fi ; \ | |||
| RV=$? ; if [[ $RV != 0 ]]; then echo "*** FAIL: nonzero exit code $RV" >> $OUTPUT ; fi \ | |||
| } | |||
| run_test test cblat1 & | |||
| run_test test cblat2 cblat2.dat & | |||
| run_test test cblat3 cblat3.dat & | |||
| run_test test dblat1 & | |||
| run_test test dblat2 dblat2.dat & | |||
| run_test test dblat3 dblat3.dat & | |||
| run_test test sblat1 & | |||
| run_test test sblat2 sblat2.dat & | |||
| run_test test sblat3 sblat3.dat & | |||
| run_test test zblat1 & | |||
| run_test test zblat2 zblat2.dat & | |||
| run_test test zblat3 zblat3.dat & | |||
| run_test ctest xccblat1 & | |||
| run_test ctest xccblat2 cin2 & | |||
| run_test ctest xccblat3 cin3 & | |||
| run_test ctest xdcblat1 & | |||
| run_test ctest xdcblat2 din2 & | |||
| run_test ctest xdcblat3 din3 & | |||
| run_test ctest xscblat1 & | |||
| run_test ctest xscblat2 sin2 & | |||
| run_test ctest xscblat3 sin3 & | |||
| run_test ctest xzcblat1 & | |||
| run_test ctest xzcblat2 zin2 & | |||
| run_test ctest xzcblat3 zin3 & | |||
| wait | |||
| while IFS= read -r -d $'\0' LOG; do cat $LOG ; FAILURES=1 ; done < <(grep -lZ FAIL ./test_out/*) | |||
| if [[ ! -z $FAILURES ]]; then echo "==========" ; echo "== FAIL ==" ; echo "==========" ; echo ; exit 1 ; fi | |||
| - name: netlib tests | |||
| shell: bash | |||
| run: | | |||
| : # these take a very long time | |||
| echo "Skipping netlib tests in CI" | |||
| exit 0 | |||
| : # comment out exit above to enable the tests | |||
| : # probably we want to identify a subset to run in CI | |||
| export PATH="/opt/riscv/bin:$PATH" | |||
| export QEMU_CPU=${{ matrix.qemu_cpu }} | |||
| rm -rf ./test_out | |||
| mkdir -p ./test_out | |||
| run_test() { local OUTPUT="./test_out/$1"; local DATA="./lapack-netlib/TESTING/$2"; local CMD="./lapack-netlib/TESTING/$3"; \ | |||
| echo "$4" >> $OUTPUT; \ | |||
| echo "$CMD" >> $OUTPUT; \ | |||
| qemu-riscv64 $CMD < $DATA |& tee $OUTPUT; \ | |||
| RV=$? ; if [[ $RV != 0 ]]; then echo "*** FAIL: nonzero exit code $RV" >> $OUTPUT ; fi; \ | |||
| if grep -q fail $OUTPUT ; then echo "*** FAIL: log contains 'fail'" >> $OUTPUT ; fi ; \ | |||
| if grep -q rror $OUTPUT | grep -v -q "passed" | grep -v "largest error" ; then echo "*** FAIL: log contains 'error'" >> $OUTPUT ; fi \ | |||
| } | |||
| run_test stest.out stest.in LIN/xlintsts "Testing REAL LAPACK linear equation routines" & | |||
| run_test ctest.out ctest.in LIN/xlintstc "Testing COMPLEX LAPACK linear equation routines" & | |||
| run_test dtest.out dtest.in LIN/xlintstd "Testing DOUBLE PRECISION LAPACK linear equation routines" & | |||
| run_test ztest.out ztest.in LIN/xlintstz "Testing COMPLEX16 LAPACK linear equation routines" & | |||
| run_test dstest.out dstest.in LIN/xlintstds "Testing SINGLE-DOUBLE PRECISION LAPACK prototype linear equation routines" & | |||
| run_test zctest.out zctest.in LIN/xlintstzc "Testing COMPLEX-COMPLEX16 LAPACK prototype linear equation routines" & | |||
| run_test stest_rfp.out stest_rfp.in LIN/xlintstrfs "Testing REAL LAPACK RFP prototype linear equation routines" & | |||
| run_test dtest_rfp.out dtest_rfp.in LIN/xlintstrfd "Testing DOUBLE PRECISION LAPACK RFP prototype linear equation routines" & | |||
| run_test ctest_rfp.out ctest_rfp.in LIN/xlintstrfc "Testing COMPLEX LAPACK RFP prototype linear equation routines" & | |||
| run_test ztest_rfp.out ztest_rfp.in LIN/xlintstrfz "Testing COMPLEX16 LAPACK RFP prototype linear equation routines" & | |||
| run_test snep.out nep.in EIG/xeigtsts "NEP - Testing Nonsymmetric Eigenvalue Problem routines" & | |||
| run_test ssep.out sep.in EIG/xeigtsts "SEP - Testing Symmetric Eigenvalue Problem routines" & | |||
| run_test sse2.out se2.in EIG/xeigtsts "SEP - Testing Symmetric Eigenvalue Problem routines" & | |||
| run_test ssvd.out svd.in EIG/xeigtsts "SVD - Testing Singular Value Decomposition routines" & | |||
| run_test sec.out sec.in EIG/xeigtsts "SEC - Testing REAL Eigen Condition Routines" & | |||
| run_test sed.out sed.in EIG/xeigtsts "SEV - Testing REAL Nonsymmetric Eigenvalue Driver" & | |||
| run_test sgg.out sgg.in EIG/xeigtsts "SGG - Testing REAL Nonsymmetric Generalized Eigenvalue Problem routines" & | |||
| run_test sgd.out sgd.in EIG/xeigtsts "SGD - Testing REAL Nonsymmetric Generalized Eigenvalue Problem driver routines" & | |||
| run_test ssb.out ssb.in EIG/xeigtsts "SSB - Testing REAL Symmetric Eigenvalue Problem routines" & | |||
| run_test ssg.out ssg.in EIG/xeigtsts "SSG - Testing REAL Symmetric Generalized Eigenvalue Problem routines" & | |||
| run_test sbal.out sbal.in EIG/xeigtsts "SGEBAL - Testing the balancing of a REAL general matrix" & | |||
| run_test sbak.out sbak.in EIG/xeigtsts "SGEBAK - Testing the back transformation of a REAL balanced matrix" & | |||
| run_test sgbal.out sgbal.in EIG/xeigtsts "SGGBAL - Testing the balancing of a pair of REAL general matrices" & | |||
| run_test sgbak.out sgbak.in EIG/xeigtsts "SGGBAK - Testing the back transformation of a pair of REAL balanced matrices" & | |||
| run_test sbb.out sbb.in EIG/xeigtsts "SBB - Testing banded Singular Value Decomposition routines" & | |||
| run_test sglm.out glm.in EIG/xeigtsts "GLM - Testing Generalized Linear Regression Model routines" & | |||
| run_test sgqr.out gqr.in EIG/xeigtsts "GQR - Testing Generalized QR and RQ factorization routines" & | |||
| run_test sgsv.out gsv.in EIG/xeigtsts "GSV - Testing Generalized Singular Value Decomposition routines" & | |||
| run_test scsd.out csd.in EIG/xeigtsts "CSD - Testing CS Decomposition routines" & | |||
| run_test slse.out lse.in EIG/xeigtsts "LSE - Testing Constrained Linear Least Squares routines" & | |||
| run_test cnep.out nep.in EIG/xeigtstc "NEP - Testing Nonsymmetric Eigenvalue Problem routines" & | |||
| run_test csep.out sep.in EIG/xeigtstc "SEP - Testing Symmetric Eigenvalue Problem routines" & | |||
| run_test cse2.out se2.in EIG/xeigtstc "SEP - Testing Symmetric Eigenvalue Problem routines" & | |||
| run_test csvd.out svd.in EIG/xeigtstc "SVD - Testing Singular Value Decomposition routines" & | |||
| run_test cec.out cec.in EIG/xeigtstc "CEC - Testing COMPLEX Eigen Condition Routines" & | |||
| run_test ced.out ced.in EIG/xeigtstc "CES - Testing COMPLEX Nonsymmetric Schur Form Driver" & | |||
| run_test cgg.out cgg.in EIG/xeigtstc "CGG - Testing COMPLEX Nonsymmetric Generalized Eigenvalue Problem routines" & | |||
| run_test cgd.out cgd.in EIG/xeigtstc "CGD - Testing COMPLEX Nonsymmetric Generalized Eigenvalue Problem driver routines" & | |||
| run_test csb.out csb.in EIG/xeigtstc "CHB - Testing Hermitian Eigenvalue Problem routines" & | |||
| run_test csg.out csg.in EIG/xeigtstc "CSG - Testing Symmetric Generalized Eigenvalue Problem routines" & | |||
| run_test cbal.out cbal.in EIG/xeigtstc "CGEBAL - Testing the balancing of a COMPLEX general matrix" & | |||
| run_test cbak.out cbak.in EIG/xeigtstc "CGEBAK - Testing the back transformation of a COMPLEX balanced matrix" & | |||
| run_test cgbal.out cgbal.in EIG/xeigtstc "CGGBAL - Testing the balancing of a pair of COMPLEX general matrices" & | |||
| run_test cgbak.out cgbak.in EIG/xeigtstc "CGGBAK - Testing the back transformation of a pair of COMPLEX balanced matrices" & | |||
| run_test cbb.out cbb.in EIG/xeigtstc "CBB - Testing banded Singular Value Decomposition routines" & | |||
| run_test cglm.out glm.in EIG/xeigtstc "GLM - Testing Generalized Linear Regression Model routines" & | |||
| run_test cgqr.out gqr.in EIG/xeigtstc "GQR - Testing Generalized QR and RQ factorization routines" & | |||
| run_test cgsv.out gsv.in EIG/xeigtstc "GSV - Testing Generalized Singular Value Decomposition routines" & | |||
| run_test ccsd.out csd.in EIG/xeigtstc "CSD - Testing CS Decomposition routines" & | |||
| run_test clse.out lse.in EIG/xeigtstc "LSE - Testing Constrained Linear Least Squares routines" & | |||
| run_test dnep.out nep.in EIG/xeigtstd "NEP - Testing Nonsymmetric Eigenvalue Problem routines" & | |||
| run_test dsep.out sep.in EIG/xeigtstd "SEP - Testing Symmetric Eigenvalue Problem routines" & | |||
| run_test dse2.out se2.in EIG/xeigtstd "SEP - Testing Symmetric Eigenvalue Problem routines" & | |||
| run_test dsvd.out svd.in EIG/xeigtstd "SVD - Testing Singular Value Decomposition routines" & | |||
| run_test dec.out dec.in EIG/xeigtstd "DEC - Testing DOUBLE PRECISION Eigen Condition Routines" & | |||
| run_test ded.out ded.in EIG/xeigtstd "DEV - Testing DOUBLE PRECISION Nonsymmetric Eigenvalue Driver" & | |||
| run_test dgg.out dgg.in EIG/xeigtstd "DGG - Testing DOUBLE PRECISION Nonsymmetric Generalized Eigenvalue Problem routines" & | |||
| run_test dgd.out dgd.in EIG/xeigtstd "DGD - Testing DOUBLE PRECISION Nonsymmetric Generalized Eigenvalue Problem driver routines" & | |||
| run_test dsb.out dsb.in EIG/xeigtstd "DSB - Testing DOUBLE PRECISION Symmetric Eigenvalue Problem routines" & | |||
| run_test dsg.out dsg.in EIG/xeigtstd "DSG - Testing DOUBLE PRECISION Symmetric Generalized Eigenvalue Problem routines" & | |||
| run_test dbal.out dbal.in EIG/xeigtstd "DGEBAL - Testing the balancing of a DOUBLE PRECISION general matrix" & | |||
| run_test dbak.out dbak.in EIG/xeigtstd "DGEBAK - Testing the back transformation of a DOUBLE PRECISION balanced matrix" & | |||
| run_test dgbal.out dgbal.in EIG/xeigtstd "DGGBAL - Testing the balancing of a pair of DOUBLE PRECISION general matrices" & | |||
| run_test dgbak.out dgbak.in EIG/xeigtstd "DGGBAK - Testing the back transformation of a pair of DOUBLE PRECISION balanced matrices" & | |||
| run_test dbb.out dbb.in EIG/xeigtstd "DBB - Testing banded Singular Value Decomposition routines" & | |||
| run_test dglm.out glm.in EIG/xeigtstd "GLM - Testing Generalized Linear Regression Model routines" & | |||
| run_test dgqr.out gqr.in EIG/xeigtstd "GQR - Testing Generalized QR and RQ factorization routines" & | |||
| run_test dgsv.out gsv.in EIG/xeigtstd "GSV - Testing Generalized Singular Value Decomposition routines" & | |||
| run_test dcsd.out csd.in EIG/xeigtstd "CSD - Testing CS Decomposition routines" & | |||
| run_test dlse.out lse.in EIG/xeigtstd "LSE - Testing Constrained Linear Least Squares routines" & | |||
| run_test znep.out nep.in EIG/xeigtstz "NEP - Testing Nonsymmetric Eigenvalue Problem routines" & | |||
| run_test zsep.out sep.in EIG/xeigtstz "SEP - Testing Symmetric Eigenvalue Problem routines" & | |||
| run_test zse2.out se2.in EIG/xeigtstz "SEP - Testing Symmetric Eigenvalue Problem routines" & | |||
| run_test zsvd.out svd.in EIG/xeigtstz "SVD - Testing Singular Value Decomposition routines" & | |||
| run_test zec.out zec.in EIG/xeigtstz "ZEC - Testing COMPLEX16 Eigen Condition Routines" & | |||
| run_test zed.out zed.in EIG/xeigtstz "ZES - Testing COMPLEX16 Nonsymmetric Schur Form Driver" & | |||
| run_test zgg.out zgg.in EIG/xeigtstz "ZGG - Testing COMPLEX16 Nonsymmetric Generalized Eigenvalue Problem routines" & | |||
| run_test zgd.out zgd.in EIG/xeigtstz "ZGD - Testing COMPLEX16 Nonsymmetric Generalized Eigenvalue Problem driver routines" & | |||
| run_test zsb.out zsb.in EIG/xeigtstz "ZHB - Testing Hermitian Eigenvalue Problem routines" & | |||
| run_test zsg.out zsg.in EIG/xeigtstz "ZSG - Testing Symmetric Generalized Eigenvalue Problem routines" & | |||
| run_test zbal.out zbal.in EIG/xeigtstz "ZGEBAL - Testing the balancing of a COMPLEX16 general matrix" & | |||
| run_test zbak.out zbak.in EIG/xeigtstz "ZGEBAK - Testing the back transformation of a COMPLEX16 balanced matrix" & | |||
| run_test zgbal.out zgbal.in EIG/xeigtstz "ZGGBAL - Testing the balancing of a pair of COMPLEX general matrices" & | |||
| run_test zgbak.out zgbak.in EIG/xeigtstz "ZGGBAK - Testing the back transformation of a pair of COMPLEX16 balanced matrices" & | |||
| run_test zbb.out zbb.in EIG/xeigtstz "ZBB - Testing banded Singular Value Decomposition routines" & | |||
| run_test zglm.out glm.in EIG/xeigtstz "GLM - Testing Generalized Linear Regression Model routines" & | |||
| run_test zgqr.out gqr.in EIG/xeigtstz "GQR - Testing Generalized QR and RQ factorization routines" & | |||
| run_test zgsv.out gsv.in EIG/xeigtstz "GSV - Testing Generalized Singular Value Decomposition routines" & | |||
| run_test zcsd.out csd.in EIG/xeigtstz "CSD - Testing CS Decomposition routines" & | |||
| run_test zlse.out lse.in EIG/xeigtstz "LSE - Testing Constrained Linear Least Squares routines" & | |||
| wait | |||
| while IFS= read -r -d $'\0' LOG; do cat $LOG ; FAILURES=1 ; done < <(grep -lZ FAIL ./test_out/*) | |||
| python ./lapack-netlib/lapack_testing.py -d ./test_out -e > netlib_summary | |||
| TOTALS="$(grep 'ALL PRECISIONS' netlib_summary)" | |||
| NUMERICAL_ERRORS=-1 | |||
| OTHER_ERRORS=-1 | |||
| . <(awk '/ALL PRECISIONS/{printf "NUMERICAL_ERRORS=%s\nOTHER_ERRORS=%s\n", $5, $7}' netlib_summary | |||
| if (( NUMERICAL_ERRORS != 0 )) || (( OTHER_ERRORS != 0 )) ; then cat netlib_summary ; FAILURES=1 ; fi | |||
| if [[ ! -z $FAILURES ]]; then echo "==========" ; echo "== FAIL ==" ; echo "==========" ; echo ; exit 1 ; fi | |||
| @@ -47,46 +47,59 @@ config_last.h | |||
| getarch | |||
| getarch_2nd | |||
| utest/openblas_utest | |||
| utest/openblas_utest_ext | |||
| ctest/xccblat1 | |||
| ctest/xccblat2 | |||
| ctest/xccblat3 | |||
| ctest/xccblat3_3m | |||
| ctest/xdcblat1 | |||
| ctest/xdcblat2 | |||
| ctest/xdcblat3 | |||
| ctest/xdcblat3_3m | |||
| ctest/xscblat1 | |||
| ctest/xscblat2 | |||
| ctest/xscblat3 | |||
| ctest/xscblat3_3m | |||
| ctest/xzcblat1 | |||
| ctest/xzcblat2 | |||
| ctest/xzcblat3 | |||
| ctest/xzcblat3_3m | |||
| exports/linktest.c | |||
| exports/linux.def | |||
| kernel/setparam_*.c | |||
| kernel/kernel_*.h | |||
| test/CBLAT2.SUMM | |||
| test/CBLAT3.SUMM | |||
| test/CBLAT3_3M.SUMM | |||
| test/DBLAT2.SUMM | |||
| test/DBLAT3.SUMM | |||
| test/DBLAT3_3M.SUMM | |||
| test/SBLAT2.SUMM | |||
| test/SBLAT3.SUMM | |||
| test/SBLAT3_3M.SUMM | |||
| test/ZBLAT2.SUMM | |||
| test/ZBLAT3.SUMM | |||
| test/ZBLAT3_3M.SUMM | |||
| test/SHBLAT3.SUMM | |||
| test/SBBLAT3.SUMM | |||
| test/cblat1 | |||
| test/cblat2 | |||
| test/cblat3 | |||
| test/cblat3_3m | |||
| test/dblat1 | |||
| test/dblat2 | |||
| test/dblat3 | |||
| test/dblat3_3m | |||
| test/sblat1 | |||
| test/sblat2 | |||
| test/sblat3 | |||
| test/sblat3_3m | |||
| test/test_shgemm | |||
| test/test_sbgemm | |||
| test/zblat1 | |||
| test/zblat2 | |||
| test/zblat3 | |||
| test/zblat3_3m | |||
| build | |||
| build.* | |||
| *.swp | |||
| @@ -24,6 +24,8 @@ option(BUILD_LAPACK_DEPRECATED "When building LAPACK, include also some older, d | |||
| option(BUILD_TESTING "Build LAPACK testsuite when building LAPACK" ON) | |||
| option(BUILD_BENCHMARKS "Build the collection of BLAS/LAPACK benchmarks" OFF) | |||
| option(C_LAPACK "Build LAPACK from C sources instead of the original Fortran" OFF) | |||
| option(BUILD_WITHOUT_CBLAS "Do not build the C interface (CBLAS) to the BLAS functions" OFF) | |||
| @@ -40,6 +42,11 @@ option(USE_PERL "Use the older PERL scripts for build preparation instead of uni | |||
| option(NO_WARMUP "Do not run a benchmark on each startup just to find the best location for the memory buffer" ON) | |||
| option(FIXED_LIBNAME "Use a non-versioned name for the library and no symbolic linking to variant names" OFF) | |||
| set(LIBNAMEPREFIX "" CACHE STRING "Add a prefix to the openblas part of the library name" ) | |||
| set(LIBNAMESUFFIX "" CACHE STRING "Add a suffix after the openblas part of the library name" ) | |||
| if(${CMAKE_SYSTEM_NAME} MATCHES "Linux") | |||
| option(NO_AFFINITY "Disable support for CPU affinity masks to avoid binding processes from e.g. R or numpy/scipy to a single core" ON) | |||
| else() | |||
| @@ -96,7 +103,7 @@ message(WARNING "CMake support is experimental. It does not yet support all buil | |||
| include("${PROJECT_SOURCE_DIR}/cmake/utils.cmake") | |||
| include("${PROJECT_SOURCE_DIR}/cmake/system.cmake") | |||
| set(OpenBLAS_LIBNAME openblas${SUFFIX64_UNDERSCORE}) | |||
| set(OpenBLAS_LIBNAME ${LIBNAMEPREFIX}openblas${LIBNAMESUFFIX}${SUFFIX64_UNDERSCORE}) | |||
| set(BLASDIRS interface driver/level2 driver/level3 driver/others) | |||
| @@ -323,7 +330,7 @@ if (NOT NOFORTRAN) | |||
| # Build test and ctest | |||
| add_subdirectory(test) | |||
| endif() | |||
| if (BUILD_TESTING) | |||
| if (BUILD_TESTING AND NOT BUILD_WITHOUT_LAPACK) | |||
| add_subdirectory(lapack-netlib/TESTING) | |||
| endif() | |||
| endif() | |||
| @@ -336,11 +343,12 @@ endif() | |||
| add_subdirectory(cpp_thread_test) | |||
| endif() | |||
| if (NOT FIXED_LIBNAME) | |||
| set_target_properties(${OpenBLAS_LIBS} PROPERTIES | |||
| VERSION ${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION} | |||
| SOVERSION ${OpenBLAS_MAJOR_VERSION} | |||
| ) | |||
| endif() | |||
| if (BUILD_SHARED_LIBS AND BUILD_RELAPACK) | |||
| if (NOT MSVC) | |||
| target_link_libraries(${OpenBLAS_LIBNAME}_shared "-Wl,-allow-multiple-definition") | |||
| @@ -452,6 +460,61 @@ if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFFIX} STREQUAL "") | |||
| endif() | |||
| endif() | |||
| if (BUILD_BENCHMARKS) | |||
| #find_package(OpenMP REQUIRED) | |||
| file(GLOB SOURCES "benchmark/*.c") | |||
| if (NOT USE_OPENMP) | |||
| file(GLOB REMFILE "benchmark/smallscaling.c") | |||
| list(REMOVE_ITEM SOURCES ${REMFILE}) | |||
| endif() | |||
| if (BUILD_WITHOUT_LAPACK) | |||
| file(GLOB REMFILE "benchmark/cholesky.c") | |||
| list(REMOVE_ITEM SOURCES ${REMFILE}) | |||
| file(GLOB REMFILE "benchmark/geev.c") | |||
| list(REMOVE_ITEM SOURCES ${REMFILE}) | |||
| file(GLOB REMFILE "benchmark/gesv.c") | |||
| list(REMOVE_ITEM SOURCES ${REMFILE}) | |||
| file(GLOB REMFILE "benchmark/getri.c") | |||
| list(REMOVE_ITEM SOURCES ${REMFILE}) | |||
| file(GLOB REMFILE "benchmark/potrf.c") | |||
| list(REMOVE_ITEM SOURCES ${REMFILE}) | |||
| file(GLOB REMFILE "benchmark/spmv.c") | |||
| list(REMOVE_ITEM SOURCES ${REMFILE}) | |||
| file(GLOB REMFILE "benchmark/symv.c") | |||
| list(REMOVE_ITEM SOURCES ${REMFILE}) | |||
| file(GLOB REMFILE "benchmark/linpack.c") | |||
| list(REMOVE_ITEM SOURCES ${REMFILE}) | |||
| endif() | |||
| if (NOT USE_GEMM3M) | |||
| file(GLOB REMFILE "benchmark/gemm3m.c") | |||
| list(REMOVE_ITEM SOURCES ${REMFILE}) | |||
| endif() | |||
| foreach(source ${SOURCES}) | |||
| get_filename_component(name ${source} NAME_WE) | |||
| if ((NOT ${name} STREQUAL "zdot-intel") AND (NOT ${name} STREQUAL "cula_wrapper")) | |||
| set(defines DEFAULT COMPLEX DOUBLE "COMPLEX\;DOUBLE") | |||
| foreach(define ${defines}) | |||
| set(target_name "benchmark_${name}") | |||
| if (NOT "${define}" STREQUAL "DEFAULT") | |||
| string(JOIN "_" define_str ${define}) | |||
| set(target_name "${target_name}_${define_str}") | |||
| endif() | |||
| if ((NOT ${target_name} STREQUAL "benchmark_imax_COMPLEX") AND (NOT ${target_name} STREQUAL "benchmark_imax_COMPLEX_DOUBLE") AND | |||
| (NOT ${target_name} STREQUAL "benchmark_imin_COMPLEX") AND (NOT ${target_name} STREQUAL "benchmark_imin_COMPLEX_DOUBLE") AND | |||
| (NOT ${target_name} STREQUAL "benchmark_max_COMPLEX") AND (NOT ${target_name} STREQUAL "benchmark_max_COMPLEX_DOUBLE") AND | |||
| (NOT ${target_name} STREQUAL "benchmark_min_COMPLEX") AND (NOT ${target_name} STREQUAL "benchmark_min_COMPLEX_DOUBLE")) | |||
| add_executable(${target_name} ${source}) | |||
| target_include_directories(${target_name} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR}) | |||
| target_link_libraries(${target_name} ${OpenBLAS_LIBNAME} ) | |||
| # target_link_libraries(${target_name} ${OpenBLAS_LIBNAME} OpenMP::OpenMP_C) | |||
| if (NOT "${define}" STREQUAL "DEFAULT") | |||
| target_compile_definitions(${target_name} PRIVATE ${define}) | |||
| endif() | |||
| endif() | |||
| endforeach() | |||
| endif() | |||
| endforeach() | |||
| endif() | |||
| # Install project | |||
| @@ -218,4 +218,8 @@ In chronological order: | |||
| * [2022-08] Fix building from sources for QNX | |||
| * Mark Seminatore <https://github.com/mseminatore> | |||
| * [2023-11-09] Improve Windows threading performance scaling | |||
| * [2023-11-09] Improve Windows threading performance scaling | |||
| * [2024-02-09] Introduce MT_TRACE facility and improve code consistency | |||
| * Dirreke <https://github.com/mseminatore> | |||
| * [2024-01-16] Add basic support for the CSKY architecture | |||
| @@ -1,4 +1,104 @@ | |||
| OpenBLAS ChangeLog | |||
| ==================================================================== | |||
| Version 0.3.27 | |||
| 4-Apr-2024 | |||
| general: | |||
| - added initial (generic) support for the CSKY architecture | |||
| - capped the maximum number of threads used in GEMM, GETRF and POTRF to avoid creating | |||
| underutilized or idle threads | |||
| - sped up multithreaded POTRF on all platforms | |||
| - added extension openblas_set_num_threads_local() that returns the previous thread count | |||
| - re-evaluated the SGEMV and DGEMV load thresholds to avoid activating multithreading | |||
| for too small workloads | |||
| - improved the fallback code used when the precompiled number of threads is exceeded, | |||
| and made it callable multiple times during the lifetime of an instance | |||
| - added CBLAS interfaces for the BLAS extensions ?AMIN,?AMAX, CAXPYC and ZAXPYC | |||
| - fixed a potential buffer overflow in the interface to the GEMMT kernels | |||
| - fixed use of incompatible pointer types in GEMMT and C/ZAXPBY as flagged by GCC-14 | |||
| - fixed unwanted case sensitivity of the character parameters in ?TRTRS | |||
| - sped up the OpenMP thread management code | |||
| - fixed sizing of logical variables in INTERFACE64 builds of the C version of LAPACK | |||
| - fixed inclusion of new LAPACK and LAPACKE functions from LAPACK 3.11 in the shared library | |||
| - added a testsuite for the BLAS extensions | |||
| - modified the error thresholds for SGS/DGS functions in the LAPACK testsuite to suppress | |||
| spurious errors | |||
| - added support for building the benchmark collection with CMAKE | |||
| - added rewriting of linker options to avoid linking both libgomp and libomp in CMAKE builds | |||
| with OpenMP enabled that use clang with gfortran | |||
| - fixed building on systems with ucLibc | |||
| - added support for calling ?NRM2 with a negative increment value on all architectures | |||
| - added support for the LLVM18 version of the flang-new compiler | |||
| - fixed handling of the OPENBLAS_LOOPS variable in several benchmarks | |||
| - Integrated fixes from the Reference-LAPACK project: | |||
| - Increased accuracy in C/ZLARFGP (Reference-LAPACK PR 981) | |||
| x86: | |||
| - fixed handling of NaN and Inf arguments in ZSCAL | |||
| - fixed GEMM3M functions failing in CMAKE builds | |||
| x86-64: | |||
| - removed all instances of sched_yield() on Linux and BSD | |||
| - fixed a potential deadlock in the thread server on MSWindows (introduced in 0.3.26) | |||
| - fixed GEMM3M functions failing in CMAKE builds | |||
| - fixed handling of NaN and Inf arguments in ZSCAL | |||
| - added compiler checks for AVX512BF16 compatibility | |||
| - fixed LLVM compiler options for Sapphire Rapids | |||
| - fixed cpu handling fallbacks for Sapphire Rapids with | |||
| disabled AVX2 in DYNAMIC_ARCH mode | |||
| - fixed extensions SCSUM and DZSUM | |||
| - improved GEMM performance for ZEN targets | |||
| arm: | |||
| - fixed handling of NaN and Inf arguments in ZSCAL | |||
| arm64: | |||
| - added initial support for the Cortex-A76 cpu | |||
| - fixed handling of NaN and Inf arguments in ZSCAL | |||
| - fixed default compiler options for gcc (-march and -mtune) | |||
| - added support for ArmCompilerForLinux | |||
| - added support for the NeoverseV2 cpu in DYNAMIC_ARCH builds | |||
| - fixed mishandling of the INTERFACE64 option in CMAKE builds | |||
| - corrected SCSUM kernels (erroneously duplicating SCASUM behaviour) | |||
| - added SVE-enabled kernels for CSUM/ZSUM | |||
| - worked around an inaccuracy in the NRM2 kernels for NeoverseN1 and Apple M | |||
| power: | |||
| - improved performance of SGEMM on POWER8/9/10 | |||
| - improved performance of DGEMM on POWER10 | |||
| - added support for OpenMP builds with xlc/xlf on AIX | |||
| - improved cpu autodetection for DYNAMIC_ARCH builds on older AIX | |||
| - fixed cpu core counting on AIX | |||
| - added support for building a shared library on AIX | |||
| riscv64: | |||
| - added support for the X280 cpu | |||
| - added support for semi-generic RISCV models with vector length 128 or 256 | |||
| - added support for compiling with either RVV 0.7.1 or RVV 1.0 standard compilers | |||
| - fixed handling of NaN and Inf arguments in ZSCAL | |||
| - improved cpu model autodetection | |||
| - fixed corner cases in ?AXPBY for C910V | |||
| - fixed handling of zero increments in ?AXPY kernels for C910V | |||
| loongarch64: | |||
| - added optimized kernels for ?AMIN and ?AMAX | |||
| - fixed handling of NaN and Inf arguments in ZSCAL | |||
| - fixed handling of corner cases in ?AXPBY | |||
| - fixed computation of SAMIN and DAMIN in LSX mode | |||
| - fixed computation of ?ROT | |||
| - added optimized SSYMV and DSYMV kernels for LSX and LASX mode | |||
| - added optimized CGEMM and ZGEMM kernels for LSX and LASX mode | |||
| - added optimized CGEMV and ZGEMV kernels | |||
| mips: | |||
| - fixed utilizing MSA on P5600 and related cpus (broken in 0.3.22) | |||
| - fixed handling of NaN and Inf arguments in ZSCAL | |||
| - fixed mishandling of the INTERFACE64 option in CMAKE builds | |||
| zarch: | |||
| - fixed handling of NaN and Inf arguments in ZSCAL | |||
| - fixed calculation of ?SUM on Z13 | |||
| ==================================================================== | |||
| Version 0.3.26 | |||
| 2-Jan-2024 | |||
| @@ -1,5 +1,9 @@ | |||
| TOPDIR = . | |||
| include ./Makefile.system | |||
| LNCMD = ln -fs | |||
| ifeq ($(FIXED_LIBNAME), 1) | |||
| LNCMD = true | |||
| endif | |||
| BLASDIRS = interface driver/level2 driver/level3 driver/others | |||
| @@ -134,17 +138,17 @@ shared : libs netlib $(RELA) | |||
| ifneq ($(NO_SHARED), 1) | |||
| ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku FreeBSD DragonFly)) | |||
| @$(MAKE) -C exports so | |||
| @ln -fs $(LIBSONAME) $(LIBPREFIX).so | |||
| @ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION) | |||
| @$(LNCMD) $(LIBSONAME) $(LIBPREFIX).so | |||
| @$(LNCMD) $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION) | |||
| endif | |||
| ifeq ($(OSNAME), $(filter $(OSNAME),OpenBSD NetBSD)) | |||
| @$(MAKE) -C exports so | |||
| @ln -fs $(LIBSONAME) $(LIBPREFIX).so | |||
| @$(LNCMD) $(LIBSONAME) $(LIBPREFIX).so | |||
| endif | |||
| ifeq ($(OSNAME), Darwin) | |||
| @$(MAKE) -C exports dyn | |||
| @ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib | |||
| @ln -fs $(LIBDYNNAME) $(LIBPREFIX).$(MAJOR_VERSION).dylib | |||
| @$(LNCMD) $(LIBDYNNAME) $(LIBPREFIX).dylib | |||
| @$(LNCMD) $(LIBDYNNAME) $(LIBPREFIX).$(MAJOR_VERSION).dylib | |||
| endif | |||
| ifeq ($(OSNAME), WINNT) | |||
| @$(MAKE) -C exports dll | |||
| @@ -152,6 +156,9 @@ endif | |||
| ifeq ($(OSNAME), CYGWIN_NT) | |||
| @$(MAKE) -C exports dll | |||
| endif | |||
| ifeq ($(OSNAME), AIX) | |||
| @$(MAKE) -C exports so | |||
| endif | |||
| endif | |||
| tests : shared | |||
| @@ -229,13 +236,13 @@ ifeq ($(INTERFACE64),1) | |||
| endif | |||
| @echo THELIBNAME=$(LIBNAME) >> Makefile.conf_last | |||
| @echo THELIBSONAME=$(LIBSONAME) >> Makefile.conf_last | |||
| @-ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) | |||
| @-$(LNCMD) $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) | |||
| @touch lib.grd | |||
| prof : prof_blas prof_lapack | |||
| prof_blas : | |||
| ln -fs $(LIBNAME_P) $(LIBPREFIX)_p.$(LIBSUFFIX) | |||
| $(LNCMD) $(LIBNAME_P) $(LIBPREFIX)_p.$(LIBSUFFIX) | |||
| for d in $(SUBDIRS) ; \ | |||
| do if test -d $$d; then \ | |||
| $(MAKE) -C $$d prof || exit 1 ; \ | |||
| @@ -246,7 +253,7 @@ ifeq ($(DYNAMIC_ARCH), 1) | |||
| endif | |||
| blas : | |||
| ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) | |||
| $(LNCMD) $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) | |||
| for d in $(BLASDIRS) ; \ | |||
| do if test -d $$d; then \ | |||
| $(MAKE) -C $$d libs || exit 1 ; \ | |||
| @@ -254,7 +261,7 @@ blas : | |||
| done | |||
| hpl : | |||
| ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) | |||
| $(LNCMD) $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) | |||
| for d in $(BLASDIRS) ../laswp exports ; \ | |||
| do if test -d $$d; then \ | |||
| $(MAKE) -C $$d $(@F) || exit 1 ; \ | |||
| @@ -268,7 +275,7 @@ ifeq ($(DYNAMIC_ARCH), 1) | |||
| endif | |||
| hpl_p : | |||
| ln -fs $(LIBNAME_P) $(LIBPREFIX)_p.$(LIBSUFFIX) | |||
| $(LNCMD) $(LIBNAME_P) $(LIBPREFIX)_p.$(LIBSUFFIX) | |||
| for d in $(SUBDIRS) ../laswp exports ; \ | |||
| do if test -d $$d; then \ | |||
| $(MAKE) -C $$d $(@F) || exit 1 ; \ | |||
| @@ -309,8 +316,12 @@ endif | |||
| -@echo "PNOOPT = $(LAPACK_FPFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc | |||
| ifeq ($(C_COMPILER)$(F_COMPILER)$(USE_OPENMP), CLANGGFORTRAN1) | |||
| -@echo "LDFLAGS = $(FFLAGS) $(EXTRALIB) -lomp" >> $(NETLIB_LAPACK_DIR)/make.inc | |||
| else | |||
| ifeq ($(C_COMPILER)$(F_COMPILER)$(USE_OPENMP), CLANGIBM1) | |||
| -@echo "LDFLAGS = $(FFLAGS) $(EXTRALIB) -lomp" >> $(NETLIB_LAPACK_DIR)/make.inc | |||
| else | |||
| -@echo "LDFLAGS = $(FFLAGS) $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc | |||
| endif | |||
| endif | |||
| -@echo "CC = $(CC)" >> $(NETLIB_LAPACK_DIR)/make.inc | |||
| -@echo "override CFLAGS = $(LAPACK_CFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc | |||
| @@ -401,6 +412,7 @@ lapack-runtest: lapack-test | |||
| blas-test: | |||
| (cd $(NETLIB_LAPACK_DIR)/BLAS/TESTING && rm -f x* *.out) | |||
| $(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR) blas_testing | |||
| (cd $(NETLIB_LAPACK_DIR)/BLAS/TESTING && cat *.out) | |||
| @@ -58,6 +58,13 @@ FCOMMON_OPT += -march=armv8-a -mtune=cortex-a73 | |||
| endif | |||
| endif | |||
| ifeq ($(CORE), CORTEXA76) | |||
| CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a76 | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a76 | |||
| endif | |||
| endif | |||
| ifeq ($(CORE), FT2000) | |||
| CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 | |||
| ifneq ($(F_COMPILER), NAG) | |||
| @@ -104,19 +111,25 @@ ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=armv8.4-a -mtune=neoverse-v1 | |||
| endif | |||
| else | |||
| CCOMMON_OPT += -march=armv8.4-a+sve -mtune=native | |||
| CCOMMON_OPT += -march=armv8.4-a+sve | |||
| ifneq ($(CROSS), 1) | |||
| CCOMMON_OPT += -mtune=native | |||
| endif | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=armv8.4-a -mtune=native | |||
| FCOMMON_OPT += -march=armv8.4-a | |||
| ifneq ($(CROSS), 1) | |||
| FCOMMON_OPT += -mtune=native | |||
| endif | |||
| endif | |||
| endif | |||
| else | |||
| CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72 | |||
| CCOMMON_OPT += -march=armv8.2-a+sve -mtune=cortex-a72 | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72 | |||
| endif | |||
| endif | |||
| else | |||
| CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 | |||
| CCOMMON_OPT += -march=armv8-a+sve -mtune=cortex-a72 | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 | |||
| endif | |||
| @@ -132,25 +145,31 @@ ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ11) $(ISCLANG))) | |||
| ifneq ($(OSNAME), Darwin) | |||
| CCOMMON_OPT += -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2 | |||
| else | |||
| CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72 | |||
| CCOMMON_OPT += -march=armv8.2-a+sve -mtune=cortex-a72 | |||
| endif | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2 | |||
| endif | |||
| else | |||
| CCOMMON_OPT += -march=armv8.5-a+sve -mtune=native | |||
| CCOMMON_OPT += -march=armv8.5-a+sve | |||
| ifneq ($(CROSS), 1) | |||
| CCOMMON_OPT += -mtune=native | |||
| endif | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=armv8.5-a -mtune=native | |||
| FCOMMON_OPT += -march=armv8.5-a | |||
| ifneq ($(CROSS), 1) | |||
| FCOMMON_OPT += -mtune=native | |||
| endif | |||
| endif | |||
| endif | |||
| else | |||
| CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72 | |||
| CCOMMON_OPT += -march=armv8.2-a+sve -mtune=cortex-a72 | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72 | |||
| endif | |||
| endif | |||
| else | |||
| CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 | |||
| CCOMMON_OPT += -march=armv8-a+sve -mtune=cortex-a72 | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 | |||
| endif | |||
| @@ -258,9 +277,17 @@ endif | |||
| ifeq (1, $(filter 1,$(GCCVERSIONGTEQ11) $(ISCLANG))) | |||
| ifeq ($(CORE), CORTEXX1) | |||
| CCOMMON_OPT += -march=armv8.2-a -mtune=cortexa72 | |||
| CCOMMON_OPT += -march=armv8.2-a | |||
| ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ12) $(ISCLANG))) | |||
| CCOMMON_OPT += -mtune=cortex-x1 | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-x1 | |||
| endif | |||
| else | |||
| CCOMMON_OPT += -mtune=cortex-a72 | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=armv8.2-a -mtune=cortexa72 | |||
| FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72 | |||
| endif | |||
| endif | |||
| endif | |||
| endif | |||
| @@ -271,6 +298,12 @@ CCOMMON_OPT += -march=armv8.4-a+sve | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=armv8.4-a+sve | |||
| endif | |||
| ifeq (1, $(filter 1,$(GCCVERSIONGTEQ12) $(ISCLANG))) | |||
| CCOMMON_OPT += -mtune=cortex-x2 | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -mtune=cortex-x2 | |||
| endif | |||
| endif | |||
| endif | |||
| endif | |||
| @@ -290,6 +323,12 @@ CCOMMON_OPT += -march=armv8.4-a+sve | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=armv8.4-a+sve | |||
| endif | |||
| ifeq (1, $(filter 1,$(GCCVERSIONGTEQ12) $(ISCLANG))) | |||
| CCOMMON_OPT += -mtune=cortex-a710 | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -mtune=cortex-a710 | |||
| endif | |||
| endif | |||
| endif | |||
| endif | |||
| @@ -0,0 +1,4 @@ | |||
| ifeq ($(CORE), CK860FV) | |||
| CCOMMON_OPT += -march=ck860v -mcpu=ck860fv -mfdivdu -mhard-float | |||
| FCOMMON_OPT += -march=ck860v -mcpu=ck860fv -mfdivdu -mhard-float -static | |||
| endif | |||
| @@ -2,11 +2,15 @@ TOPDIR = . | |||
| export GOTOBLAS_MAKEFILE = 1 | |||
| -include $(TOPDIR)/Makefile.conf_last | |||
| include ./Makefile.system | |||
| LNCMD = ln -fs | |||
| ifdef THELIBNAME | |||
| LIBNAME=$(THELIBNAME) | |||
| LIBSONAME=$(THELIBSONAME) | |||
| endif | |||
| ifeq ($(FIXED_LIBNAME), 1) | |||
| LNCMD = true | |||
| endif | |||
| ifeq ($(INTERFACE64),1) | |||
| USE_64BITINT=1 | |||
| endif | |||
| @@ -99,7 +103,7 @@ ifneq ($(NO_STATIC),1) | |||
| @echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) | |||
| @install -m644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" | |||
| @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ | |||
| ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) | |||
| $(LNCMD) $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) | |||
| endif | |||
| #for install shared library | |||
| ifneq ($(NO_SHARED),1) | |||
| @@ -107,21 +111,21 @@ ifneq ($(NO_SHARED),1) | |||
| ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku FreeBSD DragonFly)) | |||
| @install -m755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" | |||
| @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ | |||
| ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \ | |||
| ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION) | |||
| $(LNCMD) $(LIBSONAME) $(LIBPREFIX).so ; \ | |||
| $(LNCMD) $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION) | |||
| endif | |||
| ifeq ($(OSNAME), $(filter $(OSNAME),OpenBSD NetBSD)) | |||
| @cp $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" | |||
| @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ | |||
| ln -fs $(LIBSONAME) $(LIBPREFIX).so | |||
| $(LNCMD) $(LIBSONAME) $(LIBPREFIX).so | |||
| endif | |||
| ifeq ($(OSNAME), Darwin) | |||
| @-cp $(LIBDYNNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" | |||
| @-install_name_tool -id "$(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).$(MAJOR_VERSION).dylib" "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)" | |||
| @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ | |||
| ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib ; \ | |||
| ln -fs $(LIBDYNNAME) $(LIBPREFIX).$(MAJOR_VERSION).dylib | |||
| $(LNCMD) $(LIBDYNNAME) $(LIBPREFIX).dylib ; \ | |||
| $(LNCMD) $(LIBDYNNAME) $(LIBPREFIX).$(MAJOR_VERSION).dylib | |||
| endif | |||
| ifeq ($(OSNAME), WINNT) | |||
| @-cp $(LIBDLLNAME) "$(DESTDIR)$(OPENBLAS_BINARY_DIR)" | |||
| @@ -149,15 +153,15 @@ ifneq ($(NO_STATIC),1) | |||
| @echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) | |||
| @installbsd -c -m 644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" | |||
| @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ | |||
| ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) | |||
| $(LNCMD) $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) | |||
| endif | |||
| #for install shared library | |||
| ifneq ($(NO_SHARED),1) | |||
| @echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) | |||
| @installbsd -c -m 755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" | |||
| @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ | |||
| ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \ | |||
| ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION) | |||
| $(LNCMD) $(LIBSONAME) $(LIBPREFIX).so ; \ | |||
| $(LNCMD) $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION) | |||
| endif | |||
| endif | |||
| @@ -170,6 +174,8 @@ endif | |||
| @echo Generating $(LIBSONAMEBASE)$(SUFFIX64).pc in "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)" | |||
| @echo 'libdir='$(OPENBLAS_LIBRARY_DIR) > "$(PKGFILE)" | |||
| @echo 'libprefix='$(LIBNAMEPREFIX) >> "$(PKGFILE)" | |||
| @echo 'libnamesuffix='$(LIBNAMESUFFIX) >> "$(PKGFILE)" | |||
| @echo 'libsuffix='$(SYMBOLSUFFIX) >> "$(PKGFILE)" | |||
| @echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> "$(PKGFILE)" | |||
| @echo 'openblas_config= USE_64BITINT='$(INTERFACE64) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'DYNAMIC_OLDER='$(DYNAMIC_OLDER) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(CORE) 'MAX_THREADS='$(NUM_THREADS)>> "$(PKGFILE)" | |||
| @@ -186,7 +192,7 @@ endif | |||
| ifneq ($(NO_SHARED),1) | |||
| #ifeq logical or | |||
| ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD OpenBSD DragonFly)) | |||
| @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).so)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" | |||
| @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX)$(SYMBOLSUFFIX).so)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" | |||
| endif | |||
| ifeq ($(OSNAME), $(filter $(OSNAME),WINNT CYGWIN_NT)) | |||
| @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_BINARY_DIR}/$(LIBDLLNAME))" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" | |||
| @@ -55,6 +55,26 @@ ifeq ($(TARGET), C910V) | |||
| TARGET_FLAGS = -march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d | |||
| endif | |||
| ifeq ($(TARGET), CK860FV) | |||
| TARGET_FLAGS = -march=ck860v -mcpu=ck860fv -mfdivdu -mhard-float | |||
| endif | |||
| ifeq ($(TARGET), x280) | |||
| TARGET_FLAGS = -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d | |||
| endif | |||
| ifeq ($(TARGET), RISCV64_ZVL256B) | |||
| TARGET_FLAGS = -march=rv64imafdcv -mabi=lp64d | |||
| endif | |||
| ifeq ($(TARGET), RISCV64_ZVL128B) | |||
| TARGET_FLAGS = -march=rv64imafdcv -mabi=lp64d | |||
| endif | |||
| ifeq ($(TARGET), RISCV64_GENERIC) | |||
| TARGET_FLAGS = -march=rv64imafdc -mabi=lp64d | |||
| endif | |||
| all: getarch_2nd | |||
| ./getarch_2nd 0 >> $(TARGET_MAKE) | |||
| ./getarch_2nd 1 >> $(TARGET_CONF) | |||
| @@ -2,3 +2,19 @@ ifeq ($(CORE), C910V) | |||
| CCOMMON_OPT += -march=rv64imafdcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c920 | |||
| FCOMMON_OPT += -march=rv64imafdcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c920 -static | |||
| endif | |||
| ifeq ($(CORE), x280) | |||
| CCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh_zvl512b -mabi=lp64d -ffast-math | |||
| FCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d -static | |||
| endif | |||
| ifeq ($(CORE), RISCV64_ZVL256B) | |||
| CCOMMON_OPT += -march=rv64imafdcv_zvl256b -mabi=lp64d | |||
| FCOMMON_OPT += -march=rv64imafdcv -mabi=lp64d -static | |||
| endif | |||
| ifeq ($(CORE), RISCV64_ZVL128B) | |||
| CCOMMON_OPT += -march=rv64imafdcv -mabi=lp64d | |||
| FCOMMON_OPT += -march=rv64imafdcv -mabi=lp64d -static | |||
| endif | |||
| ifeq ($(CORE), RISCV64_GENERIC) | |||
| CCOMMON_OPT += -march=rv64imafdc -mabi=lp64d | |||
| FCOMMON_OPT += -march=rv64imafdc -mabi=lp64d -static | |||
| endif | |||
| @@ -3,7 +3,12 @@ | |||
| # | |||
| # This library's version | |||
| VERSION = 0.3.26 | |||
| VERSION = 0.3.26.dev | |||
| # If you set this prefix, the library name will be lib$(LIBNAMESUFFIX)openblas.a | |||
| # and lib$(LIBNAMESUFFIX)openblas.so, with a matching soname in the shared library | |||
| # | |||
| # LIBNAMEPREFIX = scipy | |||
| # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a | |||
| # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library | |||
| @@ -365,8 +365,9 @@ GCCVERSIONGT5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 5) | |||
| GCCVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 7) | |||
| GCCVERSIONGTEQ8 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 8) | |||
| GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9) | |||
| GCCVERSIONGTEQ11 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 11) | |||
| GCCVERSIONGTEQ10 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 10) | |||
| GCCVERSIONGTEQ11 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 11) | |||
| GCCVERSIONGTEQ12 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 12) | |||
| # Note that the behavior of -dumpversion is compile-time-configurable for | |||
| # gcc-7.x and newer. Use -dumpfullversion there | |||
| ifeq ($(GCCVERSIONGTEQ7),1) | |||
| @@ -873,6 +874,11 @@ endif | |||
| endif | |||
| endif | |||
| ifeq ($(ARCH), csky) | |||
| NO_BINARY_MODE = 1 | |||
| BINARY_DEFINED = 1 | |||
| endif | |||
| # | |||
| # C Compiler dependent settings | |||
| # | |||
| @@ -1176,7 +1182,7 @@ ifeq ($(F_COMPILER), IBM) | |||
| CCOMMON_OPT += -DF_INTERFACE_IBM | |||
| FEXTRALIB += -lxlf90 | |||
| ifeq ($(C_COMPILER), $(filter $(C_COMPILER),GCC CLANG)) | |||
| FCOMMON_OPT += -qextname | |||
| FCOMMON_OPT += -qextname -qzerosize | |||
| endif | |||
| # FCOMMON_OPT += -qarch=440 | |||
| ifdef BINARY64 | |||
| @@ -1511,16 +1517,28 @@ ifndef LIBSONAMEBASE | |||
| LIBSONAMEBASE = openblas | |||
| endif | |||
| ifndef LIBNAMEPREFIX | |||
| LIBNAMEPREFIX = | |||
| endif | |||
| SYMPREFIX=$(SYMBOLPREFIX) | |||
| ifeq ($(SYMBOLPREFIX),$(LIBNAMEPREFIX)) | |||
| SYMPREFIX= | |||
| endif | |||
| SYMSUFFIX=$(SYMBOLSUFFIX) | |||
| ifeq ($(SYMBOLSUFFIX),$(LIBNAMESUFFIX)) | |||
| SYMSUFFIX= | |||
| endif | |||
| ifndef LIBNAMESUFFIX | |||
| LIBNAMEBASE = $(SYMBOLPREFIX)$(LIBSONAMEBASE)$(SYMBOLSUFFIX) | |||
| LIBNAMEBASE = $(SYMPREFIX)$(LIBSONAMEBASE)$(SYMSUFFIX) | |||
| else | |||
| LIBNAMEBASE = $(SYMBOLPREFIX)$(LIBSONAMEBASE)$(SYMBOLSUFFIX)_$(LIBNAMESUFFIX) | |||
| LIBNAMEBASE = $(SYMPREFIX)$(LIBSONAMEBASE)$(SYMSUFFIX)$(LIBNAMESUFFIX) | |||
| endif | |||
| ifeq ($(OSNAME), CYGWIN_NT) | |||
| LIBPREFIX = cyg$(LIBNAMEBASE) | |||
| LIBPREFIX = cyg$(LIBNAMEPREFIX)$(LIBNAMEBASE) | |||
| else | |||
| LIBPREFIX = lib$(LIBNAMEBASE) | |||
| LIBPREFIX = lib$(LIBNAMEPREFIX)$(LIBNAMEBASE) | |||
| endif | |||
| KERNELDIR = $(TOPDIR)/kernel/$(ARCH) | |||
| @@ -1652,6 +1670,10 @@ ifeq ($(F_COMPILER),CRAY) | |||
| LAPACK_FFLAGS := $(filter-out -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS)) | |||
| override FFLAGS := $(filter-out -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS)) | |||
| endif | |||
| ifeq ($(F_COMPILER),FLANGNEW) | |||
| LAPACK_FFLAGS := $(filter-out -m32 -m64 -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS)) | |||
| override FFLAGS := $(filter-out -m32 -m64 -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS)) | |||
| endif | |||
| LAPACK_CFLAGS = $(CFLAGS) | |||
| LAPACK_CFLAGS += -DHAVE_LAPACK_CONFIG_H | |||
| @@ -1699,14 +1721,14 @@ LIBNAME_P = $(LIBPREFIX)p$(REVISION)_p.$(LIBSUFFIX) | |||
| endif | |||
| endif | |||
| ifeq ($(FIXED_LIBNAME),1) | |||
| LIBNAME = lib$(LIBNAMEPREFIX)$(LIBSONAMEBASE)$(LIBNAMESUFFIX).$(LIBSUFFIX) | |||
| LIBNAME_P = lib$(LIBNAMEPREFIX)$(LISOBNAMEBASE)$(LIBNAMESUFFIX)_p.$(LIBSUFFIX) | |||
| endif | |||
| LIBDLLNAME = $(LIBPREFIX).dll | |||
| IMPLIBNAME = lib$(LIBNAMEBASE).dll.a | |||
| ifneq ($(OSNAME), AIX) | |||
| LIBSONAME = $(LIBNAME:.$(LIBSUFFIX)=.so) | |||
| else | |||
| LIBSONAME = $(LIBNAME:.$(LIBSUFFIX)=.a) | |||
| endif | |||
| LIBDYNNAME = $(LIBNAME:.$(LIBSUFFIX)=.dylib) | |||
| LIBDEFNAME = $(LIBNAME:.$(LIBSUFFIX)=.def) | |||
| LIBEXPNAME = $(LIBNAME:.$(LIBSUFFIX)=.exp) | |||
| @@ -130,11 +130,11 @@ ifeq ($(C_COMPILER), GCC) | |||
| endif | |||
| endif | |||
| else ifeq ($(C_COMPILER), CLANG) | |||
| # cooperlake support was added in clang 12 | |||
| # sapphire rapids support was added in clang 12 | |||
| ifeq ($(CLANGVERSIONGTEQ12), 1) | |||
| CCOMMON_OPT += -march=cooperlake | |||
| CCOMMON_OPT += -march=sapphirerapids | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=cooperlake | |||
| FCOMMON_OPT += -march=sapphirerapids | |||
| endif | |||
| else # not supported in clang, fallback to avx512 | |||
| CCOMMON_OPT += -march=skylake-avx512 | |||
| @@ -167,6 +167,7 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th | |||
| - **Cortex A57**: Optimized Level-3 and Level-2 functions | |||
| - **Cortex A72**: same as A57 ( different cpu specifications) | |||
| - **Cortex A73**: same as A57 (different cpu specifications) | |||
| - **Cortex A76**: same as A57 (different cpu specifications) | |||
| - **Falkor**: same as A57 (different cpu specifications) | |||
| - **ThunderX**: Optimized some Level-1 functions | |||
| - **ThunderX2T99**: Optimized Level-3 BLAS and parts of Levels 1 and 2 | |||
| @@ -185,6 +186,11 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th | |||
| - **POWER9**: Optimized Level-3 BLAS (real) and some Level-1,2. PPC64LE with OpenMP only. | |||
| - **POWER10**: Optimized Level-3 BLAS including SBGEMM and some Level-1,2. | |||
| - **AIX**: Dynamic architecture with OpenXL and OpenMP. | |||
| ```sh | |||
| make CC=ibm-clang_r FC=xlf TARGET=POWER7 BINARY=64 USE_OPENMP=1 INTERFACE64=1 DYNAMIC_ARCH=1 USE_THREAD=1 | |||
| ``` | |||
| #### IBM zEnterprise System | |||
| - **Z13**: Optimized Level-3 BLAS and Level-1,2 | |||
| @@ -198,6 +204,21 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th | |||
| ``` | |||
| (also known to work on C906 as long as you use only single-precision functions - its instruction set support appears to be incomplete in double precision) | |||
| - **x280**: Level-3 BLAS and Level-1,2 are optimized by RISC-V Vector extension 1.0. | |||
| ```sh | |||
| make HOSTCC=gcc TARGET=x280 NUM_THREADS=8 CC=riscv64-unknown-linux-gnu-clang FC=riscv64-unknown-linux-gnu-gfortran | |||
| ``` | |||
| - **ZVL???B**: Level-3 BLAS and Level-1,2 including vectorised kernels targeting generic RISCV cores with vector support with registers of at least the corresponding width; ZVL128B and ZVL256B are available. | |||
| e.g.: | |||
| ```sh | |||
| make TARGET=RISCV64_ZVL256B CFLAGS="-DTARGET=RISCV64_ZVL256B" \ | |||
| BINARY=64 ARCH=riscv64 CC='clang -target riscv64-unknown-linux-gnu' \ | |||
| AR=riscv64-unknown-linux-gnu-ar AS=riscv64-unknown-linux-gnu-gcc \ | |||
| LD=riscv64-unknown-linux-gnu-gcc FC=riscv64-unknown-linux-gnu-gfortran \ | |||
| HOSTCC=gcc HOSTFC=gfortran -j | |||
| ``` | |||
| ### Support for multiple targets in a single library | |||
| OpenBLAS can be built for multiple targets with runtime detection of the target cpu by specifiying `DYNAMIC_ARCH=1` in Makefile.rule, on the gmake command line or as `-DDYNAMIC_ARCH=TRUE` in cmake. | |||
| @@ -227,7 +248,7 @@ Please note that it is not possible to combine support for different architectur | |||
| - **NetBSD**: Supported by the community. We don't actively test the library on this OS. | |||
| - **DragonFly BSD**: Supported by the community. We don't actively test the library on this OS. | |||
| - **Android**: Supported by the community. Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-build-OpenBLAS-for-Android>. | |||
| - **AIX**: Supported on PPC up to POWER8 | |||
| - **AIX**: Supported on PPC up to POWER10 | |||
| - **Haiku**: Supported by the community. We don't actively test the library on this OS. | |||
| - **SunOS**: Supported by the community. We don't actively test the library on this OS. | |||
| - **Cortex-M**: Supported by the community. Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-on-Cortex-M>. | |||
| @@ -93,6 +93,7 @@ CORTEXA53 | |||
| CORTEXA57 | |||
| CORTEXA72 | |||
| CORTEXA73 | |||
| CORTEXA76 | |||
| CORTEXA510 | |||
| CORTEXA710 | |||
| CORTEXX1 | |||
| @@ -118,8 +119,11 @@ Z13 | |||
| Z14 | |||
| 10.RISC-V 64: | |||
| RISCV64_GENERIC | |||
| RISCV64_GENERIC (e.g. PolarFire Soc/SiFive U54) | |||
| RISCV64_ZVL128B | |||
| C910V | |||
| x280 | |||
| RISCV64_ZVL256B | |||
| 11.LOONGARCH64: | |||
| LOONGSONGENERIC | |||
| @@ -133,3 +137,7 @@ E2K | |||
| EV4 | |||
| EV5 | |||
| EV6 | |||
| 14.CSKY | |||
| CSKY | |||
| CK860FV | |||
| @@ -37,6 +37,12 @@ ESSL=/opt/ibm/lib | |||
| #LIBESSL = -lesslsmp $(ESSL)/libxlomp_ser.so.1 $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.3/lib/libxl.a | |||
| LIBESSL = -lesslsmp $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.3/lib/libxl.a | |||
| # x280 temporary workaround for gfortran | |||
| ifeq ($(TARGET), x280) | |||
| CCOMMON_OPT:=$(filter-out -mllvm --riscv-v-vector-bits-min=512,$(CCOMMON_OPT)) | |||
| endif | |||
| ifneq ($(NO_LAPACK), 1) | |||
| GOTO_LAPACK_TARGETS=slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ | |||
| scholesky.goto dcholesky.goto ccholesky.goto zcholesky.goto \ | |||
| @@ -3436,4 +3442,4 @@ smallscaling: smallscaling.c ../$(LIBNAME) | |||
| clean :: | |||
| @rm -f *.goto *.mkl *.acml *.atlas *.veclib *.essl smallscaling | |||
| include $(TOPDIR)/Makefile.tail | |||
| include $(TOPDIR)/Makefile.tail | |||
| @@ -92,7 +92,7 @@ int main(int argc, char *argv[]){ | |||
| if ((p = getenv("OPENBLAS_TEST"))) btest=*p; | |||
| if ((p = getenv("OPENBLAS_LOOPS"))) loops=*p; | |||
| if ((p = getenv("OPENBLAS_LOOPS"))) loops=atoi(p); | |||
| fprintf(stderr, "From : %3d To : %3d Step = %3d\n", from, to, step); | |||
| @@ -85,7 +85,7 @@ int main(int argc, char *argv[]){ | |||
| double time1, time2, timeg1,timeg2; | |||
| char *p; | |||
| if ((p = getenv("OPENBLAS_LOOPS"))) loops=*p; | |||
| if ((p = getenv("OPENBLAS_LOOPS"))) loops=atoi(p); | |||
| argc--;argv++; | |||
| @@ -120,7 +120,7 @@ int main(int argc, char *argv[]){ | |||
| if ((p = getenv("OPENBLAS_TEST"))) btest=*p; | |||
| if ((p = getenv("OPENBLAS_LOOPS"))) loops=*p; | |||
| if ((p = getenv("OPENBLAS_LOOPS"))) loops=atoi(p); | |||
| fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = %c\n", from, to, step,*uplo[uplos]); | |||
| @@ -54,7 +54,7 @@ int main(int argc, char *argv[]){ | |||
| int step = 1; | |||
| int loops = 1; | |||
| if ((p = getenv("OPENBLAS_LOOPS"))) loops=*p; | |||
| if ((p = getenv("OPENBLAS_LOOPS"))) loops=atoi(p); | |||
| double time1,timeg; | |||
| @@ -91,6 +91,7 @@ case "$data" in | |||
| *ARCH_ZARCH*) architecture=zarch ;; | |||
| *ARCH_RISCV64*) architecture=riscv64 ;; | |||
| *ARCH_LOONGARCH64*) architecture=loongarch64 ;; | |||
| *ARCH_CSKY*) architecture=csky ;; | |||
| esac | |||
| defined=0 | |||
| @@ -236,6 +237,7 @@ case "$data" in | |||
| *ARCH_ARM*) architecture=arm ;; | |||
| *ARCH_ZARCH*) architecture=zarch ;; | |||
| *ARCH_LOONGARCH64*) architecture=loongarch64 ;; | |||
| *ARCH_CSKY*) architecture=csky ;; | |||
| esac | |||
| binformat='bin32' | |||
| @@ -244,6 +246,7 @@ case "$data" in | |||
| esac | |||
| no_avx512=0 | |||
| no_avx512bf=0 | |||
| if [ "$architecture" = "x86" ] || [ "$architecture" = "x86_64" ]; then | |||
| tmpd=$(mktemp -d 2>/dev/null || mktemp -d -t 'OBC') | |||
| tmpf="$tmpd/a.c" | |||
| @@ -262,6 +265,25 @@ if [ "$architecture" = "x86" ] || [ "$architecture" = "x86_64" ]; then | |||
| } | |||
| rm -rf "$tmpd" | |||
| if [ "$no_avx512" -eq 0 ]; then | |||
| tmpd=$(mktemp -d 2>/dev/null || mktemp -d -t 'OBC') | |||
| tmpf="$tmpd/a.c" | |||
| code='"__m512 a= _mm512_dpbf16_ps(a, (__m512bh) _mm512_loadu_si512(%1]), (__m512bh) _mm512_loadu_si512(%2]));"' | |||
| printf "#include <immintrin.h>\n\nint main(void){ %s; }\n" "$code" >> "$tmpf" | |||
| if [ "$compiler" = "PGI" ]; then | |||
| args=" -tp cooperlake -c -o $tmpf.o $tmpf" | |||
| else | |||
| args=" -march=cooperlake -c -o $tmpf.o $tmpf" | |||
| fi | |||
| no_avx512bf=0 | |||
| { | |||
| $compiler_name $flags $args >/dev/null 2>&1 | |||
| } || { | |||
| no_avx512bf=1 | |||
| } | |||
| rm -rf "$tmpd" | |||
| fi | |||
| fi | |||
| no_rv64gv=0 | |||
| @@ -409,6 +431,7 @@ done | |||
| [ "$makefile" = "-" ] && { | |||
| [ "$no_rv64gv" -eq 1 ] && printf "NO_RV64GV=1\n" | |||
| [ "$no_avx512" -eq 1 ] && printf "NO_AVX512=1\n" | |||
| [ "$no_avx512bf" -eq 1 ] && printf "NO_AVX512BF16=1\n" | |||
| [ "$no_avx2" -eq 1 ] && printf "NO_AVX2=1\n" | |||
| [ "$oldgcc" -eq 1 ] && printf "OLDGCC=1\n" | |||
| exit 0 | |||
| @@ -437,6 +460,7 @@ done | |||
| [ "$no_sve" -eq 1 ] && printf "NO_SVE=1\n" | |||
| [ "$no_rv64gv" -eq 1 ] && printf "NO_RV64GV=1\n" | |||
| [ "$no_avx512" -eq 1 ] && printf "NO_AVX512=1\n" | |||
| [ "$no_avx512bf" -eq 1 ] && printf "NO_AVX512BF16=1\n" | |||
| [ "$no_avx2" -eq 1 ] && printf "NO_AVX2=1\n" | |||
| [ "$oldgcc" -eq 1 ] && printf "OLDGCC=1\n" | |||
| [ "$no_lsx" -eq 1 ] && printf "NO_LSX=1\n" | |||
| @@ -97,6 +97,7 @@ $architecture = arm64 if ($data =~ /ARCH_ARM64/); | |||
| $architecture = zarch if ($data =~ /ARCH_ZARCH/); | |||
| $architecture = riscv64 if ($data =~ /ARCH_RISCV64/); | |||
| $architecture = loongarch64 if ($data =~ /ARCH_LOONGARCH64/); | |||
| $architecture = csky if ($data =~ /ARCH_CSKY/); | |||
| $defined = 0; | |||
| @@ -156,6 +157,11 @@ if ($architecture eq "loongarch64") { | |||
| $binary = 64; | |||
| } | |||
| if ($architecture eq "csky") { | |||
| $defined = 1; | |||
| $binary = 32; | |||
| } | |||
| if ($compiler eq "PGI") { | |||
| $compiler_name .= " -tp p7" if ($binary eq "32"); | |||
| $compiler_name .= " -tp p7-64" if ($binary eq "64"); | |||
| @@ -284,6 +290,7 @@ $architecture = arm if ($data =~ /ARCH_ARM/); | |||
| $architecture = arm64 if ($data =~ /ARCH_ARM64/); | |||
| $architecture = zarch if ($data =~ /ARCH_ZARCH/); | |||
| $architecture = loongarch64 if ($data =~ /ARCH_LOONGARCH64/); | |||
| $architecture = csky if ($data =~ /ARCH_CSKY/); | |||
| $binformat = bin32; | |||
| $binformat = bin64 if ($data =~ /BINARY_64/); | |||
| @@ -12,6 +12,7 @@ extern "C" { | |||
| /*Set the number of threads on runtime.*/ | |||
| void openblas_set_num_threads(int num_threads); | |||
| void goto_set_num_threads(int num_threads); | |||
| int openblas_set_num_threads_local(int num_threads); | |||
| /*Get the number of threads on runtime.*/ | |||
| int openblas_get_num_threads(void); | |||
| @@ -100,6 +101,16 @@ CBLAS_INDEX cblas_idamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPE | |||
| CBLAS_INDEX cblas_icamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); | |||
| CBLAS_INDEX cblas_izamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); | |||
| float cblas_samax(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx); | |||
| double cblas_damax(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx); | |||
| float cblas_scamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); | |||
| double cblas_dzamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); | |||
| float cblas_samin(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx); | |||
| double cblas_damin(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx); | |||
| float cblas_scamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); | |||
| double cblas_dzamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); | |||
| CBLAS_INDEX cblas_ismax(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx); | |||
| CBLAS_INDEX cblas_idmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx); | |||
| CBLAS_INDEX cblas_icmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); | |||
| @@ -115,6 +126,9 @@ void cblas_daxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double alpha, OPENBLAS | |||
| void cblas_caxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy); | |||
| void cblas_zaxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy); | |||
| void cblas_caxpyc(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy); | |||
| void cblas_zaxpyc(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy); | |||
| void cblas_scopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, float *y, OPENBLAS_CONST blasint incy); | |||
| void cblas_dcopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, double *y, OPENBLAS_CONST blasint incy); | |||
| void cblas_ccopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy); | |||
| @@ -289,6 +303,14 @@ void cblas_zgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLA | |||
| void cblas_zgemm3m(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, | |||
| OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc); | |||
| void cblas_sgemmt(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint K, | |||
| OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc); | |||
| void cblas_dgemmt(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint K, | |||
| OPENBLAS_CONST double alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST double beta, double *C, OPENBLAS_CONST blasint ldc); | |||
| void cblas_cgemmt(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint K, | |||
| OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc); | |||
| void cblas_zgemmt(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint K, | |||
| OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc); | |||
| void cblas_ssymm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, | |||
| OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc); | |||
| @@ -64,6 +64,7 @@ else () | |||
| "#define NEEDBUNDERSCORE 1\n") | |||
| endif() | |||
| if (CMAKE_Fortran_COMPILER) | |||
| get_filename_component(F_COMPILER ${CMAKE_Fortran_COMPILER} NAME_WE) | |||
| string(TOUPPER ${F_COMPILER} F_COMPILER) | |||
| endif() | |||
| @@ -6,9 +6,6 @@ | |||
| if (${F_COMPILER} STREQUAL "FLANG" AND NOT CMAKE_Fortran_COMPILER_ID STREQUAL "LLVMFlang") | |||
| # This is for classic Flang. LLVM Flang is handled with gfortran below. | |||
| set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_FLANG") | |||
| if (BINARY64 AND INTERFACE64) | |||
| set(FCOMMON_OPT "${FCOMMON_OPT} -i8") | |||
| endif () | |||
| if (USE_OPENMP) | |||
| set(FCOMMON_OPT "${FCOMMON_OPT} -fopenmp") | |||
| endif () | |||
| @@ -55,6 +52,9 @@ if (${F_COMPILER} STREQUAL "GFORTRAN" OR ${F_COMPILER} STREQUAL "F95" OR CMAKE_F | |||
| if (MIPS64) | |||
| if (BINARY64) | |||
| set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=64") | |||
| if (INTERFACE64) | |||
| set(FCOMMON_OPT "${FCOMMON_OPT} -fdefault-integer-8") | |||
| endif () | |||
| else () | |||
| set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=n32") | |||
| endif () | |||
| @@ -83,9 +83,14 @@ if (${F_COMPILER} STREQUAL "GFORTRAN" OR ${F_COMPILER} STREQUAL "F95" OR CMAKE_F | |||
| endif () | |||
| endif () | |||
| endif () | |||
| if (ARM64 AND INTERFACE64) | |||
| set(FCOMMON_OPT "${FCOMMON_OPT} -fdefault-integer-8") | |||
| endif () | |||
| else () | |||
| if (BINARY64) | |||
| if (NOT CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*") | |||
| set(FCOMMON_OPT "${FCOMMON_OPT} -m64") | |||
| endif () | |||
| if (INTERFACE64) | |||
| if (CMAKE_Fortran_COMPILER_ID STREQUAL "Intel") | |||
| if (WIN32) | |||
| @@ -98,7 +103,9 @@ if (${F_COMPILER} STREQUAL "GFORTRAN" OR ${F_COMPILER} STREQUAL "F95" OR CMAKE_F | |||
| endif () | |||
| endif () | |||
| else () | |||
| set(FCOMMON_OPT "${FCOMMON_OPT} -m32") | |||
| if (NOT CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*") | |||
| set(FCOMMON_OPT "${FCOMMON_OPT} -m32") | |||
| endif () | |||
| endif () | |||
| endif () | |||
| @@ -1,4 +1,6 @@ | |||
| libdir=@CMAKE_INSTALL_FULL_LIBDIR@ | |||
| libnameprefix=@LIBNAMEPREFIX@ | |||
| libnamesuffix=@LIBNAMESUFFIX@ | |||
| libsuffix=@SUFFIX64_UNDERSCORE@ | |||
| includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@ | |||
| @@ -7,5 +9,5 @@ Name: OpenBLAS | |||
| Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version | |||
| Version: @OpenBLAS_VERSION@ | |||
| URL: https://github.com/OpenMathLib/OpenBLAS | |||
| Libs: @OpenMP_C_FLAGS@ -L${libdir} -lopenblas${libsuffix} | |||
| Libs: @OpenMP_C_FLAGS@ -L${libdir} -l${libnameprefix}openblas${libnamesuffix}${libsuffix} | |||
| Cflags: -I${includedir} | |||
| @@ -932,7 +932,7 @@ endif () | |||
| set(ZGEMM_UNROLL_M 4) | |||
| set(ZGEMM_UNROLL_N 4) | |||
| set(SYMV_P 16) | |||
| elseif ("${TCORE}" STREQUAL "CORTEXA72" OR "${TCORE}" STREQUAL "CORTEXA73") | |||
| elseif ("${TCORE}" STREQUAL "CORTEXA72" OR "${TCORE}" STREQUAL "CORTEXA73" OR "${TCORE}" STREQUAL "CORTEXA76") | |||
| file(APPEND ${TARGET_CONF_TEMP} | |||
| "#define L1_CODE_SIZE\t49152\n" | |||
| "#define L1_CODE_LINESIZE\t64\n" | |||
| @@ -501,10 +501,11 @@ set(CCOMMON_OPT "${CCOMMON_OPT} -DBLAS3_MEM_ALLOC_THRESHOLD=${BLAS3_MEM_ALLOC_TH | |||
| endif() | |||
| endif() | |||
| endif() | |||
| set(LIBPREFIX "lib${LIBNAMEPREFIX}openblas") | |||
| if (DEFINED LIBNAMESUFFIX) | |||
| set(LIBPREFIX "libopenblas_${LIBNAMESUFFIX}") | |||
| else () | |||
| set(LIBPREFIX "libopenblas") | |||
| set(LIBPREFIX "${LIBNAMEPREFIX}_${LIBNAMESUFFIX}") | |||
| endif () | |||
| if (NOT DEFINED SYMBOLPREFIX) | |||
| @@ -615,13 +616,19 @@ if (${CMAKE_SYSTEM_NAME} STREQUAL "Windows") | |||
| endforeach () | |||
| endif () | |||
| if ("${F_COMPILER}" STREQUAL "NAG" OR "${F_COMPILER}" STREQUAL "CRAY") | |||
| if (CMAKE_Fortran_COMPILER) | |||
| if (${F_COMPILER} STREQUAL "NAG" OR ${F_COMPILER} STREQUAL "CRAY" OR CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*") | |||
| set(FILTER_FLAGS "-msse3;-mssse3;-msse4.1;-mavx;-mavx2,-mskylake-avx512") | |||
| if (CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*") | |||
| message(STATUS "removing fortran flags") | |||
| set(FILTER_FLAGS "${FILTER_FLAGS};-m32;-m64") | |||
| endif () | |||
| foreach (FILTER_FLAG ${FILTER_FLAGS}) | |||
| string(REPLACE ${FILTER_FLAG} "" LAPACK_FFLAGS ${LAPACK_FFLAGS}) | |||
| string(REPLACE ${FILTER_FLAG} "" LAPACK_FPFLAGS ${LAPACK_FPFLAGS}) | |||
| endforeach () | |||
| endif () | |||
| endif () | |||
| if ("${F_COMPILER}" STREQUAL "GFORTRAN") | |||
| # lapack-netlib is rife with uninitialized warnings -hpa | |||
| @@ -679,6 +686,10 @@ else () | |||
| endif () | |||
| endif () | |||
| if (DEFINED FIXED_LIBNAME) | |||
| set (LIBNAME "${LIBPREFIX}.${LIBSUFFIX}") | |||
| set (LIBNAME "${LIBPREFIX}_p.${LIBSUFFIX}") | |||
| endif() | |||
| set(LIBDLLNAME "${LIBPREFIX}.dll") | |||
| set(LIBSONAME "${LIBNAME}.${LIBSUFFIX}.so") | |||
| @@ -358,12 +358,6 @@ typedef int blasint; | |||
| #define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop; \n"); | |||
| #endif | |||
| #ifdef BULLDOZER | |||
| #ifndef YIELDING | |||
| #define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n"); | |||
| #endif | |||
| #endif | |||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | |||
| #ifndef YIELDING | |||
| @@ -371,21 +365,13 @@ typedef int blasint; | |||
| #endif | |||
| #endif | |||
| /* | |||
| #ifdef PILEDRIVER | |||
| #ifndef YIELDING | |||
| #define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n"); | |||
| #endif | |||
| #endif | |||
| */ | |||
| /* | |||
| #ifdef STEAMROLLER | |||
| #if defined(ARCH_X86_64) | |||
| #ifndef YIELDING | |||
| #define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n"); | |||
| #endif | |||
| #endif | |||
| */ | |||
| #ifdef __EMSCRIPTEN__ | |||
| #define YIELDING | |||
| @@ -396,7 +382,7 @@ typedef int blasint; | |||
| #endif | |||
| /*** | |||
| To alloc job_t on heap or statck. | |||
| To alloc job_t on heap or stack. | |||
| please https://github.com/xianyi/OpenBLAS/issues/246 | |||
| ***/ | |||
| #if defined(OS_WINDOWS) | |||
| @@ -482,6 +468,10 @@ please https://github.com/xianyi/OpenBLAS/issues/246 | |||
| #include "common_e2k.h" | |||
| #endif | |||
| #ifdef ARCH_CSKY | |||
| #include "common_csky.h" | |||
| #endif | |||
| #ifndef ASSEMBLER | |||
| #ifdef OS_WINDOWSSTORE | |||
| typedef char env_var_t[MAX_PATH]; | |||
| @@ -0,0 +1,56 @@ | |||
| /***************************************************************************** | |||
| Copyright (c) 2011-2015, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written | |||
| permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| **********************************************************************************/ | |||
| #ifndef COMMON_CSKY | |||
| #define COMMON_CSKY | |||
| #define MB __sync_synchronize() | |||
| #define WMB __sync_synchronize() | |||
| #define RMB __sync_synchronize() | |||
| #define INLINE inline | |||
| #ifndef ASSEMBLER | |||
| static inline int blas_quickdivide(blasint x, blasint y){ | |||
| return x / y; | |||
| } | |||
| #endif | |||
| #define BUFFER_SIZE ( 32 << 20) | |||
| #define SEEK_ADDRESS | |||
| #endif | |||
| @@ -498,6 +498,15 @@ void BLASFUNC(zgemm3m)(char *, char *, blasint *, blasint *, blasint *, double * | |||
| void BLASFUNC(xgemm3m)(char *, char *, blasint *, blasint *, blasint *, xdouble *, | |||
| xdouble *, blasint *, xdouble *, blasint *, xdouble *, xdouble *, blasint *); | |||
| void BLASFUNC(sgemmt)(char*, char *, char *, blasint *, blasint *, float *, | |||
| float *, blasint *, float *, blasint *, float *, float *, blasint *); | |||
| void BLASFUNC(dgemmt)(char*, char *, char *, blasint *, blasint *, double *, | |||
| double *, blasint *, double *, blasint *, double *, double *, blasint *); | |||
| void BLASFUNC(cgemmt)(char*, char *, char *, blasint *, blasint *, float *, | |||
| float *, blasint *, float *, blasint *, float *, float *, blasint *); | |||
| void BLASFUNC(zgemmt)(char*, char *, char *, blasint *, blasint *, double *, | |||
| double *, blasint *, double *, blasint *, double *, double *, blasint *); | |||
| int BLASFUNC(sge2mm)(char *, char *, char *, blasint *, blasint *, | |||
| float *, float *, blasint *, float *, blasint *, | |||
| float *, float *, blasint *); | |||
| @@ -764,8 +773,8 @@ xdouble BLASFUNC(qlamc3)(xdouble *, xdouble *); | |||
| void BLASFUNC(saxpby) (blasint *, float *, float *, blasint *, float *, float *, blasint *); | |||
| void BLASFUNC(daxpby) (blasint *, double *, double *, blasint *, double *, double *, blasint *); | |||
| void BLASFUNC(caxpby) (blasint *, float *, float *, blasint *, float *, float *, blasint *); | |||
| void BLASFUNC(zaxpby) (blasint *, double *, double *, blasint *, double *, double *, blasint *); | |||
| void BLASFUNC(caxpby) (blasint *, void *, float *, blasint *, void *, float *, blasint *); | |||
| void BLASFUNC(zaxpby) (blasint *, void *, double *, blasint *, void *, double *, blasint *); | |||
| void BLASFUNC(somatcopy) (char *, char *, blasint *, blasint *, float *, float *, blasint *, float *, blasint *); | |||
| void BLASFUNC(domatcopy) (char *, char *, blasint *, blasint *, double *, double *, blasint *, double *, blasint *); | |||
| @@ -91,8 +91,26 @@ static inline int blas_quickdivide(blasint x, blasint y){ | |||
| #define BUFFER_SIZE ( 32 << 20) | |||
| #define SEEK_ADDRESS | |||
| #if defined(C910V) | |||
| #include <riscv_vector.h> | |||
| #if defined(C910V) || defined(RISCV64_ZVL256B) || defined(RISCV64_ZVL128B) || defined(x280) | |||
| # include <riscv_vector.h> | |||
| #endif | |||
| #if defined( __riscv_xtheadc ) && defined( __riscv_v ) && ( __riscv_v <= 7000 ) | |||
| // t-head toolchain uses obsolete rvv intrinsics, can't build for C910V without this | |||
| #define RISCV_0p10_INTRINSICS | |||
| #define RISCV_RVV(x) x | |||
| #else | |||
| #define RISCV_RVV(x) __riscv_ ## x | |||
| #endif | |||
| #if defined(C910V) || defined(RISCV64_ZVL256B) | |||
| # if !defined(DOUBLE) | |||
| # define EXTRACT_FLOAT(v) RISCV_RVV(vfmv_f_s_f32m1_f32)(v) | |||
| # else | |||
| # define EXTRACT_FLOAT(v) RISCV_RVV(vfmv_f_s_f64m1_f64)(v) | |||
| # endif | |||
| #else | |||
| # define EXTRACT_FLOAT(v) (v[0]) | |||
| #endif | |||
| #endif | |||
| @@ -137,19 +137,20 @@ typedef struct blas_queue { | |||
| extern int blas_server_avail; | |||
| extern int blas_omp_number_max; | |||
| extern int blas_omp_threads_local; | |||
| static __inline int num_cpu_avail(int level) { | |||
| #ifdef USE_OPENMP | |||
| int openmp_nthreads; | |||
| openmp_nthreads=omp_get_max_threads(); | |||
| if (omp_in_parallel()) openmp_nthreads = blas_omp_threads_local; | |||
| #endif | |||
| #ifndef USE_OPENMP | |||
| if (blas_cpu_number == 1 | |||
| #endif | |||
| #ifdef USE_OPENMP | |||
| if (openmp_nthreads == 1 || omp_in_parallel() | |||
| #else | |||
| if (openmp_nthreads == 1 | |||
| #endif | |||
| ) return 1; | |||
| @@ -42,6 +42,7 @@ size_t length64=sizeof(value64); | |||
| #define CPU_CORTEXA57 3 | |||
| #define CPU_CORTEXA72 4 | |||
| #define CPU_CORTEXA73 5 | |||
| #define CPU_CORTEXA76 23 | |||
| #define CPU_NEOVERSEN1 11 | |||
| #define CPU_NEOVERSEV1 16 | |||
| #define CPU_NEOVERSEN2 17 | |||
| @@ -89,7 +90,8 @@ static char *cpuname[] = { | |||
| "CORTEXX2", | |||
| "CORTEXA510", | |||
| "CORTEXA710", | |||
| "FT2000" | |||
| "FT2000", | |||
| "CORTEXA76" | |||
| }; | |||
| static char *cpuname_lower[] = { | |||
| @@ -115,7 +117,8 @@ static char *cpuname_lower[] = { | |||
| "cortexx2", | |||
| "cortexa510", | |||
| "cortexa710", | |||
| "ft2000" | |||
| "ft2000", | |||
| "cortexa76" | |||
| }; | |||
| int get_feature(char *search) | |||
| @@ -210,6 +213,8 @@ int detect(void) | |||
| return CPU_CORTEXX2; | |||
| else if (strstr(cpu_part, "0xd4e")) //X3 | |||
| return CPU_CORTEXX2; | |||
| else if (strstr(cpu_part, "0xd0b")) | |||
| return CPU_CORTEXA76; | |||
| } | |||
| // Qualcomm | |||
| else if (strstr(cpu_implementer, "0x51") && strstr(cpu_part, "0xc00")) | |||
| @@ -391,6 +396,7 @@ void get_cpuconfig(void) | |||
| break; | |||
| case CPU_NEOVERSEV1: | |||
| case CPU_CORTEXA76: | |||
| printf("#define %s\n", cpuname[d]); | |||
| printf("#define L1_CODE_SIZE 65536\n"); | |||
| printf("#define L1_CODE_LINESIZE 64\n"); | |||
| @@ -70,12 +70,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #define CPU_GENERIC 0 | |||
| #define CPU_C910V 1 | |||
| #define CPU_GENERIC 0 | |||
| #define CPU_C910V 1 | |||
| #define CPU_x280 2 | |||
| #define CPU_RISCV64_ZVL256B 3 | |||
| #define CPU_RISCV64_ZVL128B 4 | |||
| static char *cpuname[] = { | |||
| "RISCV64_GENERIC", | |||
| "C910V" | |||
| "C910V", | |||
| "x280", | |||
| "CPU_RISCV64_ZVL256B", | |||
| "CPU_RISCV64_ZVL128B" | |||
| }; | |||
| static char *cpuname_lower[] = { | |||
| "riscv64_generic", | |||
| "c910v", | |||
| "x280", | |||
| "riscv64_zvl256b", | |||
| "riscv64_zvl128b" | |||
| }; | |||
| int detect(void){ | |||
| @@ -86,23 +100,29 @@ int detect(void){ | |||
| char *pmodel = NULL, *pisa = NULL; | |||
| infile = fopen("/proc/cpuinfo", "r"); | |||
| if (!infile) | |||
| return CPU_GENERIC; | |||
| while (fgets(buffer, sizeof(buffer), infile)){ | |||
| if(!strncmp(buffer, "model name", 10)){ | |||
| strcpy(model_buffer, buffer); | |||
| pmodel = strchr(isa_buffer, ':') + 1; | |||
| pmodel = strchr(model_buffer, ':'); | |||
| if (pmodel) | |||
| pmodel++; | |||
| } | |||
| if(!strncmp(buffer, "isa", 3)){ | |||
| strcpy(isa_buffer, buffer); | |||
| pisa = strchr(isa_buffer, '4') + 1; | |||
| pisa = strchr(isa_buffer, '4'); | |||
| if (pisa) | |||
| pisa++; | |||
| } | |||
| } | |||
| fclose(infile); | |||
| if (!pmodel) | |||
| if (!pmodel || !pisa) | |||
| return(CPU_GENERIC); | |||
| if (strstr(pmodel, check_c910_str) && strchr(pisa, 'v')) | |||
| return CPU_C910V; | |||
| @@ -140,5 +160,5 @@ void get_cpuconfig(void){ | |||
| } | |||
| void get_libname(void){ | |||
| printf("riscv64\n"); | |||
| printf("%s", cpuname_lower[detect()]); | |||
| } | |||
| @@ -173,6 +173,10 @@ HAVE_C11 | |||
| ARCH_E2K | |||
| #endif | |||
| #if defined(__csky__) | |||
| ARCH_CSKY | |||
| #endif | |||
| #if defined(__EMSCRIPTEN__) | |||
| ARCH_RISCV64 | |||
| OS_WINDOWS | |||
| @@ -40,6 +40,10 @@ else() | |||
| c_${float_char}blas1.c) | |||
| endif() | |||
| target_link_libraries(x${float_char}cblat1 ${OpenBLAS_LIBNAME}) | |||
| if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang)) | |||
| string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}") | |||
| target_link_libraries(x${float_char}cblat1 omp pthread) | |||
| endif() | |||
| if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX") | |||
| target_link_libraries(x${float_char}cblat1 m) | |||
| endif() | |||
| @@ -65,6 +69,10 @@ else() | |||
| constant.c) | |||
| endif() | |||
| target_link_libraries(x${float_char}cblat2 ${OpenBLAS_LIBNAME}) | |||
| if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang)) | |||
| string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}") | |||
| target_link_libraries(x${float_char}cblat2 omp pthread) | |||
| endif() | |||
| if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX") | |||
| target_link_libraries(x${float_char}cblat2 m) | |||
| endif() | |||
| @@ -80,6 +88,17 @@ if (NOT NOFORTRAN) | |||
| auxiliary.c | |||
| c_xerbla.c | |||
| constant.c) | |||
| if (USE_GEMM3M) | |||
| if ((${float_char} STREQUAL "c") OR (${float_char} STREQUAL "z")) | |||
| add_executable(x${float_char}cblat3_3m | |||
| c_${float_char}blat3_3m.f | |||
| c_${float_char}blas3_3m.c | |||
| c_${float_char}3chke_3m.c | |||
| auxiliary.c | |||
| c_xerbla.c | |||
| constant.c) | |||
| endif() | |||
| endif() | |||
| else() | |||
| add_executable(x${float_char}cblat3 | |||
| c_${float_char}blat3c.c | |||
| @@ -88,12 +107,44 @@ else() | |||
| auxiliary.c | |||
| c_xerbla.c | |||
| constant.c) | |||
| if (USE_GEMM3M) | |||
| if ((${float_char} STREQUAL "c") OR (${float_char} STREQUAL "z")) | |||
| add_executable(x${float_char}cblat3_3m | |||
| c_${float_char}blat3c_3m.c | |||
| c_${float_char}blas3_3m.c | |||
| c_${float_char}3chke_3m.c | |||
| auxiliary.c | |||
| c_xerbla.c | |||
| constant.c) | |||
| endif() | |||
| endif() | |||
| endif() | |||
| target_link_libraries(x${float_char}cblat3 ${OpenBLAS_LIBNAME}) | |||
| if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang)) | |||
| string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}") | |||
| target_link_libraries(x${float_char}cblat3 omp pthread) | |||
| endif() | |||
| if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX") | |||
| target_link_libraries(x${float_char}cblat3 m) | |||
| endif() | |||
| if (USE_GEMM3M) | |||
| if ((${float_char} STREQUAL "c") OR (${float_char} STREQUAL "z")) | |||
| target_link_libraries(x${float_char}cblat3_3m ${OpenBLAS_LIBNAME}) | |||
| if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang)) | |||
| string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}") | |||
| target_link_libraries(x${float_char}cblat3 omp pthread) | |||
| endif() | |||
| if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX") | |||
| target_link_libraries(x${float_char}cblat3_3m m) | |||
| endif() | |||
| endif() | |||
| endif() | |||
| add_test(NAME "x${float_char}cblat3" | |||
| COMMAND ${test_helper} $<TARGET_FILE:x${float_char}cblat3> "${PROJECT_SOURCE_DIR}/ctest/${float_char}in3") | |||
| if (USE_GEMM3M) | |||
| if ((${float_char} STREQUAL "c") OR (${float_char} STREQUAL "z")) | |||
| add_test(NAME "x${float_char}cblat3_3m" | |||
| COMMAND ${test_helper} $<TARGET_FILE:x${float_char}cblat3_3m> "${PROJECT_SOURCE_DIR}/ctest/${float_char}in3_3m") | |||
| endif() | |||
| endif() | |||
| endforeach() | |||
| @@ -5,6 +5,24 @@ | |||
| TOPDIR = .. | |||
| include $(TOPDIR)/Makefile.system | |||
| SUPPORT_GEMM3M = 0 | |||
| ifeq ($(ARCH), x86) | |||
| SUPPORT_GEMM3M = 1 | |||
| endif | |||
| ifeq ($(ARCH), x86_64) | |||
| SUPPORT_GEMM3M = 1 | |||
| endif | |||
| ifeq ($(ARCH), ia64) | |||
| SUPPORT_GEMM3M = 1 | |||
| endif | |||
| ifeq ($(ARCH), MIPS) | |||
| SUPPORT_GEMM3M = 1 | |||
| endif | |||
| override CFLAGS += -DADD$(BU) -DCBLAS | |||
| ifeq ($(F_COMPILER),GFORTRAN) | |||
| override FFLAGS += -fno-tree-vectorize | |||
| @@ -144,9 +162,15 @@ all3targets += xdcblat3 | |||
| endif | |||
| ifeq ($(BUILD_COMPLEX),1) | |||
| all3targets += xccblat3 | |||
| ifeq ($(SUPPORT_GEMM3M),1) | |||
| all3targets += xccblat3_3m | |||
| endif | |||
| endif | |||
| ifeq ($(BUILD_COMPLEX16),1) | |||
| all3targets += xzcblat3 | |||
| ifeq ($(SUPPORT_GEMM3M),1) | |||
| all3targets += xzcblat3_3m | |||
| endif | |||
| endif | |||
| all3: $(all3targets) | |||
| @@ -181,9 +205,9 @@ endif | |||
| endif | |||
| endif | |||
| all3_3m: xzcblat3_3m xccblat3_3m | |||
| ifeq ($(SUPPORT_GEMM3M),1) | |||
| ifeq ($(USE_OPENMP), 1) | |||
| ifeq ($(BUILD_SINGLE),1) | |||
| ifeq ($(BUILD_COMPLEX),1) | |||
| OMP_NUM_THREADS=2 ./xccblat3_3m < cin3_3m | |||
| endif | |||
| ifeq ($(BUILD_COMPLEX16),1) | |||
| @@ -197,6 +221,7 @@ ifeq ($(BUILD_COMPLEX16),1) | |||
| OPENBLAS_NUM_THREADS=2 ./xzcblat3_3m < zin3_3m | |||
| endif | |||
| endif | |||
| endif | |||
| @@ -218,6 +243,9 @@ ifeq ($(F_COMPILER), IBM) | |||
| ifeq ($(C_COMPILER), GCC) | |||
| CEXTRALIB += -lgomp | |||
| endif | |||
| ifeq ($(C_COMPILER), CLANG) | |||
| CEXTRALIB += -lomp | |||
| endif | |||
| endif | |||
| endif | |||
| @@ -268,8 +296,10 @@ xccblat2: $(ctestl2o) c_cblat2.o $(TOPDIR)/$(LIBNAME) | |||
| $(FC) $(FLDFLAGS) -o xccblat2 c_cblat2.o $(ctestl2o) $(LIB) $(EXTRALIB) $(CEXTRALIB) | |||
| xccblat3: $(ctestl3o) c_cblat3.o $(TOPDIR)/$(LIBNAME) | |||
| $(FC) $(FLDFLAGS) -o xccblat3 c_cblat3.o $(ctestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB) | |||
| ifeq ($(SUPPORT_GEMM3M),1) | |||
| xccblat3_3m: $(ctestl3o_3m) c_cblat3_3m.o $(TOPDIR)/$(LIBNAME) | |||
| $(FC) $(FLDFLAGS) -o xccblat3_3m c_cblat3_3m.o $(ctestl3o_3m) $(LIB) $(EXTRALIB) $(CEXTRALIB) | |||
| endif | |||
| else | |||
| xccblat1: $(ctestl1o) c_cblat1c.o $(TOPDIR)/$(LIBNAME) | |||
| $(CC) $(CFLAGS) -o xccblat1 c_cblat1c.o $(ctestl1o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB)) | |||
| @@ -277,6 +307,10 @@ xccblat2: $(ctestl2o) c_cblat2c.o $(TOPDIR)/$(LIBNAME) | |||
| $(CC) $(CFLAGS) -o xccblat2 c_cblat2c.o $(ctestl2o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB)) | |||
| xccblat3: $(ctestl3o) c_cblat3c.o $(TOPDIR)/$(LIBNAME) | |||
| $(CC) $(CFLAGS) -o xccblat3 c_cblat3c.o $(ctestl3o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB)) | |||
| ifeq ($(SUPPORT_GEMM3M),1) | |||
| xccblat3_3m: $(ctestl3o_3m) c_cblat3c_3m.o $(TOPDIR)/$(LIBNAME) | |||
| $(CC) $(CFLAGS) -o xccblat3_3m c_cblat3c_3m.o $(ctestl3o_3m) $(LIB) $(EXTRALIB) $(CEXTRALIB) | |||
| endif | |||
| endif | |||
| endif | |||
| @@ -290,8 +324,10 @@ xzcblat2: $(ztestl2o) c_zblat2.o $(TOPDIR)/$(LIBNAME) | |||
| $(FC) $(FLDFLAGS) -o xzcblat2 c_zblat2.o $(ztestl2o) $(LIB) $(EXTRALIB) $(CEXTRALIB) | |||
| xzcblat3: $(ztestl3o) c_zblat3.o $(TOPDIR)/$(LIBNAME) | |||
| $(FC) $(FLDFLAGS) -o xzcblat3 c_zblat3.o $(ztestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB) | |||
| ifeq ($(SUPPORT_GEMM3M),1) | |||
| xzcblat3_3m: $(ztestl3o_3m) c_zblat3_3m.o $(TOPDIR)/$(LIBNAME) | |||
| $(FC) $(FLDFLAGS) -o xzcblat3_3m c_zblat3_3m.o $(ztestl3o_3m) $(LIB) $(EXTRALIB) $(CEXTRALIB) | |||
| endif | |||
| else | |||
| xzcblat1: $(ztestl1o) c_zblat1c.o $(TOPDIR)/$(LIBNAME) | |||
| $(CC) $(CFLAGS) -o xzcblat1 c_zblat1c.o $(ztestl1o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB)) | |||
| @@ -299,6 +335,10 @@ xzcblat2: $(ztestl2o) c_zblat2c.o $(TOPDIR)/$(LIBNAME) | |||
| $(CC) $(CFLAGS) -o xzcblat2 c_zblat2c.o $(ztestl2o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB)) | |||
| xzcblat3: $(ztestl3o) c_zblat3c.o $(TOPDIR)/$(LIBNAME) | |||
| $(CC) $(CFLAGS) -o xzcblat3 c_zblat3c.o $(ztestl3o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB)) | |||
| ifeq ($(SUPPORT_GEMM3M),1) | |||
| xzcblat3_3m: $(ztestl3o_3m) c_zblat3c_3m.o $(TOPDIR)/$(LIBNAME) | |||
| $(CC) $(CFLAGS) -o xzcblat3_3m c_zblat3c_3m.o $(ztestl3o_3m) $(LIB) $(EXTRALIB) $(CEXTRALIB) | |||
| endif | |||
| endif | |||
| endif | |||
| @@ -96,7 +96,7 @@ | |||
| INTEGER ICAMAXTEST | |||
| EXTERNAL SCASUMTEST, SCNRM2TEST, ICAMAXTEST | |||
| * .. External Subroutines .. | |||
| EXTERNAL CSCAL, CSSCALTEST, CTEST, ITEST1, STEST1 | |||
| EXTERNAL CSCALTEST, CSSCALTEST, CTEST, ITEST1, STEST1 | |||
| * .. Intrinsic Functions .. | |||
| INTRINSIC MAX | |||
| * .. Common blocks .. | |||
| @@ -214,8 +214,8 @@ | |||
| CALL STEST1(SCASUMTEST(N,CX,INCX),STRUE4(NP1), | |||
| + STRUE4(NP1),SFAC) | |||
| ELSE IF (ICASE.EQ.8) THEN | |||
| * .. CSCAL .. | |||
| CALL CSCAL(N,CA,CX,INCX) | |||
| * .. CSCALTEST .. | |||
| CALL CSCALTEST(N,CA,CX,INCX) | |||
| CALL CTEST(LEN,CX,CTRUE5(1,NP1,INCX),CTRUE5(1,NP1,INCX), | |||
| + SFAC) | |||
| ELSE IF (ICASE.EQ.9) THEN | |||
| @@ -236,14 +236,14 @@ | |||
| * | |||
| INCX = 1 | |||
| IF (ICASE.EQ.8) THEN | |||
| * CSCAL | |||
| * CSCALTEST | |||
| * Add a test for alpha equal to zero. | |||
| CA = (0.0E0,0.0E0) | |||
| DO 80 I = 1, 5 | |||
| MWPCT(I) = (0.0E0,0.0E0) | |||
| MWPCS(I) = (1.0E0,1.0E0) | |||
| 80 CONTINUE | |||
| CALL CSCAL(5,CA,CX,INCX) | |||
| CALL CSCALTEST(5,CA,CX,INCX) | |||
| CALL CTEST(5,CX,MWPCT,MWPCS,SFAC) | |||
| ELSE IF (ICASE.EQ.9) THEN | |||
| * CSSCALTEST | |||
| @@ -440,6 +440,7 @@ static real c_b43 = (float)1.; | |||
| extern /* Subroutine */ int ctest_(integer*, complex*, complex*, complex*, real*); | |||
| static complex mwpcs[5], mwpct[5]; | |||
| extern /* Subroutine */ int itest1_(integer*, integer*), stest1_(real*,real*,real*,real*); | |||
| extern /* Subroutine */ int cscaltest_(), itest1_(), stest1_(); | |||
| static complex cx[8]; | |||
| extern real scnrm2test_(integer*, complex*, integer*); | |||
| static integer np1; | |||
| @@ -481,7 +482,7 @@ static real c_b43 = (float)1.; | |||
| stest1_(&r__1, &strue4[np1 - 1], &strue4[np1 - 1], sfac); | |||
| } else if (combla_1.icase == 8) { | |||
| /* .. CSCAL .. */ | |||
| cscal_(&combla_1.n, &ca, cx, &combla_1.incx); | |||
| cscaltest_(&combla_1.n, &ca, cx, &combla_1.incx); | |||
| ctest_(&len, cx, &ctrue5[(np1 + combla_1.incx * 5 << 3) - 48], | |||
| &ctrue5[(np1 + combla_1.incx * 5 << 3) - 48], sfac); | |||
| } else if (combla_1.icase == 9) { | |||
| @@ -515,7 +516,7 @@ static real c_b43 = (float)1.; | |||
| mwpcs[i__1].r = (float)1., mwpcs[i__1].i = (float)1.; | |||
| /* L80: */ | |||
| } | |||
| cscal_(&c__5, &ca, cx, &combla_1.incx); | |||
| cscaltest_(&c__5, &ca, cx, &combla_1.incx); | |||
| ctest_(&c__5, cx, mwpct, mwpcs, sfac); | |||
| } else if (combla_1.icase == 9) { | |||
| /* CSSCALTEST */ | |||
| @@ -545,13 +545,31 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG | |||
| *range_n, IFLOAT *sa, IFLOAT *sb, | |||
| BLASLONG nthreads_m, BLASLONG nthreads_n) { | |||
| #ifndef USE_OPENMP | |||
| #ifndef OS_WINDOWS | |||
| static pthread_mutex_t level3_lock = PTHREAD_MUTEX_INITIALIZER; | |||
| #ifdef USE_OPENMP | |||
| static omp_lock_t level3_lock, critical_section_lock; | |||
| static volatile BLASLONG init_lock = 0, omp_lock_initialized = 0, | |||
| parallel_section_left = MAX_PARALLEL_NUMBER; | |||
| // Lock initialization; Todo : Maybe this part can be moved to blas_init() in blas_server_omp.c | |||
| while(omp_lock_initialized == 0) | |||
| { | |||
| blas_lock(&init_lock); | |||
| { | |||
| if(omp_lock_initialized == 0) | |||
| { | |||
| omp_init_lock(&level3_lock); | |||
| omp_init_lock(&critical_section_lock); | |||
| omp_lock_initialized = 1; | |||
| WMB; | |||
| } | |||
| blas_unlock(&init_lock); | |||
| } | |||
| } | |||
| #elif defined(OS_WINDOWS) | |||
| CRITICAL_SECTION level3_lock; | |||
| InitializeCriticalSection((PCRITICAL_SECTION)&level3_lock); | |||
| #else | |||
| CRITICAL_SECTION level3_lock; | |||
| InitializeCriticalSection((PCRITICAL_SECTION)&level3_lock); | |||
| #endif | |||
| static pthread_mutex_t level3_lock = PTHREAD_MUTEX_INITIALIZER; | |||
| #endif | |||
| blas_arg_t newarg; | |||
| @@ -599,12 +617,28 @@ InitializeCriticalSection((PCRITICAL_SECTION)&level3_lock); | |||
| #endif | |||
| #endif | |||
| #ifndef USE_OPENMP | |||
| #ifndef OS_WINDOWS | |||
| pthread_mutex_lock(&level3_lock); | |||
| #ifdef USE_OPENMP | |||
| omp_set_lock(&level3_lock); | |||
| omp_set_lock(&critical_section_lock); | |||
| parallel_section_left--; | |||
| /* | |||
| How OpenMP locks works with NUM_PARALLEL | |||
| 1) parallel_section_left = Number of available concurrent executions of OpenBLAS - Number of currently executing OpenBLAS executions | |||
| 2) level3_lock is acting like a master lock or barrier which stops OpenBLAS calls when all the parallel_section are currently busy executing other OpenBLAS calls | |||
| 3) critical_section_lock is used for updating variables shared between threads executing OpenBLAS calls concurrently and for unlocking of master lock whenever required | |||
| 4) Unlock master lock only when we have not already exhausted all the parallel_sections and allow another thread with a OpenBLAS call to enter | |||
| */ | |||
| if(parallel_section_left != 0) | |||
| omp_unset_lock(&level3_lock); | |||
| omp_unset_lock(&critical_section_lock); | |||
| #elif defined(OS_WINDOWS) | |||
| EnterCriticalSection((PCRITICAL_SECTION)&level3_lock); | |||
| #else | |||
| EnterCriticalSection((PCRITICAL_SECTION)&level3_lock); | |||
| #endif | |||
| pthread_mutex_lock(&level3_lock); | |||
| #endif | |||
| #ifdef USE_ALLOC_HEAP | |||
| @@ -732,12 +766,24 @@ EnterCriticalSection((PCRITICAL_SECTION)&level3_lock); | |||
| free(job); | |||
| #endif | |||
| #ifndef USE_OPENMP | |||
| #ifndef OS_WINDOWS | |||
| pthread_mutex_unlock(&level3_lock); | |||
| #else | |||
| #ifdef USE_OPENMP | |||
| omp_set_lock(&critical_section_lock); | |||
| parallel_section_left++; | |||
| /* | |||
| Unlock master lock only when all the parallel_sections are already exhausted and one of the thread has completed its OpenBLAS call | |||
| otherwise just increment the parallel_section_left | |||
| The master lock is only locked when we have exhausted all the parallel_sections, So only unlock it then and otherwise just increment the count | |||
| */ | |||
| if(parallel_section_left == 1) | |||
| omp_unset_lock(&level3_lock); | |||
| omp_unset_lock(&critical_section_lock); | |||
| #elif defined(OS_WINDOWS) | |||
| LeaveCriticalSection((PCRITICAL_SECTION)&level3_lock); | |||
| #endif | |||
| #else | |||
| pthread_mutex_unlock(&level3_lock); | |||
| #endif | |||
| return 0; | |||
| @@ -113,6 +113,8 @@ extern unsigned int openblas_thread_timeout(void); | |||
| /* We need this global for checking if initialization is finished. */ | |||
| int blas_server_avail __attribute__((aligned(ATTRIBUTE_SIZE))) = 0; | |||
| int blas_omp_threads_local = 1; | |||
| /* Local Variables */ | |||
| #if defined(USE_PTHREAD_LOCK) | |||
| static pthread_mutex_t server_lock = PTHREAD_MUTEX_INITIALIZER; | |||
| @@ -69,6 +69,7 @@ | |||
| int blas_server_avail = 0; | |||
| int blas_omp_number_max = 0; | |||
| int blas_omp_threads_local = 1; | |||
| extern int openblas_omp_adaptive_env(void); | |||
| @@ -406,7 +407,7 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){ | |||
| } | |||
| #endif | |||
| while(true) { | |||
| while (true) { | |||
| for(i=0; i < MAX_PARALLEL_NUMBER; i++) { | |||
| #ifdef HAVE_C11 | |||
| _Bool inuse = false; | |||
| @@ -419,10 +420,9 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){ | |||
| break; | |||
| } | |||
| } | |||
| if(i != MAX_PARALLEL_NUMBER) | |||
| break; | |||
| } | |||
| if (i != MAX_PARALLEL_NUMBER) | |||
| break; | |||
| } | |||
| if (openblas_omp_adaptive_env() != 0) { | |||
| #pragma omp parallel for num_threads(num) schedule(OMP_SCHED) | |||
| for (i = 0; i < num; i ++) { | |||
| @@ -48,6 +48,12 @@ | |||
| #endif | |||
| #endif | |||
| #ifdef SMP_DEBUG | |||
| # define MT_TRACE(...) fprintf(stderr, __VA_ARGS__) | |||
| #else | |||
| # define MT_TRACE(...) | |||
| #endif | |||
| /* This is a thread implementation for Win32 lazy implementation */ | |||
| /* Thread server common information */ | |||
| @@ -59,6 +65,8 @@ static CRITICAL_SECTION queue_lock; | |||
| /* We need this global for checking if initialization is finished. */ | |||
| int blas_server_avail = 0; | |||
| int blas_omp_threads_local = 1; | |||
| /* Local Variables */ | |||
| static BLASULONG server_lock = 0; | |||
| @@ -66,19 +74,12 @@ static HANDLE blas_threads [MAX_CPU_NUMBER]; | |||
| static DWORD blas_threads_id[MAX_CPU_NUMBER]; | |||
| static volatile int thread_target; // target num of live threads, volatile for cross-thread reads | |||
| #if defined (__GNUC__) && (__GNUC__ < 6) | |||
| #define WIN_CAS(dest, exch, comp) __sync_val_compare_and_swap(dest, comp, exch) | |||
| #else | |||
| #if defined(_WIN64) | |||
| #define WIN_CAS(dest, exch, comp) InterlockedCompareExchange64(dest, exch, comp) | |||
| #else | |||
| #define WIN_CAS(dest, exch, comp) InterlockedCompareExchange(dest, exch, comp) | |||
| #endif | |||
| #endif | |||
| static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ | |||
| // | |||
| // Legacy code path | |||
| // | |||
| static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb) { | |||
| if (!(mode & BLAS_COMPLEX)){ | |||
| if (!(mode & BLAS_COMPLEX)) { | |||
| #ifdef EXPRECISION | |||
| if ((mode & BLAS_PREC) == BLAS_XDOUBLE){ | |||
| /* REAL / Extended Double */ | |||
| @@ -93,7 +94,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ | |||
| args -> c, args -> ldc, sb); | |||
| } else | |||
| #endif | |||
| if ((mode & BLAS_PREC) == BLAS_DOUBLE){ | |||
| if ((mode & BLAS_PREC) == BLAS_DOUBLE) { | |||
| /* REAL / Double */ | |||
| void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, | |||
| double *, BLASLONG, double *, BLASLONG, | |||
| @@ -104,7 +105,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ | |||
| args -> a, args -> lda, | |||
| args -> b, args -> ldb, | |||
| args -> c, args -> ldc, sb); | |||
| } else if ((mode & BLAS_PREC) == BLAS_SINGLE){ | |||
| } else if ((mode & BLAS_PREC) == BLAS_SINGLE) { | |||
| /* REAL / Single */ | |||
| void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, | |||
| float *, BLASLONG, float *, BLASLONG, | |||
| @@ -116,7 +117,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ | |||
| args -> b, args -> ldb, | |||
| args -> c, args -> ldc, sb); | |||
| #ifdef BUILD_BFLOAT16 | |||
| } else if ((mode & BLAS_PREC) == BLAS_BFLOAT16){ | |||
| } else if ((mode & BLAS_PREC) == BLAS_BFLOAT16) { | |||
| /* REAL / BFLOAT16 */ | |||
| void (*afunc)(BLASLONG, BLASLONG, BLASLONG, bfloat16, | |||
| bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, | |||
| @@ -127,7 +128,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ | |||
| args -> a, args -> lda, | |||
| args -> b, args -> ldb, | |||
| args -> c, args -> ldc, sb); | |||
| } else if ((mode & BLAS_PREC) == BLAS_STOBF16){ | |||
| } else if ((mode & BLAS_PREC) == BLAS_STOBF16) { | |||
| /* REAL / BLAS_STOBF16 */ | |||
| void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, | |||
| float *, BLASLONG, bfloat16 *, BLASLONG, | |||
| @@ -138,7 +139,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ | |||
| args -> a, args -> lda, | |||
| args -> b, args -> ldb, | |||
| args -> c, args -> ldc, sb); | |||
| } else if ((mode & BLAS_PREC) == BLAS_DTOBF16){ | |||
| } else if ((mode & BLAS_PREC) == BLAS_DTOBF16) { | |||
| /* REAL / BLAS_DTOBF16 */ | |||
| void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, | |||
| double *, BLASLONG, bfloat16 *, BLASLONG, | |||
| @@ -155,7 +156,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ | |||
| } | |||
| } else { | |||
| #ifdef EXPRECISION | |||
| if ((mode & BLAS_PREC) == BLAS_XDOUBLE){ | |||
| if ((mode & BLAS_PREC) == BLAS_XDOUBLE) { | |||
| /* COMPLEX / Extended Double */ | |||
| void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, | |||
| xdouble *, BLASLONG, xdouble *, BLASLONG, | |||
| @@ -169,7 +170,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ | |||
| args -> c, args -> ldc, sb); | |||
| } else | |||
| #endif | |||
| if ((mode & BLAS_PREC) == BLAS_DOUBLE){ | |||
| if ((mode & BLAS_PREC) == BLAS_DOUBLE) { | |||
| /* COMPLEX / Double */ | |||
| void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, double, | |||
| double *, BLASLONG, double *, BLASLONG, | |||
| @@ -199,10 +200,10 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ | |||
| } | |||
| } | |||
| /* This is a main routine of threads. Each thread waits until job is */ | |||
| /* queued. */ | |||
| static DWORD WINAPI blas_thread_server(void *arg){ | |||
| // | |||
| // This is a main routine of threads. Each thread waits until job is queued. | |||
| // | |||
| static DWORD WINAPI blas_thread_server(void *arg) { | |||
| /* Thread identifier */ | |||
| BLASLONG cpu = (BLASLONG)arg; | |||
| @@ -213,31 +214,24 @@ static DWORD WINAPI blas_thread_server(void *arg){ | |||
| /* Each server needs each buffer */ | |||
| buffer = blas_memory_alloc(2); | |||
| #ifdef SMP_DEBUG | |||
| fprintf(STDERR, "Server[%2ld] Thread is started!\n", cpu); | |||
| #endif | |||
| MT_TRACE("Server[%2ld] Thread is started!\n", cpu); | |||
| while (1){ | |||
| while (1) { | |||
| /* Waiting for Queue */ | |||
| #ifdef SMP_DEBUG | |||
| fprintf(STDERR, "Server[%2ld] Waiting for Queue.\n", cpu); | |||
| #endif | |||
| // event raised when work is added to the queue | |||
| WaitForSingleObject(kickoff_event, INFINITE); | |||
| MT_TRACE("Server[%2ld] Waiting for Queue.\n", cpu); | |||
| if (cpu > thread_target - 2) | |||
| { | |||
| //printf("thread [%d] exiting.\n", cpu); | |||
| break; // excess thread, so worker thread exits | |||
| } | |||
| // event raised when work is added to the queue | |||
| WaitForSingleObject(kickoff_event, INFINITE); | |||
| #ifdef SMP_DEBUG | |||
| fprintf(STDERR, "Server[%2ld] Got it.\n", cpu); | |||
| #endif | |||
| if (cpu > thread_target - 2) { | |||
| //MT_TRACE("thread [%d] exiting.\n", cpu); | |||
| break; // excess thread, so worker thread exits | |||
| } | |||
| MT_TRACE("Server[%2ld] Got it.\n", cpu); | |||
| #if 1 | |||
| EnterCriticalSection(&queue_lock); | |||
| queue = work_queue; | |||
| @@ -245,53 +239,39 @@ static DWORD WINAPI blas_thread_server(void *arg){ | |||
| work_queue = work_queue->next; | |||
| LeaveCriticalSection(&queue_lock); | |||
| #else | |||
| volatile blas_queue_t* queue_next; | |||
| INT_PTR prev_value; | |||
| do { | |||
| queue = (volatile blas_queue_t*)work_queue; | |||
| if (!queue) | |||
| break; | |||
| queue_next = (volatile blas_queue_t*)queue->next; | |||
| prev_value = WIN_CAS((INT_PTR*)&work_queue, (INT_PTR)queue_next, (INT_PTR)queue); | |||
| } while (prev_value != queue); | |||
| #endif | |||
| if (queue) { | |||
| if (queue) { | |||
| int (*routine)(blas_arg_t *, void *, void *, void *, void *, BLASLONG) = queue -> routine; | |||
| sa = queue -> sa; | |||
| sb = queue -> sb; | |||
| #ifdef CONSISTENT_FPCSR | |||
| __asm__ __volatile__ ("ldmxcsr %0" : : "m" (queue -> sse_mode)); | |||
| __asm__ __volatile__ ("fldcw %0" : : "m" (queue -> x87_mode)); | |||
| #endif | |||
| #ifdef CONSISTENT_FPCSR | |||
| __asm__ __volatile__ ("ldmxcsr %0" : : "m" (queue -> sse_mode)); | |||
| __asm__ __volatile__ ("fldcw %0" : : "m" (queue -> x87_mode)); | |||
| #endif | |||
| #ifdef SMP_DEBUG | |||
| fprintf(STDERR, "Server[%2ld] Started. Mode = 0x%03x M = %3ld N=%3ld K=%3ld\n", | |||
| MT_TRACE("Server[%2ld] Started. Mode = 0x%03x M = %3ld N=%3ld K=%3ld\n", | |||
| cpu, queue->mode, queue-> args ->m, queue->args->n, queue->args->k); | |||
| #endif | |||
| // fprintf(stderr, "queue start[%ld]!!!\n", cpu); | |||
| #ifdef MONITOR | |||
| main_status[cpu] = MAIN_RUNNING1; | |||
| #endif | |||
| #ifdef MONITOR | |||
| main_status[cpu] = MAIN_RUNNING1; | |||
| #endif | |||
| if (sa == NULL) sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A); | |||
| if (sa == NULL) | |||
| sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A); | |||
| if (sb == NULL) { | |||
| if (!(queue -> mode & BLAS_COMPLEX)){ | |||
| if (!(queue -> mode & BLAS_COMPLEX)) { | |||
| #ifdef EXPRECISION | |||
| if ((queue -> mode & BLAS_PREC) == BLAS_XDOUBLE){ | |||
| if ((queue -> mode & BLAS_PREC) == BLAS_XDOUBLE) { | |||
| sb = (void *)(((BLASLONG)sa + ((XGEMM_P * XGEMM_Q * sizeof(xdouble) | |||
| + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); | |||
| } else | |||
| #endif | |||
| if ((queue -> mode & BLAS_PREC) == BLAS_DOUBLE){ | |||
| if ((queue -> mode & BLAS_PREC) == BLAS_DOUBLE) { | |||
| #ifdef BUILD_DOUBLE | |||
| sb = (void *)(((BLASLONG)sa + ((DGEMM_P * DGEMM_Q * sizeof(double) | |||
| + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); | |||
| @@ -325,65 +305,58 @@ static DWORD WINAPI blas_thread_server(void *arg){ | |||
| /* Other types in future */ | |||
| } | |||
| } | |||
| queue->sb=sb; | |||
| queue->sb=sb; | |||
| } | |||
| #ifdef MONITOR | |||
| main_status[cpu] = MAIN_RUNNING2; | |||
| #endif | |||
| #ifdef MONITOR | |||
| main_status[cpu] = MAIN_RUNNING2; | |||
| #endif | |||
| if (!(queue -> mode & BLAS_LEGACY)) { | |||
| (routine)(queue -> args, queue -> range_m, queue -> range_n, sa, sb, queue -> position); | |||
| (routine)(queue -> args, queue -> range_m, queue -> range_n, sa, sb, queue -> position); | |||
| } else { | |||
| legacy_exec(routine, queue -> mode, queue -> args, sb); | |||
| legacy_exec(routine, queue -> mode, queue -> args, sb); | |||
| } | |||
| }else{ | |||
| continue; //if queue == NULL | |||
| } | |||
| } else { | |||
| continue; //if queue == NULL | |||
| } | |||
| #ifdef SMP_DEBUG | |||
| fprintf(STDERR, "Server[%2ld] Finished!\n", cpu); | |||
| #endif | |||
| MT_TRACE("Server[%2ld] Finished!\n", cpu); | |||
| queue->finished = 1; | |||
| queue->finished = 1; | |||
| } | |||
| /* Shutdown procedure */ | |||
| #ifdef SMP_DEBUG | |||
| fprintf(STDERR, "Server[%2ld] Shutdown!\n", cpu); | |||
| #endif | |||
| MT_TRACE("Server[%2ld] Shutdown!\n", cpu); | |||
| blas_memory_free(buffer); | |||
| return 0; | |||
| } | |||
| } | |||
| /* Initializing routine */ | |||
| int blas_thread_init(void){ | |||
| // | |||
| // Initializing routine | |||
| // | |||
| int blas_thread_init(void) { | |||
| BLASLONG i; | |||
| if (blas_server_avail || (blas_cpu_number <= 1)) return 0; | |||
| LOCK_COMMAND(&server_lock); | |||
| #ifdef SMP_DEBUG | |||
| fprintf(STDERR, "Initializing Thread(Num. threads = %d)\n", | |||
| blas_cpu_number); | |||
| #endif | |||
| MT_TRACE("Initializing Thread(Num. threads = %d)\n", blas_cpu_number); | |||
| if (!blas_server_avail){ | |||
| // create the kickoff Event | |||
| kickoff_event = CreateEvent(NULL, TRUE, FALSE, NULL); | |||
| if (!blas_server_avail) { | |||
| // create the kickoff Event | |||
| kickoff_event = CreateEvent(NULL, TRUE, FALSE, NULL); | |||
| thread_target = blas_cpu_number; | |||
| thread_target = blas_cpu_number; | |||
| InitializeCriticalSection(&queue_lock); | |||
| for(i = 0; i < blas_cpu_number - 1; i++){ | |||
| //printf("thread_init: creating thread [%d]\n", i); | |||
| for(i = 0; i < blas_cpu_number - 1; i++) { | |||
| //MT_TRACE("thread_init: creating thread [%d]\n", i); | |||
| blas_threads[i] = CreateThread(NULL, 0, | |||
| blas_thread_server, (void *)i, | |||
| @@ -398,15 +371,12 @@ int blas_thread_init(void){ | |||
| return 0; | |||
| } | |||
| /* | |||
| User can call one of two routines. | |||
| exec_blas_async ... immediately returns after jobs are queued. | |||
| exec_blas ... returns after jobs are finished. | |||
| */ | |||
| int exec_blas_async(BLASLONG pos, blas_queue_t *queue){ | |||
| // | |||
| // User can call one of two routines. | |||
| // exec_blas_async ... immediately returns after jobs are queued. | |||
| // exec_blas ... returns after jobs are finished. | |||
| // | |||
| int exec_blas_async(BLASLONG pos, blas_queue_t *queue) { | |||
| #if defined(SMP_SERVER) | |||
| // Handle lazy re-init of the thread-pool after a POSIX fork | |||
| @@ -426,7 +396,7 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){ | |||
| __asm__ __volatile__ ("stmxcsr %0" : "=m" (current -> sse_mode)); | |||
| #endif | |||
| current->finished = 0; | |||
| current->finished = 0; | |||
| current = current -> next; | |||
| pos ++; | |||
| } | |||
| @@ -435,18 +405,18 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){ | |||
| if (!work_queue) | |||
| { | |||
| work_queue = queue; | |||
| work_queue = queue; | |||
| } | |||
| else | |||
| { | |||
| blas_queue_t *next_item = work_queue; | |||
| blas_queue_t *queue_item = work_queue; | |||
| // find the end of the work queue | |||
| while (next_item) | |||
| next_item = next_item->next; | |||
| // find the end of the work queue | |||
| while (queue_item->next) | |||
| queue_item = queue_item->next; | |||
| // add new work to the end | |||
| next_item = queue; | |||
| // add new work to the end | |||
| queue_item->next = queue; | |||
| } | |||
| LeaveCriticalSection(&queue_lock); | |||
| @@ -456,26 +426,25 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){ | |||
| return 0; | |||
| } | |||
| int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue){ | |||
| // | |||
| // Join. Wait for all queued tasks to complete | |||
| // | |||
| int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue) { | |||
| #ifdef SMP_DEBUG | |||
| fprintf(STDERR, "Synchronization Waiting.\n"); | |||
| #endif | |||
| MT_TRACE("Synchronization Waiting.\n"); | |||
| while (num){ | |||
| #ifdef SMP_DEBUG | |||
| fprintf(STDERR, "Waiting Queue ..\n"); | |||
| #endif | |||
| while (!queue->finished) | |||
| YIELDING; | |||
| while (num) { | |||
| MT_TRACE("Waiting Queue ..\n"); | |||
| queue = queue->next; | |||
| num--; | |||
| } | |||
| while (!queue->finished) | |||
| YIELDING; | |||
| queue = queue->next; | |||
| num--; | |||
| } | |||
| MT_TRACE("Completely Done.\n\n"); | |||
| #ifdef SMP_DEBUG | |||
| fprintf(STDERR, "Completely Done.\n\n"); | |||
| #endif | |||
| // if work was added to the queue after this batch we can't sleep the worker threads | |||
| // by resetting the event | |||
| EnterCriticalSection(&queue_lock); | |||
| @@ -488,8 +457,10 @@ int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue){ | |||
| return 0; | |||
| } | |||
| /* Execute Threads */ | |||
| int exec_blas(BLASLONG num, blas_queue_t *queue){ | |||
| // | |||
| // Execute Threads | |||
| // | |||
| int exec_blas(BLASLONG num, blas_queue_t *queue) { | |||
| #if defined(SMP_SERVER) && defined(OS_CYGWIN_NT) | |||
| // Handle lazy re-init of the thread-pool after a POSIX fork | |||
| @@ -502,29 +473,33 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){ | |||
| if ((num <= 0) || (queue == NULL)) return 0; | |||
| if ((num > 1) && queue -> next) exec_blas_async(1, queue -> next); | |||
| if ((num > 1) && queue -> next) | |||
| exec_blas_async(1, queue -> next); | |||
| routine = queue -> routine; | |||
| if (queue -> mode & BLAS_LEGACY) { | |||
| legacy_exec(routine, queue -> mode, queue -> args, queue -> sb); | |||
| } else | |||
| } else { | |||
| if (queue -> mode & BLAS_PTHREAD) { | |||
| void (*pthreadcompat)(void *) = queue -> routine; | |||
| (pthreadcompat)(queue -> args); | |||
| } else | |||
| (routine)(queue -> args, queue -> range_m, queue -> range_n, | |||
| queue -> sa, queue -> sb, 0); | |||
| queue -> sa, queue -> sb, 0); | |||
| } | |||
| if ((num > 1) && queue -> next) exec_blas_async_wait(num - 1, queue -> next); | |||
| if ((num > 1) && queue -> next) | |||
| exec_blas_async_wait(num - 1, queue -> next); | |||
| return 0; | |||
| } | |||
| /* Shutdown procedure, but user don't have to call this routine. The */ | |||
| /* kernel automatically kill threads. */ | |||
| int BLASFUNC(blas_thread_shutdown)(void){ | |||
| // | |||
| // Shutdown procedure, but user don't have to call this routine. The | |||
| // kernel automatically kill threads. | |||
| // | |||
| int BLASFUNC(blas_thread_shutdown)(void) { | |||
| int i; | |||
| @@ -532,9 +507,9 @@ int BLASFUNC(blas_thread_shutdown)(void){ | |||
| LOCK_COMMAND(&server_lock); | |||
| if (blas_server_avail){ | |||
| if (blas_server_avail) { | |||
| for(i = 0; i < blas_num_threads - 1; i++){ | |||
| for (i = 0; i < blas_num_threads - 1; i++) { | |||
| // Could also just use WaitForMultipleObjects | |||
| DWORD wait_thread_value = WaitForSingleObject(blas_threads[i], 50); | |||
| @@ -556,6 +531,9 @@ int BLASFUNC(blas_thread_shutdown)(void){ | |||
| return 0; | |||
| } | |||
| // | |||
| // Legacy function to set numbef of threads | |||
| // | |||
| void goto_set_num_threads(int num_threads) | |||
| { | |||
| long i; | |||
| @@ -569,7 +547,7 @@ void goto_set_num_threads(int num_threads) | |||
| if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER; | |||
| if (blas_server_avail && num_threads < blas_num_threads) { | |||
| if (blas_server_avail && num_threads < blas_num_threads) { | |||
| LOCK_COMMAND(&server_lock); | |||
| thread_target = num_threads; | |||
| @@ -577,11 +555,11 @@ void goto_set_num_threads(int num_threads) | |||
| SetEvent(kickoff_event); | |||
| for (i = num_threads - 1; i < blas_num_threads - 1; i++) { | |||
| //printf("set_num_threads: waiting on thread [%d] to quit.\n", i); | |||
| //MT_TRACE("set_num_threads: waiting on thread [%d] to quit.\n", i); | |||
| WaitForSingleObject(blas_threads[i], INFINITE); | |||
| //printf("set_num_threads: thread [%d] has quit.\n", i); | |||
| //MT_TRACE("set_num_threads: thread [%d] has quit.\n", i); | |||
| CloseHandle(blas_threads[i]); | |||
| } | |||
| @@ -599,8 +577,8 @@ void goto_set_num_threads(int num_threads) | |||
| thread_target = num_threads; | |||
| //increased_threads = 1; | |||
| if (!blas_server_avail){ | |||
| //increased_threads = 1; | |||
| if (!blas_server_avail) { | |||
| // create the kickoff Event | |||
| kickoff_event = CreateEvent(NULL, TRUE, FALSE, NULL); | |||
| @@ -609,8 +587,8 @@ void goto_set_num_threads(int num_threads) | |||
| blas_server_avail = 1; | |||
| } | |||
| for(i = (blas_num_threads > 0) ? blas_num_threads - 1 : 0; i < num_threads - 1; i++){ | |||
| //printf("set_num_threads: creating thread [%d]\n", i); | |||
| for (i = (blas_num_threads > 0) ? blas_num_threads - 1 : 0; i < num_threads - 1; i++) { | |||
| //MT_TRACE("set_num_threads: creating thread [%d]\n", i); | |||
| blas_threads[i] = CreateThread(NULL, 0, | |||
| blas_thread_server, (void *)i, | |||
| @@ -625,6 +603,9 @@ void goto_set_num_threads(int num_threads) | |||
| blas_cpu_number = num_threads; | |||
| } | |||
| // | |||
| // Openblas function to set thread count | |||
| // | |||
| void openblas_set_num_threads(int num) | |||
| { | |||
| goto_set_num_threads(num); | |||
| @@ -275,6 +275,7 @@ extern gotoblas_t gotoblas_EXCAVATOR; | |||
| #define gotoblas_SKYLAKEX gotoblas_SANDYBRIDGE | |||
| #define gotoblas_COOPERLAKE gotoblas_SANDYBRIDGE | |||
| #define gotoblas_ZEN gotoblas_SANDYBRIDGE | |||
| #define gotoblas_SAPPHIRERAPIDS gotoblas_SANDYBRIDGE | |||
| #else | |||
| extern gotoblas_t gotoblas_HASWELL; | |||
| extern gotoblas_t gotoblas_ZEN; | |||
| @@ -1,6 +1,6 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* Copyright 2023 The OpenBLAS Project */ | |||
| /* Copyright 2023-2024 The OpenBLAS Project */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| @@ -143,12 +143,13 @@ extern gotoblas_t gotoblas_ARMV8SVE; | |||
| #endif | |||
| extern gotoblas_t gotoblas_THUNDERX3T110; | |||
| #endif | |||
| #define gotoblas_NEOVERSEV2 gotoblas_NEOVERSEV1 | |||
| extern void openblas_warning(int verbose, const char * msg); | |||
| #define FALLBACK_VERBOSE 1 | |||
| #define NEOVERSEN1_FALLBACK "OpenBLAS : Your OS does not support SVE instructions. OpenBLAS is using Neoverse N1 kernels as a fallback, which may give poorer performance.\n" | |||
| #define NUM_CORETYPES 16 | |||
| #define NUM_CORETYPES 17 | |||
| /* | |||
| * In case asm/hwcap.h is outdated on the build system, make sure | |||
| @@ -178,6 +179,7 @@ static char *corename[] = { | |||
| "emag8180", | |||
| "neoversen1", | |||
| "neoversev1", | |||
| "neoversev2", | |||
| "neoversen2", | |||
| "thunderx3t110", | |||
| "cortexa55", | |||
| @@ -198,10 +200,11 @@ char *gotoblas_corename(void) { | |||
| if (gotoblas == &gotoblas_EMAG8180) return corename[ 9]; | |||
| if (gotoblas == &gotoblas_NEOVERSEN1) return corename[10]; | |||
| if (gotoblas == &gotoblas_NEOVERSEV1) return corename[11]; | |||
| if (gotoblas == &gotoblas_NEOVERSEN2) return corename[12]; | |||
| if (gotoblas == &gotoblas_THUNDERX3T110) return corename[13]; | |||
| if (gotoblas == &gotoblas_CORTEXA55) return corename[14]; | |||
| if (gotoblas == &gotoblas_ARMV8SVE) return corename[15]; | |||
| if (gotoblas == &gotoblas_NEOVERSEV2) return corename[12]; | |||
| if (gotoblas == &gotoblas_NEOVERSEN2) return corename[13]; | |||
| if (gotoblas == &gotoblas_THUNDERX3T110) return corename[14]; | |||
| if (gotoblas == &gotoblas_CORTEXA55) return corename[15]; | |||
| if (gotoblas == &gotoblas_ARMV8SVE) return corename[16]; | |||
| return corename[NUM_CORETYPES]; | |||
| } | |||
| @@ -233,10 +236,11 @@ static gotoblas_t *force_coretype(char *coretype) { | |||
| case 9: return (&gotoblas_EMAG8180); | |||
| case 10: return (&gotoblas_NEOVERSEN1); | |||
| case 11: return (&gotoblas_NEOVERSEV1); | |||
| case 12: return (&gotoblas_NEOVERSEN2); | |||
| case 13: return (&gotoblas_THUNDERX3T110); | |||
| case 14: return (&gotoblas_CORTEXA55); | |||
| case 15: return (&gotoblas_ARMV8SVE); | |||
| case 12: return (&gotoblas_NEOVERSEV2); | |||
| case 13: return (&gotoblas_NEOVERSEN2); | |||
| case 14: return (&gotoblas_THUNDERX3T110); | |||
| case 15: return (&gotoblas_CORTEXA55); | |||
| case 16: return (&gotoblas_ARMV8SVE); | |||
| } | |||
| snprintf(message, 128, "Core not found: %s\n", coretype); | |||
| openblas_warning(1, message); | |||
| @@ -312,6 +316,13 @@ static gotoblas_t *get_coretype(void) { | |||
| return &gotoblas_NEOVERSEN1; | |||
| }else | |||
| return &gotoblas_NEOVERSEV1; | |||
| case 0xd4f: | |||
| if (!(getauxval(AT_HWCAP) & HWCAP_SVE)) { | |||
| openblas_warning(FALLBACK_VERBOSE, NEOVERSEN1_FALLBACK); | |||
| return &gotoblas_NEOVERSEN1; | |||
| } else { | |||
| return &gotoblas_NEOVERSEV2; | |||
| } | |||
| #endif | |||
| case 0xd05: // Cortex A55 | |||
| return &gotoblas_CORTEXA55; | |||
| @@ -43,6 +43,13 @@ char *gotoblas_corename(void) { | |||
| #define CPU_POWER9 9 | |||
| #define CPU_POWER10 10 | |||
| #ifndef POWER_9 | |||
| #define POWER_9 0x20000 /* 9 class CPU */ | |||
| #endif | |||
| #ifndef POWER_10 | |||
| #define POWER_10 0x40000 /* 10 class CPU */ | |||
| #endif | |||
| #ifdef _AIX | |||
| #include <sys/systemcfg.h> | |||
| @@ -62,7 +69,7 @@ static int cpuid(void) | |||
| else if (arch == POWER_9) return CPU_POWER9; | |||
| #endif | |||
| #ifdef POWER_10 | |||
| else if (arch == POWER_10) return CPU_POWER10; | |||
| else if (arch >= POWER_10) return CPU_POWER10; | |||
| #endif | |||
| return CPU_UNKNOWN; | |||
| } | |||
| @@ -332,6 +339,9 @@ void gotoblas_dynamic_init(void) { | |||
| if (gotoblas && gotoblas -> init) { | |||
| strncpy(coren,gotoblas_corename(),20); | |||
| sprintf(coremsg, "Core: %s\n",coren); | |||
| if (getenv("GET_OPENBLAS_CORETYPE")) { | |||
| fprintf(stderr, "%s", coremsg); | |||
| } | |||
| openblas_warning(2, coremsg); | |||
| gotoblas -> init(); | |||
| } else { | |||
| @@ -3214,7 +3214,7 @@ void blas_shutdown(void){ | |||
| #endif | |||
| memory[pos].lock = 0; | |||
| } | |||
| if (memory_overflowed) | |||
| if (memory_overflowed) { | |||
| for (pos = 0; pos < NEW_BUFFERS; pos ++){ | |||
| newmemory[pos].addr = (void *)0; | |||
| newmemory[pos].used = 0; | |||
| @@ -3222,6 +3222,10 @@ void blas_shutdown(void){ | |||
| newmemory[pos].pos = -1; | |||
| #endif | |||
| newmemory[pos].lock = 0; | |||
| } | |||
| free(newmemory); | |||
| newmemory = NULL; | |||
| memory_overflowed = 0; | |||
| } | |||
| UNLOCK_COMMAND(&alloc_lock); | |||
| @@ -36,11 +36,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #ifdef SMP_SERVER | |||
| extern void openblas_set_num_threads(int num_threads) ; | |||
| extern int openblas_get_num_threads(void) ; | |||
| void openblas_set_num_threads_(int* num_threads){ | |||
| openblas_set_num_threads(*num_threads); | |||
| } | |||
| int openblas_set_num_threads_local(int num_threads){ | |||
| int ret = openblas_get_num_threads(); | |||
| openblas_set_num_threads(num_threads); | |||
| blas_omp_threads_local=num_threads; | |||
| return ret; | |||
| } | |||
| #else | |||
| //Single thread | |||
| @@ -50,4 +59,8 @@ void openblas_set_num_threads(int num_threads) { | |||
| void openblas_set_num_threads_(int* num_threads){ | |||
| } | |||
| int openblas_set_num_threads_local(int num_threads){ | |||
| return 1; | |||
| } | |||
| #endif | |||
| @@ -73,6 +73,10 @@ endif | |||
| endif | |||
| endif | |||
| ifeq ($(F_COMPILER)$(OSNAME), IBMAIX) | |||
| EXTRALIB += -lxlf90 | |||
| endif | |||
| ifeq ($(C_COMPILER), PGI) | |||
| EXTRALIB += -pgf90libs | |||
| endif | |||
| @@ -132,8 +136,12 @@ libgoto_hpl.def : $(GENSYM) | |||
| ./$(GENSYM) win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F) | |||
| ifeq ($(OSNAME), Darwin) | |||
| ifeq ($(FIXED_LIBNAME),1) | |||
| INTERNALNAME = $(LIBPREFIX)$(LIBNAMESUFFIX).dylib | |||
| else | |||
| INTERNALNAME = $(LIBPREFIX).$(MAJOR_VERSION).dylib | |||
| endif | |||
| endif | |||
| ifeq (, $(SYMBOLPREFIX)$(SYMBOLSUFFIX)) | |||
| $(LIBDYNNAME) : ../$(LIBNAME) osx.def | |||
| @@ -169,8 +177,12 @@ INTERNALNAME = $(LIBPREFIX).so | |||
| FEXTRALIB += -lm | |||
| EXTRALIB += -lm | |||
| else | |||
| ifeq ($(FIXED_LIBNAME),1) | |||
| INTERNALNAME = $(LIBPREFIX)$(LIBNAMESUFFIX).so | |||
| else | |||
| INTERNALNAME = $(LIBPREFIX).so.$(MAJOR_VERSION) | |||
| endif | |||
| endif | |||
| ifeq (, $(SYMBOLPREFIX)$(SYMBOLSUFFIX)) | |||
| ../$(LIBSONAME) : ../$(LIBNAME) linktest.c | |||
| @@ -248,6 +260,20 @@ endif | |||
| ifeq ($(OSNAME), AIX) | |||
| so : ../$(LIBSONAME) linktest.c | |||
| $(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(EXTRALIB) && echo OK. | |||
| rm -f linktest | |||
| ../$(LIBSONAME) : aix.exp | |||
| $(CC) $(CFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \ | |||
| -Wl,-bcdtors:all:-2147481648:s,-bE:aix.exp -Wl,-bbigtoc ../$(LIBNAME) $(EXTRALIB) | |||
| aix.exp : | |||
| /usr/bin/nm -X32_64 -PCpgl ../$(LIBNAME) | /usr/bin/awk '{ if ((($$ 2 == "T") \ | |||
| || ($$ 2 == "D") || ($$ 2 == "B") || ($$ 2 == "W") || ($$ 2 == "V") || ($$ 2 == "Z")) && (substr($$ 1,1,1) != ".")) \ | |||
| { if (($$ 2 == "W") || ($$ 2 == "V") || ($$ 2 == "Z")) { print $$ 1 " weak" } else { print $$ 1 } } }' | \ | |||
| /usr/bin/sort -u > aix.exp | |||
| ifeq ($(COMPILER_F77), xlf) | |||
| goto32.$(SUFFIX) : ../$(LIBNAME) aix.def | |||
| @@ -289,6 +315,11 @@ test : linktest.c | |||
| linktest.c : $(GENSYM) ../Makefile.system ../getarch.c | |||
| ./$(GENSYM) linktest $(ARCH) "$(BU)" $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > linktest.c | |||
| ifeq ($(F_COMPILER), IBM) | |||
| mv linktest.c linktest.c.FIRST | |||
| egrep -v 'second_|dsecnd_' linktest.c.FIRST > linktest.c | |||
| rm linktest.c.FIRST | |||
| endif | |||
| clean :: | |||
| @rm -f *.def *.dylib __.SYMDEF* *.renamed | |||
| @@ -60,6 +60,7 @@ cblasobjsc=" | |||
| cblas_ctbsv cblas_ctpmv cblas_ctpsv cblas_ctrmm cblas_ctrmv cblas_ctrsm cblas_ctrsv | |||
| cblas_scnrm2 cblas_scasum cblas_cgemmt | |||
| cblas_icamax cblas_icamin cblas_icmin cblas_icmax cblas_scsum cblas_cimatcopy cblas_comatcopy | |||
| cblas_caxpyc cblas_crotg cblas_csrot cblas_scamax cblas_scamin | |||
| " | |||
| cblasobjsd=" | |||
| cblas_dasum cblas_daxpy cblas_dcopy cblas_ddot | |||
| @@ -69,6 +70,7 @@ cblasobjsd=" | |||
| cblas_dsyr2k cblas_dsyr cblas_dsyrk cblas_dtbmv cblas_dtbsv cblas_dtpmv cblas_dtpsv | |||
| cblas_dtrmm cblas_dtrmv cblas_dtrsm cblas_dtrsv cblas_daxpby cblas_dgeadd cblas_dgemmt | |||
| cblas_idamax cblas_idamin cblas_idmin cblas_idmax cblas_dsum cblas_dimatcopy cblas_domatcopy | |||
| cblas_damax cblas_damin | |||
| " | |||
| cblasobjss=" | |||
| @@ -80,6 +82,7 @@ cblasobjss=" | |||
| cblas_stbmv cblas_stbsv cblas_stpmv cblas_stpsv cblas_strmm cblas_strmv cblas_strsm | |||
| cblas_strsv cblas_sgeadd cblas_sgemmt | |||
| cblas_isamax cblas_isamin cblas_ismin cblas_ismax cblas_ssum cblas_simatcopy cblas_somatcopy | |||
| cblas_samax cblas_samin | |||
| " | |||
| cblasobjsz=" | |||
| @@ -91,6 +94,7 @@ cblasobjsz=" | |||
| cblas_ztrsv cblas_cdotc_sub cblas_cdotu_sub cblas_zdotc_sub cblas_zdotu_sub | |||
| cblas_zaxpby cblas_zgeadd cblas_zgemmt | |||
| cblas_izamax cblas_izamin cblas_izmin cblas_izmax cblas_dzsum cblas_zimatcopy cblas_zomatcopy | |||
| cblas_zaxpyc cblas_zdrot cblas_zrotg cblas_dzamax cblas_dzamin | |||
| " | |||
| cblasobjs="cblas_xerbla" | |||
| @@ -861,6 +865,53 @@ lapackobjs2z="$lapackobjs2z | |||
| zgedmd | |||
| zgedmdq | |||
| " | |||
| #functions added post 3.11 | |||
| lapackobjs2c="$lapackobjs2c | |||
| claqp2rk | |||
| claqp3rk | |||
| ctrsyl3 | |||
| " | |||
| # claqz0 | |||
| # claqz1 | |||
| # claqz2 | |||
| # claqz3 | |||
| # clatrs3 | |||
| lapackobjs2d="$lapackobjs2d | |||
| dgelqs | |||
| dgelst | |||
| dgeqp3rk | |||
| dgeqrs | |||
| dlaqp2rk | |||
| dlaqp3rk | |||
| dlarmm | |||
| dlatrs3 | |||
| dtrsyl3 | |||
| " | |||
| # dlaqz0 | |||
| # dlaqz1 | |||
| # dlaqz2 | |||
| # dlaqz3 | |||
| # dlaqz4 | |||
| lapackobjs2z="$lapackobjs2z | |||
| zgelqs | |||
| zgelst | |||
| zgeqp3rk | |||
| zgeqrs | |||
| zlaqp2rk | |||
| zlaqp3rk | |||
| zlatrs3 | |||
| zrscl | |||
| ztrsyl3 | |||
| " | |||
| # zlaqz0 | |||
| # zlaqz1 | |||
| # zlaqz2 | |||
| # zlaqz3 | |||
| lapack_extendedprecision_objs=" | |||
| zposvxx clagge clatms chesvxx cposvxx cgesvxx ssyrfssx csyrfsx | |||
| dlagsy dsysvxx sporfsx slatms zlatms zherfsx csysvxx | |||
| @@ -1622,6 +1673,14 @@ lapackeobjsc=" | |||
| LAPACKE_cgetsqrhrt_work | |||
| LAPACKE_cungtsqr_row | |||
| LAPACKE_cungtsqr_row_work | |||
| LAPACKE_clangb | |||
| LAPACKE_clangb_work | |||
| LAPACKE_ctrsyl3 | |||
| LAPACKE_ctrsyl3_work | |||
| LAPACKE_ctz_nancheck | |||
| LAPACKE_ctz_trans | |||
| LAPACKE_cunhr_col | |||
| LAPACKE_cunhr_col_work | |||
| " | |||
| lapackeobjsd=" | |||
| @@ -2239,6 +2298,14 @@ lapackeobjsd=" | |||
| LAPACKE_dgetsqrhrt_work | |||
| LAPACKE_dorgtsqr_row | |||
| LAPACKE_dorgtsqr_row_work | |||
| LAPACKE_dlangb | |||
| LAPACKE_dlangb_work | |||
| LAPACKE_dorhr_col | |||
| LAPACKE_dorhr_col_work | |||
| LAPACKE_dtrsyl3 | |||
| LAPACKE_dtrsyl3_work | |||
| LAPACKE_dtz_nancheck | |||
| LAPACKE_dtz_trans | |||
| " | |||
| lapackeobjss=" | |||
| @@ -2848,6 +2915,14 @@ lapackeobjss=" | |||
| LAPACKE_sgetsqrhrt_work | |||
| LAPACKE_sorgtsqr_row | |||
| LAPACKE_sorgtsqr_row_work | |||
| LAPACKE_slangb | |||
| LAPACKE_slangb_work | |||
| LAPACKE_sorhr_col | |||
| LAPACKE_sorhr_col_work | |||
| LAPACKE_strsyl3 | |||
| LAPACKE_strsyl3_work | |||
| LAPACKE_stz_nancheck | |||
| LAPACKE_stz_trans | |||
| " | |||
| lapackeobjsz=" | |||
| @@ -3515,6 +3590,14 @@ lapackeobjsz=" | |||
| LAPACKE_zgetsqrhrt_work | |||
| LAPACKE_zungtsqr_row | |||
| LAPACKE_zungtsqr_row_work | |||
| LAPACKE_zlangb | |||
| LAPACKE_zlangb_work | |||
| LAPACKE_ztrsyl3 | |||
| LAPACKE_ztrsyl3_work | |||
| LAPACKE_ztz_nancheck | |||
| LAPACKE_ztz_trans | |||
| LAPACKE_zunhr_col | |||
| LAPACKE_zunhr_col_work | |||
| " | |||
| ## @(SRCX_OBJ) from `lapack-3.4.1/lapacke/src/Makefile` | |||
| ## Not exported: requires LAPACKE_EXTENDED to be set and depends on the | |||
| @@ -3616,6 +3699,7 @@ lapack_embeded_underscore_objs_s=" | |||
| ssysv_aa_2stage ssytrf_aa_2stage | |||
| ssytrs_aa_2stage | |||
| slaorhr_col_getrfnp slaorhr_col_getrfnp2 sorhr_col | |||
| slarfb_gett | |||
| " | |||
| lapack_embeded_underscore_objs_c=" | |||
| chetf2_rook chetrf_rook chetri_rook | |||
| @@ -3641,6 +3725,7 @@ lapack_embeded_underscore_objs_c=" | |||
| csysv_aa_2stage csytrf_aa_2stage | |||
| csytrs_aa_2stage | |||
| claunhr_col_getrfnp claunhr_col_getrfnp2 cunhr_col | |||
| clarfb_gett | |||
| " | |||
| lapack_embeded_underscore_objs_d=" | |||
| dlasyf_rook | |||
| @@ -3658,6 +3743,7 @@ lapack_embeded_underscore_objs_d=" | |||
| dsysv_aa_2stage | |||
| dsytrf_aa_2stage dsytrs_aa_2stage | |||
| dlaorhr_col_getrfnp dlaorhr_col_getrfnp2 dorhr_col | |||
| dlarfb_gett | |||
| " | |||
| lapack_embeded_underscore_objs_z=" | |||
| zhetf2_rook zhetrf_rook zhetri_rook | |||
| @@ -3682,6 +3768,7 @@ lapack_embeded_underscore_objs_z=" | |||
| zhetrs_aa_2stage zsysv_aa_2stage | |||
| zsytrf_aa_2stage zsytrs_aa_2stage | |||
| zlaunhr_col_getrfnp zlaunhr_col_getrfnp2 zunhr_col | |||
| zlarfb_gett | |||
| " | |||
| dirname=`pwd -P`/../lapack-netlib | |||
| @@ -45,7 +45,7 @@ if [ -z "$compiler" ]; then | |||
| pathf90 pathf95 | |||
| pgf95 pgf90 pgf77 pgfortran nvfortran | |||
| flang egfortran | |||
| ifort nagfor ifx ftn crayftn" | |||
| ifort nagfor ifx ftn crayftn armflang" | |||
| for list in $lists; do | |||
| for p in $path; do | |||
| @@ -85,7 +85,11 @@ else | |||
| *Hewlett*) | |||
| vendor=CRAY | |||
| openmp='-fopenmp' | |||
| ;; | |||
| ;; | |||
| *Arm\ F90*) | |||
| vendor=FLANG | |||
| openmp='-fopenmp' | |||
| ;; | |||
| *GNU*|*GCC*) | |||
| v="${data#*GCC: *\) }" | |||
| @@ -108,7 +112,7 @@ else | |||
| if [ "$major" -ge 17 ]; then | |||
| vendor=FLANGNEW | |||
| fi | |||
| ;; | |||
| ;; | |||
| *ifort*|*ifx*) | |||
| vendor=INTEL | |||
| openmp='-fopenmp' | |||
| @@ -90,7 +90,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include <sys/sysinfo.h> | |||
| #include <unistd.h> | |||
| #endif | |||
| #if defined(AIX) | |||
| #if defined(_AIX) | |||
| #include <unistd.h> | |||
| #include <sys/systemcfg.h> | |||
| #include <sys/sysinfo.h> | |||
| #endif | |||
| @@ -150,6 +152,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| /* #define FORCE_EV4 */ | |||
| /* #define FORCE_EV5 */ | |||
| /* #define FORCE_EV6 */ | |||
| /* #define FORCE_CSKY */ | |||
| /* #define FORCE_CK860FV */ | |||
| /* #define FORCE_GENERIC */ | |||
| #ifdef FORCE_P2 | |||
| @@ -1327,6 +1331,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define CORENAME "CORTEXA73" | |||
| #endif | |||
| #ifdef FORCE_CORTEXA76 | |||
| #define FORCE | |||
| #define ARCHITECTURE "ARM64" | |||
| #define SUBARCHITECTURE "CORTEXA76" | |||
| #define SUBDIRNAME "arm64" | |||
| #define ARCHCONFIG "-DCORTEXA76 " \ | |||
| "-DL1_CODE_SIZE=49152 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=3 " \ | |||
| "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=2 " \ | |||
| "-DL2_SIZE=2097152 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | |||
| "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" | |||
| #define LIBNAME "cortexa76" | |||
| #define CORENAME "CORTEXA76" | |||
| #endif | |||
| #ifdef FORCE_CORTEXX1 | |||
| #define FORCE | |||
| #define ARCHITECTURE "ARM64" | |||
| @@ -1677,9 +1696,46 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define LIBNAME "c910v" | |||
| #define CORENAME "C910V" | |||
| #endif | |||
| #endif | |||
| #ifdef FORCE_x280 | |||
| #define FORCE | |||
| #define ARCHITECTURE "RISCV64" | |||
| #define SUBARCHITECTURE "x280" | |||
| #define SUBDIRNAME "riscv64" | |||
| #define ARCHCONFIG "-Dx280 " \ | |||
| "-DL1_DATA_SIZE=64536 -DL1_DATA_LINESIZE=32 " \ | |||
| "-DL2_SIZE=262144 -DL2_LINESIZE=32 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " | |||
| #define LIBNAME "x280" | |||
| #define CORENAME "x280" | |||
| #else | |||
| #endif | |||
| #ifdef FORCE_RISCV64_ZVL256B | |||
| #define FORCE | |||
| #define ARCHITECTURE "RISCV64" | |||
| #define SUBARCHITECTURE "RISCV64_ZVL256B" | |||
| #define SUBDIRNAME "riscv64" | |||
| #define ARCHCONFIG "-DRISCV64_ZVL256B " \ | |||
| "-DL1_DATA_SIZE=64536 -DL1_DATA_LINESIZE=32 " \ | |||
| "-DL2_SIZE=262144 -DL2_LINESIZE=32 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " | |||
| #define LIBNAME "riscv64_zvl256b" | |||
| #define CORENAME "RISCV64_ZVL256B" | |||
| #endif | |||
| #ifdef FORCE_RISCV64_ZVL128B | |||
| #define FORCE | |||
| #define ARCHITECTURE "RISCV64" | |||
| #define SUBARCHITECTURE "RISCV64_ZVL128B" | |||
| #define SUBDIRNAME "riscv64" | |||
| #define ARCHCONFIG "-DRISCV64_ZVL128B " \ | |||
| "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \ | |||
| "-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " | |||
| #define LIBNAME "riscv64_zvl128b" | |||
| #define CORENAME "RISCV64_ZVL128B" | |||
| #endif | |||
| #if defined(FORCE_E2K) || defined(__e2k__) | |||
| #define FORCE | |||
| @@ -1692,6 +1748,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define CORENAME "generic" | |||
| #endif | |||
| #ifdef FORCE_CSKY | |||
| #define FORCE | |||
| #define ARCHITECTURE "CSKY" | |||
| #define SUBARCHITECTURE "CSKY" | |||
| #define SUBDIRNAME "csky" | |||
| #define ARCHCONFIG "-DCSKY" \ | |||
| "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \ | |||
| "-DL2_SIZE=524288 -DL2_LINESIZE=32 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " | |||
| #define LIBNAME "csky" | |||
| #define CORENAME "CSKY" | |||
| #endif | |||
| #ifdef FORCE_CK860FV | |||
| #define FORCE | |||
| #define ARCHITECTURE "CSKY" | |||
| #define SUBARCHITECTURE "CK860V" | |||
| #define SUBDIRNAME "csky" | |||
| #define ARCHCONFIG "-DCK860FV " \ | |||
| "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \ | |||
| "-DL2_SIZE=524288 -DL2_LINESIZE=32 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " | |||
| #define LIBNAME "ck860fv" | |||
| #define CORENAME "CK860FV" | |||
| #endif | |||
| #ifndef FORCE | |||
| #ifdef USER_TARGET | |||
| @@ -1766,7 +1849,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define OPENBLAS_SUPPORTED | |||
| #endif | |||
| #ifndef OPENBLAS_SUPPORTED | |||
| #error "This arch/CPU is not supported by OpenBLAS." | |||
| #endif | |||
| @@ -1805,11 +1887,13 @@ static int get_num_cores(void) { | |||
| return count; | |||
| #elif defined(AIX) | |||
| #elif defined(_AIX) | |||
| //returns the number of processors which are currently online | |||
| count = sysconf(_SC_NPROCESSORS_ONLN); | |||
| if (count <= 0) count = 2; | |||
| return count; | |||
| #else | |||
| return 2; | |||
| #endif | |||
| @@ -1831,7 +1915,7 @@ int main(int argc, char *argv[]){ | |||
| #ifdef FORCE | |||
| printf("CORE=%s\n", CORENAME); | |||
| #else | |||
| #if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) || defined(__riscv) || defined(__alpha__) | |||
| #if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) || defined(__riscv) || defined(__alpha__) || defined(__csky__) | |||
| printf("CORE=%s\n", get_corename()); | |||
| #endif | |||
| #endif | |||
| @@ -1979,7 +2063,7 @@ printf("ELF_VERSION=2\n"); | |||
| #ifdef FORCE | |||
| printf("#define CHAR_CORENAME \"%s\"\n", CORENAME); | |||
| #else | |||
| #if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) || defined(__riscv) | |||
| #if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) || defined(__riscv) || defined(__csky__) | |||
| printf("#define CHAR_CORENAME \"%s\"\n", get_corename()); | |||
| #endif | |||
| #endif | |||
| @@ -119,6 +119,7 @@ endif () | |||
| if (BUILD_BFLOAT16) | |||
| GenerateNamedObjects("bf16dot.c" "" "sbdot" ${CBLAS_FLAG} "" "" true "BFLOAT16") | |||
| GenerateNamedObjects("gemm.c" "" "sbgemm" ${CBLAS_FLAG} "" "" true "BFLOAT16") | |||
| GenerateNamedObjects("gemmt.c" "" "sbgemmt" ${CBLAS_FLAG} "" "" true "BFLOAT16") | |||
| GenerateNamedObjects("sbgemv.c" "" "sbgemv" ${CBLAS_FLAG} "" "" true "BFLOAT16") | |||
| GenerateNamedObjects("tobf16.c" "SINGLE_PREC" "sbstobf16" ${CBLAS_FLAG} "" "" true "BFLOAT16") | |||
| GenerateNamedObjects("tobf16.c" "DOUBLE_PREC" "sbdtobf16" ${CBLAS_FLAG} "" "" true "BFLOAT16") | |||
| @@ -130,6 +131,8 @@ endif () | |||
| foreach (float_type ${FLOAT_TYPES}) | |||
| if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX") | |||
| GenerateNamedObjects("zaxpy.c" "" "axpyc" ${CBLAS_FLAG} "" "" false ${float_type}) | |||
| GenerateNamedObjects("zger.c" "" "geru" ${CBLAS_FLAG} "" "" false ${float_type}) | |||
| GenerateNamedObjects("zger.c" "CONJ" "gerc" ${CBLAS_FLAG} "" "" false ${float_type}) | |||
| GenerateNamedObjects("zdot.c" "CONJ" "dotc" ${CBLAS_FLAG} "" "" false ${float_type}) | |||
| @@ -270,7 +270,8 @@ CSBLAS1OBJS = \ | |||
| cblas_scopy.$(SUFFIX) cblas_sdot.$(SUFFIX) cblas_sdsdot.$(SUFFIX) cblas_dsdot.$(SUFFIX) \ | |||
| cblas_srot.$(SUFFIX) cblas_srotg.$(SUFFIX) cblas_srotm.$(SUFFIX) cblas_srotmg.$(SUFFIX) \ | |||
| cblas_sscal.$(SUFFIX) cblas_sswap.$(SUFFIX) cblas_snrm2.$(SUFFIX) cblas_saxpby.$(SUFFIX) \ | |||
| cblas_ismin.$(SUFFIX) cblas_ismax.$(SUFFIX) cblas_ssum.$(SUFFIX) | |||
| cblas_ismin.$(SUFFIX) cblas_ismax.$(SUFFIX) cblas_ssum.$(SUFFIX) cblas_samax.$(SUFFIX) \ | |||
| cblas_samin.$(SUFFIX) | |||
| CSBLAS2OBJS = \ | |||
| cblas_sgemv.$(SUFFIX) cblas_sger.$(SUFFIX) cblas_ssymv.$(SUFFIX) cblas_strmv.$(SUFFIX) \ | |||
| @@ -295,7 +296,8 @@ CDBLAS1OBJS = \ | |||
| cblas_dcopy.$(SUFFIX) cblas_ddot.$(SUFFIX) \ | |||
| cblas_drot.$(SUFFIX) cblas_drotg.$(SUFFIX) cblas_drotm.$(SUFFIX) cblas_drotmg.$(SUFFIX) \ | |||
| cblas_dscal.$(SUFFIX) cblas_dswap.$(SUFFIX) cblas_dnrm2.$(SUFFIX) cblas_daxpby.$(SUFFIX) \ | |||
| cblas_idmin.$(SUFFIX) cblas_idmax.$(SUFFIX) cblas_dsum.$(SUFFIX) | |||
| cblas_idmin.$(SUFFIX) cblas_idmax.$(SUFFIX) cblas_dsum.$(SUFFIX) cblas_damax.$(SUFFIX) \ | |||
| cblas_damin.$(SUFFIX) | |||
| CDBLAS2OBJS = \ | |||
| cblas_dgemv.$(SUFFIX) cblas_dger.$(SUFFIX) cblas_dsymv.$(SUFFIX) cblas_dtrmv.$(SUFFIX) \ | |||
| @@ -315,7 +317,7 @@ CCBLAS1OBJS = \ | |||
| cblas_cdotc_sub.$(SUFFIX) cblas_cdotu_sub.$(SUFFIX) \ | |||
| cblas_cscal.$(SUFFIX) cblas_csscal.$(SUFFIX) \ | |||
| cblas_cswap.$(SUFFIX) cblas_scnrm2.$(SUFFIX) \ | |||
| cblas_caxpby.$(SUFFIX) \ | |||
| cblas_caxpby.$(SUFFIX) cblas_scamax.$(SUFFIX) cblas_caxpyc.$(SUFFIX) cblas_scamin.$(SUFFIX) \ | |||
| cblas_icmin.$(SUFFIX) cblas_icmax.$(SUFFIX) cblas_scsum.$(SUFFIX) cblas_csrot.$(SUFFIX) cblas_crotg.$(SUFFIX) | |||
| CCBLAS2OBJS = \ | |||
| @@ -340,12 +342,12 @@ CXERBLAOBJ = \ | |||
| CZBLAS1OBJS = \ | |||
| cblas_izamax.$(SUFFIX) cblas_izamin.$(SUFFIX) cblas_dzasum.$(SUFFIX) cblas_zaxpy.$(SUFFIX) \ | |||
| cblas_zcopy.$(SUFFIX) \ | |||
| cblas_zcopy.$(SUFFIX) cblas_dzamax.$(SUFFIX) cblas_dzamin.$(SUFFIX) \ | |||
| cblas_zdotc.$(SUFFIX) cblas_zdotu.$(SUFFIX) \ | |||
| cblas_zdotc_sub.$(SUFFIX) cblas_zdotu_sub.$(SUFFIX) \ | |||
| cblas_zscal.$(SUFFIX) cblas_zdscal.$(SUFFIX) \ | |||
| cblas_zswap.$(SUFFIX) cblas_dznrm2.$(SUFFIX) \ | |||
| cblas_zaxpby.$(SUFFIX) \ | |||
| cblas_zaxpby.$(SUFFIX) cblas_zaxpyc.$(SUFFIX) \ | |||
| cblas_izmin.$(SUFFIX) cblas_izmax.$(SUFFIX) cblas_dzsum.$(SUFFIX) cblas_zdrot.$(SUFFIX) cblas_zrotg.$(SUFFIX) | |||
| @@ -1301,7 +1303,7 @@ xhpr2.$(SUFFIX) xhpr2.$(PSUFFIX) : zhpr2.c | |||
| ifeq ($(BUILD_BFLOAT16),1) | |||
| sbgemm.$(SUFFIX) sbgemm.$(PSUFFIX) : gemm.c ../param.h | |||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||
| sbgemmt.$(SUFFIX) sbgemmt.$(PSUFFIX) : gemmt.c ../param.h | |||
| sbgemmt.$(SUFFIX) sbgemmt.$(PSUFFIX) : sbgemmt.c ../param.h | |||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||
| endif | |||
| @@ -1533,6 +1535,30 @@ cblas_icmin.$(SUFFIX) cblas_icmin.$(PSUFFIX) : imax.c | |||
| cblas_izmin.$(SUFFIX) cblas_izmin.$(PSUFFIX) : imax.c | |||
| $(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -DUSE_MIN $< -o $(@F) | |||
| cblas_samax.$(SUFFIX) cblas_samax.$(PSUFFIX) : max.c | |||
| $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F) | |||
| cblas_damax.$(SUFFIX) cblas_damax.$(PSUFFIX) : max.c | |||
| $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F) | |||
| cblas_scamax.$(SUFFIX) cblas_scamax.$(PSUFFIX) : max.c | |||
| $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F) | |||
| cblas_dzamax.$(SUFFIX) cblas_dzamax.$(PSUFFIX) : max.c | |||
| $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F) | |||
| cblas_samin.$(SUFFIX) cblas_samin.$(PSUFFIX) : max.c | |||
| $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F) | |||
| cblas_damin.$(SUFFIX) cblas_damin.$(PSUFFIX) : max.c | |||
| $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F) | |||
| cblas_scamin.$(SUFFIX) cblas_scamin.$(PSUFFIX) : max.c | |||
| $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F) | |||
| cblas_dzamin.$(SUFFIX) cblas_dzamin.$(PSUFFIX) : max.c | |||
| $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F) | |||
| cblas_sasum.$(SUFFIX) cblas_sasum.$(PSUFFIX) : asum.c | |||
| $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | |||
| @@ -1627,6 +1653,15 @@ cblas_daxpy.$(SUFFIX) cblas_daxpy.$(PSUFFIX) : axpy.c | |||
| cblas_caxpy.$(SUFFIX) cblas_caxpy.$(PSUFFIX) : zaxpy.c | |||
| $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | |||
| cblas_caxpyc.$(SUFFIX) cblas_caxpyc.$(PSUFFIX) : zaxpy.c | |||
| $(CC) $(CFLAGS) -DCBLAS -c -DCONJ $< -o $(@F) | |||
| cblas_zaxpyc.$(SUFFIX) cblas_zaxpyc.$(PSUFFIX) : zaxpy.c | |||
| $(CC) $(CFLAGS) -DCBLAS -c -DCONJ $< -o $(@F) | |||
| cblas_xaxpyc.$(SUFFIX) cblas_xaxpyc.$(PSUFFIX) : zaxpy.c | |||
| $(CC) $(CFLAGS) -DCBLAS -c -DCONJ $< -o $(@F) | |||
| cblas_zaxpy.$(SUFFIX) cblas_zaxpy.$(PSUFFIX) : zaxpy.c | |||
| $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | |||
| @@ -1932,7 +1967,7 @@ cblas_sgemmt.$(SUFFIX) cblas_sgemmt.$(PSUFFIX) : gemmt.c ../param.h | |||
| $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) | |||
| ifeq ($(BUILD_BFLOAT16),1) | |||
| cblas_sbgemmt.$(SUFFIX) cblas_sbgemmt.$(PSUFFIX) : gemmt.c ../param.h | |||
| cblas_sbgemmt.$(SUFFIX) cblas_sbgemmt.$(PSUFFIX) : sbgemmt.c ../param.h | |||
| $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) | |||
| endif | |||
| @@ -117,8 +117,8 @@ void CNAME(enum CBLAS_ORDER order, | |||
| if (ldc < MAX(1, m)) info = 8; | |||
| if (lda < MAX(1, m)) info = 5; | |||
| if (n < 0) info = 2; | |||
| if (m < 0) info = 1; | |||
| if (n < 0) info = 1; | |||
| if (m < 0) info = 2; | |||
| } | |||
| if (info >= 0) { | |||
| @@ -533,8 +533,12 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS | |||
| MNK = (double) args.m * (double) args.n * (double) args.k; | |||
| if ( MNK <= (SMP_THRESHOLD_MIN * (double) GEMM_MULTITHREAD_THRESHOLD) ) | |||
| args.nthreads = 1; | |||
| else | |||
| else { | |||
| args.nthreads = num_cpu_avail(3); | |||
| if (MNK/args.nthreads < SMP_THRESHOLD_MIN*(double)GEMM_MULTITHREAD_THRESHOLD) | |||
| args.nthreads = MNK/(SMP_THRESHOLD_MIN*(double)GEMM_MULTITHREAD_THRESHOLD); | |||
| } | |||
| args.common = NULL; | |||
| if (args.nthreads == 1) { | |||
| @@ -78,6 +78,9 @@ void NAME(char *UPLO, char *TRANSA, char *TRANSB, | |||
| char transA, transB, Uplo; | |||
| blasint nrowa, nrowb; | |||
| #if defined(COMPLEX) | |||
| blasint ncolb; | |||
| #endif | |||
| IFLOAT *buffer; | |||
| IFLOAT *aa, *bb; | |||
| FLOAT *cc; | |||
| @@ -155,19 +158,27 @@ void NAME(char *UPLO, char *TRANSA, char *TRANSB, | |||
| uplo = 0; | |||
| if (Uplo == 'L') | |||
| uplo = 1; | |||
| nrowa = m; | |||
| if (transa) nrowa = k; | |||
| if (transa & 1) nrowa = k; | |||
| nrowb = k; | |||
| if (transb) nrowb = m; | |||
| #if defined(COMPLEX) | |||
| ncolb = m; | |||
| #endif | |||
| if (transb & 1) { | |||
| nrowb = m; | |||
| #if defined(COMPLEX) | |||
| ncolb = k; | |||
| #endif | |||
| } | |||
| info = 0; | |||
| if (ldc < MAX(1, m)) | |||
| info = 13; | |||
| if (ldb < MAX(1, nrowa)) | |||
| if (ldb < MAX(1, nrowb)) | |||
| info = 10; | |||
| if (lda < MAX(1, nrowb)) | |||
| if (lda < MAX(1, nrowa)) | |||
| info = 8; | |||
| if (k < 0) | |||
| info = 5; | |||
| @@ -211,6 +222,9 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, | |||
| blasint info; | |||
| blasint lda, ldb; | |||
| FLOAT *a, *b; | |||
| #if defined(COMPLEX) | |||
| blasint nrowb, ncolb; | |||
| #endif | |||
| XFLOAT *buffer; | |||
| PRINT_DEBUG_CNAME; | |||
| @@ -262,11 +276,22 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, | |||
| info = -1; | |||
| blasint nrowa, nrowb; | |||
| blasint nrowa; | |||
| #if !defined(COMPLEX) | |||
| blasint nrowb; | |||
| #endif | |||
| nrowa = m; | |||
| if (transa) nrowa = k; | |||
| if (transa & 1) nrowa = k; | |||
| nrowb = k; | |||
| if (transb) nrowb = m; | |||
| #if defined(COMPLEX) | |||
| ncolb = m; | |||
| #endif | |||
| if (transb & 1) { | |||
| nrowb = m; | |||
| #if defined(COMPLEX) | |||
| ncolb = k; | |||
| #endif | |||
| } | |||
| if (ldc < MAX(1, m)) | |||
| info = 13; | |||
| @@ -330,26 +355,38 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, | |||
| info = -1; | |||
| blasint ncola, ncolb; | |||
| ncola = k; | |||
| if (transa) ncola = m; | |||
| ncolb = m; | |||
| if (transb) ncolb = k; | |||
| blasint ncola; | |||
| #if !defined(COMPLEX) | |||
| blasint ncolb; | |||
| #endif | |||
| ncola = m; | |||
| if (transa & 1) ncola = k; | |||
| ncolb = k; | |||
| #if defined(COMPLEX) | |||
| nrowb = m; | |||
| #endif | |||
| if (transb & 1) { | |||
| #if defined(COMPLEX) | |||
| nrowb = k; | |||
| #endif | |||
| ncolb = m; | |||
| } | |||
| if (ldc < MAX(1,m)) | |||
| info = 13; | |||
| if (ldb < MAX(1, ncolb)) | |||
| info = 10; | |||
| if (lda < MAX(1, ncola)) | |||
| info = 8; | |||
| if (lda < MAX(1, ncola)) | |||
| info = 10; | |||
| if (k < 0) | |||
| info = 5; | |||
| if (m < 0) | |||
| info = 4; | |||
| if (transb < 0) | |||
| info = 3; | |||
| if (transa < 0) | |||
| info = 2; | |||
| if (transa < 0) | |||
| info = 3; | |||
| if (uplo < 0) | |||
| info = 1; | |||
| } | |||
| @@ -428,7 +465,20 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, | |||
| IDEBUG_START; | |||
| const blasint incb = (transb == 0) ? 1 : ldb; | |||
| #if defined(COMPLEX) | |||
| if (transb > 1){ | |||
| #ifndef CBLAS | |||
| IMATCOPY_K_CNC(nrowb, ncolb, (FLOAT)(1.0), (FLOAT)(0.0), b, ldb); | |||
| #else | |||
| if (order == CblasColMajor) | |||
| IMATCOPY_K_CNC(nrowb, ncolb, (FLOAT)(1.0), (FLOAT)(0.0), b, ldb); | |||
| if (order == CblasRowMajor) | |||
| IMATCOPY_K_RNC(nrowb, ncolb, (FLOAT)(1.0), (FLOAT)(0.0), b, ldb); | |||
| #endif | |||
| } | |||
| #endif | |||
| const blasint incb = ((transb & 1) == 0) ? 1 : ldb; | |||
| if (uplo == 1) { | |||
| for (i = 0; i < m; i++) { | |||
| @@ -438,19 +488,19 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, | |||
| #if defined(COMPLEX) | |||
| aa = a + i * 2; | |||
| bb = b + i * ldb * 2; | |||
| if (transa) { | |||
| if (transa & 1) { | |||
| aa = a + lda * i * 2; | |||
| } | |||
| if (transb) | |||
| if (transb & 1) | |||
| bb = b + i * 2; | |||
| cc = c + i * 2 * ldc + i * 2; | |||
| #else | |||
| aa = a + i; | |||
| bb = b + i * ldb; | |||
| if (transa) { | |||
| if (transa & 1) { | |||
| aa = a + lda * i; | |||
| } | |||
| if (transb) | |||
| if (transb & 1) | |||
| bb = b + i; | |||
| cc = c + i * ldc + i; | |||
| #endif | |||
| @@ -461,7 +511,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, | |||
| NULL, 0); | |||
| if (alpha_r == ZERO && alpha_i == ZERO) | |||
| return; | |||
| continue; | |||
| #else | |||
| if (beta != ONE) | |||
| SCAL_K(l, 0, 0, beta, cc, 1, NULL, 0, NULL, 0); | |||
| @@ -472,13 +522,13 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, | |||
| IDEBUG_START; | |||
| buffer_size = j + k + 128 / sizeof(FLOAT); | |||
| buffer_size = 2 * (j + k) + 128 / sizeof(FLOAT); | |||
| #ifdef WINDOWS_ABI | |||
| buffer_size += 160 / sizeof(FLOAT); | |||
| #endif | |||
| // for alignment | |||
| buffer_size = (buffer_size + 3) & ~3; | |||
| STACK_ALLOC(buffer_size, FLOAT, buffer); | |||
| STACK_ALLOC(buffer_size, IFLOAT, buffer); | |||
| #ifdef SMP | |||
| @@ -491,7 +541,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, | |||
| #endif | |||
| #if defined(COMPLEX) | |||
| if (!transa) | |||
| if (!(transa & 1)) | |||
| (gemv[(int)transa]) (j, k, 0, alpha_r, alpha_i, | |||
| aa, lda, bb, incb, cc, 1, | |||
| buffer); | |||
| @@ -500,7 +550,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, | |||
| aa, lda, bb, incb, cc, 1, | |||
| buffer); | |||
| #else | |||
| if (!transa) | |||
| if (!(transa & 1)) | |||
| (gemv[(int)transa]) (j, k, 0, alpha, aa, lda, | |||
| bb, incb, cc, 1, buffer); | |||
| else | |||
| @@ -509,7 +559,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, | |||
| #endif | |||
| #ifdef SMP | |||
| } else { | |||
| if (!transa) | |||
| if (!(transa & 1)) | |||
| (gemv_thread[(int)transa]) (j, k, alpha, aa, | |||
| lda, bb, incb, cc, | |||
| 1, buffer, | |||
| @@ -533,13 +583,13 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, | |||
| l = j; | |||
| #if defined COMPLEX | |||
| bb = b + i * ldb * 2; | |||
| if (transb) { | |||
| if (transb & 1) { | |||
| bb = b + i * 2; | |||
| } | |||
| cc = c + i * 2 * ldc; | |||
| #else | |||
| bb = b + i * ldb; | |||
| if (transb) { | |||
| if (transb & 1) { | |||
| bb = b + i; | |||
| } | |||
| cc = c + i * ldc; | |||
| @@ -551,7 +601,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, | |||
| NULL, 0); | |||
| if (alpha_r == ZERO && alpha_i == ZERO) | |||
| return; | |||
| continue; | |||
| #else | |||
| if (beta != ONE) | |||
| SCAL_K(l, 0, 0, beta, cc, 1, NULL, 0, NULL, 0); | |||
| @@ -561,13 +611,13 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, | |||
| #endif | |||
| IDEBUG_START; | |||
| buffer_size = j + k + 128 / sizeof(FLOAT); | |||
| buffer_size = 2 * (j + k) + 128 / sizeof(FLOAT); | |||
| #ifdef WINDOWS_ABI | |||
| buffer_size += 160 / sizeof(FLOAT); | |||
| #endif | |||
| // for alignment | |||
| buffer_size = (buffer_size + 3) & ~3; | |||
| STACK_ALLOC(buffer_size, FLOAT, buffer); | |||
| STACK_ALLOC(buffer_size, IFLOAT, buffer); | |||
| #ifdef SMP | |||
| @@ -580,7 +630,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, | |||
| #endif | |||
| #if defined(COMPLEX) | |||
| if (!transa) | |||
| if (!(transa & 1)) | |||
| (gemv[(int)transa]) (j, k, 0, alpha_r, alpha_i, | |||
| a, lda, bb, incb, cc, 1, | |||
| buffer); | |||
| @@ -589,7 +639,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, | |||
| a, lda, bb, incb, cc, 1, | |||
| buffer); | |||
| #else | |||
| if (!transa) | |||
| if (!(transa & 1)) | |||
| (gemv[(int)transa]) (j, k, 0, alpha, a, lda, bb, | |||
| incb, cc, 1, buffer); | |||
| else | |||
| @@ -599,7 +649,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, | |||
| #ifdef SMP | |||
| } else { | |||
| if (!transa) | |||
| if (!(transa & 1)) | |||
| (gemv_thread[(int)transa]) (j, k, alpha, a, lda, | |||
| bb, incb, cc, 1, | |||
| buffer, nthreads); | |||
| @@ -226,7 +226,7 @@ void CNAME(enum CBLAS_ORDER order, | |||
| #ifdef SMP | |||
| if ( 1L * m * n < 2304L * GEMM_MULTITHREAD_THRESHOLD ) | |||
| if ( 1L * m * n < 115200L * GEMM_MULTITHREAD_THRESHOLD ) | |||
| nthreads = 1; | |||
| else | |||
| nthreads = num_cpu_avail(2); | |||
| @@ -154,7 +154,10 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, | |||
| } | |||
| #endif | |||
| msize = (size_t)(*rows) * (*cols) * sizeof(FLOAT); | |||
| if ( *rows > *cols ) | |||
| msize = (size_t)(*rows) * (*ldb) * sizeof(FLOAT); | |||
| else | |||
| msize = (size_t)(*cols) * (*ldb) * sizeof(FLOAT); | |||
| b = malloc(msize); | |||
| if ( b == NULL ) | |||
| @@ -95,14 +95,19 @@ int NAME(blasint *M, blasint *N, FLOAT *a, blasint *ldA, blasint *ipiv, blasint | |||
| #ifdef SMP | |||
| args.common = NULL; | |||
| #ifndef DOUBLE | |||
| if (args.m*args.n < 40000) | |||
| #else | |||
| if (args.m*args.n < 10000) | |||
| int nmax = 40000; | |||
| #else | |||
| int nmax = 10000; | |||
| #endif | |||
| args.nthreads=1; | |||
| else | |||
| args.nthreads = num_cpu_avail(4); | |||
| if (args.m*args.n <nmax) { | |||
| args.nthreads = 1; | |||
| } else { | |||
| args.nthreads = num_cpu_avail(4); | |||
| if ((args.m*args.n)/args.nthreads <nmax) | |||
| args.nthreads = (args.m*args.n)/nmax; | |||
| } | |||
| if (args.nthreads == 1) { | |||
| #endif | |||
| @@ -113,13 +113,17 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ | |||
| #ifdef SMP | |||
| args.common = NULL; | |||
| #ifndef DOUBLE | |||
| if (args.n <128) | |||
| #else | |||
| if (args.n <64) | |||
| int nmax = 128; | |||
| #else | |||
| int nmax = 64; | |||
| #endif | |||
| if (args.n <nmax) { | |||
| args.nthreads = 1; | |||
| else | |||
| args.nthreads = num_cpu_avail(4); | |||
| } else { | |||
| args.nthreads = num_cpu_avail(4); | |||
| if (args.n/args.nthreads <nmax) | |||
| args.nthreads = args.n/nmax; | |||
| } | |||
| if (args.nthreads == 1) { | |||
| #endif | |||
| @@ -95,10 +95,12 @@ int NAME(char *UPLO, char* TRANS, char* DIAG, blasint *N, blasint *NRHS, FLOAT * | |||
| if (trans_arg == 'R') trans = 0; | |||
| if (trans_arg == 'C') trans = 1; | |||
| TOUPPER(uplo_arg); | |||
| uplo = -1; | |||
| if (uplo_arg == 'U') uplo = 0; | |||
| if (uplo_arg == 'L') uplo = 1; | |||
| TOUPPER(diag_arg); | |||
| diag = -1; | |||
| if (diag_arg == 'U') diag = 0; | |||
| if (diag_arg == 'N') diag = 1; | |||
| @@ -95,10 +95,12 @@ int NAME(char *UPLO, char* TRANS, char* DIAG, blasint *N, blasint *NRHS, FLOAT * | |||
| if (trans_arg == 'R') trans = 2; | |||
| if (trans_arg == 'C') trans = 3; | |||
| TOUPPER(uplo_arg); | |||
| uplo = -1; | |||
| if (uplo_arg == 'U') uplo = 0; | |||
| if (uplo_arg == 'L') uplo = 1; | |||
| TOUPPER(diag_arg); | |||
| diag = -1; | |||
| if (diag_arg == 'U') diag = 0; | |||
| if (diag_arg == 'N') diag = 1; | |||
| @@ -46,6 +46,12 @@ | |||
| #ifdef USE_ABS | |||
| #if defined(DOUBLE) | |||
| #define ABS fabs | |||
| #else | |||
| #define ABS fabsf | |||
| #endif | |||
| #ifndef USE_MIN | |||
| /* ABS & MAX */ | |||
| @@ -92,6 +98,8 @@ | |||
| #else | |||
| #define ABS | |||
| #ifndef USE_MIN | |||
| /* MAX */ | |||
| @@ -130,6 +138,12 @@ FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX){ | |||
| if (n <= 0) return 0; | |||
| #ifndef COMPLEX | |||
| if (incx == 0) return (ABS(*x)); | |||
| #else | |||
| if (incx == 0) return (ABS(*x) + ABS(*(x+1))); | |||
| #endif | |||
| IDEBUG_START; | |||
| FUNCTION_PROFILE_START(); | |||
| @@ -145,14 +159,25 @@ FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX){ | |||
| #else | |||
| #ifdef COMPLEX | |||
| FLOAT CNAME(blasint n, void *vx, blasint incx){ | |||
| FLOAT *x = (FLOAT*) vx; | |||
| #else | |||
| FLOAT CNAME(blasint n, FLOAT *x, blasint incx){ | |||
| #endif | |||
| FLOAT ret; | |||
| PRINT_DEBUG_CNAME; | |||
| if (n <= 0) return 0; | |||
| #ifndef COMPLEX | |||
| if (incx == 0) return (ABS(*x)); | |||
| #else | |||
| if (incx == 0) return (ABS(*x) + ABS(*(x+1))); | |||
| #endif | |||
| IDEBUG_START; | |||
| FUNCTION_PROFILE_START(); | |||
| @@ -96,12 +96,6 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){ | |||
| else | |||
| { | |||
| dp2 = *dd2 * dy1; | |||
| if(dp2 == ZERO) | |||
| { | |||
| dflag = -TWO; | |||
| dparam[0] = dflag; | |||
| return; | |||
| } | |||
| dp1 = *dd1 * *dx1; | |||
| dq2 = dp2 * dy1; | |||
| dq1 = dp1 * *dx1; | |||
| @@ -113,24 +107,10 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){ | |||
| dh12 = dp2 / dp1; | |||
| du = ONE - dh12 * dh21; | |||
| if(du > ZERO) | |||
| { | |||
| dflag = ZERO; | |||
| *dd1 = *dd1 / du; | |||
| *dd2 = *dd2 / du; | |||
| *dx1 = *dx1 * du; | |||
| } else { | |||
| dflag = -ONE; | |||
| dh11 = ZERO; | |||
| dh12 = ZERO; | |||
| dh21 = ZERO; | |||
| dh22 = ZERO; | |||
| *dd1 = ZERO; | |||
| *dd2 = ZERO; | |||
| *dx1 = ZERO; | |||
| } | |||
| dflag = ZERO; | |||
| *dd1 = *dd1 / du; | |||
| *dd2 = *dd2 / du; | |||
| *dx1 = *dx1 * du; | |||
| } | |||
| else | |||
| @@ -0,0 +1,447 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2024, The OpenBLAS Project. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /*********************************************************************/ | |||
| #include <stdio.h> | |||
| #include <stdlib.h> | |||
| #include "common.h" | |||
| #define SMP_THRESHOLD_MIN 65536.0 | |||
| #define ERROR_NAME "SBGEMMT " | |||
| #ifndef GEMM_MULTITHREAD_THRESHOLD | |||
| #define GEMM_MULTITHREAD_THRESHOLD 4 | |||
| #endif | |||
| #ifndef CBLAS | |||
| void NAME(char *UPLO, char *TRANSA, char *TRANSB, | |||
| blasint * M, blasint * K, | |||
| FLOAT * Alpha, | |||
| IFLOAT * a, blasint * ldA, | |||
| IFLOAT * b, blasint * ldB, FLOAT * Beta, FLOAT * c, blasint * ldC) | |||
| { | |||
| blasint m, k; | |||
| blasint lda, ldb, ldc; | |||
| int transa, transb, uplo; | |||
| blasint info; | |||
| char transA, transB, Uplo; | |||
| blasint nrowa, nrowb; | |||
| IFLOAT *buffer; | |||
| IFLOAT *aa, *bb; | |||
| FLOAT *cc; | |||
| FLOAT alpha, beta; | |||
| PRINT_DEBUG_NAME; | |||
| m = *M; | |||
| k = *K; | |||
| alpha = *Alpha; | |||
| beta = *Beta; | |||
| lda = *ldA; | |||
| ldb = *ldB; | |||
| ldc = *ldC; | |||
| transA = *TRANSA; | |||
| transB = *TRANSB; | |||
| Uplo = *UPLO; | |||
| TOUPPER(transA); | |||
| TOUPPER(transB); | |||
| TOUPPER(Uplo); | |||
| transa = -1; | |||
| transb = -1; | |||
| uplo = -1; | |||
| if (transA == 'N') | |||
| transa = 0; | |||
| if (transA == 'T') | |||
| transa = 1; | |||
| if (transA == 'R') | |||
| transa = 0; | |||
| if (transA == 'C') | |||
| transa = 1; | |||
| if (transB == 'N') | |||
| transb = 0; | |||
| if (transB == 'T') | |||
| transb = 1; | |||
| if (transB == 'R') | |||
| transb = 0; | |||
| if (transB == 'C') | |||
| transb = 1; | |||
| if (Uplo == 'U') | |||
| uplo = 0; | |||
| if (Uplo == 'L') | |||
| uplo = 1; | |||
| nrowa = m; | |||
| if (transa & 1) nrowa = k; | |||
| nrowb = k; | |||
| if (transb & 1) nrowb = m; | |||
| info = 0; | |||
| if (ldc < MAX(1, m)) | |||
| info = 13; | |||
| if (ldb < MAX(1, nrowb)) | |||
| info = 10; | |||
| if (lda < MAX(1, nrowa)) | |||
| info = 8; | |||
| if (k < 0) | |||
| info = 5; | |||
| if (m < 0) | |||
| info = 4; | |||
| if (transb < 0) | |||
| info = 3; | |||
| if (transa < 0) | |||
| info = 2; | |||
| if (uplo < 0) | |||
| info = 1; | |||
| if (info != 0) { | |||
| BLASFUNC(xerbla) (ERROR_NAME, &info, sizeof(ERROR_NAME)); | |||
| return; | |||
| } | |||
| #else | |||
| void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, | |||
| enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint m, | |||
| blasint k, | |||
| FLOAT alpha, | |||
| IFLOAT * A, blasint LDA, | |||
| IFLOAT * B, blasint LDB, FLOAT beta, FLOAT * c, blasint ldc) | |||
| { | |||
| IFLOAT *aa, *bb; | |||
| FLOAT *cc; | |||
| int transa, transb, uplo; | |||
| blasint info; | |||
| blasint lda, ldb; | |||
| IFLOAT *a, *b; | |||
| XFLOAT *buffer; | |||
| PRINT_DEBUG_CNAME; | |||
| uplo = -1; | |||
| transa = -1; | |||
| transb = -1; | |||
| info = 0; | |||
| if (order == CblasColMajor) { | |||
| if (Uplo == CblasUpper) uplo = 0; | |||
| if (Uplo == CblasLower) uplo = 1; | |||
| if (TransA == CblasNoTrans) | |||
| transa = 0; | |||
| if (TransA == CblasTrans) | |||
| transa = 1; | |||
| if (TransA == CblasConjNoTrans) | |||
| transa = 0; | |||
| if (TransA == CblasConjTrans) | |||
| transa = 1; | |||
| if (TransB == CblasNoTrans) | |||
| transb = 0; | |||
| if (TransB == CblasTrans) | |||
| transb = 1; | |||
| if (TransB == CblasConjNoTrans) | |||
| transb = 0; | |||
| if (TransB == CblasConjTrans) | |||
| transb = 1; | |||
| a = (void *)A; | |||
| b = (void *)B; | |||
| lda = LDA; | |||
| ldb = LDB; | |||
| info = -1; | |||
| blasint nrowa; | |||
| blasint nrowb; | |||
| nrowa = m; | |||
| if (transa & 1) nrowa = k; | |||
| nrowb = k; | |||
| if (transb & 1) nrowb = m; | |||
| if (ldc < MAX(1, m)) | |||
| info = 13; | |||
| if (ldb < MAX(1, nrowb)) | |||
| info = 10; | |||
| if (lda < MAX(1, nrowa)) | |||
| info = 8; | |||
| if (k < 0) | |||
| info = 5; | |||
| if (m < 0) | |||
| info = 4; | |||
| if (transb < 0) | |||
| info = 3; | |||
| if (transa < 0) | |||
| info = 2; | |||
| if (uplo < 0) | |||
| info = 1; | |||
| } | |||
| if (order == CblasRowMajor) { | |||
| a = (void *)B; | |||
| b = (void *)A; | |||
| lda = LDB; | |||
| ldb = LDA; | |||
| if (Uplo == CblasUpper) uplo = 0; | |||
| if (Uplo == CblasLower) uplo = 1; | |||
| if (TransB == CblasNoTrans) | |||
| transa = 0; | |||
| if (TransB == CblasTrans) | |||
| transa = 1; | |||
| if (TransB == CblasConjNoTrans) | |||
| transa = 0; | |||
| if (TransB == CblasConjTrans) | |||
| transa = 1; | |||
| if (TransA == CblasNoTrans) | |||
| transb = 0; | |||
| if (TransA == CblasTrans) | |||
| transb = 1; | |||
| if (TransA == CblasConjNoTrans) | |||
| transb = 0; | |||
| if (TransA == CblasConjTrans) | |||
| transb = 1; | |||
| info = -1; | |||
| blasint ncola; | |||
| blasint ncolb; | |||
| ncola = m; | |||
| if (transa & 1) ncola = k; | |||
| ncolb = k; | |||
| if (transb & 1) { | |||
| ncolb = m; | |||
| } | |||
| if (ldc < MAX(1,m)) | |||
| info = 13; | |||
| if (ldb < MAX(1, ncolb)) | |||
| info = 8; | |||
| if (lda < MAX(1, ncola)) | |||
| info = 10; | |||
| if (k < 0) | |||
| info = 5; | |||
| if (m < 0) | |||
| info = 4; | |||
| if (transb < 0) | |||
| info = 2; | |||
| if (transa < 0) | |||
| info = 3; | |||
| if (uplo < 0) | |||
| info = 1; | |||
| } | |||
| if (info >= 0) { | |||
| BLASFUNC(xerbla) (ERROR_NAME, &info, sizeof(ERROR_NAME)); | |||
| return; | |||
| } | |||
| #endif | |||
| int buffer_size; | |||
| blasint i, j; | |||
| #ifdef SMP | |||
| int nthreads; | |||
| #endif | |||
| #ifdef SMP | |||
| static int (*gemv_thread[]) (BLASLONG, BLASLONG, FLOAT, IFLOAT *, | |||
| BLASLONG, IFLOAT *, BLASLONG, FLOAT, | |||
| FLOAT *, BLASLONG, int) = { | |||
| sbgemv_thread_n, sbgemv_thread_t, | |||
| }; | |||
| #endif | |||
| int (*gemv[]) (BLASLONG, BLASLONG, FLOAT, IFLOAT *, BLASLONG, | |||
| IFLOAT *, BLASLONG, FLOAT, FLOAT *, BLASLONG) = { | |||
| SBGEMV_N, SBGEMV_T,}; | |||
| if (m == 0) | |||
| return; | |||
| IDEBUG_START; | |||
| const blasint incb = ((transb & 1) == 0) ? 1 : ldb; | |||
| if (uplo == 1) { | |||
| for (i = 0; i < m; i++) { | |||
| j = m - i; | |||
| aa = a + i; | |||
| bb = b + i * ldb; | |||
| if (transa & 1) { | |||
| aa = a + lda * i; | |||
| } | |||
| if (transb & 1) | |||
| bb = b + i; | |||
| cc = c + i * ldc + i; | |||
| #if 0 | |||
| if (beta != ONE) | |||
| SCAL_K(l, 0, 0, beta, cc, 1, NULL, 0, NULL, 0); | |||
| if (alpha == ZERO) | |||
| continue; | |||
| #endif | |||
| IDEBUG_START; | |||
| buffer_size = j + k + 128 / sizeof(FLOAT); | |||
| #ifdef WINDOWS_ABI | |||
| buffer_size += 160 / sizeof(FLOAT); | |||
| #endif | |||
| // for alignment | |||
| buffer_size = (buffer_size + 3) & ~3; | |||
| STACK_ALLOC(buffer_size, IFLOAT, buffer); | |||
| #ifdef SMP | |||
| if (1L * j * k < 2304L * GEMM_MULTITHREAD_THRESHOLD) | |||
| nthreads = 1; | |||
| else | |||
| nthreads = num_cpu_avail(2); | |||
| if (nthreads == 1) { | |||
| #endif | |||
| if (!(transa & 1)) | |||
| (gemv[(int)transa]) (j, k, alpha, aa, lda, | |||
| bb, incb, beta, cc, 1); | |||
| else | |||
| (gemv[(int)transa]) (k, j, alpha, aa, lda, | |||
| bb, incb, beta, cc, 1); | |||
| #ifdef SMP | |||
| } else { | |||
| if (!(transa & 1)) | |||
| (gemv_thread[(int)transa]) (j, k, alpha, aa, | |||
| lda, bb, incb, beta, cc, | |||
| 1, nthreads); | |||
| else | |||
| (gemv_thread[(int)transa]) (k, j, alpha, aa, | |||
| lda, bb, incb, beta, cc, | |||
| 1, nthreads); | |||
| } | |||
| #endif | |||
| STACK_FREE(buffer); | |||
| } | |||
| } else { | |||
| for (i = 0; i < m; i++) { | |||
| j = i + 1; | |||
| bb = b + i * ldb; | |||
| if (transb & 1) { | |||
| bb = b + i; | |||
| } | |||
| cc = c + i * ldc; | |||
| #if 0 | |||
| if (beta != ONE) | |||
| SCAL_K(l, 0, 0, beta, cc, 1, NULL, 0, NULL, 0); | |||
| if (alpha == ZERO) | |||
| continue; | |||
| #endif | |||
| IDEBUG_START; | |||
| buffer_size = j + k + 128 / sizeof(FLOAT); | |||
| #ifdef WINDOWS_ABI | |||
| buffer_size += 160 / sizeof(FLOAT); | |||
| #endif | |||
| // for alignment | |||
| buffer_size = (buffer_size + 3) & ~3; | |||
| STACK_ALLOC(buffer_size, IFLOAT, buffer); | |||
| #ifdef SMP | |||
| if (1L * j * k < 2304L * GEMM_MULTITHREAD_THRESHOLD) | |||
| nthreads = 1; | |||
| else | |||
| nthreads = num_cpu_avail(2); | |||
| if (nthreads == 1) { | |||
| #endif | |||
| if (!(transa & 1)) | |||
| (gemv[(int)transa]) (j, k, alpha, a, lda, bb, | |||
| incb, beta, cc, 1); | |||
| else | |||
| (gemv[(int)transa]) (k, j, alpha, a, lda, bb, | |||
| incb, beta, cc, 1); | |||
| #ifdef SMP | |||
| } else { | |||
| if (!(transa & 1)) | |||
| (gemv_thread[(int)transa]) (j, k, alpha, a, lda, | |||
| bb, incb, beta, cc, 1, | |||
| nthreads); | |||
| else | |||
| (gemv_thread[(int)transa]) (k, j, alpha, a, lda, | |||
| bb, incb, beta, cc, 1, | |||
| nthreads); | |||
| } | |||
| #endif | |||
| STACK_FREE(buffer); | |||
| } | |||
| } | |||
| IDEBUG_END; | |||
| return; | |||
| } | |||
| @@ -39,12 +39,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #ifndef CBLAS | |||
| void NAME(blasint *N, FLOAT *ALPHA, FLOAT *x, blasint *INCX, FLOAT *BETA, FLOAT *y, blasint *INCY) | |||
| void NAME(blasint *N, void *VALPHA, FLOAT *x, blasint *INCX, void *VBETA, FLOAT *y, blasint *INCY) | |||
| { | |||
| blasint n = *N; | |||
| blasint incx = *INCX; | |||
| blasint incy = *INCY; | |||
| FLOAT* ALPHA = (FLOAT*) VALPHA; | |||
| FLOAT* BETA = (FLOAT*) VBETA; | |||
| #else | |||
| @@ -66,7 +66,7 @@ void NAME(blasint *M, blasint *N, FLOAT *ALPHA, FLOAT *a, blasint *LDA, | |||
| info = 0; | |||
| if (lda < MAX(1, m)) info = 6; | |||
| if (lda < MAX(1, m)) info = 5; | |||
| if (ldc < MAX(1, m)) info = 8; | |||
| if (n < 0) info = 2; | |||
| @@ -115,8 +115,8 @@ void CNAME(enum CBLAS_ORDER order, | |||
| if (ldc < MAX(1, m)) info = 8; | |||
| if (lda < MAX(1, m)) info = 5; | |||
| if (n < 0) info = 2; | |||
| if (m < 0) info = 1; | |||
| if (n < 0) info = 1; | |||
| if (m < 0) info = 2; | |||
| } | |||
| if (info >= 0) { | |||
| @@ -183,7 +183,10 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, | |||
| } | |||
| #endif | |||
| msize = (size_t)(*rows) * (*cols) * sizeof(FLOAT) * 2; | |||
| if ( *rows > *cols ) | |||
| msize = (size_t)(*rows) * (*ldb) * sizeof(FLOAT) * 2; | |||
| else | |||
| msize = (size_t)(*cols) * (*ldb) * sizeof(FLOAT) * 2; | |||
| b = malloc(msize); | |||
| if ( b == NULL ) | |||
| @@ -102,7 +102,7 @@ void CNAME(void *VDA, void *VDB, FLOAT *C, void *VS) { | |||
| if (ada >= h *safmin) { | |||
| *C = sqrt(ada/h); | |||
| *R = *DA / *C; | |||
| *(R+1) = *(DA+1) / *(C+1); | |||
| *(R+1) = *(DA+1) / *C; | |||
| rtmax *= 2.; | |||
| if ( ada > rtmin && h < rtmax) { // no risk of intermediate overflow | |||
| *S = *S1 * (*DA / adahsq) - *(S1+1)* (*(DA+1)/adahsq); | |||
| @@ -115,7 +115,7 @@ void CNAME(void *VDA, void *VDB, FLOAT *C, void *VS) { | |||
| *C = ada / adahsq; | |||
| if (*C >= safmin) { | |||
| *R = *DA / *C; | |||
| *(R+1) = *(DA+1) / *(C+1); | |||
| *(R+1) = *(DA+1) / *C; | |||
| } else { | |||
| *R = *DA * (h / adahsq); | |||
| *(R+1) = *(DA+1) * (h / adahsq); | |||
| @@ -1349,6 +1349,9 @@ endif () | |||
| set_target_properties(kernel${TSUFFIX} PROPERTIES COMPILE_FLAGS "${KERNEL_DEFINITIONS}") | |||
| get_target_property(KERNEL_INCLUDE_DIRECTORIES kernel${TSUFFIX} INCLUDE_DIRECTORIES) | |||
| set_target_properties(kernel${TSUFFIX} PROPERTIES INCLUDE_DIRECTORIES "${KERNEL_INCLUDE_DIRECTORIES};${TARGET_CONF_DIR}") | |||
| if (USE_GEMM3M) | |||
| target_compile_definitions(kernel${TSUFFIX} PRIVATE USE_GEMM3M) | |||
| endif() | |||
| endfunction () | |||
| @@ -61,7 +61,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| if ( n == 1 ) return( ABS(x[0]) ); | |||
| n *= inc_x; | |||
| while(i < n) | |||
| while(abs(i) < abs(n)) | |||
| { | |||
| if ( x[i] != 0.0 ) | |||
| @@ -62,7 +62,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| inc_x2 = 2 * inc_x; | |||
| n *= inc_x2; | |||
| while(i < n) | |||
| while(abs(i) < abs(n)) | |||
| { | |||
| if ( x[i] != 0.0 ) | |||
| @@ -60,6 +60,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F | |||
| else | |||
| { | |||
| temp = - da_i * x[ip+1] ; | |||
| if (isnan(x[ip]) || isinf(x[ip])) temp = NAN; | |||
| x[ip+1] = da_i * x[ip] ; | |||
| } | |||
| } | |||
| @@ -1,3 +1,5 @@ | |||
| CSUMKERNEL=csum.S | |||
| ifndef SNRM2KERNEL | |||
| SNRM2KERNEL = ../arm/nrm2.c | |||
| endif | |||
| @@ -1,3 +1,6 @@ | |||
| CSUMKERNEL = csum_thunderx2t99.c | |||
| ZSUMKERNEL = zsum_thunderx2t99.c | |||
| SAMINKERNEL = ../arm/amin.c | |||
| DAMINKERNEL = ../arm/amin.c | |||
| CAMINKERNEL = ../arm/zamin.c | |||
| @@ -0,0 +1,3 @@ | |||
| include $(KERNELDIR)/KERNEL.CORTEXA57 | |||
| @@ -91,8 +91,8 @@ IDAMAXKERNEL = iamax_thunderx2t99.c | |||
| ICAMAXKERNEL = izamax_thunderx2t99.c | |||
| IZAMAXKERNEL = izamax_thunderx2t99.c | |||
| SNRM2KERNEL = scnrm2_thunderx2t99.c | |||
| DNRM2KERNEL = dznrm2_thunderx2t99.c | |||
| SNRM2KERNEL = nrm2.S | |||
| DNRM2KERNEL = nrm2.S | |||
| CNRM2KERNEL = scnrm2_thunderx2t99.c | |||
| ZNRM2KERNEL = dznrm2_thunderx2t99.c | |||
| @@ -0,0 +1,247 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2017, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #include <arm_neon.h> | |||
| #define N "x0" /* vector length */ | |||
| #define X "x1" /* "X" vector address */ | |||
| #define INC_X "x2" /* "X" stride */ | |||
| #define J "x5" /* loop variable */ | |||
| #define REG0 "wzr" | |||
| #define SUMF "s0" | |||
| #define SUMFD "d0" | |||
| /******************************************************************************/ | |||
| #define KERNEL_F1 \ | |||
| "ldr d1, ["X"] \n" \ | |||
| "add "X", "X", #8 \n" \ | |||
| "ext v2.8b, v1.8b, v1.8b, #4 \n" \ | |||
| "fadd s1, s1, s2 \n" \ | |||
| "fadd "SUMF", "SUMF", s1 \n" | |||
| #define KERNEL_F32 \ | |||
| "ldr q16, ["X"] \n" \ | |||
| "ldr q17, ["X", #16] \n" \ | |||
| "ldr q18, ["X", #32] \n" \ | |||
| "ldr q19, ["X", #48] \n" \ | |||
| "ldp q20, q21, ["X", #64] \n" \ | |||
| "ldp q22, q23, ["X", #96] \n" \ | |||
| "ldp q24, q25, ["X", #128] \n" \ | |||
| "ldp q26, q27, ["X", #160] \n" \ | |||
| "fadd v16.4s, v16.4s, v17.4s \n" \ | |||
| "fadd v18.4s, v18.4s, v19.4s \n" \ | |||
| "ldp q28, q29, ["X", #192] \n" \ | |||
| "ldp q30, q31, ["X", #224] \n" \ | |||
| "add "X", "X", #256 \n" \ | |||
| "fadd v20.4s, v20.4s, v21.4s \n" \ | |||
| "fadd v22.4s, v22.4s, v23.4s \n" \ | |||
| "PRFM PLDL1KEEP, ["X", #1024] \n" \ | |||
| "PRFM PLDL1KEEP, ["X", #1024+64] \n" \ | |||
| "fadd v24.4s, v24.4s, v25.4s \n" \ | |||
| "fadd v26.4s, v26.4s, v27.4s \n" \ | |||
| "fadd v0.4s, v0.4s, v16.4s \n" \ | |||
| "fadd v1.4s, v1.4s, v18.4s \n" \ | |||
| "fadd v2.4s, v2.4s, v20.4s \n" \ | |||
| "fadd v3.4s, v3.4s, v22.4s \n" \ | |||
| "PRFM PLDL1KEEP, ["X", #1024+128] \n" \ | |||
| "PRFM PLDL1KEEP, ["X", #1024+192] \n" \ | |||
| "fadd v28.4s, v28.4s, v29.4s \n" \ | |||
| "fadd v30.4s, v30.4s, v31.4s \n" \ | |||
| "fadd v4.4s, v4.4s, v24.4s \n" \ | |||
| "fadd v5.4s, v5.4s, v26.4s \n" \ | |||
| "fadd v6.4s, v6.4s, v28.4s \n" \ | |||
| "fadd v7.4s, v7.4s, v30.4s \n" | |||
| #define KERNEL_F32_FINALIZE \ | |||
| "fadd v0.4s, v0.4s, v1.4s \n" \ | |||
| "fadd v2.4s, v2.4s, v3.4s \n" \ | |||
| "fadd v4.4s, v4.4s, v5.4s \n" \ | |||
| "fadd v6.4s, v6.4s, v7.4s \n" \ | |||
| "fadd v0.4s, v0.4s, v2.4s \n" \ | |||
| "fadd v4.4s, v4.4s, v6.4s \n" \ | |||
| "fadd v0.4s, v0.4s, v4.4s \n" \ | |||
| "ext v1.16b, v0.16b, v0.16b, #8 \n" \ | |||
| "fadd v0.2s, v0.2s, v1.2s \n" \ | |||
| "faddp "SUMF", v0.2s \n" | |||
| #define INIT_S \ | |||
| "lsl "INC_X", "INC_X", #3 \n" | |||
| #define KERNEL_S1 \ | |||
| "ldr d1, ["X"] \n" \ | |||
| "add "X", "X", "INC_X" \n" \ | |||
| "ext v2.8b, v1.8b, v1.8b, #4 \n" \ | |||
| "fadd s1, s1, s2 \n" \ | |||
| "fadd "SUMF", "SUMF", s1 \n" | |||
| #if defined(SMP) | |||
| extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n, | |||
| BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, | |||
| void *c, BLASLONG ldc, int (*function)(), int nthreads); | |||
| #endif | |||
| static FLOAT casum_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| { | |||
| FLOAT asum = 0.0 ; | |||
| if ( n < 0 ) return(asum); | |||
| __asm__ __volatile__ ( | |||
| " mov "N", %[N_] \n" | |||
| " mov "X", %[X_] \n" | |||
| " mov "INC_X", %[INCX_] \n" | |||
| " fmov "SUMF", "REG0" \n" | |||
| " fmov s1, "REG0" \n" | |||
| " fmov s2, "REG0" \n" | |||
| " fmov s3, "REG0" \n" | |||
| " fmov s4, "REG0" \n" | |||
| " fmov s5, "REG0" \n" | |||
| " fmov s6, "REG0" \n" | |||
| " fmov s7, "REG0" \n" | |||
| " cmp "N", xzr \n" | |||
| " ble 9f //asum_kernel_L999 \n" | |||
| " cmp "INC_X", xzr \n" | |||
| " ble 9f //asum_kernel_L999 \n" | |||
| " cmp "INC_X", #1 \n" | |||
| " bne 5f //asum_kernel_S_BEGIN \n" | |||
| "1: //asum_kernel_F_BEGIN: \n" | |||
| " asr "J", "N", #5 \n" | |||
| " cmp "J", xzr \n" | |||
| " beq 3f //asum_kernel_F1 \n" | |||
| "2: //asum_kernel_F32: \n" | |||
| " "KERNEL_F32" \n" | |||
| " subs "J", "J", #1 \n" | |||
| " bne 2b //asum_kernel_F32 \n" | |||
| " "KERNEL_F32_FINALIZE" \n" | |||
| "3: //asum_kernel_F1: \n" | |||
| " ands "J", "N", #31 \n" | |||
| " ble 9f //asum_kernel_L999 \n" | |||
| "4: //asum_kernel_F10: \n" | |||
| " "KERNEL_F1" \n" | |||
| " subs "J", "J", #1 \n" | |||
| " bne 4b //asum_kernel_F10 \n" | |||
| " b 9f //asum_kernel_L999 \n" | |||
| "5: //asum_kernel_S_BEGIN: \n" | |||
| " "INIT_S" \n" | |||
| " asr "J", "N", #2 \n" | |||
| " cmp "J", xzr \n" | |||
| " ble 7f //asum_kernel_S1 \n" | |||
| "6: //asum_kernel_S4: \n" | |||
| " "KERNEL_S1" \n" | |||
| " "KERNEL_S1" \n" | |||
| " "KERNEL_S1" \n" | |||
| " "KERNEL_S1" \n" | |||
| " subs "J", "J", #1 \n" | |||
| " bne 6b //asum_kernel_S4 \n" | |||
| "7: //asum_kernel_S1: \n" | |||
| " ands "J", "N", #3 \n" | |||
| " ble 9f //asum_kernel_L999 \n" | |||
| "8: //asum_kernel_S10: \n" | |||
| " "KERNEL_S1" \n" | |||
| " subs "J", "J", #1 \n" | |||
| " bne 8b //asum_kernel_S10 \n" | |||
| "9: //asum_kernel_L999: \n" | |||
| " fmov %[ASUM_], "SUMFD" \n" | |||
| : [ASUM_] "=r" (asum) //%0 | |||
| : [N_] "r" (n), //%1 | |||
| [X_] "r" (x), //%2 | |||
| [INCX_] "r" (inc_x) //%3 | |||
| : "cc", | |||
| "memory", | |||
| "x0", "x1", "x2", "x3", "x4", "x5", | |||
| "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" | |||
| ); | |||
| return asum; | |||
| } | |||
| #if defined(SMP) | |||
| static int casum_thread_function(BLASLONG n, BLASLONG dummy0, | |||
| BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *y, | |||
| BLASLONG inc_y, FLOAT *result, BLASLONG dummy3) | |||
| { | |||
| *result = casum_compute(n, x, inc_x); | |||
| return 0; | |||
| } | |||
| #endif | |||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| { | |||
| #if defined(SMP) | |||
| int nthreads; | |||
| FLOAT dummy_alpha; | |||
| #endif | |||
| FLOAT asum = 0.0; | |||
| #if defined(SMP) | |||
| if (inc_x == 0 || n <= 10000) | |||
| nthreads = 1; | |||
| else | |||
| nthreads = num_cpu_avail(1); | |||
| if (nthreads == 1) { | |||
| asum = casum_compute(n, x, inc_x); | |||
| } else { | |||
| int mode, i; | |||
| char result[MAX_CPU_NUMBER * sizeof(double) * 2]; | |||
| FLOAT *ptr; | |||
| mode = BLAS_SINGLE | BLAS_COMPLEX; | |||
| blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, | |||
| x, inc_x, NULL, 0, result, 0, | |||
| ( void *)casum_thread_function, nthreads); | |||
| ptr = (FLOAT *)result; | |||
| for (i = 0; i < nthreads; i++) { | |||
| asum = asum + (*ptr); | |||
| ptr = (FLOAT *)(((char *)ptr) + sizeof(double) * 2); | |||
| } | |||
| } | |||
| #else | |||
| asum = casum_compute(n, x, inc_x); | |||
| #endif | |||
| return asum; | |||
| } | |||
| @@ -77,7 +77,7 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, | |||
| " cmp "N", xzr \n" | |||
| " ble 9f //nrm2_kernel_L999 \n" | |||
| " cmp "INC_X", xzr \n" | |||
| " ble 9f //nrm2_kernel_L999 \n" | |||
| " beq 9f //nrm2_kernel_L999 \n" | |||
| "1: //nrm2_kernel_F_BEGIN: \n" | |||
| " mov x6, #0x7FF0000000000000 //+Infinity \n" | |||
| @@ -345,7 +345,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| #endif | |||
| FLOAT ssq, scale; | |||
| if (n <= 0 || inc_x <= 0) return 0.0; | |||
| if (n <= 0 || inc_x == 0) return 0.0; | |||
| #if defined(SMP) | |||
| if (n <= 10000) | |||
| @@ -229,7 +229,7 @@ static double nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| " cmp "N", xzr \n" | |||
| " ble 9f //nrm2_kernel_L999 \n" | |||
| " cmp "INC_X", xzr \n" | |||
| " ble 9f //nrm2_kernel_L999 \n" | |||
| " beq 9f //nrm2_kernel_L999 \n" | |||
| " cmp "INC_X", #1 \n" | |||
| " bne 5f //nrm2_kernel_S_BEGIN \n" | |||
| @@ -315,7 +315,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| FLOAT nrm2 = 0.0; | |||
| double nrm2_double = 0.0; | |||
| if (n <= 0 || inc_x <= 0) return 0.0; | |||
| if (n <= 0 || inc_x == 0) return 0.0; | |||
| #if defined(SMP) | |||
| if (n <= 10000) | |||
| @@ -223,7 +223,7 @@ zscal_begin: | |||
| fcmp DA_I, #0.0 | |||
| beq .Lzscal_kernel_RI_zero | |||
| b .Lzscal_kernel_R_zero | |||
| // b .Lzscal_kernel_R_zero | |||
| .Lzscal_kernel_R_non_zero: | |||
| @@ -0,0 +1,244 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2017, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #include <arm_neon.h> | |||
| #define N "x0" /* vector length */ | |||
| #define X "x1" /* "X" vector address */ | |||
| #define INC_X "x2" /* "X" stride */ | |||
| #define J "x5" /* loop variable */ | |||
| #define REG0 "xzr" | |||
| #define SUMF "d0" | |||
| #define TMPF "d1" | |||
| /******************************************************************************/ | |||
| #define KERNEL_F1 \ | |||
| "ldr q1, ["X"] \n" \ | |||
| "add "X", "X", #16 \n" \ | |||
| "faddp d1, v1.2d \n" \ | |||
| "fadd "SUMF", "SUMF", d1 \n" | |||
| #define KERNEL_F16 \ | |||
| "ldr q16, ["X"] \n" \ | |||
| "ldr q17, ["X", #16] \n" \ | |||
| "ldr q18, ["X", #32] \n" \ | |||
| "ldr q19, ["X", #48] \n" \ | |||
| "ldp q20, q21, ["X", #64] \n" \ | |||
| "ldp q22, q23, ["X", #96] \n" \ | |||
| "ldp q24, q25, ["X", #128] \n" \ | |||
| "ldp q26, q27, ["X", #160] \n" \ | |||
| "fadd v16.2d, v16.2d, v17.2d \n" \ | |||
| "fadd v18.2d, v18.2d, v19.2d \n" \ | |||
| "ldp q28, q29, ["X", #192] \n" \ | |||
| "ldp q30, q31, ["X", #224] \n" \ | |||
| "add "X", "X", #256 \n" \ | |||
| "fadd v20.2d, v20.2d, v21.2d \n" \ | |||
| "fadd v22.2d, v22.2d, v23.2d \n" \ | |||
| "PRFM PLDL1KEEP, ["X", #1024] \n" \ | |||
| "PRFM PLDL1KEEP, ["X", #1024+64] \n" \ | |||
| "fadd v24.2d, v24.2d, v25.2d \n" \ | |||
| "fadd v26.2d, v26.2d, v27.2d \n" \ | |||
| "fadd v28.2d, v28.2d, v29.2d \n" \ | |||
| "fadd v30.2d, v30.2d, v31.2d \n" \ | |||
| "fadd v0.2d, v0.2d, v16.2d \n" \ | |||
| "fadd v1.2d, v1.2d, v18.2d \n" \ | |||
| "fadd v2.2d, v2.2d, v20.2d \n" \ | |||
| "fadd v3.2d, v3.2d, v22.2d \n" \ | |||
| "PRFM PLDL1KEEP, ["X", #1024+128] \n" \ | |||
| "PRFM PLDL1KEEP, ["X", #1024+192] \n" \ | |||
| "fadd v4.2d, v4.2d, v24.2d \n" \ | |||
| "fadd v5.2d, v5.2d, v26.2d \n" \ | |||
| "fadd v6.2d, v6.2d, v28.2d \n" \ | |||
| "fadd v7.2d, v7.2d, v30.2d \n" | |||
| #define KERNEL_F16_FINALIZE \ | |||
| "fadd v0.2d, v0.2d, v1.2d \n" \ | |||
| "fadd v2.2d, v2.2d, v3.2d \n" \ | |||
| "fadd v4.2d, v4.2d, v5.2d \n" \ | |||
| "fadd v6.2d, v6.2d, v7.2d \n" \ | |||
| "fadd v0.2d, v0.2d, v2.2d \n" \ | |||
| "fadd v4.2d, v4.2d, v6.2d \n" \ | |||
| "fadd v0.2d, v0.2d, v4.2d \n" \ | |||
| "faddp "SUMF", v0.2d \n" | |||
| #define INIT_S \ | |||
| "lsl "INC_X", "INC_X", #4 \n" | |||
| #define KERNEL_S1 \ | |||
| "ldr q1, ["X"] \n" \ | |||
| "add "X", "X", "INC_X" \n" \ | |||
| "faddp d1, v1.2d \n" \ | |||
| "fadd "SUMF", "SUMF", d1 \n" | |||
| #if defined(SMP) | |||
| extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n, | |||
| BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, | |||
| void *c, BLASLONG ldc, int (*function)(), int nthreads); | |||
| #endif | |||
| static FLOAT zasum_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| { | |||
| FLOAT asum = 0.0 ; | |||
| if ( n < 0 ) return(asum); | |||
| __asm__ __volatile__ ( | |||
| " mov "N", %[N_] \n" | |||
| " mov "X", %[X_] \n" | |||
| " mov "INC_X", %[INCX_] \n" | |||
| " fmov "SUMF", "REG0" \n" | |||
| " fmov d1, "REG0" \n" | |||
| " fmov d2, "REG0" \n" | |||
| " fmov d3, "REG0" \n" | |||
| " fmov d4, "REG0" \n" | |||
| " fmov d5, "REG0" \n" | |||
| " fmov d6, "REG0" \n" | |||
| " fmov d7, "REG0" \n" | |||
| " cmp "N", xzr \n" | |||
| " ble 9f //asum_kernel_L999 \n" | |||
| " cmp "INC_X", xzr \n" | |||
| " ble 9f //asum_kernel_L999 \n" | |||
| " cmp "INC_X", #1 \n" | |||
| " bne 5f //asum_kernel_S_BEGIN \n" | |||
| "1: //asum_kernel_F_BEGIN: \n" | |||
| " asr "J", "N", #4 \n" | |||
| " cmp "J", xzr \n" | |||
| " beq 3f //asum_kernel_F1 \n" | |||
| ".align 5 \n" | |||
| "2: //asum_kernel_F16: \n" | |||
| " "KERNEL_F16" \n" | |||
| " subs "J", "J", #1 \n" | |||
| " bne 2b //asum_kernel_F16 \n" | |||
| " "KERNEL_F16_FINALIZE" \n" | |||
| "3: //asum_kernel_F1: \n" | |||
| " ands "J", "N", #15 \n" | |||
| " ble 9f //asum_kernel_L999 \n" | |||
| "4: //asum_kernel_F10: \n" | |||
| " "KERNEL_F1" \n" | |||
| " subs "J", "J", #1 \n" | |||
| " bne 4b //asum_kernel_F10 \n" | |||
| " b 9f //asum_kernel_L999 \n" | |||
| "5: //asum_kernel_S_BEGIN: \n" | |||
| " "INIT_S" \n" | |||
| " asr "J", "N", #2 \n" | |||
| " cmp "J", xzr \n" | |||
| " ble 7f //asum_kernel_S1 \n" | |||
| "6: //asum_kernel_S4: \n" | |||
| " "KERNEL_S1" \n" | |||
| " "KERNEL_S1" \n" | |||
| " "KERNEL_S1" \n" | |||
| " "KERNEL_S1" \n" | |||
| " subs "J", "J", #1 \n" | |||
| " bne 6b //asum_kernel_S4 \n" | |||
| "7: //asum_kernel_S1: \n" | |||
| " ands "J", "N", #3 \n" | |||
| " ble 9f //asum_kernel_L999 \n" | |||
| "8: //asum_kernel_S10: \n" | |||
| " "KERNEL_S1" \n" | |||
| " subs "J", "J", #1 \n" | |||
| " bne 8b //asum_kernel_S10 \n" | |||
| "9: //asum_kernel_L999: \n" | |||
| " fmov %[ASUM_], "SUMF" \n" | |||
| : [ASUM_] "=r" (asum) //%0 | |||
| : [N_] "r" (n), //%1 | |||
| [X_] "r" (x), //%2 | |||
| [INCX_] "r" (inc_x) //%3 | |||
| : "cc", | |||
| "memory", | |||
| "x0", "x1", "x2", "x3", "x4", "x5", | |||
| "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" | |||
| ); | |||
| return asum; | |||
| } | |||
| #if defined(SMP) | |||
| static int zasum_thread_function(BLASLONG n, BLASLONG dummy0, | |||
| BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *y, | |||
| BLASLONG inc_y, FLOAT *result, BLASLONG dummy3) | |||
| { | |||
| *result = zasum_compute(n, x, inc_x); | |||
| return 0; | |||
| } | |||
| #endif | |||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| { | |||
| #if defined(SMP) | |||
| int nthreads; | |||
| FLOAT dummy_alpha; | |||
| #endif | |||
| FLOAT asum = 0.0; | |||
| #if defined(SMP) | |||
| if (inc_x == 0 || n <= 10000) | |||
| nthreads = 1; | |||
| else | |||
| nthreads = num_cpu_avail(1); | |||
| if (nthreads == 1) { | |||
| asum = zasum_compute(n, x, inc_x); | |||
| } else { | |||
| int mode, i; | |||
| char result[MAX_CPU_NUMBER * sizeof(double) * 2]; | |||
| FLOAT *ptr; | |||
| mode = BLAS_DOUBLE | BLAS_COMPLEX; | |||
| blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, | |||
| x, inc_x, NULL, 0, result, 0, | |||
| ( void *)zasum_thread_function, nthreads); | |||
| ptr = (FLOAT *)result; | |||
| for (i = 0; i < nthreads; i++) { | |||
| asum = asum + (*ptr); | |||
| ptr = (FLOAT *)(((char *)ptr) + sizeof(double) * 2); | |||
| } | |||
| } | |||
| #else | |||
| asum = zasum_compute(n, x, inc_x); | |||
| #endif | |||
| return asum; | |||
| } | |||
| @@ -0,0 +1,149 @@ | |||
| SAMAXKERNEL = ../arm/amax.c | |||
| DAMAXKERNEL = ../arm/amax.c | |||
| CAMAXKERNEL = ../arm/zamax.c | |||
| ZAMAXKERNEL = ../arm/zamax.c | |||
| SAMINKERNEL = ../arm/amin.c | |||
| DAMINKERNEL = ../arm/amin.c | |||
| CAMINKERNEL = ../arm/zamin.c | |||
| ZAMINKERNEL = ../arm/zamin.c | |||
| SMAXKERNEL = ../arm/max.c | |||
| DMAXKERNEL = ../arm/max.c | |||
| SMINKERNEL = ../arm/min.c | |||
| DMINKERNEL = ../arm/min.c | |||
| ISAMAXKERNEL = ../arm/iamax.c | |||
| IDAMAXKERNEL = ../arm/iamax.c | |||
| ICAMAXKERNEL = ../arm/izamax.c | |||
| IZAMAXKERNEL = ../arm/izamax.c | |||
| ISAMINKERNEL = ../arm/iamin.c | |||
| IDAMINKERNEL = ../arm/iamin.c | |||
| ICAMINKERNEL = ../arm/izamin.c | |||
| IZAMINKERNEL = ../arm/izamin.c | |||
| ISMAXKERNEL = ../arm/imax.c | |||
| IDMAXKERNEL = ../arm/imax.c | |||
| ISMINKERNEL = ../arm/imin.c | |||
| IDMINKERNEL = ../arm/imin.c | |||
| SASUMKERNEL = ../arm/asum.c | |||
| DASUMKERNEL = ../arm/asum.c | |||
| CASUMKERNEL = ../arm/zasum.c | |||
| ZASUMKERNEL = ../arm/zasum.c | |||
| SSUMKERNEL = ../arm/sum.c | |||
| DSUMKERNEL = ../arm/sum.c | |||
| CSUMKERNEL = ../arm/zsum.c | |||
| ZSUMKERNEL = ../arm/zsum.c | |||
| SAXPYKERNEL = ../arm/axpy.c | |||
| DAXPYKERNEL = ../arm/axpy.c | |||
| CAXPYKERNEL = ../arm/zaxpy.c | |||
| ZAXPYKERNEL = ../arm/zaxpy.c | |||
| SCOPYKERNEL = ../arm/copy.c | |||
| DCOPYKERNEL = ../arm/copy.c | |||
| CCOPYKERNEL = ../arm/zcopy.c | |||
| ZCOPYKERNEL = ../arm/zcopy.c | |||
| SDOTKERNEL = ../arm/dot.c | |||
| DDOTKERNEL = ../arm/dot.c | |||
| CDOTKERNEL = ../arm/zdot.c | |||
| ZDOTKERNEL = ../arm/zdot.c | |||
| DSDOTKERNEL = ../generic/dot.c | |||
| SNRM2KERNEL = ../arm/nrm2.c | |||
| DNRM2KERNEL = ../arm/nrm2.c | |||
| CNRM2KERNEL = ../arm/znrm2.c | |||
| ZNRM2KERNEL = ../arm/znrm2.c | |||
| SROTKERNEL = ../arm/rot.c | |||
| DROTKERNEL = ../arm/rot.c | |||
| CROTKERNEL = ../arm/zrot.c | |||
| ZROTKERNEL = ../arm/zrot.c | |||
| SSCALKERNEL = ../arm/scal.c | |||
| DSCALKERNEL = ../arm/scal.c | |||
| CSCALKERNEL = ../arm/zscal.c | |||
| ZSCALKERNEL = ../arm/zscal.c | |||
| SSWAPKERNEL = ../arm/swap.c | |||
| DSWAPKERNEL = ../arm/swap.c | |||
| CSWAPKERNEL = ../arm/zswap.c | |||
| ZSWAPKERNEL = ../arm/zswap.c | |||
| SGEMVNKERNEL = ../arm/gemv_n.c | |||
| DGEMVNKERNEL = ../arm/gemv_n.c | |||
| CGEMVNKERNEL = ../arm/zgemv_n.c | |||
| ZGEMVNKERNEL = ../arm/zgemv_n.c | |||
| SGEMVTKERNEL = ../arm/gemv_t.c | |||
| DGEMVTKERNEL = ../arm/gemv_t.c | |||
| CGEMVTKERNEL = ../arm/zgemv_t.c | |||
| ZGEMVTKERNEL = ../arm/zgemv_t.c | |||
| STRMMKERNEL = ../generic/trmmkernel_2x2.c | |||
| DTRMMKERNEL = ../generic/trmmkernel_2x2.c | |||
| CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c | |||
| ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c | |||
| SGEMMKERNEL = ../generic/gemmkernel_2x2.c | |||
| SGEMMONCOPY = ../generic/gemm_ncopy_2.c | |||
| SGEMMOTCOPY = ../generic/gemm_tcopy_2.c | |||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMKERNEL = ../generic/gemmkernel_2x2.c | |||
| DGEMMONCOPY = ../generic/gemm_ncopy_2.c | |||
| DGEMMOTCOPY = ../generic/gemm_tcopy_2.c | |||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMKERNEL = ../generic/zgemmkernel_2x2.c | |||
| CGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||
| CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c | |||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||
| ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| SCABS_KERNEL = ../generic/cabs.c | |||
| DCABS_KERNEL = ../generic/cabs.c | |||
| QCABS_KERNEL = ../generic/cabs.c | |||
| LSAME_KERNEL = ../generic/lsame.c | |||
| SGEMM_BETA = ../generic/gemm_beta.c | |||
| DGEMM_BETA = ../generic/gemm_beta.c | |||
| CGEMM_BETA = ../generic/zgemm_beta.c | |||
| ZGEMM_BETA = ../generic/zgemm_beta.c | |||
| @@ -0,0 +1 @@ | |||
| clean :: | |||
| @@ -40,7 +40,6 @@ int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, | |||
| if ( rows <= 0 ) return(0); | |||
| if ( cols <= 0 ) return(0); | |||
| if ( alpha_r == 1.0 && alpha_i == 0.0 ) return (0); | |||
| aptr = a; | |||
| lda *= 2; | |||
| @@ -0,0 +1,587 @@ | |||
| /******************************************************************************* | |||
| Copyright (c) 2024, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *******************************************************************************/ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ | |||
| BLASLONG i, j; | |||
| FLOAT *aoffset; | |||
| FLOAT *aoffset1, *aoffset2; | |||
| FLOAT *boffset; | |||
| FLOAT ctemp01, ctemp02, ctemp03, ctemp04; | |||
| FLOAT ctemp05, ctemp06, ctemp07, ctemp08; | |||
| FLOAT ctemp09, ctemp10, ctemp11, ctemp12; | |||
| FLOAT ctemp13, ctemp14, ctemp15, ctemp16; | |||
| FLOAT ctemp17, ctemp18, ctemp19, ctemp20; | |||
| FLOAT ctemp21, ctemp22, ctemp23, ctemp24; | |||
| FLOAT ctemp25, ctemp26, ctemp27, ctemp28; | |||
| FLOAT ctemp29, ctemp30, ctemp31, ctemp32; | |||
| FLOAT ctemp33, ctemp34, ctemp35, ctemp36; | |||
| FLOAT ctemp37, ctemp38, ctemp39, ctemp40; | |||
| FLOAT ctemp41, ctemp42, ctemp43, ctemp44; | |||
| FLOAT ctemp45, ctemp46, ctemp47, ctemp48; | |||
| FLOAT ctemp49, ctemp50, ctemp51, ctemp52; | |||
| FLOAT ctemp53, ctemp54, ctemp55, ctemp56; | |||
| FLOAT ctemp57, ctemp58, ctemp59, ctemp60; | |||
| FLOAT ctemp61, ctemp62, ctemp63, ctemp64; | |||
| aoffset = a; | |||
| boffset = b; | |||
| lda *= 2; | |||
| #if 0 | |||
| fprintf(stderr, "M = %d N = %d\n", m, n); | |||
| #endif | |||
| j = (n >> 4); | |||
| if (j > 0){ | |||
| do{ | |||
| aoffset1 = aoffset; | |||
| aoffset2 = aoffset + lda; | |||
| aoffset += 32; | |||
| i = (m >> 1); | |||
| if (i > 0){ | |||
| do{ | |||
| ctemp01 = *(aoffset1 + 0); | |||
| ctemp02 = *(aoffset1 + 1); | |||
| ctemp03 = *(aoffset1 + 2); | |||
| ctemp04 = *(aoffset1 + 3); | |||
| ctemp05 = *(aoffset1 + 4); | |||
| ctemp06 = *(aoffset1 + 5); | |||
| ctemp07 = *(aoffset1 + 6); | |||
| ctemp08 = *(aoffset1 + 7); | |||
| ctemp09 = *(aoffset1 + 8); | |||
| ctemp10 = *(aoffset1 + 9); | |||
| ctemp11 = *(aoffset1 + 10); | |||
| ctemp12 = *(aoffset1 + 11); | |||
| ctemp13 = *(aoffset1 + 12); | |||
| ctemp14 = *(aoffset1 + 13); | |||
| ctemp15 = *(aoffset1 + 14); | |||
| ctemp16 = *(aoffset1 + 15); | |||
| ctemp17 = *(aoffset1 + 16); | |||
| ctemp18 = *(aoffset1 + 17); | |||
| ctemp19 = *(aoffset1 + 18); | |||
| ctemp20 = *(aoffset1 + 19); | |||
| ctemp21 = *(aoffset1 + 20); | |||
| ctemp22 = *(aoffset1 + 21); | |||
| ctemp23 = *(aoffset1 + 22); | |||
| ctemp24 = *(aoffset1 + 23); | |||
| ctemp25 = *(aoffset1 + 24); | |||
| ctemp26 = *(aoffset1 + 25); | |||
| ctemp27 = *(aoffset1 + 26); | |||
| ctemp28 = *(aoffset1 + 27); | |||
| ctemp29 = *(aoffset1 + 28); | |||
| ctemp30 = *(aoffset1 + 29); | |||
| ctemp31 = *(aoffset1 + 30); | |||
| ctemp32 = *(aoffset1 + 31); | |||
| ctemp33 = *(aoffset2 + 0); | |||
| ctemp34 = *(aoffset2 + 1); | |||
| ctemp35 = *(aoffset2 + 2); | |||
| ctemp36 = *(aoffset2 + 3); | |||
| ctemp37 = *(aoffset2 + 4); | |||
| ctemp38 = *(aoffset2 + 5); | |||
| ctemp39 = *(aoffset2 + 6); | |||
| ctemp40 = *(aoffset2 + 7); | |||
| ctemp41 = *(aoffset2 + 8); | |||
| ctemp42 = *(aoffset2 + 9); | |||
| ctemp43 = *(aoffset2 + 10); | |||
| ctemp44 = *(aoffset2 + 11); | |||
| ctemp45 = *(aoffset2 + 12); | |||
| ctemp46 = *(aoffset2 + 13); | |||
| ctemp47 = *(aoffset2 + 14); | |||
| ctemp48 = *(aoffset2 + 15); | |||
| ctemp49 = *(aoffset2 + 16); | |||
| ctemp50 = *(aoffset2 + 17); | |||
| ctemp51 = *(aoffset2 + 18); | |||
| ctemp52 = *(aoffset2 + 19); | |||
| ctemp53 = *(aoffset2 + 20); | |||
| ctemp54 = *(aoffset2 + 21); | |||
| ctemp55 = *(aoffset2 + 22); | |||
| ctemp56 = *(aoffset2 + 23); | |||
| ctemp57 = *(aoffset2 + 24); | |||
| ctemp58 = *(aoffset2 + 25); | |||
| ctemp59 = *(aoffset2 + 26); | |||
| ctemp60 = *(aoffset2 + 27); | |||
| ctemp61 = *(aoffset2 + 28); | |||
| ctemp62 = *(aoffset2 + 29); | |||
| ctemp63 = *(aoffset2 + 30); | |||
| ctemp64 = *(aoffset2 + 31); | |||
| *(boffset + 0) = -ctemp01; | |||
| *(boffset + 1) = -ctemp02; | |||
| *(boffset + 2) = -ctemp03; | |||
| *(boffset + 3) = -ctemp04; | |||
| *(boffset + 4) = -ctemp05; | |||
| *(boffset + 5) = -ctemp06; | |||
| *(boffset + 6) = -ctemp07; | |||
| *(boffset + 7) = -ctemp08; | |||
| *(boffset + 8) = -ctemp09; | |||
| *(boffset + 9) = -ctemp10; | |||
| *(boffset + 10) = -ctemp11; | |||
| *(boffset + 11) = -ctemp12; | |||
| *(boffset + 12) = -ctemp13; | |||
| *(boffset + 13) = -ctemp14; | |||
| *(boffset + 14) = -ctemp15; | |||
| *(boffset + 15) = -ctemp16; | |||
| *(boffset + 16) = -ctemp17; | |||
| *(boffset + 17) = -ctemp18; | |||
| *(boffset + 18) = -ctemp19; | |||
| *(boffset + 19) = -ctemp20; | |||
| *(boffset + 20) = -ctemp21; | |||
| *(boffset + 21) = -ctemp22; | |||
| *(boffset + 22) = -ctemp23; | |||
| *(boffset + 23) = -ctemp24; | |||
| *(boffset + 24) = -ctemp25; | |||
| *(boffset + 25) = -ctemp26; | |||
| *(boffset + 26) = -ctemp27; | |||
| *(boffset + 27) = -ctemp28; | |||
| *(boffset + 28) = -ctemp29; | |||
| *(boffset + 29) = -ctemp30; | |||
| *(boffset + 30) = -ctemp31; | |||
| *(boffset + 31) = -ctemp32; | |||
| *(boffset + 32) = -ctemp33; | |||
| *(boffset + 33) = -ctemp34; | |||
| *(boffset + 34) = -ctemp35; | |||
| *(boffset + 35) = -ctemp36; | |||
| *(boffset + 36) = -ctemp37; | |||
| *(boffset + 37) = -ctemp38; | |||
| *(boffset + 38) = -ctemp39; | |||
| *(boffset + 39) = -ctemp40; | |||
| *(boffset + 40) = -ctemp41; | |||
| *(boffset + 41) = -ctemp42; | |||
| *(boffset + 42) = -ctemp43; | |||
| *(boffset + 43) = -ctemp44; | |||
| *(boffset + 44) = -ctemp45; | |||
| *(boffset + 45) = -ctemp46; | |||
| *(boffset + 46) = -ctemp47; | |||
| *(boffset + 47) = -ctemp48; | |||
| *(boffset + 48) = -ctemp49; | |||
| *(boffset + 49) = -ctemp50; | |||
| *(boffset + 50) = -ctemp51; | |||
| *(boffset + 51) = -ctemp52; | |||
| *(boffset + 52) = -ctemp53; | |||
| *(boffset + 53) = -ctemp54; | |||
| *(boffset + 54) = -ctemp55; | |||
| *(boffset + 55) = -ctemp56; | |||
| *(boffset + 56) = -ctemp57; | |||
| *(boffset + 57) = -ctemp58; | |||
| *(boffset + 58) = -ctemp59; | |||
| *(boffset + 59) = -ctemp60; | |||
| *(boffset + 60) = -ctemp61; | |||
| *(boffset + 61) = -ctemp62; | |||
| *(boffset + 62) = -ctemp63; | |||
| *(boffset + 63) = -ctemp64; | |||
| aoffset1 += 2 * lda; | |||
| aoffset2 += 2 * lda; | |||
| boffset += 64; | |||
| i --; | |||
| }while(i > 0); | |||
| } | |||
| if (m & 1){ | |||
| ctemp01 = *(aoffset1 + 0); | |||
| ctemp02 = *(aoffset1 + 1); | |||
| ctemp03 = *(aoffset1 + 2); | |||
| ctemp04 = *(aoffset1 + 3); | |||
| ctemp05 = *(aoffset1 + 4); | |||
| ctemp06 = *(aoffset1 + 5); | |||
| ctemp07 = *(aoffset1 + 6); | |||
| ctemp08 = *(aoffset1 + 7); | |||
| ctemp09 = *(aoffset1 + 8); | |||
| ctemp10 = *(aoffset1 + 9); | |||
| ctemp11 = *(aoffset1 + 10); | |||
| ctemp12 = *(aoffset1 + 11); | |||
| ctemp13 = *(aoffset1 + 12); | |||
| ctemp14 = *(aoffset1 + 13); | |||
| ctemp15 = *(aoffset1 + 14); | |||
| ctemp16 = *(aoffset1 + 15); | |||
| ctemp17 = *(aoffset1 + 16); | |||
| ctemp18 = *(aoffset1 + 17); | |||
| ctemp19 = *(aoffset1 + 18); | |||
| ctemp20 = *(aoffset1 + 19); | |||
| ctemp21 = *(aoffset1 + 20); | |||
| ctemp22 = *(aoffset1 + 21); | |||
| ctemp23 = *(aoffset1 + 22); | |||
| ctemp24 = *(aoffset1 + 23); | |||
| ctemp25 = *(aoffset1 + 24); | |||
| ctemp26 = *(aoffset1 + 25); | |||
| ctemp27 = *(aoffset1 + 26); | |||
| ctemp28 = *(aoffset1 + 27); | |||
| ctemp29 = *(aoffset1 + 28); | |||
| ctemp30 = *(aoffset1 + 29); | |||
| ctemp31 = *(aoffset1 + 30); | |||
| ctemp32 = *(aoffset1 + 31); | |||
| *(boffset + 0) = -ctemp01; | |||
| *(boffset + 1) = -ctemp02; | |||
| *(boffset + 2) = -ctemp03; | |||
| *(boffset + 3) = -ctemp04; | |||
| *(boffset + 4) = -ctemp05; | |||
| *(boffset + 5) = -ctemp06; | |||
| *(boffset + 6) = -ctemp07; | |||
| *(boffset + 7) = -ctemp08; | |||
| *(boffset + 8) = -ctemp09; | |||
| *(boffset + 9) = -ctemp10; | |||
| *(boffset + 10) = -ctemp11; | |||
| *(boffset + 11) = -ctemp12; | |||
| *(boffset + 12) = -ctemp13; | |||
| *(boffset + 13) = -ctemp14; | |||
| *(boffset + 14) = -ctemp15; | |||
| *(boffset + 15) = -ctemp16; | |||
| *(boffset + 16) = -ctemp17; | |||
| *(boffset + 17) = -ctemp18; | |||
| *(boffset + 18) = -ctemp19; | |||
| *(boffset + 19) = -ctemp20; | |||
| *(boffset + 20) = -ctemp21; | |||
| *(boffset + 21) = -ctemp22; | |||
| *(boffset + 22) = -ctemp23; | |||
| *(boffset + 23) = -ctemp24; | |||
| *(boffset + 24) = -ctemp25; | |||
| *(boffset + 25) = -ctemp26; | |||
| *(boffset + 26) = -ctemp27; | |||
| *(boffset + 27) = -ctemp28; | |||
| *(boffset + 28) = -ctemp29; | |||
| *(boffset + 29) = -ctemp30; | |||
| *(boffset + 30) = -ctemp31; | |||
| *(boffset + 31) = -ctemp32; | |||
| boffset += 32; | |||
| } | |||
| j--; | |||
| }while(j > 0); | |||
| } /* end of if(j > 0) */ | |||
| if (n & 8){ | |||
| aoffset1 = aoffset; | |||
| aoffset2 = aoffset + lda; | |||
| aoffset += 16; | |||
| i = (m >> 1); | |||
| if (i > 0){ | |||
| do{ | |||
| ctemp01 = *(aoffset1 + 0); | |||
| ctemp02 = *(aoffset1 + 1); | |||
| ctemp03 = *(aoffset1 + 2); | |||
| ctemp04 = *(aoffset1 + 3); | |||
| ctemp05 = *(aoffset1 + 4); | |||
| ctemp06 = *(aoffset1 + 5); | |||
| ctemp07 = *(aoffset1 + 6); | |||
| ctemp08 = *(aoffset1 + 7); | |||
| ctemp09 = *(aoffset1 + 8); | |||
| ctemp10 = *(aoffset1 + 9); | |||
| ctemp11 = *(aoffset1 + 10); | |||
| ctemp12 = *(aoffset1 + 11); | |||
| ctemp13 = *(aoffset1 + 12); | |||
| ctemp14 = *(aoffset1 + 13); | |||
| ctemp15 = *(aoffset1 + 14); | |||
| ctemp16 = *(aoffset1 + 15); | |||
| ctemp17 = *(aoffset2 + 0); | |||
| ctemp18 = *(aoffset2 + 1); | |||
| ctemp19 = *(aoffset2 + 2); | |||
| ctemp20 = *(aoffset2 + 3); | |||
| ctemp21 = *(aoffset2 + 4); | |||
| ctemp22 = *(aoffset2 + 5); | |||
| ctemp23 = *(aoffset2 + 6); | |||
| ctemp24 = *(aoffset2 + 7); | |||
| ctemp25 = *(aoffset2 + 8); | |||
| ctemp26 = *(aoffset2 + 9); | |||
| ctemp27 = *(aoffset2 + 10); | |||
| ctemp28 = *(aoffset2 + 11); | |||
| ctemp29 = *(aoffset2 + 12); | |||
| ctemp30 = *(aoffset2 + 13); | |||
| ctemp31 = *(aoffset2 + 14); | |||
| ctemp32 = *(aoffset2 + 15); | |||
| *(boffset + 0) = -ctemp01; | |||
| *(boffset + 1) = -ctemp02; | |||
| *(boffset + 2) = -ctemp03; | |||
| *(boffset + 3) = -ctemp04; | |||
| *(boffset + 4) = -ctemp05; | |||
| *(boffset + 5) = -ctemp06; | |||
| *(boffset + 6) = -ctemp07; | |||
| *(boffset + 7) = -ctemp08; | |||
| *(boffset + 8) = -ctemp09; | |||
| *(boffset + 9) = -ctemp10; | |||
| *(boffset + 10) = -ctemp11; | |||
| *(boffset + 11) = -ctemp12; | |||
| *(boffset + 12) = -ctemp13; | |||
| *(boffset + 13) = -ctemp14; | |||
| *(boffset + 14) = -ctemp15; | |||
| *(boffset + 15) = -ctemp16; | |||
| *(boffset + 16) = -ctemp17; | |||
| *(boffset + 17) = -ctemp18; | |||
| *(boffset + 18) = -ctemp19; | |||
| *(boffset + 19) = -ctemp20; | |||
| *(boffset + 20) = -ctemp21; | |||
| *(boffset + 21) = -ctemp22; | |||
| *(boffset + 22) = -ctemp23; | |||
| *(boffset + 23) = -ctemp24; | |||
| *(boffset + 24) = -ctemp25; | |||
| *(boffset + 25) = -ctemp26; | |||
| *(boffset + 26) = -ctemp27; | |||
| *(boffset + 27) = -ctemp28; | |||
| *(boffset + 28) = -ctemp29; | |||
| *(boffset + 29) = -ctemp30; | |||
| *(boffset + 30) = -ctemp31; | |||
| *(boffset + 31) = -ctemp32; | |||
| aoffset1 += 2 * lda; | |||
| aoffset2 += 2 * lda; | |||
| boffset += 32; | |||
| i --; | |||
| }while(i > 0); | |||
| } | |||
| if (m & 1){ | |||
| ctemp01 = *(aoffset1 + 0); | |||
| ctemp02 = *(aoffset1 + 1); | |||
| ctemp03 = *(aoffset1 + 2); | |||
| ctemp04 = *(aoffset1 + 3); | |||
| ctemp05 = *(aoffset1 + 4); | |||
| ctemp06 = *(aoffset1 + 5); | |||
| ctemp07 = *(aoffset1 + 6); | |||
| ctemp08 = *(aoffset1 + 7); | |||
| ctemp09 = *(aoffset1 + 8); | |||
| ctemp10 = *(aoffset1 + 9); | |||
| ctemp11 = *(aoffset1 + 10); | |||
| ctemp12 = *(aoffset1 + 11); | |||
| ctemp13 = *(aoffset1 + 12); | |||
| ctemp14 = *(aoffset1 + 13); | |||
| ctemp15 = *(aoffset1 + 14); | |||
| ctemp16 = *(aoffset1 + 15); | |||
| *(boffset + 0) = -ctemp01; | |||
| *(boffset + 1) = -ctemp02; | |||
| *(boffset + 2) = -ctemp03; | |||
| *(boffset + 3) = -ctemp04; | |||
| *(boffset + 4) = -ctemp05; | |||
| *(boffset + 5) = -ctemp06; | |||
| *(boffset + 6) = -ctemp07; | |||
| *(boffset + 7) = -ctemp08; | |||
| *(boffset + 8) = -ctemp09; | |||
| *(boffset + 9) = -ctemp10; | |||
| *(boffset + 10) = -ctemp11; | |||
| *(boffset + 11) = -ctemp12; | |||
| *(boffset + 12) = -ctemp13; | |||
| *(boffset + 13) = -ctemp14; | |||
| *(boffset + 14) = -ctemp15; | |||
| *(boffset + 15) = -ctemp16; | |||
| boffset += 16; | |||
| } | |||
| } | |||
| if (n & 4){ | |||
| aoffset1 = aoffset; | |||
| aoffset2 = aoffset + lda; | |||
| aoffset += 8; | |||
| i = (m >> 1); | |||
| if (i > 0){ | |||
| do{ | |||
| ctemp01 = *(aoffset1 + 0); | |||
| ctemp02 = *(aoffset1 + 1); | |||
| ctemp03 = *(aoffset1 + 2); | |||
| ctemp04 = *(aoffset1 + 3); | |||
| ctemp05 = *(aoffset1 + 4); | |||
| ctemp06 = *(aoffset1 + 5); | |||
| ctemp07 = *(aoffset1 + 6); | |||
| ctemp08 = *(aoffset1 + 7); | |||
| ctemp09 = *(aoffset2 + 0); | |||
| ctemp10 = *(aoffset2 + 1); | |||
| ctemp11 = *(aoffset2 + 2); | |||
| ctemp12 = *(aoffset2 + 3); | |||
| ctemp13 = *(aoffset2 + 4); | |||
| ctemp14 = *(aoffset2 + 5); | |||
| ctemp15 = *(aoffset2 + 6); | |||
| ctemp16 = *(aoffset2 + 7); | |||
| *(boffset + 0) = -ctemp01; | |||
| *(boffset + 1) = -ctemp02; | |||
| *(boffset + 2) = -ctemp03; | |||
| *(boffset + 3) = -ctemp04; | |||
| *(boffset + 4) = -ctemp05; | |||
| *(boffset + 5) = -ctemp06; | |||
| *(boffset + 6) = -ctemp07; | |||
| *(boffset + 7) = -ctemp08; | |||
| *(boffset + 8) = -ctemp09; | |||
| *(boffset + 9) = -ctemp10; | |||
| *(boffset + 10) = -ctemp11; | |||
| *(boffset + 11) = -ctemp12; | |||
| *(boffset + 12) = -ctemp13; | |||
| *(boffset + 13) = -ctemp14; | |||
| *(boffset + 14) = -ctemp15; | |||
| *(boffset + 15) = -ctemp16; | |||
| aoffset1 += 2 * lda; | |||
| aoffset2 += 2 * lda; | |||
| boffset += 16; | |||
| i --; | |||
| }while(i > 0); | |||
| } | |||
| if (m & 1){ | |||
| ctemp01 = *(aoffset1 + 0); | |||
| ctemp02 = *(aoffset1 + 1); | |||
| ctemp03 = *(aoffset1 + 2); | |||
| ctemp04 = *(aoffset1 + 3); | |||
| ctemp05 = *(aoffset1 + 4); | |||
| ctemp06 = *(aoffset1 + 5); | |||
| ctemp07 = *(aoffset1 + 6); | |||
| ctemp08 = *(aoffset1 + 7); | |||
| *(boffset + 0) = -ctemp01; | |||
| *(boffset + 1) = -ctemp02; | |||
| *(boffset + 2) = -ctemp03; | |||
| *(boffset + 3) = -ctemp04; | |||
| *(boffset + 4) = -ctemp05; | |||
| *(boffset + 5) = -ctemp06; | |||
| *(boffset + 6) = -ctemp07; | |||
| *(boffset + 7) = -ctemp08; | |||
| boffset += 8; | |||
| } | |||
| } | |||
| if (n & 2){ | |||
| aoffset1 = aoffset; | |||
| aoffset2 = aoffset + lda; | |||
| aoffset += 4; | |||
| i = (m >> 1); | |||
| if (i > 0){ | |||
| do{ | |||
| ctemp01 = *(aoffset1 + 0); | |||
| ctemp02 = *(aoffset1 + 1); | |||
| ctemp03 = *(aoffset1 + 2); | |||
| ctemp04 = *(aoffset1 + 3); | |||
| ctemp05 = *(aoffset2 + 0); | |||
| ctemp06 = *(aoffset2 + 1); | |||
| ctemp07 = *(aoffset2 + 2); | |||
| ctemp08 = *(aoffset2 + 3); | |||
| *(boffset + 0) = -ctemp01; | |||
| *(boffset + 1) = -ctemp02; | |||
| *(boffset + 2) = -ctemp03; | |||
| *(boffset + 3) = -ctemp04; | |||
| *(boffset + 4) = -ctemp05; | |||
| *(boffset + 5) = -ctemp06; | |||
| *(boffset + 6) = -ctemp07; | |||
| *(boffset + 7) = -ctemp08; | |||
| aoffset1 += 2 * lda; | |||
| aoffset2 += 2 * lda; | |||
| boffset += 8; | |||
| i --; | |||
| }while(i > 0); | |||
| } | |||
| if (m & 1){ | |||
| ctemp01 = *(aoffset1 + 0); | |||
| ctemp02 = *(aoffset1 + 1); | |||
| ctemp03 = *(aoffset1 + 2); | |||
| ctemp04 = *(aoffset1 + 3); | |||
| *(boffset + 0) = -ctemp01; | |||
| *(boffset + 1) = -ctemp02; | |||
| *(boffset + 2) = -ctemp03; | |||
| *(boffset + 3) = -ctemp04; | |||
| boffset += 4; | |||
| } | |||
| } | |||
| if (n & 1){ | |||
| aoffset1 = aoffset; | |||
| aoffset2 = aoffset + lda; | |||
| // aoffset += 2; | |||
| i = (m >> 1); | |||
| if (i > 0){ | |||
| do{ | |||
| ctemp01 = *(aoffset1 + 0); | |||
| ctemp02 = *(aoffset1 + 1); | |||
| ctemp03 = *(aoffset2 + 0); | |||
| ctemp04 = *(aoffset2 + 1); | |||
| *(boffset + 0) = -ctemp01; | |||
| *(boffset + 1) = -ctemp02; | |||
| *(boffset + 2) = -ctemp03; | |||
| *(boffset + 3) = -ctemp04; | |||
| aoffset1 += 2 * lda; | |||
| aoffset2 += 2 * lda; | |||
| boffset += 4; | |||
| i --; | |||
| }while(i > 0); | |||
| } | |||
| if (m & 1){ | |||
| ctemp01 = *(aoffset1 + 0); | |||
| ctemp02 = *(aoffset1 + 1); | |||
| *(boffset + 0) = -ctemp01; | |||
| *(boffset + 1) = -ctemp02; | |||
| // boffset += 2; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,333 @@ | |||
| /******************************************************************************* | |||
| Copyright (c) 2024, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *******************************************************************************/ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ | |||
| BLASLONG i, js, offset; | |||
| FLOAT data01, data02, data03, data04, data05, data06, data07, data08; | |||
| FLOAT data09, data10, data11, data12, data13, data14, data15, data16; | |||
| FLOAT data17, data18, data19, data20, data21, data22, data23, data24; | |||
| FLOAT data25, data26, data27, data28, data29, data30, data31, data32; | |||
| FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6, *ao7, *ao8; | |||
| FLOAT *ao9, *ao10, *ao11, *ao12, *ao13, *ao14, *ao15, *ao16; | |||
| lda *= 2; | |||
| js = (n >> 4); | |||
| while (js > 0){ | |||
| offset = posX - posY; | |||
| if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; | |||
| if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; | |||
| if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda; | |||
| if (offset > -3) ao4 = a + (posX + 3) * 2 + posY * lda; else ao4 = a + posY * 2 + (posX + 3) * lda; | |||
| if (offset > -4) ao5 = a + (posX + 4) * 2 + posY * lda; else ao5 = a + posY * 2 + (posX + 4) * lda; | |||
| if (offset > -5) ao6 = a + (posX + 5) * 2 + posY * lda; else ao6 = a + posY * 2 + (posX + 5) * lda; | |||
| if (offset > -6) ao7 = a + (posX + 6) * 2 + posY * lda; else ao7 = a + posY * 2 + (posX + 6) * lda; | |||
| if (offset > -7) ao8 = a + (posX + 7) * 2 + posY * lda; else ao8 = a + posY * 2 + (posX + 7) * lda; | |||
| if (offset > -8) ao9 = a + (posX + 8) * 2 + posY * lda; else ao9 = a + posY * 2 + (posX + 8) * lda; | |||
| if (offset > -9) ao10 = a + (posX + 9) * 2 + posY * lda; else ao10 = a + posY * 2 + (posX + 9) * lda; | |||
| if (offset > -10) ao11 = a + (posX + 10) * 2 + posY * lda; else ao11 = a + posY * 2 + (posX + 10) * lda; | |||
| if (offset > -11) ao12 = a + (posX + 11) * 2 + posY * lda; else ao12 = a + posY * 2 + (posX + 11) * lda; | |||
| if (offset > -12) ao13 = a + (posX + 12) * 2 + posY * lda; else ao13 = a + posY * 2 + (posX + 12) * lda; | |||
| if (offset > -13) ao14 = a + (posX + 13) * 2 + posY * lda; else ao14 = a + posY * 2 + (posX + 13) * lda; | |||
| if (offset > -14) ao15 = a + (posX + 14) * 2 + posY * lda; else ao15 = a + posY * 2 + (posX + 14) * lda; | |||
| if (offset > -15) ao16 = a + (posX + 15) * 2 + posY * lda; else ao16 = a + posY * 2 + (posX + 15) * lda; | |||
| i = m; | |||
| while (i > 0) { | |||
| data01 = *(ao1 + 0); | |||
| data02 = *(ao1 + 1); | |||
| data03 = *(ao2 + 0); | |||
| data04 = *(ao2 + 1); | |||
| data05 = *(ao3 + 0); | |||
| data06 = *(ao3 + 1); | |||
| data07 = *(ao4 + 0); | |||
| data08 = *(ao4 + 1); | |||
| data09 = *(ao5 + 0); | |||
| data10 = *(ao5 + 1); | |||
| data11 = *(ao6 + 0); | |||
| data12 = *(ao6 + 1); | |||
| data13 = *(ao7 + 0); | |||
| data14 = *(ao7 + 1); | |||
| data15 = *(ao8 + 0); | |||
| data16 = *(ao8 + 1); | |||
| data17 = *(ao9 + 0); | |||
| data18 = *(ao9 + 1); | |||
| data19 = *(ao10 + 0); | |||
| data20 = *(ao10 + 1); | |||
| data21 = *(ao11 + 0); | |||
| data22 = *(ao11 + 1); | |||
| data23 = *(ao12 + 0); | |||
| data24 = *(ao12 + 1); | |||
| data25 = *(ao13 + 0); | |||
| data26 = *(ao13 + 1); | |||
| data27 = *(ao14 + 0); | |||
| data28 = *(ao14 + 1); | |||
| data29 = *(ao15 + 0); | |||
| data30 = *(ao15 + 1); | |||
| data31 = *(ao16 + 0); | |||
| data32 = *(ao16 + 1); | |||
| if (offset > 0) ao1 += lda; else ao1 += 2; | |||
| if (offset > -1) ao2 += lda; else ao2 += 2; | |||
| if (offset > -2) ao3 += lda; else ao3 += 2; | |||
| if (offset > -3) ao4 += lda; else ao4 += 2; | |||
| if (offset > -4) ao5 += lda; else ao5 += 2; | |||
| if (offset > -5) ao6 += lda; else ao6 += 2; | |||
| if (offset > -6) ao7 += lda; else ao7 += 2; | |||
| if (offset > -7) ao8 += lda; else ao8 += 2; | |||
| if (offset > -8) ao9 += lda; else ao9 += 2; | |||
| if (offset > -9) ao10 += lda; else ao10 += 2; | |||
| if (offset > -10) ao11 += lda; else ao11 += 2; | |||
| if (offset > -11) ao12 += lda; else ao12 += 2; | |||
| if (offset > -12) ao13 += lda; else ao13 += 2; | |||
| if (offset > -13) ao14 += lda; else ao14 += 2; | |||
| if (offset > -14) ao15 += lda; else ao15 += 2; | |||
| if (offset > -15) ao16 += lda; else ao16 += 2; | |||
| b[ 0] = data01; | |||
| b[ 1] = data02; | |||
| b[ 2] = data03; | |||
| b[ 3] = data04; | |||
| b[ 4] = data05; | |||
| b[ 5] = data06; | |||
| b[ 6] = data07; | |||
| b[ 7] = data08; | |||
| b[ 8] = data09; | |||
| b[ 9] = data10; | |||
| b[10] = data11; | |||
| b[11] = data12; | |||
| b[12] = data13; | |||
| b[13] = data14; | |||
| b[14] = data15; | |||
| b[15] = data16; | |||
| b[16] = data17; | |||
| b[17] = data18; | |||
| b[18] = data19; | |||
| b[19] = data20; | |||
| b[20] = data21; | |||
| b[21] = data22; | |||
| b[22] = data23; | |||
| b[23] = data24; | |||
| b[24] = data25; | |||
| b[25] = data26; | |||
| b[26] = data27; | |||
| b[27] = data28; | |||
| b[28] = data29; | |||
| b[29] = data30; | |||
| b[30] = data31; | |||
| b[31] = data32; | |||
| b += 32; | |||
| offset --; | |||
| i --; | |||
| } | |||
| posX += 16; | |||
| js --; | |||
| } | |||
| if (n & 8) { | |||
| offset = posX - posY; | |||
| if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; | |||
| if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; | |||
| if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda; | |||
| if (offset > -3) ao4 = a + (posX + 3) * 2 + posY * lda; else ao4 = a + posY * 2 + (posX + 3) * lda; | |||
| if (offset > -4) ao5 = a + (posX + 4) * 2 + posY * lda; else ao5 = a + posY * 2 + (posX + 4) * lda; | |||
| if (offset > -5) ao6 = a + (posX + 5) * 2 + posY * lda; else ao6 = a + posY * 2 + (posX + 5) * lda; | |||
| if (offset > -6) ao7 = a + (posX + 6) * 2 + posY * lda; else ao7 = a + posY * 2 + (posX + 6) * lda; | |||
| if (offset > -7) ao8 = a + (posX + 7) * 2 + posY * lda; else ao8 = a + posY * 2 + (posX + 7) * lda; | |||
| i = m; | |||
| while (i > 0) { | |||
| data01 = *(ao1 + 0); | |||
| data02 = *(ao1 + 1); | |||
| data03 = *(ao2 + 0); | |||
| data04 = *(ao2 + 1); | |||
| data05 = *(ao3 + 0); | |||
| data06 = *(ao3 + 1); | |||
| data07 = *(ao4 + 0); | |||
| data08 = *(ao4 + 1); | |||
| data09 = *(ao5 + 0); | |||
| data10 = *(ao5 + 1); | |||
| data11 = *(ao6 + 0); | |||
| data12 = *(ao6 + 1); | |||
| data13 = *(ao7 + 0); | |||
| data14 = *(ao7 + 1); | |||
| data15 = *(ao8 + 0); | |||
| data16 = *(ao8 + 1); | |||
| if (offset > 0) ao1 += lda; else ao1 += 2; | |||
| if (offset > -1) ao2 += lda; else ao2 += 2; | |||
| if (offset > -2) ao3 += lda; else ao3 += 2; | |||
| if (offset > -3) ao4 += lda; else ao4 += 2; | |||
| if (offset > -4) ao5 += lda; else ao5 += 2; | |||
| if (offset > -5) ao6 += lda; else ao6 += 2; | |||
| if (offset > -6) ao7 += lda; else ao7 += 2; | |||
| if (offset > -7) ao8 += lda; else ao8 += 2; | |||
| b[ 0] = data01; | |||
| b[ 1] = data02; | |||
| b[ 2] = data03; | |||
| b[ 3] = data04; | |||
| b[ 4] = data05; | |||
| b[ 5] = data06; | |||
| b[ 6] = data07; | |||
| b[ 7] = data08; | |||
| b[ 8] = data09; | |||
| b[ 9] = data10; | |||
| b[10] = data11; | |||
| b[11] = data12; | |||
| b[12] = data13; | |||
| b[13] = data14; | |||
| b[14] = data15; | |||
| b[15] = data16; | |||
| b += 16; | |||
| offset --; | |||
| i --; | |||
| } | |||
| posX += 8; | |||
| } | |||
| if (n & 4) { | |||
| offset = posX - posY; | |||
| if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; | |||
| if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; | |||
| if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda; | |||
| if (offset > -3) ao4 = a + (posX + 3) * 2 + posY * lda; else ao4 = a + posY * 2 + (posX + 3) * lda; | |||
| i = m; | |||
| while (i > 0) { | |||
| data01 = *(ao1 + 0); | |||
| data02 = *(ao1 + 1); | |||
| data03 = *(ao2 + 0); | |||
| data04 = *(ao2 + 1); | |||
| data05 = *(ao3 + 0); | |||
| data06 = *(ao3 + 1); | |||
| data07 = *(ao4 + 0); | |||
| data08 = *(ao4 + 1); | |||
| if (offset > 0) ao1 += lda; else ao1 += 2; | |||
| if (offset > -1) ao2 += lda; else ao2 += 2; | |||
| if (offset > -2) ao3 += lda; else ao3 += 2; | |||
| if (offset > -3) ao4 += lda; else ao4 += 2; | |||
| b[ 0] = data01; | |||
| b[ 1] = data02; | |||
| b[ 2] = data03; | |||
| b[ 3] = data04; | |||
| b[ 4] = data05; | |||
| b[ 5] = data06; | |||
| b[ 6] = data07; | |||
| b[ 7] = data08; | |||
| b += 8; | |||
| offset --; | |||
| i --; | |||
| } | |||
| posX += 4; | |||
| } | |||
| if (n & 2) { | |||
| offset = posX - posY; | |||
| if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; | |||
| if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; | |||
| i = m; | |||
| while (i > 0) { | |||
| data01 = *(ao1 + 0); | |||
| data02 = *(ao1 + 1); | |||
| data03 = *(ao2 + 0); | |||
| data04 = *(ao2 + 1); | |||
| if (offset > 0) ao1 += lda; else ao1 += 2; | |||
| if (offset > -1) ao2 += lda; else ao2 += 2; | |||
| b[ 0] = data01; | |||
| b[ 1] = data02; | |||
| b[ 2] = data03; | |||
| b[ 3] = data04; | |||
| b += 4; | |||
| offset --; | |||
| i --; | |||
| } | |||
| posX += 2; | |||
| } | |||
| if (n & 1) { | |||
| offset = posX - posY; | |||
| if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; | |||
| i = m; | |||
| while (i > 0) { | |||
| data01 = *(ao1 + 0); | |||
| data02 = *(ao1 + 1); | |||
| if (offset > 0) ao1 += lda; else ao1 += 2; | |||
| b[ 0] = data01; | |||
| b[ 1] = data02; | |||
| b += 2; | |||
| offset --; | |||
| i --; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||