Update from develop for 0.3.30 releasetags/v0.3.30
| @@ -58,8 +58,8 @@ task: | |||||
| - export VALID_ARCHS="i386 x86_64" | - export VALID_ARCHS="i386 x86_64" | ||||
| - xcrun --sdk macosx --show-sdk-path | - xcrun --sdk macosx --show-sdk-path | ||||
| - xcodebuild -version | - xcodebuild -version | ||||
| - export CC=/Applications/Xcode_15.4.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang | |||||
| - export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode_15.4.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX14.5.sdk -arch x86_64" | |||||
| - export CC=/Applications/Xcode_16.3.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang | |||||
| - export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode_16.3.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX15.4.sdk -arch x86_64" | |||||
| - make TARGET=CORE2 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 RANLIB="ls -l" | - make TARGET=CORE2 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 RANLIB="ls -l" | ||||
| always: | always: | ||||
| config_artifacts: | config_artifacts: | ||||
| @@ -78,8 +78,8 @@ task: | |||||
| - export #PATH=/opt/homebrew/opt/llvm/bin:$PATH | - export #PATH=/opt/homebrew/opt/llvm/bin:$PATH | ||||
| - export #LDFLAGS="-L/opt/homebrew/opt/llvm/lib" | - export #LDFLAGS="-L/opt/homebrew/opt/llvm/lib" | ||||
| - export #CPPFLAGS="-I/opt/homebrew/opt/llvm/include" | - export #CPPFLAGS="-I/opt/homebrew/opt/llvm/include" | ||||
| - export CC=/Applications/Xcode_15.4.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang | |||||
| - export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode_15.4.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS17.5.sdk -arch arm64 -miphoneos-version-min=10.0" | |||||
| - export CC=/Applications/Xcode_16.3.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang | |||||
| - export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode_16.3.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS18.4.sdk -arch arm64 -miphoneos-version-min=10.0" | |||||
| - xcrun --sdk iphoneos --show-sdk-path | - xcrun --sdk iphoneos --show-sdk-path | ||||
| - ls -l /Applications | - ls -l /Applications | ||||
| - make TARGET=ARMV8 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 CROSS=1 | - make TARGET=ARMV8 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 CROSS=1 | ||||
| @@ -127,7 +127,7 @@ task: | |||||
| FreeBSD_task: | FreeBSD_task: | ||||
| name: FreeBSD-gcc | name: FreeBSD-gcc | ||||
| freebsd_instance: | freebsd_instance: | ||||
| image_family: freebsd-14-1 | |||||
| image_family: freebsd-14-2 | |||||
| install_script: | install_script: | ||||
| - pkg update -f && pkg upgrade -y && pkg install -y gmake gcc | - pkg update -f && pkg upgrade -y && pkg install -y gmake gcc | ||||
| compile_script: | compile_script: | ||||
| @@ -138,7 +138,7 @@ FreeBSD_task: | |||||
| FreeBSD_task: | FreeBSD_task: | ||||
| name: freebsd-gcc-ilp64 | name: freebsd-gcc-ilp64 | ||||
| freebsd_instance: | freebsd_instance: | ||||
| image_family: freebsd-14-1 | |||||
| image_family: freebsd-14-2 | |||||
| install_script: | install_script: | ||||
| - pkg update -f && pkg upgrade -y && pkg install -y gmake gcc | - pkg update -f && pkg upgrade -y && pkg install -y gmake gcc | ||||
| compile_script: | compile_script: | ||||
| @@ -148,7 +148,7 @@ FreeBSD_task: | |||||
| FreeBSD_task: | FreeBSD_task: | ||||
| name: FreeBSD-clang-openmp | name: FreeBSD-clang-openmp | ||||
| freebsd_instance: | freebsd_instance: | ||||
| image_family: freebsd-14-1 | |||||
| image_family: freebsd-14-2 | |||||
| install_script: | install_script: | ||||
| - pkg update -f && pkg upgrade -y && pkg install -y gmake gcc | - pkg update -f && pkg upgrade -y && pkg install -y gmake gcc | ||||
| - ln -s /usr/local/lib/gcc13/libgfortran.so.5.0.0 /usr/lib/libgfortran.so | - ln -s /usr/local/lib/gcc13/libgfortran.so.5.0.0 /usr/lib/libgfortran.so | ||||
| @@ -102,6 +102,7 @@ jobs: | |||||
| mkdir build && cd build | mkdir build && cd build | ||||
| cmake -DDYNAMIC_ARCH=1 \ | cmake -DDYNAMIC_ARCH=1 \ | ||||
| -DUSE_OPENMP=${{matrix.openmp}} \ | -DUSE_OPENMP=${{matrix.openmp}} \ | ||||
| -DOpenMP_Fortran_LIB_NAMES=omp \ | |||||
| -DINTERFACE64=${{matrix.ilp64}} \ | -DINTERFACE64=${{matrix.ilp64}} \ | ||||
| -DNOFORTRAN=0 \ | -DNOFORTRAN=0 \ | ||||
| -DBUILD_WITHOUT_LAPACK=0 \ | -DBUILD_WITHOUT_LAPACK=0 \ | ||||
| @@ -31,27 +31,28 @@ jobs: | |||||
| steps: | steps: | ||||
| - name: Checkout repository | - name: Checkout repository | ||||
| uses: actions/checkout@v3 | |||||
| uses: actions/checkout@v4 | |||||
| - name: install build deps | - name: install build deps | ||||
| run: | | run: | | ||||
| sudo apt-get update | sudo apt-get update | ||||
| sudo apt-get install autoconf automake autotools-dev ninja-build make ccache \ | sudo apt-get install autoconf automake autotools-dev ninja-build make ccache \ | ||||
| gcc-${{ matrix.apt_triple }} gfortran-${{ matrix.apt_triple }} libgomp1-riscv64-cross | |||||
| gcc-${{ matrix.apt_triple }} gfortran-${{ matrix.apt_triple }} libgomp1-riscv64-cross libglib2.0-dev | |||||
| - name: checkout qemu | - name: checkout qemu | ||||
| uses: actions/checkout@v3 | |||||
| uses: actions/checkout@v4 | |||||
| with: | with: | ||||
| repository: T-head-Semi/qemu | |||||
| repository: XUANTIE-RV/qemu | |||||
| path: qemu | path: qemu | ||||
| ref: 1e692ebb43d396c52352406323fc782c1ac99a42 | |||||
| ref: e0ace167effcd36d1f82c7ccb4522b3126011479 # xuantie-qemu-9.0 | |||||
| - name: build qemu | - name: build qemu | ||||
| run: | | run: | | ||||
| # Force use c910v qemu-user | # Force use c910v qemu-user | ||||
| wget https://github.com/revyos/qemu/commit/5164bca5a4bcde4534dc1a9aa3a7f619719874cf.patch | |||||
| wget https://github.com/revyos/qemu/commit/222729c7455784dd855216d7a2bec4bd8f2a6800.patch | |||||
| cd qemu | cd qemu | ||||
| patch -p1 < ../5164bca5a4bcde4534dc1a9aa3a7f619719874cf.patch | |||||
| patch -p1 < ../222729c7455784dd855216d7a2bec4bd8f2a6800.patch | |||||
| export CXXFLAGS="-Wno-error"; export CFLAGS="-Wno-error" | |||||
| ./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=riscv64-linux-user --disable-system | ./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=riscv64-linux-user --disable-system | ||||
| make -j$(nproc) | make -j$(nproc) | ||||
| make install | make install | ||||
| @@ -82,9 +83,39 @@ jobs: | |||||
| - name: test | - name: test | ||||
| run: | | run: | | ||||
| export PATH=$GITHUB_WORKSPACE/qemu-install/bin/:$PATH | |||||
| qemu-riscv64 ./utest/openblas_utest | |||||
| qemu-riscv64 ./utest/openblas_utest_ext | |||||
| run_with_retry() { | |||||
| local cmd="$1" | |||||
| local time_out=10 | |||||
| local retries=10 | |||||
| local attempt=0 | |||||
| for ((i=1; i<=retries; i++)); do | |||||
| attempt=$((i)) | |||||
| if timeout -s 12 --preserve-status $time_out $cmd; then | |||||
| echo "Command succeeded on attempt $i." | |||||
| return 0 | |||||
| else | |||||
| local exit_code=$? | |||||
| if [ $exit_code -eq 140 ]; then | |||||
| echo "Attempt $i timed out (retrying...)" | |||||
| time_out=$((time_out + 5)) | |||||
| else | |||||
| echo "Attempt $i failed with exit code $exit_code. Aborting workflow." | |||||
| exit $exit_code | |||||
| fi | |||||
| fi | |||||
| done | |||||
| echo "All $retries attempts failed, giving up." | |||||
| echo "Final failure was due to timeout." | |||||
| echo "Aborting workflow." | |||||
| exit $exit_code | |||||
| } | |||||
| export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH | |||||
| which qemu-riscv64 | |||||
| export QEMU_BIN=$(which qemu-riscv64) | |||||
| run_with_retry "$QEMU_BIN ./utest/openblas_utest" | |||||
| run_with_retry "$QEMU_BIN ./utest/openblas_utest_ext" | |||||
| OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xscblat1 | OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xscblat1 | ||||
| OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xdcblat1 | OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xdcblat1 | ||||
| OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xccblat1 | OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xccblat1 | ||||
| @@ -15,7 +15,7 @@ jobs: | |||||
| strategy: | strategy: | ||||
| fail-fast: false | fail-fast: false | ||||
| matrix: | matrix: | ||||
| os: [ubuntu-latest] | |||||
| os: [ubuntu-22.04] | |||||
| fortran: [gfortran] | fortran: [gfortran] | ||||
| build: [make] | build: [make] | ||||
| pyver: ["3.12"] | pyver: ["3.12"] | ||||
| @@ -147,7 +147,7 @@ jobs: | |||||
| OPENBLAS_NUM_THREADS=1 pytest benchmarks/bench_blas.py -k 'gesdd' | OPENBLAS_NUM_THREADS=1 pytest benchmarks/bench_blas.py -k 'gesdd' | ||||
| - name: Run benchmarks | - name: Run benchmarks | ||||
| uses: CodSpeedHQ/action@v2 | |||||
| uses: CodSpeedHQ/action@v3 | |||||
| with: | with: | ||||
| token: ${{ secrets.CODSPEED_TOKEN }} | token: ${{ secrets.CODSPEED_TOKEN }} | ||||
| run: | | run: | | ||||
| @@ -43,7 +43,9 @@ jobs: | |||||
| run: | | run: | | ||||
| if [ "$RUNNER_OS" == "Linux" ]; then | if [ "$RUNNER_OS" == "Linux" ]; then | ||||
| sudo apt-get update | sudo apt-get update | ||||
| sudo apt-get install -y gfortran cmake ccache libtinfo5 | |||||
| sudo apt-get install -y gfortran cmake ccache | |||||
| wget http://security.ubuntu.com/ubuntu/pool/universe/n/ncurses/libtinfo5_6.3-2ubuntu0.1_amd64.deb | |||||
| sudo apt install ./libtinfo5_6.3-2ubuntu0.1_amd64.deb | |||||
| elif [ "$RUNNER_OS" == "macOS" ]; then | elif [ "$RUNNER_OS" == "macOS" ]; then | ||||
| # It looks like "gfortran" isn't working correctly unless "gcc" is re-installed. | # It looks like "gfortran" isn't working correctly unless "gcc" is re-installed. | ||||
| brew reinstall gcc | brew reinstall gcc | ||||
| @@ -354,3 +356,23 @@ jobs: | |||||
| - name: Build OpenBLAS | - name: Build OpenBLAS | ||||
| run: | | run: | | ||||
| make -j$(nproc) HOSTCC="ccache gcc" CC="ccache ${{ matrix.triple }}-gcc" FC="ccache ${{ matrix.triple }}-gfortran" ARCH=${{ matrix.target }} ${{ matrix.opts }} | make -j$(nproc) HOSTCC="ccache gcc" CC="ccache ${{ matrix.triple }}-gcc" FC="ccache ${{ matrix.triple }}-gfortran" ARCH=${{ matrix.target }} ${{ matrix.opts }} | ||||
| neoverse_build: | |||||
| if: "github.repository == 'OpenMathLib/OpenBLAS'" | |||||
| runs-on: ubuntu-24.04-arm | |||||
| steps: | |||||
| - name: Checkout repository | |||||
| uses: actions/checkout@v3 | |||||
| - name: Install Dependencies | |||||
| run: | | |||||
| sudo apt-get update | |||||
| sudo apt-get install -y gcc gfortran make | |||||
| - name: Build OpenBLAS | |||||
| run: | | |||||
| make -j${nproc} | |||||
| make -j${nproc} lapack-test | |||||
| @@ -41,7 +41,7 @@ jobs: | |||||
| - name: Install APT deps | - name: Install APT deps | ||||
| run: | | run: | | ||||
| sudo apt-get update | sudo apt-get update | ||||
| sudo apt-get install autoconf automake autotools-dev ninja-build make ccache | |||||
| sudo apt-get install autoconf automake autotools-dev ninja-build make ccache libglib2.0-dev | |||||
| - name: Download and install loongarch64-toolchain | - name: Download and install loongarch64-toolchain | ||||
| run: | | run: | | ||||
| @@ -41,14 +41,14 @@ jobs: | |||||
| run: | | run: | | ||||
| sudo apt-get update | sudo apt-get update | ||||
| sudo apt-get install autoconf automake autotools-dev ninja-build make ccache \ | sudo apt-get install autoconf automake autotools-dev ninja-build make ccache \ | ||||
| gcc-${{ matrix.triple }} gfortran-${{ matrix.triple }} libgomp1-mips64el-cross | |||||
| gcc-${{ matrix.triple }} gfortran-${{ matrix.triple }} libgomp1-mips64el-cross libglib2.0-dev | |||||
| - name: checkout qemu | - name: checkout qemu | ||||
| uses: actions/checkout@v3 | uses: actions/checkout@v3 | ||||
| with: | with: | ||||
| repository: qemu/qemu | repository: qemu/qemu | ||||
| path: qemu | path: qemu | ||||
| ref: 79dfa177ae348bb5ab5f97c0915359b13d6186e2 | |||||
| ref: ae35f033b874c627d81d51070187fbf55f0bf1a7 | |||||
| - name: build qemu | - name: build qemu | ||||
| run: | | run: | | ||||
| @@ -9,7 +9,7 @@ project(OpenBLAS C ASM) | |||||
| set(OpenBLAS_MAJOR_VERSION 0) | set(OpenBLAS_MAJOR_VERSION 0) | ||||
| set(OpenBLAS_MINOR_VERSION 3) | set(OpenBLAS_MINOR_VERSION 3) | ||||
| set(OpenBLAS_PATCH_VERSION 29) | |||||
| set(OpenBLAS_PATCH_VERSION 29.dev) | |||||
| set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") | set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") | ||||
| @@ -21,6 +21,8 @@ include(CMakePackageConfigHelpers) | |||||
| ####### | ####### | ||||
| option(BUILD_WITHOUT_LAPACK "Do not build LAPACK and LAPACKE (Only BLAS or CBLAS)" OFF) | option(BUILD_WITHOUT_LAPACK "Do not build LAPACK and LAPACKE (Only BLAS or CBLAS)" OFF) | ||||
| option(BUILD_WITHOUT_LAPACKE "Do not build the C interface to LAPACK)" OFF) | |||||
| option(BUILD_LAPACK_DEPRECATED "When building LAPACK, include also some older, deprecated routines" ON) | option(BUILD_LAPACK_DEPRECATED "When building LAPACK, include also some older, deprecated routines" ON) | ||||
| set(LAPACK_STRLEN "" CACHE STRING "When building LAPACK, use this type (e.g. \"int\") for character lengths (defaults to size_t)") | set(LAPACK_STRLEN "" CACHE STRING "When building LAPACK, use this type (e.g. \"int\") for character lengths (defaults to size_t)") | ||||
| @@ -60,6 +62,7 @@ option(CPP_THREAD_SAFETY_TEST "Run a massively parallel DGEMM test to confirm th | |||||
| option(CPP_THREAD_SAFETY_GEMV "Run a massively parallel DGEMV test to confirm thread safety of the library (requires OpenMP)" OFF) | option(CPP_THREAD_SAFETY_GEMV "Run a massively parallel DGEMV test to confirm thread safety of the library (requires OpenMP)" OFF) | ||||
| option(BUILD_STATIC_LIBS "Build static library" OFF) | option(BUILD_STATIC_LIBS "Build static library" OFF) | ||||
| option(BUILD_SHARED_LIBS "Build shared library" OFF) | |||||
| if(NOT BUILD_STATIC_LIBS AND NOT BUILD_SHARED_LIBS) | if(NOT BUILD_STATIC_LIBS AND NOT BUILD_SHARED_LIBS) | ||||
| set(BUILD_STATIC_LIBS ON CACHE BOOL "Build static library" FORCE) | set(BUILD_STATIC_LIBS ON CACHE BOOL "Build static library" FORCE) | ||||
| endif() | endif() | ||||
| @@ -75,12 +78,27 @@ set(SYMBOLPREFIX "" CACHE STRING "Add a prefix to all exported symbol names in | |||||
| set(SYMBOLSUFFIX "" CACHE STRING "Add a suffix to all exported symbol names in the shared library, e.g. _64 for INTERFACE64 builds" ) | set(SYMBOLSUFFIX "" CACHE STRING "Add a suffix to all exported symbol names in the shared library, e.g. _64 for INTERFACE64 builds" ) | ||||
| if (CMAKE_SYSTEM_NAME MATCHES "Windows" AND BUILD_SHARED_LIBS AND NOT ("${SYMBOLPREFIX}${SYMBOLSUFFIX}" STREQUAL "")) | |||||
| set (DELETE_STATIC_LIBS "") | |||||
| if (NOT BUILD_STATIC_LIBS) | |||||
| message (STATUS "forcing build of a temporary static library for symbol renaming") | |||||
| set (BUILD_SHARED_LIBS OFF CACHE BOOL "Build shared library" FORCE) | |||||
| set (BUILD_STATIC_LIBS ON CACHE BOOL "Build static library" FORCE) | |||||
| set (DELETE_STATIC_LIBS file (REMOVE $<TARGET_FILE_DIR:${OpenBLAS_LIBNAME}_static>/${OpenBLAS_LIBNAME}.lib)) | |||||
| endif () | |||||
| endif() | |||||
| ####### | ####### | ||||
| if(BUILD_WITHOUT_LAPACK) | if(BUILD_WITHOUT_LAPACK) | ||||
| set(NO_LAPACK 1) | set(NO_LAPACK 1) | ||||
| set(NO_LAPACKE 1) | set(NO_LAPACKE 1) | ||||
| endif() | endif() | ||||
| if (BUILD_WITHOUT_LAPACKE) | |||||
| set(NO_LAPACKE 1) | |||||
| endif() | |||||
| if(BUILD_WITHOUT_CBLAS) | if(BUILD_WITHOUT_CBLAS) | ||||
| set(NO_CBLAS 1) | set(NO_CBLAS 1) | ||||
| endif() | endif() | ||||
| @@ -103,14 +121,15 @@ endif() | |||||
| message(WARNING "CMake support is experimental. It does not yet support all build options and may not produce the same Makefiles that OpenBLAS ships with.") | message(WARNING "CMake support is experimental. It does not yet support all build options and may not produce the same Makefiles that OpenBLAS ships with.") | ||||
| if (USE_OPENMP) | |||||
| find_package(OpenMP REQUIRED) | |||||
| endif () | |||||
| include("${PROJECT_SOURCE_DIR}/cmake/utils.cmake") | include("${PROJECT_SOURCE_DIR}/cmake/utils.cmake") | ||||
| include("${PROJECT_SOURCE_DIR}/cmake/system.cmake") | include("${PROJECT_SOURCE_DIR}/cmake/system.cmake") | ||||
| set(OpenBLAS_LIBNAME ${LIBNAMEPREFIX}openblas${LIBNAMESUFFIX}${SUFFIX64_UNDERSCORE}) | |||||
| string(FIND "${LIBNAMESUFFIX}" "${SUFFIX64_UNDERSCORE}" HAVE64) | |||||
| if (${HAVE64} GREATER -1) | |||||
| set(OpenBLAS_LIBNAME ${LIBNAMEPREFIX}openblas${LIBNAMESUFFIX}) | |||||
| else () | |||||
| set(OpenBLAS_LIBNAME ${LIBNAMEPREFIX}openblas${LIBNAMESUFFIX}${SUFFIX64_UNDERSCORE}) | |||||
| endif () | |||||
| set(BLASDIRS interface driver/level2 driver/level3 driver/others) | set(BLASDIRS interface driver/level2 driver/level3 driver/others) | ||||
| @@ -224,6 +243,12 @@ endif () | |||||
| # add objects to the openblas lib | # add objects to the openblas lib | ||||
| if(NOT NO_LAPACK) | if(NOT NO_LAPACK) | ||||
| add_library(LAPACK_OVERRIDES OBJECT ${LA_SOURCES}) | add_library(LAPACK_OVERRIDES OBJECT ${LA_SOURCES}) | ||||
| if (USE_OPENMP AND (NOT NOFORTRAN)) | |||||
| # Disable OpenMP for LAPACK Fortran codes on Windows. | |||||
| if(NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Windows") | |||||
| target_link_libraries(LAPACK_OVERRIDES OpenMP::OpenMP_Fortran) | |||||
| endif() | |||||
| endif() | |||||
| list(APPEND TARGET_OBJS "$<TARGET_OBJECTS:LAPACK_OVERRIDES>") | list(APPEND TARGET_OBJS "$<TARGET_OBJECTS:LAPACK_OVERRIDES>") | ||||
| endif() | endif() | ||||
| if(NOT NO_LAPACKE) | if(NOT NO_LAPACKE) | ||||
| @@ -265,30 +290,59 @@ endif() | |||||
| if (USE_OPENMP) | if (USE_OPENMP) | ||||
| if(BUILD_STATIC_LIBS) | if(BUILD_STATIC_LIBS) | ||||
| target_link_libraries(${OpenBLAS_LIBNAME}_static OpenMP::OpenMP_C) | |||||
| if(NOFORTRAN) | |||||
| target_link_libraries(${OpenBLAS_LIBNAME}_static OpenMP::OpenMP_C) | |||||
| else() | |||||
| target_link_libraries(${OpenBLAS_LIBNAME}_static OpenMP::OpenMP_C OpenMP::OpenMP_Fortran) | |||||
| endif() | |||||
| endif() | endif() | ||||
| if(BUILD_SHARED_LIBS) | if(BUILD_SHARED_LIBS) | ||||
| target_link_libraries(${OpenBLAS_LIBNAME}_shared OpenMP::OpenMP_C) | |||||
| if(NOFORTRAN) | |||||
| target_link_libraries(${OpenBLAS_LIBNAME}_shared OpenMP::OpenMP_C) | |||||
| else() | |||||
| target_link_libraries(${OpenBLAS_LIBNAME}_shared OpenMP::OpenMP_C OpenMP::OpenMP_Fortran) | |||||
| endif() | |||||
| endif() | endif() | ||||
| endif() | endif() | ||||
| # Seems that this hack doesn't required since macOS 11 Big Sur | |||||
| if (APPLE AND BUILD_SHARED_LIBS AND CMAKE_HOST_SYSTEM_VERSION VERSION_LESS 20) | |||||
| set (CMAKE_C_USE_RESPONSE_FILE_FOR_OBJECTS 1) | |||||
| if (NOT NOFORTRAN) | |||||
| set (CMAKE_Fortran_USE_RESPONSE_FILE_FOR_OBJECTS 1) | |||||
| set (CMAKE_Fortran_CREATE_SHARED_LIBRARY | |||||
| "sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ${CMAKE_AR} -ru libopenblas.a && exit 0' " | |||||
| "sh -c '${CMAKE_AR} -rs libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' " | |||||
| "sh -c 'echo \"\" | ${CMAKE_Fortran_COMPILER} -o dummy.o -c -x f95-cpp-input - '" | |||||
| "sh -c '${CMAKE_Fortran_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load dummy.o -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'" | |||||
| "sh -c 'ls -l ${CMAKE_BINARY_DIR}/lib'") | |||||
| else () | |||||
| set (CMAKE_C_CREATE_SHARED_LIBRARY | |||||
| "sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ${CMAKE_AR} -ru libopenblas.a && exit 0' " | |||||
| "sh -c '${CMAKE_AR} -rs libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' " | |||||
| "sh -c '${CMAKE_C_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'") | |||||
| endif () | |||||
| # Fix "Argument list too long" for macOS with Intel CPUs and DYNAMIC_ARCH turned on | |||||
| if(APPLE AND DYNAMIC_ARCH AND (NOT CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "arm64")) | |||||
| # Use response files | |||||
| set(CMAKE_C_USE_RESPONSE_FILE_FOR_OBJECTS 1) | |||||
| # Always build static library first | |||||
| if(BUILD_STATIC_LIBS) | |||||
| set(STATIC_PATH "${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/lib${OpenBLAS_LIBNAME}.a") | |||||
| else() | |||||
| add_library(${OpenBLAS_LIBNAME}_static STATIC ${TARGET_OBJS} ${OpenBLAS_DEF_FILE}) | |||||
| set(STATIC_PATH "lib${OpenBLAS_LIBNAME}.a") | |||||
| endif() | |||||
| set(CREATE_STATIC_LIBRARY_COMMAND | |||||
| "sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/${OpenBLAS_LIBNAME}_static.dir/objects*.rsp | xargs -n 1024 ${CMAKE_AR} -ru ${STATIC_PATH} && exit 0' " | |||||
| "sh -c '${CMAKE_AR} -rs ${STATIC_PATH} ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' ") | |||||
| if(BUILD_SHARED_LIBS) | |||||
| add_dependencies(${OpenBLAS_LIBNAME}_shared ${OpenBLAS_LIBNAME}_static) | |||||
| set(SHARED_PATH "${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib") | |||||
| endif() | |||||
| if(USE_OPENMP) | |||||
| get_target_property(OMP_LIB OpenMP::OpenMP_C INTERFACE_LINK_LIBRARIES) | |||||
| else() | |||||
| set(OMP_LIB "") | |||||
| endif() | |||||
| if(NOT NOFORTRAN) | |||||
| set(CMAKE_Fortran_USE_RESPONSE_FILE_FOR_OBJECTS 1) | |||||
| set(CMAKE_Fortran_CREATE_STATIC_LIBRARY ${CREATE_STATIC_LIBRARY_COMMAND}) | |||||
| if(BUILD_SHARED_LIBS) | |||||
| set(CMAKE_Fortran_CREATE_SHARED_LIBRARY | |||||
| "sh -c 'echo \"\" | ${CMAKE_Fortran_COMPILER} -o dummy.o -c -x f95-cpp-input - '" | |||||
| "sh -c '${CMAKE_Fortran_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,${STATIC_PATH} dummy.o -o ${SHARED_PATH} ${OMP_LIB}'") | |||||
| endif() | |||||
| else() | |||||
| set(CMAKE_C_CREATE_STATIC_LIBRARY ${CREATE_STATIC_LIBRARY_COMMAND}) | |||||
| if(BUILD_SHARED_LIBS) | |||||
| set(CMAKE_C_CREATE_SHARED_LIBRARY | |||||
| "sh -c '${CMAKE_C_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,${STATIC_PATH} -o ${SHARED_PATH} ${OMP_LIB}'") | |||||
| endif() | |||||
| endif() | |||||
| endif() | endif() | ||||
| # Handle MSVC exports | # Handle MSVC exports | ||||
| @@ -373,7 +427,7 @@ if (BUILD_SHARED_LIBS AND BUILD_RELAPACK) | |||||
| endif() | endif() | ||||
| endif() | endif() | ||||
| if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFFIX} STREQUAL "") | |||||
| if (BUILD_SHARED_LIBS OR DELETE_STATIC_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFFIX} STREQUAL "") | |||||
| if (NOT DEFINED ARCH) | if (NOT DEFINED ARCH) | ||||
| set(ARCH_IN "x86_64") | set(ARCH_IN "x86_64") | ||||
| else() | else() | ||||
| @@ -461,10 +515,33 @@ if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFFIX} STREQUAL "") | |||||
| else () | else () | ||||
| set (BZ 0) | set (BZ 0) | ||||
| endif() | endif() | ||||
| if (CMAKE_SYSTEM_NAME MATCHES "Windows") | |||||
| set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib) | |||||
| set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib) | |||||
| if (CMAKE_BUILD_TYPE MATCHES "Debug") | |||||
| set (CRTLIB msvcrtd) | |||||
| set (PDBOPT -debug -pdb:$<TARGET_FILE_DIR:${OpenBLAS_LIBNAME}_static>/${OpenBLAS_LIBNAME}.pdb) | |||||
| set (PDB_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib) | |||||
| else () | |||||
| set (CRTLIB msvcrt) | |||||
| set (PDBOPT "") | |||||
| endif() | |||||
| #if (USE_PERL) | |||||
| message(STATUS "adding postbuild instruction to rename syms") | |||||
| add_custom_command(TARGET ${OpenBLAS_LIBNAME}_static POST_BUILD | |||||
| COMMAND perl ${PROJECT_SOURCE_DIR}/exports/gensymbol.pl "win2k" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" "${SYMBOLPREFIX}" "${SYMBOLSUFFIX}" "${BLD}" "${BBF16}" "${BS}" "${BD}" "${BC}" "${BZ}" > ${PROJECT_BINARY_DIR}/renamesyms.def | |||||
| COMMAND ${CMAKE_C_COMPILER} ${CMAKE_C_FLAGS} -I${PROJECT_SOURCE_DIR} -I${PROJECT_BINARY_DIR} -c -o ${PROJECT_BINARY_DIR}/dllinit.o ${PROJECT_SOURCE_DIR}/exports/dllinit.c | |||||
| COMMAND lld-link -nodefaultlib:libcmt -defaultlib:${CRTLIB} ${CMAKE_LINKER_FLAGS} -errorlimit:0 -def:${PROJECT_BINARY_DIR}/renamesyms.def ${PROJECT_BINARY_DIR}/dllinit.o $<TARGET_FILE:${OpenBLAS_LIBNAME}_static> -wholearchive:$<TARGET_FILE:${OpenBLAS_LIBNAME}_static> -dll -out:$<TARGET_FILE_DIR:${OpenBLAS_LIBNAME}_static>/${OpenBLAS_LIBNAME}.dll -implib:$<TARGET_FILE_DIR:${OpenBLAS_LIBNAME}_static>/${OpenBLAS_LIBNAME}.dll.a ${PDBOPT} | |||||
| #COMMAND lld-link -nodefaultlib:libcmt -defaultlib:msvcrt ${CMAKE_LINKER_FLAGS} -errorlimit:0 -def:${PROJECT_BINARY_DIR}/renamesyms.def ${PROJECT_BINARY_DIR}/dllinit.o $<TARGET_FILE:${OpenBLAS_LIBNAME}_static> -wholearchive:$<TARGET_FILE:${OpenBLAS_LIBNAME}_static> -dll -out:$<TARGET_FILE_DIR:${OpenBLAS_LIBNAME}_static>/${OpenBLAS_LIBNAME}.dll -implib:$<TARGET_FILE_DIR:${OpenBLAS_LIBNAME}_static>/${OpenBLAS_LIBNAME}.dll.a | |||||
| ${REMOVE_STATIC_LIB} VERBATIM | |||||
| ) | |||||
| #endif () | |||||
| else () | |||||
| if (NOT USE_PERL) | if (NOT USE_PERL) | ||||
| add_custom_command(TARGET ${OpenBLAS_LIBNAME}_shared POST_BUILD | add_custom_command(TARGET ${OpenBLAS_LIBNAME}_shared POST_BUILD | ||||
| COMMAND ${PROJECT_SOURCE_DIR}/exports/gensymbol "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BLD}" "${BBF16}" "${BS}" "${BD}" "${BC}" "${BZ}" > ${PROJECT_BINARY_DIR}/objcopy.def | |||||
| COMMAND objcopy -v --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def ${PROJECT_BINARY_DIR}/lib/lib${OpenBLAS_LIBNAME}.so | |||||
| COMMAND sh ${PROJECT_SOURCE_DIR}/exports/gensymbol "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BLD}" "${BBF16}" "${BS}" "${BD}" "${BC}" "${BZ}" > ${PROJECT_BINARY_DIR}/objcopy.def | |||||
| COMMAND objcopy -v --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def ${PROJECT_BINARY_DIR}/lib/${OpenBLAS_LIBNAME}.so | |||||
| COMMENT "renaming symbols" | COMMENT "renaming symbols" | ||||
| ) | ) | ||||
| else() | else() | ||||
| @@ -475,6 +552,7 @@ if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFFIX} STREQUAL "") | |||||
| ) | ) | ||||
| endif() | endif() | ||||
| endif() | endif() | ||||
| endif() | |||||
| if (BUILD_BENCHMARKS) | if (BUILD_BENCHMARKS) | ||||
| #find_package(OpenMP REQUIRED) | #find_package(OpenMP REQUIRED) | ||||
| @@ -645,3 +723,4 @@ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${PN}ConfigVersion.cmake | |||||
| install(EXPORT "${PN}${SUFFIX64}Targets" | install(EXPORT "${PN}${SUFFIX64}Targets" | ||||
| NAMESPACE "${PN}${SUFFIX64}::" | NAMESPACE "${PN}${SUFFIX64}::" | ||||
| DESTINATION ${CMAKECONFIG_INSTALL_DIR}) | DESTINATION ${CMAKECONFIG_INSTALL_DIR}) | ||||
| @@ -26,6 +26,9 @@ | |||||
| * Chris Sidebottom <chris.sidebottom@arm.com> | * Chris Sidebottom <chris.sidebottom@arm.com> | ||||
| * Optimizations and other improvements targeting AArch64 | * Optimizations and other improvements targeting AArch64 | ||||
| * Annop Wongwathanarat <annop.wongwathanarat@arm.com> | |||||
| * Optimizations and other improvements targeting AArch64 | |||||
| ## Previous Developers | ## Previous Developers | ||||
| * Zaheer Chothia <zaheer.chothia@gmail.com> | * Zaheer Chothia <zaheer.chothia@gmail.com> | ||||
| @@ -231,4 +234,23 @@ In chronological order: | |||||
| * [2024-01-24] Optimize GEMV forwarding on ARM64 systems | * [2024-01-24] Optimize GEMV forwarding on ARM64 systems | ||||
| * Aniket P. Garade <https://github.com/garadeaniket> Sushil Pratap Singh <https://github.com/SushilPratap04> Juliya James <https://github.com/Juliya32> | * Aniket P. Garade <https://github.com/garadeaniket> Sushil Pratap Singh <https://github.com/SushilPratap04> Juliya James <https://github.com/Juliya32> | ||||
| * [2024-12-13] Optimized swap and rot Level-1 BLAS routines with ARM SVE | |||||
| * [2024-12-13] Optimized swap and rot Level-1 BLAS routines with ARM SVE | |||||
| * Annop Wongwathanarat <annop.wongwathanarat@arm.com> | |||||
| * [2025-01-10] Add thread throttling profile for SGEMM on NEOVERSEV1 | |||||
| * [2025-01-21] Optimize gemv_t_sve_v1x3 kernel | |||||
| * [2025-02-26] Add sbgemv_t_bfdot kernel | |||||
| * [2025-03-12] Fix aarch64 sbgemv_t compilation error for GCC < 13 | |||||
| * [2025-03-12] Optimize aarch64 sgemm_ncopy | |||||
| * Marek Michalowski <marek.michalowski@arm.com> | |||||
| * [2025-01-21] Add thread throttling profile for SGEMV on `NEOVERSEV1` | |||||
| * [2025-02-18] Add thread throttling profile for SGEMM on `NEOVERSEV2` | |||||
| * [2025-02-19] Add thread throttling profile for SGEMV on `NEOVERSEV2` | |||||
| * Ye Tao <ye.tao@arm.com> | |||||
| * [2025-02-03] Optimize SBGEMM kernel on NEOVERSEV1 | |||||
| * [2025-02-27] Add sbgemv_n_neon kernel | |||||
| * Abhishek Kumar <https://github.com/abhishek-iitmadras> | |||||
| * [2025-04-22] Optimise dot kernel for NEOVERSE V1 | |||||
| @@ -1,4 +1,138 @@ | |||||
| OpenBLAS ChangeLog | OpenBLAS ChangeLog | ||||
| ==================================================================== | |||||
| Version 0.3.30 | |||||
| 19-Jun-2025 | |||||
| general: | |||||
| - fixed an installation problem with the thread safety test in gmake builds | |||||
| - fixed spurious overwriting of an input array in complex GEMMT/GEMMTR | |||||
| - fixed naming of GEMMTR in error messages from XERBLA | |||||
| - fixed compilation of SBGEMMT/SBGEMMTR in CMake builds | |||||
| - fixed the implementation of ?NRM2 to handle INCX=0 correctly | |||||
| - removed tests for CSROT and ZDROT that relied on unspecified behavior | |||||
| - fixed a performance regression in multithreaded GEMM that was particularly | |||||
| serious on POWER targets | |||||
| - fixed linking issues when using LLVM's flang-new with gmake | |||||
| - fixed a potential thread safety problem with C11 atomic operations | |||||
| - further improved the workload partitioning in parallel GEMM | |||||
| - fixed omission of LAPACKE interfaces for CGESVDQ,CTRSYL3 and ?GEQPF in | |||||
| CMake builds | |||||
| - fixed mishandling of setting NO_LAPACK to FALSE, and incorrect dependencies | |||||
| for LAPACK function SPMV in CMake builds | |||||
| - added explicit CMake options for building LAPACKE and shared libraries | |||||
| - simplified and improved handling of OpenMP options in CMake builds | |||||
| - reworked Windows DLL generation in CMake builds to ensure correct symbol | |||||
| renaming (pre/postfixing) and optional generation of PDB files for debugging | |||||
| - updated the Perl script version of the gensymbol utility for use with | |||||
| Windows-on-Arm | |||||
| - Fixed building with (Mingw) gmake on Windows to ensure completeness of the | |||||
| LAPACK included in the static library (potential race condition due to the | |||||
| Windows version of the "ln" utility creating snapshot copies rather than links) | |||||
| - fixed unwanted deletion of the lapacke_mangling.h file by "make clean" | |||||
| - fixed potential duplication of a _64 suffix on library names in CMake builds | |||||
| - fixed compilation of the C fallback copies of the LAPACK code with GCC 15 | |||||
| - included fixed from the Reference-LAPACK project: | |||||
| - fixed a truncated error message in the EIG part of the testsuite | |||||
| (Reference-LAPACK PR 1119) | |||||
| - fixed too strict check in LAPACKE_?gesdd_work (PR #1126) | |||||
| - fixed memory corruption when calling ?GEEV with non-finite data (PR #1128) | |||||
| - fixed missing initialization of a variable in C/GEQP3RK (PR #1131) | |||||
| - fixed 2nd dimension chosen in C/ZUNMLQ transposition operation (PR #1135) | |||||
| x86_64: | |||||
| - fixed an error in the SBGEMV kernel for Cooper Lake/Sapphire Rapids | |||||
| - fixed corner cases of NAN and INF input handling in CSCAL and ZSCAL | |||||
| - improved the compiler identification code for flang-new | |||||
| - fixed a potential build issue in the ZSUM kernel | |||||
| - fixed "argument list too long" errors when building on MacOS | |||||
| - added cpu autodetection support for several new Arrow Lake models | |||||
| - fixed conditional inclusion of the fast path SGEMM kernel in DYNAMIC_ARCH | |||||
| - fixed compilation with the MinGW build of GCC 15 | |||||
| arm64: | |||||
| - fixed cpu type detection of A64FX and some ThunderX models (broken in 0.3.29) | |||||
| - added support for the AmpereOne/1A cpus in DYNAMIC_ ARCH builds | |||||
| - added an optimized SBGEMM kernel for NEOVERSEV1 | |||||
| - improved 1xN SBGEMM performance by forwarding to SBGEMV | |||||
| - introduced a stepwise increase of the thread count used for | |||||
| SGEMM and SGEMV on NEOVERSEV1/V2 in relation to problem size | |||||
| - introduced a stepwise increase of the thread count used for | |||||
| DGEMV on NEOVERSEV1 in relation to problem size | |||||
| - introduced a stepwise increase of the thread count used for | |||||
| SDOT and DDOT on NEOVERSEV1 in relation to problem size | |||||
| - worked around assembler limitations in LLVM for Windows-on-Arm | |||||
| - enabled cpu type autodetection from the registry on Windows-on-Arm | |||||
| - improved multithreading threshold for GEMV and GESV on Windows-on-Arm | |||||
| - fixed overoptimization issues with LLVM's flang in Windows-on-Arm | |||||
| - fixed corner cases of NAN and INF input handling in CSCAL and ZSCAL | |||||
| - added a fast path SGEMM kernel for small workloads on SME capable targets | |||||
| - improved performance of SGEMM and DGEMM kernels for small workloads | |||||
| - improved performance of SGEMV and DGEMV on SVE-capable targets | |||||
| - improved performance of SGEMV on NEOVERSEN1 and Apple M | |||||
| - added optimized SSYMV and DSYMV kernels for NEOVERSEN1, Apple M and all | |||||
| SVE capable targets | |||||
| - added optimized SBGEMV kernels for NEOVERSEV1/V2/N2 | |||||
| - improved performance of SGEMM through faster NCOPY kernels | |||||
| - added compiler options for the NVIDIA HPC Compiler Suite | |||||
| - fixed compilation on OSX with XCode 16.3 and later | |||||
| - fixed cpu core type and cache size detection on Apple M4 | |||||
| - updated GEMM parameter settings for Neoverse cpus in cross-builds with CMake | |||||
| - fixed default compiler options for NEOVERSEN1 and CORTEXX2 in CMake builds | |||||
| - fixed conditional inclusion of the fast path SGEMM kernel in DYNAMIC_ARCH | |||||
| - fixed potential miscompilation of the non-SVE SDOT kernel | |||||
| riscv64: | |||||
| - added optimized SROTM and DROTM kernels for x280 | |||||
| - fixed corner cases of NAN and INF input handling in CSCAL and ZSCAL | |||||
| - improved performance of GEMM_TCOPY on RVV1.0 targets with | |||||
| VLEN of 128 or 256 | |||||
| - improved performance of OMATCOPY on targets with VLEN 256 | |||||
| - greatly improved performance of SGEMV/DGEMV | |||||
| - improved performance of CGEMV and ZGEMV on C910V and all RVV targets | |||||
| with VLEN 256 | |||||
| - improved performance of SAXPBY and DAXPBY on C910V and all RVV targets | |||||
| with VLEN 256 | |||||
| - improved performance of AXPY and DOT on C910V and ZVL256B targets by | |||||
| falling back to non-vectorized code for very small N. (Thereby fixing | |||||
| poor performance of CHBMV/ZHBMV for very small K) | |||||
| - fixed CMake build failures of the TRMM kernels | |||||
| loongarch64: | |||||
| - improved performance of the LSX versions of SSYMV/DSYMV | |||||
| - made the LASX versions of the DSYMV and SSYMV kernels | |||||
| compatible with hardware changes in LA664 and future targets | |||||
| - fixed inaccuracies in several LASX kernels | |||||
| - improved compatibility of LSX kernels with LA264 targets | |||||
| - fixed handling of deprecated target names in CMake builds | |||||
| - fixed corner cases of NAN and INF input handling in CSCAL and ZSCAL | |||||
| power: | |||||
| - fixed building for PPCG4 with CMake | |||||
| - fixed SSCAL/DSCAL on PPC970 running FreeBSD | |||||
| - fixed a potential alignment issue in the POWER8 SGEMV kernel | |||||
| - fixed corner cases of NAN and INF input handling in CSCAL and ZSCAL | |||||
| zarch: | |||||
| - fixed corner cases of NAN and INF input handling in CSCAL and ZSCAL | |||||
| - fixed unwanted generation of object files with a writable stack | |||||
| x86: | |||||
| - fixed corner cases of NAN and INF input handling in CSCAL and ZSCAL | |||||
| - worked around potential miscompilation of CDOT with very old binutils | |||||
| arm: | |||||
| - fixed corner cases of NAN and INF input handling in CSCAL and ZSCAL | |||||
| - fixed unwanted generation of object files with a writable stack | |||||
| sparc: | |||||
| - fixed corner cases of NAN and INF input handling in CSCAL and ZSCAL | |||||
| alpha: | |||||
| - fixed build failure caused by spurious Windows-only typecasts | |||||
| cell: | |||||
| - fixed probable build issue caused by spurious Windows-only typecasts | |||||
| ==================================================================== | ==================================================================== | ||||
| Version 0.3.29 | Version 0.3.29 | ||||
| 12-Jan-2025 | 12-Jan-2025 | ||||
| @@ -93,6 +93,11 @@ ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) | |||||
| echo " Fortran compiler ... $(F_COMPILER) (command line : $(FC))";\ | echo " Fortran compiler ... $(F_COMPILER) (command line : $(FC))";\ | ||||
| fi | fi | ||||
| endif | endif | ||||
| ifeq ($(OSNAME), WINNT) | |||||
| @-$(LNCMD) $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) | |||||
| endif | |||||
| ifneq ($(OSNAME), AIX) | ifneq ($(OSNAME), AIX) | ||||
| @echo -n " Library Name ... $(LIBNAME)" | @echo -n " Library Name ... $(LIBNAME)" | ||||
| else | else | ||||
| @@ -447,7 +452,7 @@ endif | |||||
| @rm -f cblas.tmp cblas.tmp2 | @rm -f cblas.tmp cblas.tmp2 | ||||
| @touch $(NETLIB_LAPACK_DIR)/make.inc | @touch $(NETLIB_LAPACK_DIR)/make.inc | ||||
| @$(MAKE) -C $(NETLIB_LAPACK_DIR) clean | @$(MAKE) -C $(NETLIB_LAPACK_DIR) clean | ||||
| @rm -f $(NETLIB_LAPACK_DIR)/make.inc $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_mangling.h | |||||
| @rm -f $(NETLIB_LAPACK_DIR)/make.inc | |||||
| @$(MAKE) -C relapack clean | @$(MAKE) -C relapack clean | ||||
| @rm -f *.grd Makefile.conf_last config_last.h | @rm -f *.grd Makefile.conf_last config_last.h | ||||
| @(cd $(NETLIB_LAPACK_DIR)/TESTING && rm -f x* *.out testing_results.txt) | @(cd $(NETLIB_LAPACK_DIR)/TESTING && rm -f x* *.out testing_results.txt) | ||||
| @@ -30,6 +30,11 @@ FCOMMON_OPT += -march=armv8-a+sve | |||||
| endif | endif | ||||
| endif | endif | ||||
| ifeq ($(CORE), ARMV9SME) | |||||
| CCOMMON_OPT += -march=armv9-a+sve2+sme | |||||
| FCOMMON_OPT += -march=armv9-a+sve2 | |||||
| endif | |||||
| ifeq ($(CORE), CORTEXA53) | ifeq ($(CORE), CORTEXA53) | ||||
| CCOMMON_OPT += -march=armv8-a -mtune=cortex-a53 | CCOMMON_OPT += -march=armv8-a -mtune=cortex-a53 | ||||
| ifneq ($(F_COMPILER), NAG) | ifneq ($(F_COMPILER), NAG) | ||||
| @@ -101,7 +106,7 @@ ifeq ($(CORE), NEOVERSEV1) | |||||
| ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG))) | ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG))) | ||||
| ifeq (1, $(filter 1,$(GCCVERSIONGTEQ10) $(ISCLANG))) | ifeq (1, $(filter 1,$(GCCVERSIONGTEQ10) $(ISCLANG))) | ||||
| ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ11) $(ISCLANG))) | ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ11) $(ISCLANG))) | ||||
| CCOMMON_OPT += -march=armv8.4-a+sve | |||||
| CCOMMON_OPT += -march=armv8.4-a+sve+bf16 | |||||
| ifeq (1, $(ISCLANG)) | ifeq (1, $(ISCLANG)) | ||||
| CCOMMON_OPT += -mtune=cortex-x1 | CCOMMON_OPT += -mtune=cortex-x1 | ||||
| else | else | ||||
| @@ -111,7 +116,7 @@ ifneq ($(F_COMPILER), NAG) | |||||
| FCOMMON_OPT += -march=armv8.4-a -mtune=neoverse-v1 | FCOMMON_OPT += -march=armv8.4-a -mtune=neoverse-v1 | ||||
| endif | endif | ||||
| else | else | ||||
| CCOMMON_OPT += -march=armv8.4-a+sve | |||||
| CCOMMON_OPT += -march=armv8.4-a+sve+bf16 | |||||
| ifneq ($(CROSS), 1) | ifneq ($(CROSS), 1) | ||||
| CCOMMON_OPT += -mtune=native | CCOMMON_OPT += -mtune=native | ||||
| endif | endif | ||||
| @@ -315,8 +315,8 @@ endif | |||||
| endif | endif | ||||
| ifeq ($(CPP_THREAD_SAFETY_TEST), 1) | ifeq ($(CPP_THREAD_SAFETY_TEST), 1) | ||||
| @install -m 666 cpp_thread_test/dgemm_tester $(DESTDIR)$(OPENBLAS_BINARY_DIR) | |||||
| @install -m 666 cpp_thread_test/dgemv_tester $(DESTDIR)$(OPENBLAS_BINARY_DIR) | |||||
| @install -m 666 cpp_thread_test/dgemm_tester $(DESTDIR)$(OPENBLAS_BINARY_DIR) | |||||
| @install -m 666 cpp_thread_test/dgemv_tester $(DESTDIR)$(OPENBLAS_BINARY_DIR) | |||||
| endif | endif | ||||
| endif | endif | ||||
| @@ -3,7 +3,7 @@ | |||||
| # | # | ||||
| # This library's version | # This library's version | ||||
| VERSION = 0.3.29 | |||||
| VERSION = 0.3.29.dev | |||||
| # If you set this prefix, the library name will be lib$(LIBNAMESUFFIX)openblas.a | # If you set this prefix, the library name will be lib$(LIBNAMESUFFIX)openblas.a | ||||
| # and lib$(LIBNAMESUFFIX)openblas.so, with a matching soname in the shared library | # and lib$(LIBNAMESUFFIX)openblas.so, with a matching soname in the shared library | ||||
| @@ -276,6 +276,7 @@ SMALL_MATRIX_OPT = 1 | |||||
| endif | endif | ||||
| ifeq ($(ARCH), arm64) | ifeq ($(ARCH), arm64) | ||||
| GEMM_GEMV_FORWARD = 1 | GEMM_GEMV_FORWARD = 1 | ||||
| GEMM_GEMV_FORWARD_BF16 = 1 | |||||
| endif | endif | ||||
| ifeq ($(ARCH), riscv) | ifeq ($(ARCH), riscv) | ||||
| GEMM_GEMV_FORWARD = 1 | GEMM_GEMV_FORWARD = 1 | ||||
| @@ -420,6 +421,7 @@ ifeq ($(ARCH), arm64) | |||||
| export MACOSX_DEPLOYMENT_TARGET=11.0 | export MACOSX_DEPLOYMENT_TARGET=11.0 | ||||
| ifeq ($(C_COMPILER), GCC) | ifeq ($(C_COMPILER), GCC) | ||||
| export NO_SVE = 1 | export NO_SVE = 1 | ||||
| export NO_SME = 1 | |||||
| endif | endif | ||||
| else | else | ||||
| export MACOSX_DEPLOYMENT_TARGET=10.8 | export MACOSX_DEPLOYMENT_TARGET=10.8 | ||||
| @@ -434,6 +436,11 @@ ifeq (x$(XCVER), x 15) | |||||
| CCOMMON_OPT += -Wl,-ld_classic | CCOMMON_OPT += -Wl,-ld_classic | ||||
| FCOMMON_OPT += -Wl,-ld_classic | FCOMMON_OPT += -Wl,-ld_classic | ||||
| endif | endif | ||||
| ifeq (x$(XCVER), x 16) | |||||
| ifeq ($(F_COMPILER), GFORTRAN) | |||||
| override CEXTRALIB := $(filter-out(-lto_library, $(CEXTRALIB))) | |||||
| endif | |||||
| endif | |||||
| endif | endif | ||||
| ifneq (,$(findstring $(OSNAME), FreeBSD OpenBSD DragonFly)) | ifneq (,$(findstring $(OSNAME), FreeBSD OpenBSD DragonFly)) | ||||
| @@ -709,6 +716,9 @@ DYNAMIC_CORE += NEOVERSEN2 | |||||
| DYNAMIC_CORE += ARMV8SVE | DYNAMIC_CORE += ARMV8SVE | ||||
| DYNAMIC_CORE += A64FX | DYNAMIC_CORE += A64FX | ||||
| endif | endif | ||||
| ifneq ($(NO_SME), 1) | |||||
| DYNAMIC_CORE += ARMV9SME | |||||
| endif | |||||
| DYNAMIC_CORE += THUNDERX | DYNAMIC_CORE += THUNDERX | ||||
| DYNAMIC_CORE += THUNDERX2T99 | DYNAMIC_CORE += THUNDERX2T99 | ||||
| DYNAMIC_CORE += TSV110 | DYNAMIC_CORE += TSV110 | ||||
| @@ -1472,6 +1482,10 @@ ifeq ($(NO_SVE), 1) | |||||
| CCOMMON_OPT += -DNO_SVE | CCOMMON_OPT += -DNO_SVE | ||||
| endif | endif | ||||
| ifeq ($(NO_SME), 1) | |||||
| CCOMMON_OPT += -DNO_SME | |||||
| endif | |||||
| ifdef SMP | ifdef SMP | ||||
| CCOMMON_OPT += -DSMP_SERVER | CCOMMON_OPT += -DSMP_SERVER | ||||
| @@ -111,6 +111,7 @@ THUNDERX3T110 | |||||
| VORTEX | VORTEX | ||||
| A64FX | A64FX | ||||
| ARMV8SVE | ARMV8SVE | ||||
| ARMV9SME | |||||
| FT2000 | FT2000 | ||||
| 9.System Z: | 9.System Z: | ||||
| @@ -25,14 +25,28 @@ jobs: | |||||
| echo "FROM quay.io/pypa/manylinux1_x86_64 | echo "FROM quay.io/pypa/manylinux1_x86_64 | ||||
| COPY . /tmp/openblas | COPY . /tmp/openblas | ||||
| RUN cd /tmp/openblas && \ | RUN cd /tmp/openblas && \ | ||||
| COMMON_FLAGS='DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32' && \ | |||||
| BTYPE='BINARY=64' CC=gcc && \ | |||||
| make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE && \ | |||||
| make -C test $COMMON_FLAGS $BTYPE && \ | |||||
| make -C ctest $COMMON_FLAGS $BTYPE && \ | |||||
| make -C utest $COMMON_FLAGS $BTYPE" > Dockerfile | |||||
| CC=gcc && \ | |||||
| make QUIET_MAKE=1 BINARY=64 DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 && \ | |||||
| make -C test BINARY=64 DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 && \ | |||||
| make -C ctest BINARY=64 DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 && \ | |||||
| make -C utest BINARY=64 DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32" > Dockerfile | |||||
| docker build . | docker build . | ||||
| displayName: Run manylinux1 docker build | displayName: Run manylinux1 docker build | ||||
| - job: manylinux_32bit | |||||
| pool: | |||||
| vmImage: 'ubuntu-latest' | |||||
| steps: | |||||
| - script: | | |||||
| echo "FROM quay.io/pypa/manylinux2014_i686 | |||||
| COPY . /tmp/openblas | |||||
| RUN cd /tmp/openblas && \ | |||||
| CC=gcc && \ | |||||
| make QUIET_MAKE=1 BINARY=32 TARGET=NEHALEM NUM_THREADS=32 && \ | |||||
| make -C test BINARY=32 TARGET=NEHALEM NUM_THREADS=32 && \ | |||||
| make -C ctest BINARY=32 TARGET=NEHALEM NUM_THREADS=32 && \ | |||||
| make -C utest BINARY=32 TARGET=NEHALEM NUM_THREADS=32" > Dockerfile | |||||
| docker build . | |||||
| displayName: Run manylinux 32bit docker build | |||||
| - job: Intel_SDE_skx | - job: Intel_SDE_skx | ||||
| pool: | pool: | ||||
| vmImage: 'ubuntu-latest' | vmImage: 'ubuntu-latest' | ||||
| @@ -141,7 +155,7 @@ jobs: | |||||
| - job: OSX_OpenMP | - job: OSX_OpenMP | ||||
| pool: | pool: | ||||
| vmImage: 'macOS-12' | |||||
| vmImage: 'macOS-13' | |||||
| steps: | steps: | ||||
| - script: | | - script: | | ||||
| brew update | brew update | ||||
| @@ -151,7 +165,7 @@ jobs: | |||||
| - job: OSX_GCC_Nothreads | - job: OSX_GCC_Nothreads | ||||
| pool: | pool: | ||||
| vmImage: 'macOS-12' | |||||
| vmImage: 'macOS-13' | |||||
| steps: | steps: | ||||
| - script: | | - script: | | ||||
| brew update | brew update | ||||
| @@ -164,7 +178,19 @@ jobs: | |||||
| - script: | | - script: | | ||||
| brew update | brew update | ||||
| make CC=gcc-12 FC=gfortran-12 | make CC=gcc-12 FC=gfortran-12 | ||||
| - job: OSX_LLVM_flangnew | |||||
| pool: | |||||
| vmImage: 'macOS-latest' | |||||
| variables: | |||||
| LD_LIBRARY_PATH: /usr/local/opt/llvm/lib | |||||
| LIBRARY_PATH: /usr/local/opt/llvm/lib | |||||
| steps: | |||||
| - script: | | |||||
| brew update | |||||
| brew install llvm flang | |||||
| make TARGET=NEHALEM CC=/usr/local/opt/llvm/bin/clang FC=/usr/local/opt/flang/bin/flang NO_SHARED=1 | |||||
| - job: OSX_OpenMP_Clang | - job: OSX_OpenMP_Clang | ||||
| pool: | pool: | ||||
| vmImage: 'macOS-latest' | vmImage: 'macOS-latest' | ||||
| @@ -195,7 +221,7 @@ jobs: | |||||
| - job: OSX_dynarch_cmake | - job: OSX_dynarch_cmake | ||||
| pool: | pool: | ||||
| vmImage: 'macOS-12' | |||||
| vmImage: 'macOS-13' | |||||
| variables: | variables: | ||||
| LD_LIBRARY_PATH: /usr/local/opt/llvm/lib | LD_LIBRARY_PATH: /usr/local/opt/llvm/lib | ||||
| LIBRARY_PATH: /usr/local/opt/llvm/lib | LIBRARY_PATH: /usr/local/opt/llvm/lib | ||||
| @@ -242,7 +268,7 @@ jobs: | |||||
| - job: OSX_NDK_ARMV7 | - job: OSX_NDK_ARMV7 | ||||
| pool: | pool: | ||||
| vmImage: 'macOS-12' | |||||
| vmImage: 'macOS-13' | |||||
| steps: | steps: | ||||
| - script: | | - script: | | ||||
| brew update | brew update | ||||
| @@ -252,7 +278,7 @@ jobs: | |||||
| - job: OSX_IOS_ARMV8 | - job: OSX_IOS_ARMV8 | ||||
| pool: | pool: | ||||
| vmImage: 'macOS-12' | |||||
| vmImage: 'macOS-13' | |||||
| variables: | variables: | ||||
| CC: /Applications/Xcode_14.2.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang | CC: /Applications/Xcode_14.2.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang | ||||
| CFLAGS: -O2 -Wno-macro-redefined -isysroot /Applications/Xcode_14.2.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS16.2.sdk -arch arm64 -miphoneos-version-min=10.0 | CFLAGS: -O2 -Wno-macro-redefined -isysroot /Applications/Xcode_14.2.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS16.2.sdk -arch arm64 -miphoneos-version-min=10.0 | ||||
| @@ -262,7 +288,7 @@ jobs: | |||||
| - job: OSX_IOS_ARMV7 | - job: OSX_IOS_ARMV7 | ||||
| pool: | pool: | ||||
| vmImage: 'macOS-12' | |||||
| vmImage: 'macOS-13' | |||||
| variables: | variables: | ||||
| CC: /Applications/Xcode_14.2.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang | CC: /Applications/Xcode_14.2.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang | ||||
| CFLAGS: -O2 -mno-thumb -Wno-macro-redefined -isysroot /Applications/Xcode_14.2.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS16.2.sdk -arch armv7 -miphoneos-version-min=5.1 | CFLAGS: -O2 -mno-thumb -Wno-macro-redefined -isysroot /Applications/Xcode_14.2.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS16.2.sdk -arch armv7 -miphoneos-version-min=5.1 | ||||
| @@ -272,7 +298,7 @@ jobs: | |||||
| - job: OSX_xbuild_DYNAMIC_ARM64 | - job: OSX_xbuild_DYNAMIC_ARM64 | ||||
| pool: | pool: | ||||
| vmImage: 'macOS-12' | |||||
| vmImage: 'macOS-13' | |||||
| variables: | variables: | ||||
| CC: /Applications/Xcode_14.2.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang | CC: /Applications/Xcode_14.2.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang | ||||
| CFLAGS: -O2 -Wno-macro-redefined -isysroot /Applications/Xcode_14.2.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX13.1.sdk -arch arm64 | CFLAGS: -O2 -Wno-macro-redefined -isysroot /Applications/Xcode_14.2.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX13.1.sdk -arch arm64 | ||||
| @@ -334,6 +334,24 @@ if [ "$architecture" = "arm64" ]; then | |||||
| rm -rf "$tmpd" | rm -rf "$tmpd" | ||||
| fi | fi | ||||
| no_sme=0 | |||||
| if [ "$architecture" = "arm64" ]; then | |||||
| tmpd=$(mktemp -d 2>/dev/null || mktemp -d -t 'OBC') | |||||
| tmpf="$tmpd/a.S" | |||||
| printf ".text \n.global sme_test\n\nsme_test:\nsmstart\nsmstop\nret\n">> "$tmpf" | |||||
| args=" -march=armv9-a+sve2+sme -c -o $tmpf.o $tmpf" | |||||
| no_sme=0 | |||||
| { | |||||
| $compiler_name $flags $args >/dev/null 2>&1 | |||||
| } || { | |||||
| args=" -march=armv9-a+sme -c -o $tmpf.o $tmpf" | |||||
| $compiler_name $flags $args >/dev/null 2>&1 | |||||
| } || { | |||||
| no_sme=1 | |||||
| } | |||||
| rm -rf "$tmpd" | |||||
| fi | |||||
| c11_atomics=0 | c11_atomics=0 | ||||
| case "$data" in | case "$data" in | ||||
| *HAVE_C11*) | *HAVE_C11*) | ||||
| @@ -475,6 +493,7 @@ done | |||||
| printf "CEXTRALIB=%s %s %s\n" "$linker_L" "$linker_l" "$linker_a" | printf "CEXTRALIB=%s %s %s\n" "$linker_L" "$linker_l" "$linker_a" | ||||
| [ "$no_msa" -eq 1 ] && printf "NO_MSA=1\n" | [ "$no_msa" -eq 1 ] && printf "NO_MSA=1\n" | ||||
| [ "$no_sve" -eq 1 ] && printf "NO_SVE=1\n" | [ "$no_sve" -eq 1 ] && printf "NO_SVE=1\n" | ||||
| [ "$no_sme" -eq 1 ] && printf "NO_SME=1\n" | |||||
| [ "$no_rv64gv" -eq 1 ] && printf "NO_RV64GV=1\n" | [ "$no_rv64gv" -eq 1 ] && printf "NO_RV64GV=1\n" | ||||
| [ "$no_avx512" -eq 1 ] && printf "NO_AVX512=1\n" | [ "$no_avx512" -eq 1 ] && printf "NO_AVX512=1\n" | ||||
| [ "$no_avx512bf" -eq 1 ] && printf "NO_AVX512BF16=1\n" | [ "$no_avx512bf" -eq 1 ] && printf "NO_AVX512BF16=1\n" | ||||
| @@ -31,22 +31,23 @@ if (${CMAKE_C_COMPILER_ID} STREQUAL "Intel") | |||||
| set(CCOMMON_OPT "${CCOMMON_OPT} -wd981") | set(CCOMMON_OPT "${CCOMMON_OPT} -wd981") | ||||
| endif () | endif () | ||||
| if (USE_OPENMP) | |||||
| # USE_SIMPLE_THREADED_LEVEL3 = 1 | |||||
| # NO_AFFINITY = 1 | |||||
| find_package(OpenMP REQUIRED) | |||||
| if (OpenMP_FOUND) | |||||
| set(CCOMMON_OPT "${CCOMMON_OPT} ${OpenMP_C_FLAGS} -DUSE_OPENMP") | |||||
| set(FCOMMON_OPT "${FCOMMON_OPT} ${OpenMP_Fortran_FLAGS}") | |||||
| endif() | |||||
| endif () | |||||
| if (DYNAMIC_ARCH) | if (DYNAMIC_ARCH) | ||||
| if (ARM64) | if (ARM64) | ||||
| set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110) | |||||
| if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 9.99) | |||||
| set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2 ARMV8SVE A64FX) | |||||
| set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110) | |||||
| if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU") | |||||
| if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER_EQUAL 10) # SVE ACLE supported in GCC >= 10 | |||||
| set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2 ARMV8SVE A64FX) | |||||
| endif () | |||||
| if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER_EQUAL 14) # SME ACLE supported in GCC >= 14 | |||||
| set(DYNAMIC_CORE ${DYNAMIC_CORE} ARMV9SME) | |||||
| endif() | |||||
| elseif (${CMAKE_C_COMPILER_ID} MATCHES "Clang") | |||||
| if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER_EQUAL 11) # SVE ACLE supported in LLVM >= 11 | |||||
| set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2 ARMV8SVE A64FX) | |||||
| endif () | |||||
| if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER_EQUAL 19) # SME ACLE supported in LLVM >= 19 | |||||
| set(DYNAMIC_CORE ${DYNAMIC_CORE} ARMV9SME) | |||||
| endif() | |||||
| endif () | endif () | ||||
| if (DYNAMIC_LIST) | if (DYNAMIC_LIST) | ||||
| set(DYNAMIC_CORE ARMV8 ${DYNAMIC_LIST}) | set(DYNAMIC_CORE ARMV8 ${DYNAMIC_LIST}) | ||||
| @@ -84,7 +84,7 @@ endif () | |||||
| if (${CMAKE_C_COMPILER_ID} STREQUAL "NVHPC") | if (${CMAKE_C_COMPILER_ID} STREQUAL "NVHPC") | ||||
| if (POWER) | if (POWER) | ||||
| set(CCOMMON_OPT "${CCOMMON_OPT} -tp pwr8") | set(CCOMMON_OPT "${CCOMMON_OPT} -tp pwr8") | ||||
| else () | |||||
| elseif (X86_64) | |||||
| set(CCOMMON_OPT "${CCOMMON_OPT} -tp px") | set(CCOMMON_OPT "${CCOMMON_OPT} -tp px") | ||||
| endif () | endif () | ||||
| endif () | endif () | ||||
| @@ -182,7 +182,9 @@ endif () | |||||
| if (${CORE} STREQUAL A64FX) | if (${CORE} STREQUAL A64FX) | ||||
| if (NOT DYNAMIC_ARCH) | if (NOT DYNAMIC_ARCH) | ||||
| if (${GCC_VERSION} VERSION_GREATER 11.0 OR ${GCC_VERSION} VERSION_EQUAL 11.0) | |||||
| if (${CMAKE_C_COMPILER_ID} STREQUAL "NVC" AND NOT NO_SVE) | |||||
| set (CCOMMON_OPT "${CCOMMON_OPT} -tp=a64fx") | |||||
| elseif (${GCC_VERSION} VERSION_GREATER 11.0 OR ${GCC_VERSION} VERSION_EQUAL 11.0) | |||||
| set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve -mtune=a64fx") | set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve -mtune=a64fx") | ||||
| else () | else () | ||||
| set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve") | set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve") | ||||
| @@ -194,6 +196,8 @@ if (${CORE} STREQUAL NEOVERSEN2) | |||||
| if (NOT DYNAMIC_ARCH) | if (NOT DYNAMIC_ARCH) | ||||
| if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE) | if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE) | ||||
| set (CCOMMON_OPT "${CCOMMON_OPT} -Msve_intrinsics -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2") | set (CCOMMON_OPT "${CCOMMON_OPT} -Msve_intrinsics -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2") | ||||
| elseif (${CMAKE_C_COMPILER_ID} STREQUAL "NVC" AND NOT NO_SVE) | |||||
| set (CCOMMON_OPT "${CCOMMON_OPT} -tp=neoverse-v2") | |||||
| else () | else () | ||||
| if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4) | if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4) | ||||
| set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2") | set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2") | ||||
| @@ -208,6 +212,8 @@ if (${CORE} STREQUAL NEOVERSEV1) | |||||
| if (NOT DYNAMIC_ARCH) | if (NOT DYNAMIC_ARCH) | ||||
| if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE) | if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE) | ||||
| set (CCOMMON_OPT "${CCOMMON_OPT} -Msve_intrinsics -march=armv8.4-a+sve -mtune=neoverse-v1") | set (CCOMMON_OPT "${CCOMMON_OPT} -Msve_intrinsics -march=armv8.4-a+sve -mtune=neoverse-v1") | ||||
| elseif (${CMAKE_C_COMPILER_ID} STREQUAL "NVC" AND NOT NO_SVE) | |||||
| set (CCOMMON_OPT "${CCOMMON_OPT} -tp=neoverse-v1") | |||||
| else () | else () | ||||
| if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4) | if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4) | ||||
| set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.4-a+sve -mtune=neoverse-v1") | set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.4-a+sve -mtune=neoverse-v1") | ||||
| @@ -220,10 +226,12 @@ endif () | |||||
| if (${CORE} STREQUAL NEOVERSEN1) | if (${CORE} STREQUAL NEOVERSEN1) | ||||
| if (NOT DYNAMIC_ARCH) | if (NOT DYNAMIC_ARCH) | ||||
| if (${GCC_VERSION} VERSION_GREATER 9.4 OR ${GCC_VERSION} VERSION_EQUAL 9.4) | |||||
| set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve -mtune=neoverse-n1") | |||||
| if (${CMAKE_C_COMPILER_ID} STREQUAL "NVC" AND NOT NO_SVE) | |||||
| set (CCOMMON_OPT "${CCOMMON_OPT} -tp=neoverse-n1") | |||||
| elseif (${GCC_VERSION} VERSION_GREATER 9.4 OR ${GCC_VERSION} VERSION_EQUAL 9.4) | |||||
| set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a -mtune=neoverse-n1") | |||||
| else () | else () | ||||
| set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve") | |||||
| set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a") | |||||
| endif() | endif() | ||||
| endif () | endif () | ||||
| endif () | endif () | ||||
| @@ -232,21 +240,33 @@ if (${CORE} STREQUAL ARMV8SVE) | |||||
| if (NOT DYNAMIC_ARCH) | if (NOT DYNAMIC_ARCH) | ||||
| if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE) | if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE) | ||||
| set (CCOMMON_OPT "${CCOMMON_OPT} -Msve_intrinsics -march=armv8-a+sve") | set (CCOMMON_OPT "${CCOMMON_OPT} -Msve_intrinsics -march=armv8-a+sve") | ||||
| elseif (${CMAKE_C_COMPILER_ID} STREQUAL "NVC" AND NOT NO_SVE) | |||||
| set (CCOMMON_OPT "${CCOMMON_OPT} -tp=host") | |||||
| else () | else () | ||||
| set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve") | set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve") | ||||
| endif () | endif () | ||||
| endif () | endif () | ||||
| endif () | endif () | ||||
| if (${CORE} STREQUAL ARMV9SME) | |||||
| if (NOT DYNAMIC_ARCH) | |||||
| if (${CMAKE_C_COMPILER_ID} STREQUAL "NVC" AND NOT NO_SVE) | |||||
| set (CCOMMON_OPT "${CCOMMON_OPT} -tp=host") | |||||
| else () | |||||
| set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv9-a+sme") | |||||
| endif () | |||||
| endif () | |||||
| endif () | |||||
| if (${CORE} STREQUAL CORTEXA510) | if (${CORE} STREQUAL CORTEXA510) | ||||
| if (NOT DYNAMIC_ARCH) | if (NOT DYNAMIC_ARCH) | ||||
| set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve") | |||||
| set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.4-a+sve") | |||||
| endif () | endif () | ||||
| endif () | endif () | ||||
| if (${CORE} STREQUAL CORTEXA710) | if (${CORE} STREQUAL CORTEXA710) | ||||
| if (NOT DYNAMIC_ARCH) | if (NOT DYNAMIC_ARCH) | ||||
| set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve") | |||||
| set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.4-a+sve") | |||||
| endif () | endif () | ||||
| endif () | endif () | ||||
| @@ -258,7 +278,7 @@ endif () | |||||
| if (${CORE} STREQUAL CORTEXX2) | if (${CORE} STREQUAL CORTEXX2) | ||||
| if (NOT DYNAMIC_ARCH) | if (NOT DYNAMIC_ARCH) | ||||
| set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve") | |||||
| set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.4-a+sve") | |||||
| endif () | endif () | ||||
| endif () | endif () | ||||
| @@ -7,7 +7,7 @@ if (${F_COMPILER} STREQUAL "FLANG" AND NOT CMAKE_Fortran_COMPILER_ID STREQUAL "L | |||||
| # This is for classic Flang. LLVM Flang is handled with gfortran below. | # This is for classic Flang. LLVM Flang is handled with gfortran below. | ||||
| set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_FLANG") | set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_FLANG") | ||||
| if (USE_OPENMP) | if (USE_OPENMP) | ||||
| set(FCOMMON_OPT "${FCOMMON_OPT} -fopenmp") | |||||
| set(OpenMP_Fortran_FLAGS "-fopenmp" CACHE STRING "OpenMP Fortran compiler flags") | |||||
| endif () | endif () | ||||
| set(FCOMMON_OPT "${FCOMMON_OPT} -Mrecursive -Kieee") | set(FCOMMON_OPT "${FCOMMON_OPT} -Mrecursive -Kieee") | ||||
| endif () | endif () | ||||
| @@ -117,7 +117,7 @@ if (${F_COMPILER} STREQUAL "GFORTRAN" OR ${F_COMPILER} STREQUAL "F95" OR CMAKE_F | |||||
| endif () | endif () | ||||
| if (USE_OPENMP) | if (USE_OPENMP) | ||||
| set(FCOMMON_OPT "${FCOMMON_OPT} -fopenmp") | |||||
| set(OpenMP_Fortran_FLAGS "-fopenmp" CACHE STRING "OpenMP Fortran compiler flags") | |||||
| endif () | endif () | ||||
| endif () | endif () | ||||
| @@ -128,14 +128,14 @@ if (${F_COMPILER} STREQUAL "INTEL" OR CMAKE_Fortran_COMPILER_ID MATCHES "Intel") | |||||
| endif () | endif () | ||||
| set(FCOMMON_OPT "${FCOMMON_OPT} -recursive -fp-model=consistent") | set(FCOMMON_OPT "${FCOMMON_OPT} -recursive -fp-model=consistent") | ||||
| if (USE_OPENMP) | if (USE_OPENMP) | ||||
| set(FCOMMON_OPT "${FCOMMON_OPT} -openmp") | |||||
| set(OpenMP_Fortran_FLAGS "-openmp" CACHE STRING "OpenMP Fortran compiler flags") | |||||
| endif () | endif () | ||||
| endif () | endif () | ||||
| if (${F_COMPILER} STREQUAL "FUJITSU") | if (${F_COMPILER} STREQUAL "FUJITSU") | ||||
| set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_FUJITSU") | set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_FUJITSU") | ||||
| if (USE_OPENMP) | if (USE_OPENMP) | ||||
| set(FCOMMON_OPT "${FCOMMON_OPT} -openmp") | |||||
| set(OpenMP_Fortran_FLAGS "-openmp" CACHE STRING "OpenMP Fortran compiler flags") | |||||
| endif () | endif () | ||||
| endif () | endif () | ||||
| @@ -151,7 +151,7 @@ if (${F_COMPILER} STREQUAL "IBM") | |||||
| set(FCOMMON_OPT "${FCOMMON_OPT} -q32") | set(FCOMMON_OPT "${FCOMMON_OPT} -q32") | ||||
| endif () | endif () | ||||
| if (USE_OPENMP) | if (USE_OPENMP) | ||||
| set(FCOMMON_OPT "${FCOMMON_OPT} -openmp") | |||||
| set(OpenMP_Fortran_FLAGS "-openmp" CACHE STRING "OpenMP Fortran compiler flags") | |||||
| endif () | endif () | ||||
| endif () | endif () | ||||
| @@ -168,7 +168,7 @@ if (${F_COMPILER} STREQUAL "PGI" OR ${F_COMPILER} STREQUAL "PGF95") | |||||
| endif () | endif () | ||||
| set(FCOMMON_OPT "${FCOMMON_OPT} -Mrecursive") | set(FCOMMON_OPT "${FCOMMON_OPT} -Mrecursive") | ||||
| if (USE_OPENMP) | if (USE_OPENMP) | ||||
| set(FCOMMON_OPT "${FCOMMON_OPT} -mp") | |||||
| set(OpenMP_Fortran_FLAGS "-mp" CACHE STRING "OpenMP Fortran compiler flags") | |||||
| endif () | endif () | ||||
| endif () | endif () | ||||
| @@ -195,7 +195,7 @@ if (${F_COMPILER} STREQUAL "PATHSCALE") | |||||
| endif () | endif () | ||||
| if (USE_OPENMP) | if (USE_OPENMP) | ||||
| set(FCOMMON_OPT "${FCOMMON_OPT} -mp") | |||||
| set(OpenMP_Fortran_FLAGS "-mp" CACHE STRING "OpenMP Fortran compiler flags") | |||||
| endif () | endif () | ||||
| endif () | endif () | ||||
| @@ -233,7 +233,7 @@ if (${F_COMPILER} STREQUAL "OPEN64") | |||||
| if (USE_OPENMP) | if (USE_OPENMP) | ||||
| set(FEXTRALIB "${FEXTRALIB} -lstdc++") | set(FEXTRALIB "${FEXTRALIB} -lstdc++") | ||||
| set(FCOMMON_OPT "${FCOMMON_OPT} -mp") | |||||
| set(OpenMP_Fortran_FLAGS "-mp" CACHE STRING "OpenMP Fortran compiler flags") | |||||
| endif () | endif () | ||||
| endif () | endif () | ||||
| @@ -245,14 +245,14 @@ if (${F_COMPILER} STREQUAL "SUN") | |||||
| set(FCOMMON_OPT "${FCOMMON_OPT} -m64") | set(FCOMMON_OPT "${FCOMMON_OPT} -m64") | ||||
| endif () | endif () | ||||
| if (USE_OPENMP) | if (USE_OPENMP) | ||||
| set(FCOMMON_OPT "${FCOMMON_OPT} -xopenmp=parallel") | |||||
| set(OpenMP_Fortran_FLAGS "-xopenmp=parallel" CACHE STRING "OpenMP Fortran compiler flags") | |||||
| endif () | endif () | ||||
| endif () | endif () | ||||
| if (${F_COMPILER} STREQUAL "COMPAQ") | if (${F_COMPILER} STREQUAL "COMPAQ") | ||||
| set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_COMPAQ") | set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_COMPAQ") | ||||
| if (USE_OPENMP) | if (USE_OPENMP) | ||||
| set(FCOMMON_OPT "${FCOMMON_OPT} -openmp") | |||||
| set(OpenMP_Fortran_FLAGS "-openmp" CACHE STRING "OpenMP Fortran compiler flags") | |||||
| endif () | endif () | ||||
| endif () | endif () | ||||
| @@ -265,7 +265,7 @@ if (${F_COMPILER} STREQUAL "CRAY") | |||||
| if (NOT USE_OPENMP) | if (NOT USE_OPENMP) | ||||
| set(FCOMMON_OPT "${FCOMMON_OPT} -fno-openmp") | set(FCOMMON_OPT "${FCOMMON_OPT} -fno-openmp") | ||||
| else () | else () | ||||
| set(FCOMMON_OPT "${FCOMMON_OPT} -fopenmp") | |||||
| set(OpenMP_Fortran_FLAGS "-fopenmp" CACHE STRING "OpenMP Fortran compiler flags") | |||||
| endif () | endif () | ||||
| endif () | endif () | ||||
| @@ -290,7 +290,7 @@ if (${F_COMPILER} STREQUAL "NAGFOR") | |||||
| # -w=unused: Suppress warning messages about unused variables | # -w=unused: Suppress warning messages about unused variables | ||||
| set(FCOMMON_OPT "${FCOMMON_OPT} -w=x77 -w=ques -w=unused") | set(FCOMMON_OPT "${FCOMMON_OPT} -w=x77 -w=ques -w=unused") | ||||
| if (USE_OPENMP) | if (USE_OPENMP) | ||||
| set(FCOMMON_OPT "${FCOMMON_OPT} -openmp") | |||||
| set(OpenMP_Fortran_FLAGS "-openmp" CACHE STRING "OpenMP Fortran compiler flags") | |||||
| endif () | endif () | ||||
| endif () | endif () | ||||
| @@ -79,6 +79,9 @@ macro(SetDefaultL1) | |||||
| SetFallback(CROTKERNEL zrot.S) | SetFallback(CROTKERNEL zrot.S) | ||||
| SetFallback(ZROTKERNEL zrot.S) | SetFallback(ZROTKERNEL zrot.S) | ||||
| SetFallback(XROTKERNEL zrot.S) | SetFallback(XROTKERNEL zrot.S) | ||||
| SetFallback(SROTMKERNEL rotm.S) | |||||
| SetFallback(DROTMKERNEL rotm.S) | |||||
| SetFallback(QROTMKERNEL rotm.S) | |||||
| SetFallback(SSCALKERNEL scal.S) | SetFallback(SSCALKERNEL scal.S) | ||||
| SetFallback(DSCALKERNEL scal.S) | SetFallback(DSCALKERNEL scal.S) | ||||
| SetFallback(CSCALKERNEL zscal.S) | SetFallback(CSCALKERNEL zscal.S) | ||||
| @@ -98,6 +98,8 @@ set(CSRC | |||||
| lapacke_cgesv_work.c | lapacke_cgesv_work.c | ||||
| lapacke_cgesvd.c | lapacke_cgesvd.c | ||||
| lapacke_cgesvd_work.c | lapacke_cgesvd_work.c | ||||
| lapacke_cgesvdq.c | |||||
| lapacke_cgesvdq_work.c | |||||
| lapacke_cgesvdx.c | lapacke_cgesvdx.c | ||||
| lapacke_cgesvdx_work.c | lapacke_cgesvdx_work.c | ||||
| lapacke_cgesvj.c | lapacke_cgesvj.c | ||||
| @@ -1766,8 +1768,8 @@ set(SSRC | |||||
| lapacke_strsna_work.c | lapacke_strsna_work.c | ||||
| lapacke_strsyl.c | lapacke_strsyl.c | ||||
| lapacke_strsyl_work.c | lapacke_strsyl_work.c | ||||
| lapacke_ctrsyl3.c | |||||
| lapacke_ctrsyl3_work.c | |||||
| lapacke_strsyl3.c | |||||
| lapacke_strsyl3_work.c | |||||
| lapacke_strtri.c | lapacke_strtri.c | ||||
| lapacke_strtri_work.c | lapacke_strtri_work.c | ||||
| lapacke_strtrs.c | lapacke_strtrs.c | ||||
| @@ -2410,10 +2412,10 @@ set(ZSRC | |||||
| lapacke_ilaver.c | lapacke_ilaver.c | ||||
| ) | ) | ||||
| if (BUILD_LAPACK_DEPRECATED) | if (BUILD_LAPACK_DEPRECATED) | ||||
| set(SRCS $SRCS lapacke_sgeqpf.c lapacke_sgeqpf_work.c lapacke_sggsvd.c lapacke_sggsvd_work.c lapacke_sggsvp.c lapacke_sggsvp_work.c) | |||||
| set(SRCD $SRCD lapacke_dgeqpf.c lapacke_dgeqpf_work.c lapacke_dggsvd.c lapacke_dggsvd_work.c lapacke_dggsvp.c lapacke_dggsvp_work.c) | |||||
| set(SRCC $SRCC lapacke_cgeqpf.c lapacke_cgeqpf_work.c lapacke_cggsvd.c lapacke_cggsvd_work.c lapacke_cggsvp.c lapacke_cggsvp_work.c) | |||||
| set(SRCZ $SRCZ lapacke_zgeqpf.c lapacke_zgeqpf_work.c lapacke_zggsvd.c lapacke_zggsvd_work.c lapacke_zggsvp.c lapacke_zggsvp_work.c) | |||||
| list(APPEND SSRC lapacke_sgeqpf.c lapacke_sgeqpf_work.c lapacke_sggsvd.c lapacke_sggsvd_work.c lapacke_sggsvp.c lapacke_sggsvp_work.c) | |||||
| list(APPEND DSRC lapacke_dgeqpf.c lapacke_dgeqpf_work.c lapacke_dggsvd.c lapacke_dggsvd_work.c lapacke_dggsvp.c lapacke_dggsvp_work.c) | |||||
| list(APPEND CSRC lapacke_cgeqpf.c lapacke_cgeqpf_work.c lapacke_cggsvd.c lapacke_cggsvd_work.c lapacke_cggsvp.c lapacke_cggsvp_work.c) | |||||
| list(APPEND ZSRC lapacke_zgeqpf.c lapacke_zgeqpf_work.c lapacke_zggsvd.c lapacke_zggsvd_work.c lapacke_zggsvp.c lapacke_zggsvp_work.c) | |||||
| endif() | endif() | ||||
| set(SRCX | set(SRCX | ||||
| @@ -1006,15 +1006,15 @@ endif () | |||||
| "#define HAVE_SVE\n" | "#define HAVE_SVE\n" | ||||
| "#define ARMV8\n") | "#define ARMV8\n") | ||||
| set(SGEMM_UNROLL_M 16) | set(SGEMM_UNROLL_M 16) | ||||
| set(SGEMM_UNROLL_N 4) | |||||
| set(DGEMM_UNROLL_M 8) | |||||
| set(DGEMM_UNROLL_N 4) | |||||
| set(CGEMM_UNROLL_M 8) | |||||
| set(SGEMM_UNROLL_N 8) | |||||
| set(DGEMM_UNROLL_M 4) | |||||
| set(DGEMM_UNROLL_N 8) | |||||
| set(CGEMM_UNROLL_M 2) | |||||
| set(CGEMM_UNROLL_N 4) | set(CGEMM_UNROLL_N 4) | ||||
| set(ZGEMM_UNROLL_M 4) | |||||
| set(ZGEMM_UNROLL_M 2) | |||||
| set(ZGEMM_UNROLL_N 4) | set(ZGEMM_UNROLL_N 4) | ||||
| set(SYMV_P 16) | set(SYMV_P 16) | ||||
| elseif ("${TCORE}" STREQUAL "NEOVERSEN2") | |||||
| elseif ("${TCORE}" STREQUAL "NEOVERSEN2" OR "${TCORE}" STREQUAL "ARMV9SME") | |||||
| file(APPEND ${TARGET_CONF_TEMP} | file(APPEND ${TARGET_CONF_TEMP} | ||||
| "#define L1_CODE_SIZE\t65536\n" | "#define L1_CODE_SIZE\t65536\n" | ||||
| "#define L1_CODE_LINESIZE\t64\n" | "#define L1_CODE_LINESIZE\t64\n" | ||||
| @@ -1249,6 +1249,25 @@ endif () | |||||
| set(ZGEMM_UNROLL_M 2) | set(ZGEMM_UNROLL_M 2) | ||||
| set(ZGEMM_UNROLL_N 4) | set(ZGEMM_UNROLL_N 4) | ||||
| set(SYMV_P 16) | set(SYMV_P 16) | ||||
| elseif ("${TCORE}" STREQUAL "ARMV8SVE" OR "${TCORE}" STREQUAL "CORTEXA510" OR "${TCORE}" STREQUAL "CORTEXX2" OR "${TCORE}" STREQUAL "ARMV9") | |||||
| file(APPEND ${TARGET_CONF_TEMP} | |||||
| "#define L1_DATA_SIZE\t32768\n" | |||||
| "#define L1_DATA_LINESIZE\t64\n" | |||||
| "#define L2_SIZE\t262144\n" | |||||
| "#define L2_LINESIZE\t64\n" | |||||
| "#define DTB_DEFAULT_ENTRIES\t64\n" | |||||
| "#define DTB_SIZE\t4096\n" | |||||
| "#define L2_ASSOCIATIVE\t32\n" | |||||
| "#define ARMV8\n") | |||||
| set(SGEMM_UNROLL_M 4) | |||||
| set(SGEMM_UNROLL_N 8) | |||||
| set(DGEMM_UNROLL_M 4) | |||||
| set(DGEMM_UNROLL_N 8) | |||||
| set(CGEMM_UNROLL_M 2) | |||||
| set(CGEMM_UNROLL_N 4) | |||||
| set(ZGEMM_UNROLL_M 2) | |||||
| set(ZGEMM_UNROLL_N 4) | |||||
| set(SYMV_P 16) | |||||
| elseif ("${TCORE}" STREQUAL "P5600") | elseif ("${TCORE}" STREQUAL "P5600") | ||||
| file(APPEND ${TARGET_CONF_TEMP} | file(APPEND ${TARGET_CONF_TEMP} | ||||
| "#define L2_SIZE 1048576\n" | "#define L2_SIZE 1048576\n" | ||||
| @@ -1409,9 +1428,11 @@ endif () | |||||
| # GetArch_2nd | # GetArch_2nd | ||||
| foreach(float_char S;D;Q;C;Z;X) | foreach(float_char S;D;Q;C;Z;X) | ||||
| if (NOT DEFINED ${float_char}GEMM_UNROLL_M) | if (NOT DEFINED ${float_char}GEMM_UNROLL_M) | ||||
| message(STATUS "setting unrollm=2") | |||||
| set(${float_char}GEMM_UNROLL_M 2) | set(${float_char}GEMM_UNROLL_M 2) | ||||
| endif() | endif() | ||||
| if (NOT DEFINED ${float_char}GEMM_UNROLL_N) | if (NOT DEFINED ${float_char}GEMM_UNROLL_N) | ||||
| message(STATUS "setting unrolln=2") | |||||
| set(${float_char}GEMM_UNROLL_N 2) | set(${float_char}GEMM_UNROLL_N 2) | ||||
| endif() | endif() | ||||
| endforeach() | endforeach() | ||||
| @@ -21,7 +21,15 @@ endif() | |||||
| # Other files expect CORE, which is actually TARGET and will become TARGET_CORE for kernel build. Confused yet? | # Other files expect CORE, which is actually TARGET and will become TARGET_CORE for kernel build. Confused yet? | ||||
| # It seems we are meant to use TARGET as input and CORE internally as kernel. | # It seems we are meant to use TARGET as input and CORE internally as kernel. | ||||
| if(NOT DEFINED CORE AND DEFINED TARGET) | if(NOT DEFINED CORE AND DEFINED TARGET) | ||||
| set(CORE ${TARGET}) | |||||
| if (${TARGET} STREQUAL "LOONGSON3R5") | |||||
| set(CORE "LA464") | |||||
| elseif (${TARGET} STREQUAL "LOONGSON2K1000") | |||||
| set(CORE "LA264") | |||||
| elseif (${TARGET} STREQUAL "LOONGSONGENERIC") | |||||
| set(CORE "LA64_GENERIC)") | |||||
| else () | |||||
| set(CORE ${TARGET}) | |||||
| endif() | |||||
| endif() | endif() | ||||
| # TARGET_CORE will override TARGET which is used in DYNAMIC_ARCH=1. | # TARGET_CORE will override TARGET which is used in DYNAMIC_ARCH=1. | ||||
| @@ -310,6 +318,9 @@ if (${TARGET} STREQUAL NEOVERSEV1) | |||||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv8.2-a+sve") | set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv8.2-a+sve") | ||||
| endif() | endif() | ||||
| endif() | endif() | ||||
| if (${TARGET} STREQUAL ARMV9SME) | |||||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv9-a+sme -O3") | |||||
| endif() | |||||
| if (${TARGET} STREQUAL A64FX) | if (${TARGET} STREQUAL A64FX) | ||||
| if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE) | if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE) | ||||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -Msve-intrinsics -march=armv8.2-a+sve -mtune=a64fx") | set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -Msve-intrinsics -march=armv8.2-a+sve -mtune=a64fx") | ||||
| @@ -361,6 +372,20 @@ else () | |||||
| endif () | endif () | ||||
| endif () | endif () | ||||
| if (USE_OPENMP) | |||||
| find_package(OpenMP COMPONENTS C REQUIRED) | |||||
| set(CCOMMON_OPT "${CCOMMON_OPT} -DUSE_OPENMP") | |||||
| if (NOT NOFORTRAN) | |||||
| find_package(OpenMP COMPONENTS Fortran REQUIRED) | |||||
| # Avoid mixed OpenMP linkage | |||||
| get_target_property(OMP_C_LIB OpenMP::OpenMP_C INTERFACE_LINK_LIBRARIES) | |||||
| get_target_property(OMP_Fortran_LIB OpenMP::OpenMP_Fortran INTERFACE_LINK_LIBRARIES) | |||||
| if (NOT OMP_C_LIB STREQUAL OMP_Fortran_LIB) | |||||
| message(FATAL_ERROR "Multiple OpenMP runtime libraries detected. Mixed OpenMP runtime linkage is dangerous. You may pass -DOpenMP_LANG_LIB_NAMES and -DOpenMP_omp_LIBRARY to manually choose the OpenMP library.") | |||||
| endif() | |||||
| endif () | |||||
| endif () | |||||
| if (BINARY64) | if (BINARY64) | ||||
| if (INTERFACE64) | if (INTERFACE64) | ||||
| # CCOMMON_OPT += -DUSE64BITINT | # CCOMMON_OPT += -DUSE64BITINT | ||||
| @@ -620,6 +645,18 @@ set(CMAKE_ASM_FLAGS "${CMAKE_ASM_FLAGS} ${CCOMMON_OPT}") | |||||
| endif() | endif() | ||||
| # TODO: not sure what PFLAGS is -hpa | # TODO: not sure what PFLAGS is -hpa | ||||
| set(PFLAGS "${PFLAGS} ${CCOMMON_OPT} -I${TOPDIR} -DPROFILE ${COMMON_PROF}") | set(PFLAGS "${PFLAGS} ${CCOMMON_OPT} -I${TOPDIR} -DPROFILE ${COMMON_PROF}") | ||||
| if ("${CMAKE_BUILD_TYPE}" STREQUAL "Release") | |||||
| if ("${F_COMPILER}" STREQUAL "FLANG") | |||||
| if (${CMAKE_Fortran_COMPILER_VERSION} VERSION_LESS_EQUAL 3) | |||||
| set(CMAKE_Fortran_FLAGS_RELEASE "${CMAKE_Fortran_FLAGS_RELEASE} -fno-unroll-loops") | |||||
| endif () | |||||
| endif () | |||||
| if (ARM64 AND CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*" AND CMAKE_SYSTEM_NAME STREQUAL "Windows") | |||||
| set(CMAKE_Fortran_FLAGS_RELEASE "${CMAKE_Fortran_FLAGS_RELEASE} -O2") | |||||
| endif () | |||||
| endif () | |||||
| set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} ${FCOMMON_OPT}") | set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} ${FCOMMON_OPT}") | ||||
| # TODO: not sure what FPFLAGS is -hpa | # TODO: not sure what FPFLAGS is -hpa | ||||
| @@ -632,20 +669,11 @@ if (LAPACK_STRLEN) | |||||
| endif() | endif() | ||||
| set(LAPACK_FPFLAGS "${LAPACK_FPFLAGS} ${FPFLAGS}") | set(LAPACK_FPFLAGS "${LAPACK_FPFLAGS} ${FPFLAGS}") | ||||
| #Disable -fopenmp for LAPACK Fortran codes on Windows. | |||||
| if (${CMAKE_SYSTEM_NAME} STREQUAL "Windows") | |||||
| set(FILTER_FLAGS "-fopenmp;-mp;-openmp;-xopenmp=parallel") | |||||
| foreach (FILTER_FLAG ${FILTER_FLAGS}) | |||||
| string(REPLACE ${FILTER_FLAG} "" LAPACK_FFLAGS ${LAPACK_FFLAGS}) | |||||
| string(REPLACE ${FILTER_FLAG} "" LAPACK_FPFLAGS ${LAPACK_FPFLAGS}) | |||||
| endforeach () | |||||
| endif () | |||||
| if (CMAKE_Fortran_COMPILER) | if (CMAKE_Fortran_COMPILER) | ||||
| if ("${F_COMPILER}" STREQUAL "NAGFOR" OR "${F_COMPILER}" STREQUAL "CRAY" OR CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*") | if ("${F_COMPILER}" STREQUAL "NAGFOR" OR "${F_COMPILER}" STREQUAL "CRAY" OR CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*") | ||||
| set(FILTER_FLAGS "-msse3;-mssse3;-msse4.1;-mavx;-mavx2,-mskylake-avx512") | set(FILTER_FLAGS "-msse3;-mssse3;-msse4.1;-mavx;-mavx2,-mskylake-avx512") | ||||
| if (CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*") | if (CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*") | ||||
| message(STATUS "removing fortran flags") | |||||
| message(STATUS "removing fortran flags not supported by the compiler") | |||||
| set(FILTER_FLAGS "${FILTER_FLAGS};-m32;-m64") | set(FILTER_FLAGS "${FILTER_FLAGS};-m32;-m64") | ||||
| endif () | endif () | ||||
| foreach (FILTER_FLAG ${FILTER_FLAGS}) | foreach (FILTER_FLAG ${FILTER_FLAGS}) | ||||
| @@ -676,13 +704,6 @@ if (${CMAKE_C_COMPILER_ID} MATCHES "IntelLLVM" AND ${CMAKE_SYSTEM_NAME} STREQUAL | |||||
| set(LAPACK_CFLAGS "${LAPACK_CFLAGS} -DNOCHANGE") | set(LAPACK_CFLAGS "${LAPACK_CFLAGS} -DNOCHANGE") | ||||
| endif () | endif () | ||||
| if ("${CMAKE_BUILD_TYPE}" STREQUAL "Release") | |||||
| if ("${F_COMPILER}" STREQUAL "FLANG") | |||||
| if (${CMAKE_Fortran_COMPILER_VERSION} VERSION_LESS_EQUAL 3) | |||||
| set(CMAKE_Fortran_FLAGS_RELEASE "${CMAKE_Fortran_FLAGS_RELEASE} -fno-unroll-loops") | |||||
| endif () | |||||
| endif () | |||||
| endif () | |||||
| if (NOT DEFINED SUFFIX) | if (NOT DEFINED SUFFIX) | ||||
| set(SUFFIX o) | set(SUFFIX o) | ||||
| @@ -139,6 +139,17 @@ endif() | |||||
| endif() | endif() | ||||
| endif() | endif() | ||||
| if (ARM64) | |||||
| if (NOT NO_SME) | |||||
| file(WRITE ${PROJECT_BINARY_DIR}/sme.c ".text \n.global sme_test\n\nsme_test:\nsmstart\nsmstop\nret\n") | |||||
| execute_process(COMMAND ${CMAKE_C_COMPILER} -march=armv9-a+sve2+sme -c -v -o ${PROJECT_BINARY_DIR}/sme.o ${PROJECT_BINARY_DIR}/sme.c OUTPUT_QUIET ERROR_QUIET RESULT_VARIABLE NO_SME) | |||||
| if (NO_SME EQUAL 1) | |||||
| set (CCOMMON_OPT "${CCOMMON_OPT} -DNO_SME") | |||||
| endif() | |||||
| file(REMOVE "${PROJECT_BINARY_DIR}/sme.c" "${PROJECT_BINARY_DIR}/sme.o") | |||||
| endif() | |||||
| endif() | |||||
| include(CheckIncludeFile) | include(CheckIncludeFile) | ||||
| CHECK_INCLUDE_FILE("stdatomic.h" HAVE_C11) | CHECK_INCLUDE_FILE("stdatomic.h" HAVE_C11) | ||||
| if (HAVE_C11 EQUAL 1) | if (HAVE_C11 EQUAL 1) | ||||
| @@ -16,6 +16,14 @@ endfunction () | |||||
| macro(ParseMakefileVars MAKEFILE_IN) | macro(ParseMakefileVars MAKEFILE_IN) | ||||
| message(STATUS "Reading vars from ${MAKEFILE_IN}...") | message(STATUS "Reading vars from ${MAKEFILE_IN}...") | ||||
| set (C_COMPILER ${CMAKE_C_COMPILER_ID}) | set (C_COMPILER ${CMAKE_C_COMPILER_ID}) | ||||
| set (OSNAME ${CMAKE_SYSTEM_NAME}) | |||||
| if (${C_COMPILER} MATCHES Clang) | |||||
| set (C_COMPILER CLANG) | |||||
| endif () | |||||
| if (${OSNAME} STREQUAL Windows) | |||||
| set (OSNAME WINNT) | |||||
| endif () | |||||
| message(STATUS OS ${OSNAME} COMPILER ${C_COMPILER}) | |||||
| set (IfElse 0) | set (IfElse 0) | ||||
| set (ElseSeen 0) | set (ElseSeen 0) | ||||
| set (SkipIfs 0) | set (SkipIfs 0) | ||||
| @@ -702,6 +702,7 @@ void gotoblas_profile_init(void); | |||||
| void gotoblas_profile_quit(void); | void gotoblas_profile_quit(void); | ||||
| int support_avx512(void); | int support_avx512(void); | ||||
| int support_sme1(void); | |||||
| #ifdef USE_OPENMP | #ifdef USE_OPENMP | ||||
| @@ -114,7 +114,15 @@ static inline int blas_quickdivide(blasint x, blasint y){ | |||||
| OPENBLAS_ARM_TYPE_FUNCTION \ | OPENBLAS_ARM_TYPE_FUNCTION \ | ||||
| REALNAME: | REALNAME: | ||||
| #define EPILOGUE | |||||
| #if defined(__ELF__) && defined(__linux__) | |||||
| # define GNUSTACK .section .note.GNU-stack,"",%progbits | |||||
| #else | |||||
| # define GNUSTACK | |||||
| #endif | |||||
| #define EPILOGUE \ | |||||
| GNUSTACK | |||||
| #define PROFCODE | #define PROFCODE | ||||
| @@ -175,7 +175,7 @@ REALNAME: | |||||
| #define HUGE_PAGESIZE ( 4 << 20) | #define HUGE_PAGESIZE ( 4 << 20) | ||||
| #ifndef BUFFERSIZE | #ifndef BUFFERSIZE | ||||
| #if defined(NEOVERSEN1) || defined(NEOVERSEN2) || defined(NEOVERSEV1) || defined(A64FX) || defined(ARMV8SVE) | |||||
| #if defined(NEOVERSEN1) || defined(NEOVERSEN2) || defined(NEOVERSEV1) || defined(A64FX) || defined(ARMV8SVE) || defined(ARMV9SME) | |||||
| #define BUFFER_SIZE (32 << 22) | #define BUFFER_SIZE (32 << 22) | ||||
| #else | #else | ||||
| #define BUFFER_SIZE (32 << 20) | #define BUFFER_SIZE (32 << 20) | ||||
| @@ -22,6 +22,7 @@ | |||||
| #define DSUM_K dsum_k | #define DSUM_K dsum_k | ||||
| #define DSWAP_K dswap_k | #define DSWAP_K dswap_k | ||||
| #define DROT_K drot_k | #define DROT_K drot_k | ||||
| #define DROTM_K drotm_k | |||||
| #define DGEMV_N dgemv_n | #define DGEMV_N dgemv_n | ||||
| #define DGEMV_T dgemv_t | #define DGEMV_T dgemv_t | ||||
| @@ -180,6 +181,7 @@ | |||||
| #define DSUM_K gotoblas -> dsum_k | #define DSUM_K gotoblas -> dsum_k | ||||
| #define DSWAP_K gotoblas -> dswap_k | #define DSWAP_K gotoblas -> dswap_k | ||||
| #define DROT_K gotoblas -> drot_k | #define DROT_K gotoblas -> drot_k | ||||
| #define DROTM_K gotoblas -> drotm_k | |||||
| #define DGEMV_N gotoblas -> dgemv_n | #define DGEMV_N gotoblas -> dgemv_n | ||||
| #define DGEMV_T gotoblas -> dgemv_t | #define DGEMV_T gotoblas -> dgemv_t | ||||
| @@ -213,9 +213,9 @@ int srotmg_k(float *, float *, float *, float *, float *); | |||||
| int drotmg_k(double *, double *, double *, double *, double *); | int drotmg_k(double *, double *, double *, double *, double *); | ||||
| int qrotmg_k(xdouble *, xdouble *, xdouble *, xdouble *, xdouble *); | int qrotmg_k(xdouble *, xdouble *, xdouble *, xdouble *, xdouble *); | ||||
| int srotm_k (BLASLONG, float, BLASLONG, float, BLASLONG, float); | |||||
| int drotm_k (BLASLONG, double, BLASLONG, double, BLASLONG, double); | |||||
| int qrotm_k (BLASLONG, xdouble, BLASLONG, xdouble, BLASLONG, xdouble); | |||||
| int srotm_k (BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); | |||||
| int drotm_k (BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); | |||||
| int qrotm_k (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); | |||||
| int saxpby_k (BLASLONG, float, float *, BLASLONG, float, float *, BLASLONG); | int saxpby_k (BLASLONG, float, float *, BLASLONG, float, float *, BLASLONG); | ||||
| @@ -70,6 +70,7 @@ | |||||
| #define SUM_K QSUM_K | #define SUM_K QSUM_K | ||||
| #define SWAP_K QSWAP_K | #define SWAP_K QSWAP_K | ||||
| #define ROT_K QROT_K | #define ROT_K QROT_K | ||||
| #define ROTM_K QROTM_K | |||||
| #define GEMV_N QGEMV_N | #define GEMV_N QGEMV_N | ||||
| #define GEMV_T QGEMV_T | #define GEMV_T QGEMV_T | ||||
| @@ -361,6 +362,7 @@ | |||||
| #define SUM_K DSUM_K | #define SUM_K DSUM_K | ||||
| #define SWAP_K DSWAP_K | #define SWAP_K DSWAP_K | ||||
| #define ROT_K DROT_K | #define ROT_K DROT_K | ||||
| #define ROTM_K DROTM_K | |||||
| #define GEMV_N DGEMV_N | #define GEMV_N DGEMV_N | ||||
| #define GEMV_T DGEMV_T | #define GEMV_T DGEMV_T | ||||
| @@ -977,6 +979,7 @@ | |||||
| #define SUM_K SSUM_K | #define SUM_K SSUM_K | ||||
| #define SWAP_K SSWAP_K | #define SWAP_K SSWAP_K | ||||
| #define ROT_K SROT_K | #define ROT_K SROT_K | ||||
| #define ROTM_K SROTM_K | |||||
| #define GEMV_N SGEMV_N | #define GEMV_N SGEMV_N | ||||
| #define GEMV_T SGEMV_T | #define GEMV_T SGEMV_T | ||||
| @@ -77,6 +77,7 @@ BLASLONG (*isbmin_k) (BLASLONG, float *, BLASLONG); | |||||
| double (*dsbdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); | double (*dsbdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); | ||||
| int (*sbrot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float, float); | int (*sbrot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float, float); | ||||
| int (*sbrotm_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); | |||||
| int (*sbaxpy_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); | int (*sbaxpy_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); | ||||
| int (*sbscal_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); | int (*sbscal_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); | ||||
| @@ -197,6 +198,7 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG); | |||||
| //double (*dsdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); | //double (*dsdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); | ||||
| int (*srot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float, float); | int (*srot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float, float); | ||||
| int (*srotm_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); | |||||
| #endif | #endif | ||||
| #if (BUILD_SINGLE==1) || (BUILD_DOUBLE==1) || (BUILD_COMPLEX==1) | #if (BUILD_SINGLE==1) || (BUILD_DOUBLE==1) || (BUILD_COMPLEX==1) | ||||
| int (*saxpy_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); | int (*saxpy_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); | ||||
| @@ -221,6 +223,10 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG); | |||||
| void (*sgemm_direct) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG , float *, BLASLONG , float * , BLASLONG); | void (*sgemm_direct) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG , float *, BLASLONG , float * , BLASLONG); | ||||
| int (*sgemm_direct_performant) (BLASLONG M, BLASLONG N, BLASLONG K); | int (*sgemm_direct_performant) (BLASLONG M, BLASLONG N, BLASLONG K); | ||||
| #endif | #endif | ||||
| #ifdef ARCH_ARM64 | |||||
| void (*sgemm_direct) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG , float *, BLASLONG , float * , BLASLONG); | |||||
| #endif | |||||
| int (*sgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG); | int (*sgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG); | ||||
| int (*sgemm_beta )(BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); | int (*sgemm_beta )(BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); | ||||
| @@ -330,6 +336,7 @@ BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG); | |||||
| #endif | #endif | ||||
| #if (BUILD_DOUBLE==1) || (BUILD_COMPLEX16==1) | #if (BUILD_DOUBLE==1) || (BUILD_COMPLEX16==1) | ||||
| int (*drot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG, double, double); | int (*drot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG, double, double); | ||||
| int (*drotm_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); | |||||
| int (*daxpy_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); | int (*daxpy_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); | ||||
| int (*dscal_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); | int (*dscal_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); | ||||
| int (*dswap_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); | int (*dswap_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); | ||||
| @@ -439,6 +446,7 @@ BLASLONG (*iqmin_k) (BLASLONG, xdouble *, BLASLONG); | |||||
| int (*qcopy_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); | int (*qcopy_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); | ||||
| xdouble (*qdot_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); | xdouble (*qdot_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); | ||||
| int (*qrot_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble); | int (*qrot_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble); | ||||
| int (*qrotm_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); | |||||
| int (*qaxpy_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); | int (*qaxpy_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); | ||||
| int (*qscal_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); | int (*qscal_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); | ||||
| @@ -22,6 +22,7 @@ | |||||
| #define QSUM_K qsum_k | #define QSUM_K qsum_k | ||||
| #define QSWAP_K qswap_k | #define QSWAP_K qswap_k | ||||
| #define QROT_K qrot_k | #define QROT_K qrot_k | ||||
| #define QROTM_K qrotm_k | |||||
| #define QGEMV_N qgemv_n | #define QGEMV_N qgemv_n | ||||
| #define QGEMV_T qgemv_t | #define QGEMV_T qgemv_t | ||||
| @@ -165,6 +166,7 @@ | |||||
| #define QSUM_K gotoblas -> qsum_k | #define QSUM_K gotoblas -> qsum_k | ||||
| #define QSWAP_K gotoblas -> qswap_k | #define QSWAP_K gotoblas -> qswap_k | ||||
| #define QROT_K gotoblas -> qrot_k | #define QROT_K gotoblas -> qrot_k | ||||
| #define QROTM_K gotoblas -> qrotm_k | |||||
| #define QGEMV_N gotoblas -> qgemv_n | #define QGEMV_N gotoblas -> qgemv_n | ||||
| #define QGEMV_T gotoblas -> qgemv_t | #define QGEMV_T gotoblas -> qgemv_t | ||||
| @@ -24,6 +24,7 @@ | |||||
| #define SSCAL_K sscal_k | #define SSCAL_K sscal_k | ||||
| #define SSWAP_K sswap_k | #define SSWAP_K sswap_k | ||||
| #define SROT_K srot_k | #define SROT_K srot_k | ||||
| #define SROTM_K srotm_k | |||||
| #define SGEMV_N sgemv_n | #define SGEMV_N sgemv_n | ||||
| #define SGEMV_T sgemv_t | #define SGEMV_T sgemv_t | ||||
| @@ -189,6 +190,7 @@ | |||||
| #define SSCAL_K gotoblas -> sscal_k | #define SSCAL_K gotoblas -> sscal_k | ||||
| #define SSWAP_K gotoblas -> sswap_k | #define SSWAP_K gotoblas -> sswap_k | ||||
| #define SROT_K gotoblas -> srot_k | #define SROT_K gotoblas -> srot_k | ||||
| #define SROTM_K gotoblas -> srotm_k | |||||
| #define SGEMV_N gotoblas -> sgemv_n | #define SGEMV_N gotoblas -> sgemv_n | ||||
| #define SGEMV_T gotoblas -> sgemv_t | #define SGEMV_T gotoblas -> sgemv_t | ||||
| @@ -213,9 +215,9 @@ | |||||
| #ifdef ARCH_X86_64 | #ifdef ARCH_X86_64 | ||||
| #define SGEMM_DIRECT_PERFORMANT gotoblas -> sgemm_direct_performant | #define SGEMM_DIRECT_PERFORMANT gotoblas -> sgemm_direct_performant | ||||
| #define SGEMM_DIRECT gotoblas -> sgemm_direct | #define SGEMM_DIRECT gotoblas -> sgemm_direct | ||||
| #else | |||||
| #elif ARCH_ARM64 | |||||
| #define SGEMM_DIRECT_PERFORMANT sgemm_direct_performant | #define SGEMM_DIRECT_PERFORMANT sgemm_direct_performant | ||||
| #define SGEMM_DIRECT sgemm_direct | |||||
| #define SGEMM_DIRECT gotoblas -> sgemm_direct | |||||
| #endif | #endif | ||||
| #define SGEMM_ONCOPY gotoblas -> sgemm_oncopy | #define SGEMM_ONCOPY gotoblas -> sgemm_oncopy | ||||
| @@ -103,9 +103,16 @@ static inline int blas_quickdivide(blasint x, blasint y){ | |||||
| .global REALNAME ;\ | .global REALNAME ;\ | ||||
| .type REALNAME, %function ;\ | .type REALNAME, %function ;\ | ||||
| REALNAME: | REALNAME: | ||||
| #define EPILOGUE | |||||
| #if defined(__ELF__) && defined(__linux__) | |||||
| # define GNUSTACK .section .note.GNU-stack,"",@progbits | |||||
| #else | |||||
| # define GNUSTACK | |||||
| #endif | |||||
| #define EPILOGUE \ | |||||
| .size REALNAME, .-REALNAME; \ | |||||
| GNUSTACK | |||||
| #define PROFCODE | #define PROFCODE | ||||
| @@ -65,3 +65,6 @@ _cpuid: | |||||
| .subsections_via_symbols | .subsections_via_symbols | ||||
| #endif | #endif | ||||
| #if defined(__ELF__) && defined(__linux__) | |||||
| .section .note.GNU-stack,"",@progbits | |||||
| #endif | |||||
| @@ -43,6 +43,9 @@ size_t length64=sizeof(value64); | |||||
| #ifndef HWCAP_SVE | #ifndef HWCAP_SVE | ||||
| #define HWCAP_SVE (1 << 22) | #define HWCAP_SVE (1 << 22) | ||||
| #endif | #endif | ||||
| #if (defined OS_WINDOWS) | |||||
| #include <winreg.h> | |||||
| #endif | |||||
| #define get_cpu_ftr(id, var) ({ \ | #define get_cpu_ftr(id, var) ({ \ | ||||
| __asm__ __volatile__ ("mrs %0, "#id : "=r" (var)); \ | __asm__ __volatile__ ("mrs %0, "#id : "=r" (var)); \ | ||||
| @@ -273,11 +276,11 @@ int detect(void) | |||||
| fclose(infile); | fclose(infile); | ||||
| } | } | ||||
| } | } | ||||
| sprintf(cpuimpl,"0x%2x",implementer); | |||||
| sprintf(cpuimpl,"0x%02x",implementer); | |||||
| cpu_implementer=strdup(cpuimpl); | cpu_implementer=strdup(cpuimpl); | ||||
| } | } | ||||
| qsort(cpucores,1024,sizeof(int),cpusort); | qsort(cpucores,1024,sizeof(int),cpusort); | ||||
| sprintf(cpupart,"0x%3x",cpucores[0]); | |||||
| sprintf(cpupart,"0x%03x",cpucores[0]); | |||||
| cpu_part=strdup(cpupart); | cpu_part=strdup(cpupart); | ||||
| if(cpu_part != NULL && cpu_implementer != NULL) { | if(cpu_part != NULL && cpu_implementer != NULL) { | ||||
| // Arm | // Arm | ||||
| @@ -371,20 +374,47 @@ int detect(void) | |||||
| } | } | ||||
| #else | #else | ||||
| #ifdef __APPLE__ | #ifdef __APPLE__ | ||||
| length64 = sizeof(value64); | |||||
| sysctlbyname("hw.ncpu",&value64,&length64,NULL,0); | sysctlbyname("hw.ncpu",&value64,&length64,NULL,0); | ||||
| cpulowperf=value64; | cpulowperf=value64; | ||||
| length64 = sizeof(value64); | |||||
| sysctlbyname("hw.nperflevels",&value64,&length64,NULL,0); | sysctlbyname("hw.nperflevels",&value64,&length64,NULL,0); | ||||
| if (value64 > 1) { | if (value64 > 1) { | ||||
| sysctlbyname("hw.perflevel0.cpusperl",&value64,&length64,NULL,0); | |||||
| length64 = sizeof(value64); | |||||
| sysctlbyname("hw.perflevel0.cpusperl2",&value64,&length64,NULL,0); | |||||
| cpuhiperf=value64; | cpuhiperf=value64; | ||||
| sysctlbyname("hw.perflevel1.cpusperl",&value64,&length64,NULL,0); | |||||
| length64 = sizeof(value64); | |||||
| sysctlbyname("hw.perflevel1.cpusperl2",&value64,&length64,NULL,0); | |||||
| cpulowperf=value64; | cpulowperf=value64; | ||||
| } | } | ||||
| length64 = sizeof(value64); | |||||
| sysctlbyname("hw.cpufamily",&value64,&length64,NULL,0); | sysctlbyname("hw.cpufamily",&value64,&length64,NULL,0); | ||||
| if (value64 ==131287967|| value64 == 458787763 ) return CPU_VORTEX; //A12/M1 | if (value64 ==131287967|| value64 == 458787763 ) return CPU_VORTEX; //A12/M1 | ||||
| if (value64 == 3660830781) return CPU_VORTEX; //A15/M2 | if (value64 == 3660830781) return CPU_VORTEX; //A15/M2 | ||||
| if (value64 == 2271604202) return CPU_VORTEX; //A16/M3 | if (value64 == 2271604202) return CPU_VORTEX; //A16/M3 | ||||
| if (value64 == 1867590060) return CPU_VORTEX; //M4 | if (value64 == 1867590060) return CPU_VORTEX; //M4 | ||||
| #else | |||||
| #ifdef OS_WINDOWS | |||||
| HKEY reghandle; | |||||
| HKEY hklm = HKEY_LOCAL_MACHINE; | |||||
| WCHAR valstring[512]; | |||||
| PVOID pvalstring=valstring; | |||||
| DWORD size=sizeof (valstring); | |||||
| DWORD type=RRF_RT_ANY; | |||||
| DWORD flags=0; | |||||
| LPCWSTR subkey= L"HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0"; | |||||
| LPCWSTR field=L"ProcessorNameString"; | |||||
| LONG errcode=RegOpenKeyEx(HKEY_LOCAL_MACHINE,TEXT("Hardware\\Description\\System\\CentralProcessor\\0"), 0, KEY_READ, ®handle); | |||||
| if (errcode != NO_ERROR) wprintf(L"Could not open registry key for proc0: %x\n",errcode); | |||||
| errcode=RegQueryValueEx(reghandle, "ProcessorNameString", NULL,NULL ,pvalstring,&size); | |||||
| if (errcode != ERROR_SUCCESS) wprintf(L"Error reading cpuname from registry:%x\n",errcode); | |||||
| //wprintf(stderr,L"%s\n",(PWSTR)valstring); | |||||
| RegCloseKey(reghandle); | |||||
| if (strstr(valstring, "Snapdragon(R) X Elite")) return CPU_NEOVERSEN1; | |||||
| if (strstr(valstring, "Ampere(R) Altra")) return CPU_NEOVERSEN1; | |||||
| if (strstr(valstring, "Snapdragon (TM) 8cx Gen 3")) return CPU_CORTEXX1; | |||||
| if (strstr(valstring, "Snapdragon Compute Platform")) return CPU_CORTEXX1; | |||||
| #endif | |||||
| #endif | #endif | ||||
| return CPU_ARMV8; | return CPU_ARMV8; | ||||
| #endif | #endif | ||||
| @@ -442,6 +472,7 @@ int n=0; | |||||
| printf("#define NUM_CORES_HP %d\n",cpuhiperf); | printf("#define NUM_CORES_HP %d\n",cpuhiperf); | ||||
| #endif | #endif | ||||
| #ifdef __APPLE__ | #ifdef __APPLE__ | ||||
| length64 = sizeof(value64); | |||||
| sysctlbyname("hw.physicalcpu_max",&value,&length,NULL,0); | sysctlbyname("hw.physicalcpu_max",&value,&length,NULL,0); | ||||
| printf("#define NUM_CORES %d\n",value); | printf("#define NUM_CORES %d\n",value); | ||||
| if (cpulowperf >0) | if (cpulowperf >0) | ||||
| @@ -673,12 +704,17 @@ void get_cpuconfig(void) | |||||
| case CPU_VORTEX: | case CPU_VORTEX: | ||||
| printf("#define VORTEX \n"); | printf("#define VORTEX \n"); | ||||
| #ifdef __APPLE__ | #ifdef __APPLE__ | ||||
| length64 = sizeof(value64); | |||||
| sysctlbyname("hw.l1icachesize",&value64,&length64,NULL,0); | sysctlbyname("hw.l1icachesize",&value64,&length64,NULL,0); | ||||
| printf("#define L1_CODE_SIZE %lld \n",value64); | printf("#define L1_CODE_SIZE %lld \n",value64); | ||||
| length64 = sizeof(value64); | |||||
| sysctlbyname("hw.cachelinesize",&value64,&length64,NULL,0); | sysctlbyname("hw.cachelinesize",&value64,&length64,NULL,0); | ||||
| printf("#define L1_CODE_LINESIZE %lld \n",value64); | printf("#define L1_CODE_LINESIZE %lld \n",value64); | ||||
| printf("#define L1_DATA_LINESIZE %lld \n",value64); | |||||
| length64 = sizeof(value64); | |||||
| sysctlbyname("hw.l1dcachesize",&value64,&length64,NULL,0); | sysctlbyname("hw.l1dcachesize",&value64,&length64,NULL,0); | ||||
| printf("#define L1_DATA_SIZE %lld \n",value64); | printf("#define L1_DATA_SIZE %lld \n",value64); | ||||
| length64 = sizeof(value64); | |||||
| sysctlbyname("hw.l2cachesize",&value64,&length64,NULL,0); | sysctlbyname("hw.l2cachesize",&value64,&length64,NULL,0); | ||||
| printf("#define L2_SIZE %lld \n",value64); | printf("#define L2_SIZE %lld \n",value64); | ||||
| #endif | #endif | ||||
| @@ -1578,6 +1578,7 @@ int get_cpuname(void){ | |||||
| case 12: //family 6 exmodel 12 | case 12: //family 6 exmodel 12 | ||||
| switch (model) { | switch (model) { | ||||
| case 15: | case 15: | ||||
| case 6: // Arrow Lake | |||||
| if(support_avx512()) | if(support_avx512()) | ||||
| return CPUTYPE_SAPPHIRERAPIDS; | return CPUTYPE_SAPPHIRERAPIDS; | ||||
| if(support_avx2()) | if(support_avx2()) | ||||
| @@ -2421,6 +2422,22 @@ int get_coretype(void){ | |||||
| else | else | ||||
| return CORE_NEHALEM; | return CORE_NEHALEM; | ||||
| } | } | ||||
| case 12: | |||||
| switch (model) { | |||||
| case 6: // Arrow Lake | |||||
| if(support_amx_bf16()) | |||||
| return CORE_SAPPHIRERAPIDS; | |||||
| if(support_avx512_bf16()) | |||||
| return CORE_COOPERLAKE; | |||||
| if(support_avx512()) | |||||
| return CORE_SKYLAKEX; | |||||
| if(support_avx2()) | |||||
| return CORE_HASWELL; | |||||
| if(support_avx()) | |||||
| return CORE_SANDYBRIDGE; | |||||
| else | |||||
| return CORE_NEHALEM; | |||||
| } | |||||
| } | } | ||||
| case 15: | case 15: | ||||
| if (model <= 0x2) return CORE_NORTHWOOD; | if (model <= 0x2) return CORE_NORTHWOOD; | ||||
| @@ -6,7 +6,7 @@ enable_language(Fortran) | |||||
| endif() | endif() | ||||
| set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DADD${BU} -DCBLAS") | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DADD${BU} -DCBLAS") | ||||
| if (BINARY32 AND CMAKE_C_PLATFORM_ID MATCHES "MinGW" AND CMAKE_Fortran_COMPILER_VERSION VERSION_EQUAL 14.2) | |||||
| if (BINARY32 AND CMAKE_C_PLATFORM_ID MATCHES "MinGW" AND CMAKE_Fortran_COMPILER_VERSION VERSION_GREATER 14.1) | |||||
| list(REMOVE_ITEM ${CMAKE_Fortran_FLAGS} -O3 -O2 -O1 -Os) | list(REMOVE_ITEM ${CMAKE_Fortran_FLAGS} -O3 -O2 -O1 -Os) | ||||
| set (CMAKE_Fortran_FLAGS_RELEASE "" CACHE STRING "" FORCE) | set (CMAKE_Fortran_FLAGS_RELEASE "" CACHE STRING "" FORCE) | ||||
| endif() | endif() | ||||
| @@ -44,10 +44,6 @@ else() | |||||
| c_${float_char}blas1.c) | c_${float_char}blas1.c) | ||||
| endif() | endif() | ||||
| target_link_libraries(x${float_char}cblat1 ${OpenBLAS_LIBNAME}) | target_link_libraries(x${float_char}cblat1 ${OpenBLAS_LIBNAME}) | ||||
| if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang)) | |||||
| string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}") | |||||
| target_link_libraries(x${float_char}cblat1 omp pthread) | |||||
| endif() | |||||
| if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX") | if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX") | ||||
| target_link_libraries(x${float_char}cblat1 m) | target_link_libraries(x${float_char}cblat1 m) | ||||
| endif() | endif() | ||||
| @@ -73,10 +69,6 @@ else() | |||||
| constant.c) | constant.c) | ||||
| endif() | endif() | ||||
| target_link_libraries(x${float_char}cblat2 ${OpenBLAS_LIBNAME}) | target_link_libraries(x${float_char}cblat2 ${OpenBLAS_LIBNAME}) | ||||
| if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang)) | |||||
| string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}") | |||||
| target_link_libraries(x${float_char}cblat2 omp pthread) | |||||
| endif() | |||||
| if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX") | if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX") | ||||
| target_link_libraries(x${float_char}cblat2 m) | target_link_libraries(x${float_char}cblat2 m) | ||||
| endif() | endif() | ||||
| @@ -124,20 +116,12 @@ else() | |||||
| endif() | endif() | ||||
| endif() | endif() | ||||
| target_link_libraries(x${float_char}cblat3 ${OpenBLAS_LIBNAME}) | target_link_libraries(x${float_char}cblat3 ${OpenBLAS_LIBNAME}) | ||||
| if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang)) | |||||
| string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}") | |||||
| target_link_libraries(x${float_char}cblat3 omp pthread) | |||||
| endif() | |||||
| if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX") | if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX") | ||||
| target_link_libraries(x${float_char}cblat3 m) | target_link_libraries(x${float_char}cblat3 m) | ||||
| endif() | endif() | ||||
| if (USE_GEMM3M) | if (USE_GEMM3M) | ||||
| if ((${float_char} STREQUAL "c") OR (${float_char} STREQUAL "z")) | if ((${float_char} STREQUAL "c") OR (${float_char} STREQUAL "z")) | ||||
| target_link_libraries(x${float_char}cblat3_3m ${OpenBLAS_LIBNAME}) | target_link_libraries(x${float_char}cblat3_3m ${OpenBLAS_LIBNAME}) | ||||
| if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang)) | |||||
| string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}") | |||||
| target_link_libraries(x${float_char}cblat3 omp pthread) | |||||
| endif() | |||||
| if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX") | if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX") | ||||
| target_link_libraries(x${float_char}cblat3_3m m) | target_link_libraries(x${float_char}cblat3_3m m) | ||||
| endif() | endif() | ||||
| @@ -235,18 +235,18 @@ FLDFLAGS = $(FFLAGS:-fPIC=) $(LDFLAGS) | |||||
| ifeq ($(USE_OPENMP), 1) | ifeq ($(USE_OPENMP), 1) | ||||
| ifeq ($(F_COMPILER), GFORTRAN) | ifeq ($(F_COMPILER), GFORTRAN) | ||||
| ifeq ($(C_COMPILER), CLANG) | ifeq ($(C_COMPILER), CLANG) | ||||
| CEXTRALIB += -lomp | |||||
| EXTRALIB += -lomp | |||||
| endif | endif | ||||
| endif | endif | ||||
| ifeq ($(F_COMPILER), NAG) | ifeq ($(F_COMPILER), NAG) | ||||
| CEXTRALIB = -lgomp | |||||
| EXTRALIB = -lgomp | |||||
| endif | endif | ||||
| ifeq ($(F_COMPILER), IBM) | ifeq ($(F_COMPILER), IBM) | ||||
| ifeq ($(C_COMPILER), GCC) | ifeq ($(C_COMPILER), GCC) | ||||
| CEXTRALIB += -lgomp | |||||
| EXTRALIB += -lgomp | |||||
| endif | endif | ||||
| ifeq ($(C_COMPILER), CLANG) | ifeq ($(C_COMPILER), CLANG) | ||||
| CEXTRALIB += -lomp | |||||
| EXTRALIB += -lomp | |||||
| endif | endif | ||||
| endif | endif | ||||
| endif | endif | ||||
| @@ -440,7 +440,7 @@ static real c_b43 = (float)1.; | |||||
| extern /* Subroutine */ int ctest_(integer*, complex*, complex*, complex*, real*); | extern /* Subroutine */ int ctest_(integer*, complex*, complex*, complex*, real*); | ||||
| static complex mwpcs[5], mwpct[5]; | static complex mwpcs[5], mwpct[5]; | ||||
| extern /* Subroutine */ int itest1_(integer*, integer*), stest1_(real*,real*,real*,real*); | extern /* Subroutine */ int itest1_(integer*, integer*), stest1_(real*,real*,real*,real*); | ||||
| extern /* Subroutine */ int cscaltest_(), itest1_(), stest1_(); | |||||
| extern /* Subroutine */ int cscaltest_(integer*, complex*, complex*, integer*); | |||||
| static complex cx[8]; | static complex cx[8]; | ||||
| extern real scnrm2test_(integer*, complex*, integer*); | extern real scnrm2test_(integer*, complex*, integer*); | ||||
| static integer np1; | static integer np1; | ||||
| @@ -480,13 +480,13 @@ the LLVM toolchain enables native compilation of the Fortran sources of LAPACK a | |||||
| 4. Navigate to the OpenBLAS source code directory and start building OpenBLAS | 4. Navigate to the OpenBLAS source code directory and start building OpenBLAS | ||||
| by invoking Ninja: | by invoking Ninja: | ||||
| ```cmd | ```cmd | ||||
| cd OpenBLAS | cd OpenBLAS | ||||
| mkdir build | mkdir build | ||||
| cd build | cd build | ||||
| cmake .. -G Ninja -DCMAKE_BUILD_TYPE=Release -DTARGET=ARMV8 -DBINARY=64 -DCMAKE_C_COMPILER=clang-cl -DCMAKE_C_COMPILER=arm64-pc-windows-msvc -DCMAKE_ASM_COMPILER=arm64-pc-windows-msvc -DCMAKE_Fortran_COMPILER=flang-new | |||||
| cmake .. -G Ninja -DCMAKE_BUILD_TYPE=Release -DTARGET=ARMV8 -DBINARY=64 -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang-new | |||||
| ninja -j16 | ninja -j16 | ||||
| ``` | ``` | ||||
| @@ -223,3 +223,7 @@ if (USE_THREAD) | |||||
| endif () | endif () | ||||
| add_library(driver_level2 OBJECT ${OPENBLAS_SRC}) | add_library(driver_level2 OBJECT ${OPENBLAS_SRC}) | ||||
| if (USE_OPENMP) | |||||
| target_link_libraries(driver_level2 OpenMP::OpenMP_C) | |||||
| endif() | |||||
| @@ -171,3 +171,7 @@ endforeach () | |||||
| # | # | ||||
| add_library(driver_level3 OBJECT ${OPENBLAS_SRC}) | add_library(driver_level3 OBJECT ${OPENBLAS_SRC}) | ||||
| if (USE_OPENMP) | |||||
| target_link_libraries(driver_level3 OpenMP::OpenMP_C) | |||||
| endif() | |||||
| @@ -547,7 +547,7 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG | |||||
| #ifdef USE_OPENMP | #ifdef USE_OPENMP | ||||
| static omp_lock_t level3_lock, critical_section_lock; | static omp_lock_t level3_lock, critical_section_lock; | ||||
| static volatile BLASLONG init_lock = 0, omp_lock_initialized = 0, | |||||
| static volatile BLASULONG init_lock = 0, omp_lock_initialized = 0, | |||||
| parallel_section_left = MAX_PARALLEL_NUMBER; | parallel_section_left = MAX_PARALLEL_NUMBER; | ||||
| // Lock initialization; Todo : Maybe this part can be moved to blas_init() in blas_server_omp.c | // Lock initialization; Todo : Maybe this part can be moved to blas_init() in blas_server_omp.c | ||||
| @@ -591,7 +591,7 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG | |||||
| BLASLONG nthreads = args -> nthreads; | BLASLONG nthreads = args -> nthreads; | ||||
| BLASLONG width, i, j, k, js; | |||||
| BLASLONG width, width_n, i, j, k, js; | |||||
| BLASLONG m, n, n_from, n_to; | BLASLONG m, n, n_from, n_to; | ||||
| int mode; | int mode; | ||||
| #if defined(DYNAMIC_ARCH) | #if defined(DYNAMIC_ARCH) | ||||
| @@ -740,18 +740,25 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG | |||||
| /* Partition (a step of) n into nthreads regions */ | /* Partition (a step of) n into nthreads regions */ | ||||
| range_N[0] = js; | range_N[0] = js; | ||||
| num_parts = 0; | num_parts = 0; | ||||
| while (n > 0){ | |||||
| width = blas_quickdivide(n + nthreads - num_parts - 1, nthreads - num_parts); | |||||
| if (width < switch_ratio && width > 1) { | |||||
| width = switch_ratio; | |||||
| for(j = 0; j < nthreads_n; j++){ | |||||
| width_n = blas_quickdivide(n + nthreads_n - j - 1, nthreads_n - j); | |||||
| n -= width_n; | |||||
| for(i = 0; i < nthreads_m; i++){ | |||||
| width = blas_quickdivide(width_n + nthreads_m - i - 1, nthreads_m - i); | |||||
| if (width < switch_ratio) { | |||||
| width = switch_ratio; | |||||
| } | |||||
| width = round_up(width_n, width, GEMM_PREFERED_SIZE); | |||||
| width_n -= width; | |||||
| if (width_n < 0) { | |||||
| width = width + width_n; | |||||
| width_n = 0; | |||||
| } | |||||
| range_N[num_parts + 1] = range_N[num_parts] + width; | |||||
| num_parts ++; | |||||
| } | } | ||||
| width = round_up(n, width, GEMM_PREFERED_SIZE); | |||||
| n -= width; | |||||
| if (n < 0) width = width + n; | |||||
| range_N[num_parts + 1] = range_N[num_parts] + width; | |||||
| num_parts ++; | |||||
| } | } | ||||
| for (j = num_parts; j < MAX_CPU_NUMBER; j++) { | for (j = num_parts; j < MAX_CPU_NUMBER; j++) { | ||||
| range_N[j + 1] = range_N[num_parts]; | range_N[j + 1] = range_N[num_parts]; | ||||
| @@ -844,9 +851,20 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, IFLOAT *sa, IF | |||||
| /* Objective function come from sum of partitions in m and n. */ | /* Objective function come from sum of partitions in m and n. */ | ||||
| /* (n / nthreads_n) + (m / nthreads_m) */ | /* (n / nthreads_n) + (m / nthreads_m) */ | ||||
| /* = (n * nthreads_m + m * nthreads_n) / (nthreads_n * nthreads_m) */ | /* = (n * nthreads_m + m * nthreads_n) / (nthreads_n * nthreads_m) */ | ||||
| while (nthreads_m % 2 == 0 && n * nthreads_m + m * nthreads_n > n * (nthreads_m / 2) + m * (nthreads_n * 2)) { | |||||
| nthreads_m /= 2; | |||||
| nthreads_n *= 2; | |||||
| BLASLONG cost = 0, div = 0; | |||||
| BLASLONG i; | |||||
| for (i = 1; i <= sqrt(nthreads_m); i++) { | |||||
| if (nthreads_m % i) continue; | |||||
| BLASLONG j = nthreads_m / i; | |||||
| BLASLONG cost_i = n * j + m * nthreads_n * i; | |||||
| BLASLONG cost_j = n * i + m * nthreads_n * j; | |||||
| if (cost == 0 || | |||||
| cost_i < cost) {cost = cost_i; div = i;} | |||||
| if (cost_j < cost) {cost = cost_j; div = j;} | |||||
| } | |||||
| if (div > 1) { | |||||
| nthreads_m /= div; | |||||
| nthreads_n *= div; | |||||
| } | } | ||||
| } | } | ||||
| @@ -88,3 +88,7 @@ endif () | |||||
| #endif | #endif | ||||
| add_library(driver_others OBJECT ${OPENBLAS_SRC} ${MEMORY} ${SMP_SOURCES} ${COMMON_SOURCES}) | add_library(driver_others OBJECT ${OPENBLAS_SRC} ${MEMORY} ${SMP_SOURCES} ${COMMON_SOURCES}) | ||||
| if (USE_OPENMP) | |||||
| target_link_libraries(driver_others OpenMP::OpenMP_C) | |||||
| endif() | |||||
| @@ -146,8 +146,8 @@ typedef struct { | |||||
| } thread_status_t; | } thread_status_t; | ||||
| #ifdef HAVE_C11 | #ifdef HAVE_C11 | ||||
| #define atomic_load_queue(p) __atomic_load_n(p, __ATOMIC_RELAXED) | |||||
| #define atomic_store_queue(p, v) __atomic_store_n(p, v, __ATOMIC_RELAXED) | |||||
| #define atomic_load_queue(p) __atomic_load_n(p, __ATOMIC_ACQUIRE) | |||||
| #define atomic_store_queue(p, v) __atomic_store_n(p, v, __ATOMIC_RELEASE) | |||||
| #else | #else | ||||
| #define atomic_load_queue(p) (blas_queue_t*)(*(volatile blas_queue_t**)(p)) | #define atomic_load_queue(p) (blas_queue_t*)(*(volatile blas_queue_t**)(p)) | ||||
| #define atomic_store_queue(p, v) (*(volatile blas_queue_t* volatile*)(p) = (v)) | #define atomic_store_queue(p, v) (*(volatile blas_queue_t* volatile*)(p) = (v)) | ||||
| @@ -637,7 +637,9 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){ | |||||
| #ifdef SMP_SERVER | #ifdef SMP_SERVER | ||||
| // Handle lazy re-init of the thread-pool after a POSIX fork | // Handle lazy re-init of the thread-pool after a POSIX fork | ||||
| LOCK_COMMAND(&server_lock); | |||||
| if (unlikely(blas_server_avail == 0)) blas_thread_init(); | if (unlikely(blas_server_avail == 0)) blas_thread_init(); | ||||
| UNLOCK_COMMAND(&server_lock); | |||||
| #endif | #endif | ||||
| BLASLONG i = 0; | BLASLONG i = 0; | ||||
| blas_queue_t *current = queue; | blas_queue_t *current = queue; | ||||
| @@ -43,6 +43,14 @@ | |||||
| #include <sys/auxv.h> | #include <sys/auxv.h> | ||||
| #endif | #endif | ||||
| #ifdef __APPLE__ | |||||
| #include <sys/sysctl.h> | |||||
| int32_t value; | |||||
| size_t length=sizeof(value); | |||||
| int64_t value64; | |||||
| size_t length64=sizeof(value64); | |||||
| #endif | |||||
| extern gotoblas_t gotoblas_ARMV8; | extern gotoblas_t gotoblas_ARMV8; | ||||
| #ifdef DYNAMIC_LIST | #ifdef DYNAMIC_LIST | ||||
| #ifdef DYN_CORTEXA53 | #ifdef DYN_CORTEXA53 | ||||
| @@ -115,7 +123,12 @@ extern gotoblas_t gotoblas_ARMV8SVE; | |||||
| #else | #else | ||||
| #define gotoblas_ARMV8SVE gotoblas_ARMV8 | #define gotoblas_ARMV8SVE gotoblas_ARMV8 | ||||
| #endif | #endif | ||||
| #ifdef DYN_CORTEX_A55 | |||||
| #ifdef DYN_ARMV9SME | |||||
| extern gotoblas_t gotoblas_ARMV9SME; | |||||
| #else | |||||
| #define gotoblas_ARMV9SME gotoblas_ARMV8 | |||||
| #endif | |||||
| #ifdef DYN_CORTEXA55 | |||||
| extern gotoblas_t gotoblas_CORTEXA55; | extern gotoblas_t gotoblas_CORTEXA55; | ||||
| #else | #else | ||||
| #define gotoblas_CORTEXA55 gotoblas_ARMV8 | #define gotoblas_CORTEXA55 gotoblas_ARMV8 | ||||
| @@ -142,21 +155,28 @@ extern gotoblas_t gotoblas_NEOVERSEV1; | |||||
| extern gotoblas_t gotoblas_NEOVERSEN2; | extern gotoblas_t gotoblas_NEOVERSEN2; | ||||
| extern gotoblas_t gotoblas_ARMV8SVE; | extern gotoblas_t gotoblas_ARMV8SVE; | ||||
| extern gotoblas_t gotoblas_A64FX; | extern gotoblas_t gotoblas_A64FX; | ||||
| #ifndef NO_SME | |||||
| extern gotoblas_t gotoblas_ARMV9SME; | |||||
| #else | |||||
| #define gotoblas_ARMV9SME gotoblas_ARMV8SVE | |||||
| #endif | |||||
| #else | #else | ||||
| #define gotoblas_NEOVERSEV1 gotoblas_ARMV8 | #define gotoblas_NEOVERSEV1 gotoblas_ARMV8 | ||||
| #define gotoblas_NEOVERSEN2 gotoblas_ARMV8 | #define gotoblas_NEOVERSEN2 gotoblas_ARMV8 | ||||
| #define gotoblas_ARMV8SVE gotoblas_ARMV8 | #define gotoblas_ARMV8SVE gotoblas_ARMV8 | ||||
| #define gotoblas_A64FX gotoblas_ARMV8 | #define gotoblas_A64FX gotoblas_ARMV8 | ||||
| #define gotoblas_ARMV9SME gotoblas_ARMV8 | |||||
| #endif | #endif | ||||
| extern gotoblas_t gotoblas_THUNDERX3T110; | extern gotoblas_t gotoblas_THUNDERX3T110; | ||||
| #endif | #endif | ||||
| #define gotoblas_NEOVERSEV2 gotoblas_NEOVERSEV1 | |||||
| #define gotoblas_NEOVERSEV2 gotoblas_NEOVERSEN2 | |||||
| extern void openblas_warning(int verbose, const char * msg); | extern void openblas_warning(int verbose, const char * msg); | ||||
| #define FALLBACK_VERBOSE 1 | #define FALLBACK_VERBOSE 1 | ||||
| #define NEOVERSEN1_FALLBACK "OpenBLAS : Your OS does not support SVE instructions. OpenBLAS is using Neoverse N1 kernels as a fallback, which may give poorer performance.\n" | #define NEOVERSEN1_FALLBACK "OpenBLAS : Your OS does not support SVE instructions. OpenBLAS is using Neoverse N1 kernels as a fallback, which may give poorer performance.\n" | ||||
| #define NUM_CORETYPES 18 | |||||
| #define NUM_CORETYPES 19 | |||||
| /* | /* | ||||
| * In case asm/hwcap.h is outdated on the build system, make sure | * In case asm/hwcap.h is outdated on the build system, make sure | ||||
| @@ -168,6 +188,9 @@ extern void openblas_warning(int verbose, const char * msg); | |||||
| #ifndef HWCAP_SVE | #ifndef HWCAP_SVE | ||||
| #define HWCAP_SVE (1 << 22) | #define HWCAP_SVE (1 << 22) | ||||
| #endif | #endif | ||||
| #ifndef HWCAP2_SME | |||||
| #define HWCAP2_SME 1<<23 | |||||
| #endif | |||||
| #define get_cpu_ftr(id, var) ({ \ | #define get_cpu_ftr(id, var) ({ \ | ||||
| __asm__ __volatile__ ("mrs %0, "#id : "=r" (var)); \ | __asm__ __volatile__ ("mrs %0, "#id : "=r" (var)); \ | ||||
| @@ -192,6 +215,7 @@ static char *corename[] = { | |||||
| "cortexa55", | "cortexa55", | ||||
| "armv8sve", | "armv8sve", | ||||
| "a64fx", | "a64fx", | ||||
| "armv9sme", | |||||
| "unknown" | "unknown" | ||||
| }; | }; | ||||
| @@ -214,6 +238,7 @@ char *gotoblas_corename(void) { | |||||
| if (gotoblas == &gotoblas_CORTEXA55) return corename[15]; | if (gotoblas == &gotoblas_CORTEXA55) return corename[15]; | ||||
| if (gotoblas == &gotoblas_ARMV8SVE) return corename[16]; | if (gotoblas == &gotoblas_ARMV8SVE) return corename[16]; | ||||
| if (gotoblas == &gotoblas_A64FX) return corename[17]; | if (gotoblas == &gotoblas_A64FX) return corename[17]; | ||||
| if (gotoblas == &gotoblas_ARMV9SME) return corename[18]; | |||||
| return corename[NUM_CORETYPES]; | return corename[NUM_CORETYPES]; | ||||
| } | } | ||||
| @@ -251,6 +276,7 @@ static gotoblas_t *force_coretype(char *coretype) { | |||||
| case 15: return (&gotoblas_CORTEXA55); | case 15: return (&gotoblas_CORTEXA55); | ||||
| case 16: return (&gotoblas_ARMV8SVE); | case 16: return (&gotoblas_ARMV8SVE); | ||||
| case 17: return (&gotoblas_A64FX); | case 17: return (&gotoblas_A64FX); | ||||
| case 18: return (&gotoblas_ARMV9SME); | |||||
| } | } | ||||
| snprintf(message, 128, "Core not found: %s\n", coretype); | snprintf(message, 128, "Core not found: %s\n", coretype); | ||||
| openblas_warning(1, message); | openblas_warning(1, message); | ||||
| @@ -262,6 +288,11 @@ static gotoblas_t *get_coretype(void) { | |||||
| char coremsg[128]; | char coremsg[128]; | ||||
| #if defined (OS_DARWIN) | #if defined (OS_DARWIN) | ||||
| //future #if !defined(NO_SME) | |||||
| // if (support_sme1()) { | |||||
| // return &gotoblas_ARMV9SME; | |||||
| // } | |||||
| // #endif | |||||
| return &gotoblas_NEOVERSEN1; | return &gotoblas_NEOVERSEN1; | ||||
| #endif | #endif | ||||
| @@ -409,13 +440,21 @@ static gotoblas_t *get_coretype(void) { | |||||
| return &gotoblas_TSV110; | return &gotoblas_TSV110; | ||||
| } | } | ||||
| break; | break; | ||||
| case 0x50: // Ampere | |||||
| case 0x50: // Ampere/AppliedMicro | |||||
| switch (part) | switch (part) | ||||
| { | { | ||||
| case 0x000: // Skylark/EMAG8180 | case 0x000: // Skylark/EMAG8180 | ||||
| return &gotoblas_EMAG8180; | return &gotoblas_EMAG8180; | ||||
| } | } | ||||
| break; | break; | ||||
| case 0xc0: // Ampere | |||||
| switch(part) | |||||
| { | |||||
| case 0xac3: | |||||
| case 0xac4: | |||||
| return &gotoblas_NEOVERSEN1; | |||||
| } | |||||
| break; | |||||
| case 0x51: // Qualcomm | case 0x51: // Qualcomm | ||||
| switch (part) | switch (part) | ||||
| { | { | ||||
| @@ -424,12 +463,20 @@ static gotoblas_t *get_coretype(void) { | |||||
| } | } | ||||
| break; | break; | ||||
| case 0x61: // Apple | case 0x61: // Apple | ||||
| //future if (support_sme1()) return &gotoblas_ARMV9SME; | |||||
| return &gotoblas_NEOVERSEN1; | return &gotoblas_NEOVERSEN1; | ||||
| break; | break; | ||||
| default: | default: | ||||
| snprintf(coremsg, 128, "Unknown CPU model - implementer %x part %x\n",implementer,part); | snprintf(coremsg, 128, "Unknown CPU model - implementer %x part %x\n",implementer,part); | ||||
| openblas_warning(1, coremsg); | openblas_warning(1, coremsg); | ||||
| } | } | ||||
| #if !defined(NO_SME) | |||||
| if (support_sme1()) { | |||||
| return &gotoblas_ARMV9SME; | |||||
| } | |||||
| #endif | |||||
| #ifndef NO_SVE | #ifndef NO_SVE | ||||
| if ((getauxval(AT_HWCAP) & HWCAP_SVE)) { | if ((getauxval(AT_HWCAP) & HWCAP_SVE)) { | ||||
| return &gotoblas_ARMV8SVE; | return &gotoblas_ARMV8SVE; | ||||
| @@ -480,3 +527,19 @@ void gotoblas_dynamic_init(void) { | |||||
| void gotoblas_dynamic_quit(void) { | void gotoblas_dynamic_quit(void) { | ||||
| gotoblas = NULL; | gotoblas = NULL; | ||||
| } | } | ||||
| int support_sme1(void) { | |||||
| int ret = 0; | |||||
| #if (defined OS_LINUX || defined OS_ANDROID) | |||||
| ret = getauxval(AT_HWCAP2) & HWCAP2_SME; | |||||
| if(getauxval(AT_HWCAP2) & HWCAP2_SME){ | |||||
| ret = 1; | |||||
| } | |||||
| #endif | |||||
| #if defined(__APPLE__) | |||||
| sysctlbyname("hw.optional.arm.FEAT_SME",&value64,&length64,NULL,0); | |||||
| ret = value64; | |||||
| #endif | |||||
| return ret; | |||||
| } | |||||
| @@ -197,7 +197,7 @@ ifeq ($(F_COMPILER), INTEL) | |||||
| -Wl,--whole-archive $< -Wl,--no-whole-archive \ | -Wl,--whole-archive $< -Wl,--no-whole-archive \ | ||||
| -Wl,-soname,$(INTERNALNAME) $(EXTRALIB) | -Wl,-soname,$(INTERNALNAME) $(EXTRALIB) | ||||
| $(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK. | $(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK. | ||||
| else ifeq ($(F_COMPILER), FLANG) | |||||
| else ifeq ($(F_COMPILER), $(filter $(F_COMPILER),FLANG FLANGNEW)) | |||||
| $(FC) $(FFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \ | $(FC) $(FFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \ | ||||
| -Wl,--whole-archive $< -Wl,--no-whole-archive \ | -Wl,--whole-archive $< -Wl,--no-whole-archive \ | ||||
| -Wl,-soname,$(INTERNALNAME) $(EXTRALIB) | -Wl,-soname,$(INTERNALNAME) $(EXTRALIB) | ||||
| @@ -21,7 +21,7 @@ | |||||
| chbmv,chemm,chemv,cher2,cher2k,cher,cherk,scabs1,scamax, | chbmv,chemm,chemv,cher2,cher2k,cher,cherk,scabs1,scamax, | ||||
| chpmv,chpr2,chpr,crotg,cscal,csrot,csscal,cswap,scamin,scasum,scnrm2, | chpmv,chpr2,chpr,crotg,cscal,csrot,csscal,cswap,scamin,scasum,scnrm2, | ||||
| csymm,csyr2k,csyrk,ctbmv,ctbsv,ctpmv,ctpsv,ctrmm,ctrmv,ctrsm, | csymm,csyr2k,csyrk,ctbmv,ctbsv,ctpmv,ctpsv,ctrmm,ctrmv,ctrsm, | ||||
| ctrsv,icamax,icamin,cimatcopy,comatcopy,cgeadd,scsum,cgemmt); | |||||
| ctrsv,icamax,icamin,cimatcopy,comatcopy,cgeadd,scsum,cgemmt,cgemmtr); | |||||
| @blasobjsd = ( | @blasobjsd = ( | ||||
| damax,damin,dasum,daxpy,daxpby,dcabs1,dcopy,ddot,dgbmv,dgemm, | damax,damin,dasum,daxpy,daxpby,dcabs1,dcopy,ddot,dgbmv,dgemm, | ||||
| @@ -29,7 +29,7 @@ | |||||
| dscal,dsdot,dspmv,dspr2,dimatcopy,domatcopy, | dscal,dsdot,dspmv,dspr2,dimatcopy,domatcopy, | ||||
| dspr,dswap,dsymm,dsymv,dsyr2,dsyr2k,dsyr,dsyrk,dtbmv,dtbsv, | dspr,dswap,dsymm,dsymv,dsyr2,dsyr2k,dsyr,dsyrk,dtbmv,dtbsv, | ||||
| dtpmv,dtpsv,dtrmm,dtrmv,dtrsm,dtrsv, | dtpmv,dtpsv,dtrmm,dtrmv,dtrsm,dtrsv, | ||||
| idamax,idamin,idmax,idmin,dgeadd,dsum,dgemmt); | |||||
| idamax,idamin,idmax,idmin,dgeadd,dsum,dgemmt,dgemmtr); | |||||
| @blasobjss = ( | @blasobjss = ( | ||||
| isamax,isamin,ismax,ismin, | isamax,isamin,ismax,ismin, | ||||
| @@ -38,7 +38,7 @@ | |||||
| smax,smin,snrm2,simatcopy,somatcopy, | smax,smin,snrm2,simatcopy,somatcopy, | ||||
| srot,srotg,srotm,srotmg,ssbmv,sscal,sspmv,sspr2,sspr,sswap, | srot,srotg,srotm,srotmg,ssbmv,sscal,sspmv,sspr2,sspr,sswap, | ||||
| ssymm,ssymv,ssyr2,ssyr2k,ssyr,ssyrk,stbmv,stbsv,stpmv,stpsv, | ssymm,ssymv,ssyr2,ssyr2k,ssyr,ssyrk,stbmv,stbsv,stpmv,stpsv, | ||||
| strmm,strmv,strsm,strsv, sgeadd,ssum,sgemmt); | |||||
| strmm,strmv,strsm,strsv, sgeadd,ssum,sgemmt,sgemmtr); | |||||
| @blasobjsz = ( | @blasobjsz = ( | ||||
| izamax,izamin,, | izamax,izamin,, | ||||
| @@ -48,28 +48,29 @@ | |||||
| zhpr,zrotg,zscal,zswap,zsymm,zsyr2k,zsyrk,ztbmv, | zhpr,zrotg,zscal,zswap,zsymm,zsyr2k,zsyrk,ztbmv, | ||||
| ztbsv,ztpmv,ztpsv,ztrmm,ztrmv,ztrsm,ztrsv, | ztbsv,ztpmv,ztpsv,ztrmm,ztrmv,ztrsm,ztrsv, | ||||
| zomatcopy, zimatcopy,dzamax,dzamin,dzasum,dznrm2, | zomatcopy, zimatcopy,dzamax,dzamin,dzasum,dznrm2, | ||||
| zgeadd, dzsum, zgemmt); | |||||
| zgeadd, dzsum, zgemmt,zgemmtr); | |||||
| @blasobjs = (lsame, xerbla); | @blasobjs = (lsame, xerbla); | ||||
| @bfblasobjs = (sbgemm, sbgemv, sbdot, sbstobf16, sbdtobf16, sbf16tos, dbf16tod); | |||||
| @bfblasobjs = (sbgemm, sbgemmt, sbgemmtr, sbgemv, sbdot, sbstobf16, sbdtobf16, sbf16tos, dbf16tod); | |||||
| @cblasobjsc = ( | @cblasobjsc = ( | ||||
| cblas_caxpy, cblas_ccopy, cblas_cdotc, cblas_cdotu, cblas_cgbmv, cblas_cgemm, cblas_cgemv, | cblas_caxpy, cblas_ccopy, cblas_cdotc, cblas_cdotu, cblas_cgbmv, cblas_cgemm, cblas_cgemv, | ||||
| cblas_cgerc, cblas_cgeru, cblas_chbmv, cblas_chemm, cblas_chemv, cblas_cher2, cblas_cher2k, | cblas_cgerc, cblas_cgeru, cblas_chbmv, cblas_chemm, cblas_chemv, cblas_cher2, cblas_cher2k, | ||||
| cblas_cher, cblas_cherk, cblas_chpmv, cblas_chpr2, cblas_chpr, cblas_cscal, cblas_caxpby, | cblas_cher, cblas_cherk, cblas_chpmv, cblas_chpr2, cblas_chpr, cblas_cscal, cblas_caxpby, | ||||
| cblas_csscal, cblas_cswap, cblas_csymm, cblas_csyr2k, cblas_csyrk, cblas_ctbmv, cblas_cgeadd, | cblas_csscal, cblas_cswap, cblas_csymm, cblas_csyr2k, cblas_csyrk, cblas_ctbmv, cblas_cgeadd, | ||||
| cblas_ctbsv, cblas_ctpmv, cblas_ctpsv, cblas_ctrmm, cblas_ctrmv, cblas_ctrsm, cblas_ctrsv, | cblas_ctbsv, cblas_ctpmv, cblas_ctpsv, cblas_ctrmm, cblas_ctrmv, cblas_ctrsm, cblas_ctrsv, | ||||
| cblas_scnrm2, cblas_scasum, | |||||
| cblas_icamax, cblas_icamin, cblas_icmin, cblas_icmax, cblas_scsum,cblas_cimatcopy,cblas_comatcopy | |||||
| cblas_cgemmt); | |||||
| cblas_scnrm2, cblas_scasum, cblas_cgemmt, cblas_cgemmtr, | |||||
| cblas_icamax, cblas_icamin, cblas_icmin, cblas_icmax, cblas_scsum,cblas_cimatcopy,cblas_comatcopy, | |||||
| cblas_caxpyc, cblas_crotg, cblas_csrot, cblas_scamax, cblas_scamin, cblas_cgemm_batch); | |||||
| @cblasobjsd = ( | @cblasobjsd = ( | ||||
| cblas_dasum, cblas_daxpy, cblas_dcopy, cblas_ddot, | cblas_dasum, cblas_daxpy, cblas_dcopy, cblas_ddot, | ||||
| cblas_dgbmv, cblas_dgemm, cblas_dgemv, cblas_dger, cblas_dnrm2, | cblas_dgbmv, cblas_dgemm, cblas_dgemv, cblas_dger, cblas_dnrm2, | ||||
| cblas_drot, cblas_drotg, cblas_drotm, cblas_drotmg, cblas_dsbmv, cblas_dscal, cblas_dsdot, | cblas_drot, cblas_drotg, cblas_drotm, cblas_drotmg, cblas_dsbmv, cblas_dscal, cblas_dsdot, | ||||
| cblas_dspmv, cblas_dspr2, cblas_dspr, cblas_dswap, cblas_dsymm, cblas_dsymv, cblas_dsyr2, | cblas_dspmv, cblas_dspr2, cblas_dspr, cblas_dswap, cblas_dsymm, cblas_dsymv, cblas_dsyr2, | ||||
| cblas_dsyr2k, cblas_dsyr, cblas_dsyrk, cblas_dtbmv, cblas_dtbsv, cblas_dtpmv, cblas_dtpsv, | cblas_dsyr2k, cblas_dsyr, cblas_dsyrk, cblas_dtbmv, cblas_dtbsv, cblas_dtpmv, cblas_dtpsv, | ||||
| cblas_dtrmm, cblas_dtrmv, cblas_dtrsm, cblas_dtrsv, cblas_daxpby, cblas_dgeadd, | |||||
| cblas_idamax, cblas_idamin, cblas_idmin, cblas_idmax, cblas_dsum,cblas_dimatcopy,cblas_domatcopy | |||||
| cblas_dgemmt); | |||||
| cblas_dtrmm, cblas_dtrmv, cblas_dtrsm, cblas_dtrsv, cblas_daxpby, cblas_dgeadd, cblas_dgemmt, cblas_dgemmtr, | |||||
| cblas_idamax, cblas_idamin, cblas_idmin, cblas_idmax, cblas_dsum,cblas_dimatcopy,cblas_domatcopy, | |||||
| cblas_damax, cblas_damin, cblas_dgemm_batch); | |||||
| @cblasobjss = ( | @cblasobjss = ( | ||||
| cblas_sasum, cblas_saxpy, cblas_saxpby, | cblas_sasum, cblas_saxpy, cblas_saxpby, | ||||
| @@ -78,9 +79,10 @@ | |||||
| cblas_srotm, cblas_srotmg, cblas_ssbmv, cblas_sscal, cblas_sspmv, cblas_sspr2, cblas_sspr, | cblas_srotm, cblas_srotmg, cblas_ssbmv, cblas_sscal, cblas_sspmv, cblas_sspr2, cblas_sspr, | ||||
| cblas_sswap, cblas_ssymm, cblas_ssymv, cblas_ssyr2, cblas_ssyr2k, cblas_ssyr, cblas_ssyrk, | cblas_sswap, cblas_ssymm, cblas_ssymv, cblas_ssyr2, cblas_ssyr2k, cblas_ssyr, cblas_ssyrk, | ||||
| cblas_stbmv, cblas_stbsv, cblas_stpmv, cblas_stpsv, cblas_strmm, cblas_strmv, cblas_strsm, | cblas_stbmv, cblas_stbsv, cblas_stpmv, cblas_stpsv, cblas_strmm, cblas_strmv, cblas_strsm, | ||||
| cblas_strsv, cblas_sgeadd, | |||||
| cblas_isamax, cblas_isamin, cblas_ismin, cblas_ismax, cblas_ssum,cblas_simatcopy,cblas_somatcopy | |||||
| cblas_sgemmt); | |||||
| cblas_strsv, cblas_sgeadd, cblas_sgemmt, cblas_sgemmtr, | |||||
| cblas_isamax, cblas_isamin, cblas_ismin, cblas_ismax, cblas_ssum,cblas_simatcopy,cblas_somatcopy, | |||||
| cblas_samax, cblas_samin, cblas_sgemm_batch); | |||||
| @cblasobjsz = ( | @cblasobjsz = ( | ||||
| cblas_dzasum, cblas_dznrm2, cblas_zaxpy, cblas_zcopy, cblas_zdotc, cblas_zdotu, cblas_zdscal, | cblas_dzasum, cblas_dznrm2, cblas_zaxpy, cblas_zcopy, cblas_zdotc, cblas_zdotu, cblas_zdscal, | ||||
| cblas_zgbmv, cblas_zgemm, cblas_zgemv, cblas_zgerc, cblas_zgeru, cblas_zhbmv, cblas_zhemm, | cblas_zgbmv, cblas_zgemm, cblas_zgemv, cblas_zgerc, cblas_zgeru, cblas_zhbmv, cblas_zhemm, | ||||
| @@ -88,13 +90,13 @@ | |||||
| cblas_zhpr, cblas_zscal, cblas_zswap, cblas_zsymm, cblas_zsyr2k, cblas_zsyrk, | cblas_zhpr, cblas_zscal, cblas_zswap, cblas_zsymm, cblas_zsyr2k, cblas_zsyrk, | ||||
| cblas_ztbmv, cblas_ztbsv, cblas_ztpmv, cblas_ztpsv, cblas_ztrmm, cblas_ztrmv, cblas_ztrsm, | cblas_ztbmv, cblas_ztbsv, cblas_ztpmv, cblas_ztpsv, cblas_ztrmm, cblas_ztrmv, cblas_ztrsm, | ||||
| cblas_ztrsv, cblas_cdotc_sub, cblas_cdotu_sub, cblas_zdotc_sub, cblas_zdotu_sub, | cblas_ztrsv, cblas_cdotc_sub, cblas_cdotu_sub, cblas_zdotc_sub, cblas_zdotu_sub, | ||||
| cblas_zaxpby, cblas_zgeadd, | |||||
| cblas_izamax, cblas_izamin, cblas_izmin, cblas_izmax, cblas_dzsum,cblas_zimatcopy,cblas_zomatcopy | |||||
| cblas_zgemmt); | |||||
| cblas_zaxpby, cblas_zgeadd, cblas_zgemmt, cblas_zgemmtr, | |||||
| cblas_izamax, cblas_izamin, cblas_izmin, cblas_izmax, cblas_dzsum,cblas_zimatcopy,cblas_zomatcopy, | |||||
| cblas_zaxpyc, cblas_zdrot, cblas_zrotg, cblas_dzamax, cblas_dzamin, cblas_zgemm_batch); | |||||
| @cblasobjs = ( cblas_xerbla ); | @cblasobjs = ( cblas_xerbla ); | ||||
| @bfcblasobjs = (cblas_sbgemm, cblas_sbgemv, cblas_sbdot, cblas_sbstobf16, cblas_sbdtobf16, cblas_sbf16tos, cblas_dbf16tod); | |||||
| @bfcblasobjs = (cblas_sbgemm, cblas_sbgemmt, cblas_sbgemmtr, cblas_sbgemv, cblas_sbdot, cblas_sbstobf16, cblas_sbdtobf16, cblas_sbf16tos, cblas_dbf16tod, cblas_sbgemm_batch); | |||||
| @exblasobjs = ( | @exblasobjs = ( | ||||
| qamax,qamin,qasum,qaxpy,qcabs1,qcopy,qdot,qgbmv,qgemm, | qamax,qamin,qasum,qaxpy,qcabs1,qcopy,qdot,qgbmv,qgemm, | ||||
| @@ -709,6 +711,7 @@ zpotri, | |||||
| # functions added for lapack-3.7.0 | # functions added for lapack-3.7.0 | ||||
| @lapackobjs2s = (@lapackobjs2s, | @lapackobjs2s = (@lapackobjs2s, | ||||
| slarfy, | slarfy, | ||||
| ssyconvf, | |||||
| strevc3, | strevc3, | ||||
| sgelqt, | sgelqt, | ||||
| sgelqt3, | sgelqt3, | ||||
| @@ -832,12 +835,82 @@ zpotri, | |||||
| zungtsqr_row | zungtsqr_row | ||||
| ); | ); | ||||
| #functions added for lapack-3.11 | |||||
| @lapackobjs2c = (@lapackobjs2c, | |||||
| cgedmd, | |||||
| cgedmdq | |||||
| ); | |||||
| @lapackobjs2d = (@lapackobjs2d, | |||||
| dgedmd, | |||||
| dgedmdq | |||||
| ); | |||||
| @lapackobjs2s = (@lapackobjs2s, | |||||
| sgedmd, | |||||
| sgedmdq | |||||
| ); | |||||
| @lapackobjs2z = (@lapackobjs2z, | |||||
| zgedmd, | |||||
| zgedmdq | |||||
| ); | |||||
| #functions added post 3.11 | |||||
| @lapackobjs2c = (@lapackobjs2c, | |||||
| cgelst, | |||||
| cgeqp3rk, | |||||
| claqp2rk, | |||||
| claqp3rk, | |||||
| clatrs3, | |||||
| crscl, | |||||
| ctrsyl3 | |||||
| ); | |||||
| # claqz0 | |||||
| # claqz1 | |||||
| # claqz2 | |||||
| # claqz3 | |||||
| # clatrs3 | |||||
| @lapackobjs2d = (@lapackobjs2d, | |||||
| dgelst, | |||||
| dgeqp3rk, | |||||
| dlaqp2rk, | |||||
| dlaqp3rk, | |||||
| dlarmm, | |||||
| dlatrs3, | |||||
| dtrsyl3 | |||||
| ); | |||||
| @lapackobjs2s = (@lapackobjs2s, | |||||
| sgelst, | |||||
| sgeqp3rk, | |||||
| slaqp2rk, | |||||
| slaqp3rk, | |||||
| slarmm, | |||||
| slatrs3, | |||||
| strsyl3 | |||||
| ); | |||||
| @lapackobjs2z = (@lapackobjs2z, | |||||
| zgelst, | |||||
| zgeqp3rk, | |||||
| zlaqp2rk, | |||||
| zlaqp3rk, | |||||
| zlatrs3, | |||||
| zrscl, | |||||
| ztrsyl3 | |||||
| ); | |||||
| # zlaqz0 | |||||
| # zlaqz1 | |||||
| # zlaqz2 | |||||
| # zlaqz3 | |||||
| @lapack_extendedprecision_objs = ( | @lapack_extendedprecision_objs = ( | ||||
| zposvxx, clagge, clatms, chesvxx, cposvxx, cgesvxx, ssyrfssx, csyrfsx, | zposvxx, clagge, clatms, chesvxx, cposvxx, cgesvxx, ssyrfssx, csyrfsx, | ||||
| dlagsy, dsysvxx, sporfsx, slatms, zlatms, zherfsx, csysvxx, | dlagsy, dsysvxx, sporfsx, slatms, zlatms, zherfsx, csysvxx, | ||||
| ); | ); | ||||
| @lapack_deprecated_objsc = ( | @lapack_deprecated_objsc = ( | ||||
| cgelqs, cgeqrs, | |||||
| cgegs, cggsvd, | cgegs, cggsvd, | ||||
| cgegv, cggsvp, | cgegv, cggsvp, | ||||
| cgelsx, clahrd, | cgelsx, clahrd, | ||||
| @@ -845,13 +918,16 @@ zpotri, | |||||
| ctzrqf, | ctzrqf, | ||||
| ); | ); | ||||
| @lapack_deprecated_objsd = ( | @lapack_deprecated_objsd = ( | ||||
| dgelqs, dgeqrs, | |||||
| dgegs, dgeqpf, | dgegs, dgeqpf, | ||||
| dgegv, dggsvd, | dgegv, dggsvd, | ||||
| dgelsx, dggsvp, | dgelsx, dggsvp, | ||||
| dlahrd, | dlahrd, | ||||
| dlatzm, dtzrqf); | dlatzm, dtzrqf); | ||||
| @lapack_deprecated_objss = ( | |||||
| @lapack_deprecated_objss = ( | |||||
| sgelqs, | |||||
| sgeqrs, | |||||
| sgelsx, | sgelsx, | ||||
| sgegs, | sgegs, | ||||
| sgegv, | sgegv, | ||||
| @@ -864,6 +940,8 @@ zpotri, | |||||
| ); | ); | ||||
| @lapack_deprecated_objsz = ( | @lapack_deprecated_objsz = ( | ||||
| zgelqs, | |||||
| zgeqrs, | |||||
| zgegs, | zgegs, | ||||
| zgegv, | zgegv, | ||||
| zgelsx, | zgelsx, | ||||
| @@ -997,6 +1075,10 @@ zpotri, | |||||
| LAPACKE_cgebrd_work, | LAPACKE_cgebrd_work, | ||||
| LAPACKE_cgecon, | LAPACKE_cgecon, | ||||
| LAPACKE_cgecon_work, | LAPACKE_cgecon_work, | ||||
| LAPACKE_cgedmd, | |||||
| LAPACKE_cgedmd_work, | |||||
| LAPACKE_cgedmdq, | |||||
| LAPACKE_cgedmdq_work, | |||||
| LAPACKE_cgeequ, | LAPACKE_cgeequ, | ||||
| LAPACKE_cgeequ_work, | LAPACKE_cgeequ_work, | ||||
| LAPACKE_cgeequb, | LAPACKE_cgeequb, | ||||
| @@ -1584,8 +1666,15 @@ zpotri, | |||||
| LAPACKE_cgetsqrhrt, | LAPACKE_cgetsqrhrt, | ||||
| LAPACKE_cgetsqrhrt_work, | LAPACKE_cgetsqrhrt_work, | ||||
| LAPACKE_cungtsqr_row, | LAPACKE_cungtsqr_row, | ||||
| LAPACKE_cungtsqr_row_work | |||||
| LAPACKE_cungtsqr_row_work, | |||||
| LAPACKE_clangb, | |||||
| LAPACKE_clangb_work, | |||||
| LAPACKE_ctrsyl3, | |||||
| LAPACKE_ctrsyl3_work, | |||||
| LAPACKE_ctz_nancheck, | |||||
| LAPACKE_ctz_trans, | |||||
| LAPACKE_cunhr_col, | |||||
| LAPACKE_cunhr_col_work | |||||
| ); | ); | ||||
| @lapackeobjsd = ( | @lapackeobjsd = ( | ||||
| LAPACKE_dgb_nancheck, | LAPACKE_dgb_nancheck, | ||||
| @@ -1656,6 +1745,10 @@ zpotri, | |||||
| LAPACKE_dgebrd_work, | LAPACKE_dgebrd_work, | ||||
| LAPACKE_dgecon, | LAPACKE_dgecon, | ||||
| LAPACKE_dgecon_work, | LAPACKE_dgecon_work, | ||||
| LAPACKE_dgedmd, | |||||
| LAPACKE_dgedmd_work, | |||||
| LAPACKE_dgedmdq, | |||||
| LAPACKE_dgedmdq_work, | |||||
| LAPACKE_dgeequ, | LAPACKE_dgeequ, | ||||
| LAPACKE_dgeequ_work, | LAPACKE_dgeequ_work, | ||||
| LAPACKE_dgeequb, | LAPACKE_dgeequb, | ||||
| @@ -2197,7 +2290,15 @@ zpotri, | |||||
| LAPACKE_dgetsqrhrt, | LAPACKE_dgetsqrhrt, | ||||
| LAPACKE_dgetsqrhrt_work, | LAPACKE_dgetsqrhrt_work, | ||||
| LAPACKE_dorgtsqr_row, | LAPACKE_dorgtsqr_row, | ||||
| LAPACKE_dorgtsqr_row_work | |||||
| LAPACKE_dorgtsqr_row_work, | |||||
| LAPACKE_dlangb, | |||||
| LAPACKE_dlangb_work, | |||||
| LAPACKE_dorhr_col, | |||||
| LAPACKE_dorhr_col_work, | |||||
| LAPACKE_dtrsyl3, | |||||
| LAPACKE_dtrsyl3_work, | |||||
| LAPACKE_dtz_nancheck, | |||||
| LAPACKE_dtz_trans, | |||||
| ); | ); | ||||
| @lapackeobjss = ( | @lapackeobjss = ( | ||||
| @@ -2269,6 +2370,10 @@ zpotri, | |||||
| LAPACKE_sgebrd_work, | LAPACKE_sgebrd_work, | ||||
| LAPACKE_sgecon, | LAPACKE_sgecon, | ||||
| LAPACKE_sgecon_work, | LAPACKE_sgecon_work, | ||||
| LAPACKE_sgedmd, | |||||
| LAPACKE_sgedmd_work, | |||||
| LAPACKE_sgedmdq, | |||||
| LAPACKE_sgedmdq_work, | |||||
| LAPACKE_sgeequ, | LAPACKE_sgeequ, | ||||
| LAPACKE_sgeequ_work, | LAPACKE_sgeequ_work, | ||||
| LAPACKE_sgeequb, | LAPACKE_sgeequb, | ||||
| @@ -2802,7 +2907,15 @@ zpotri, | |||||
| LAPACKE_sgetsqrhrt, | LAPACKE_sgetsqrhrt, | ||||
| LAPACKE_sgetsqrhrt_work, | LAPACKE_sgetsqrhrt_work, | ||||
| LAPACKE_sorgtsqr_row, | LAPACKE_sorgtsqr_row, | ||||
| LAPACKE_sorgtsqr_row_work | |||||
| LAPACKE_sorgtsqr_row_work, | |||||
| LAPACKE_slangb, | |||||
| LAPACKE_slangb_work, | |||||
| LAPACKE_sorhr_col, | |||||
| LAPACKE_sorhr_col_work, | |||||
| LAPACKE_strsyl3, | |||||
| LAPACKE_strsyl3_work, | |||||
| LAPACKE_stz_nancheck, | |||||
| LAPACKE_stz_trans, | |||||
| ); | ); | ||||
| @lapackeobjsz = ( | @lapackeobjsz = ( | ||||
| @@ -2878,6 +2991,10 @@ zpotri, | |||||
| LAPACKE_zgebrd_work, | LAPACKE_zgebrd_work, | ||||
| LAPACKE_zgecon, | LAPACKE_zgecon, | ||||
| LAPACKE_zgecon_work, | LAPACKE_zgecon_work, | ||||
| LAPACKE_zgedmd, | |||||
| LAPACKE_zgedmd_work, | |||||
| LAPACKE_zgedmdq, | |||||
| LAPACKE_zgedmdq_work, | |||||
| LAPACKE_zgeequ, | LAPACKE_zgeequ, | ||||
| LAPACKE_zgeequ_work, | LAPACKE_zgeequ_work, | ||||
| LAPACKE_zgeequb, | LAPACKE_zgeequb, | ||||
| @@ -3345,7 +3462,15 @@ zpotri, | |||||
| LAPACKE_zgetsqrhrt, | LAPACKE_zgetsqrhrt, | ||||
| LAPACKE_zgetsqrhrt_work, | LAPACKE_zgetsqrhrt_work, | ||||
| LAPACKE_zungtsqr_row, | LAPACKE_zungtsqr_row, | ||||
| LAPACKE_zungtsqr_row_work | |||||
| LAPACKE_zungtsqr_row_work, | |||||
| LAPACKE_zlangb, | |||||
| LAPACKE_zlangb_work, | |||||
| LAPACKE_zunhr_col, | |||||
| LAPACKE_zunhr_col_work, | |||||
| LAPACKE_ztrsyl3, | |||||
| LAPACKE_ztrsyl3_work, | |||||
| LAPACKE_ztz_nancheck, | |||||
| LAPACKE_ztz_trans, | |||||
| ## @(SRCX_OBJ) from `lapack-3.4.1/lapacke/src/Makefile` | ## @(SRCX_OBJ) from `lapack-3.4.1/lapacke/src/Makefile` | ||||
| ## Not exported: requires LAPACKE_EXTENDED to be set and depends on the | ## Not exported: requires LAPACKE_EXTENDED to be set and depends on the | ||||
| @@ -3551,7 +3676,7 @@ zpotri, | |||||
| LAPACKE_zsytrs_aa_2stage_work, | LAPACKE_zsytrs_aa_2stage_work, | ||||
| # new functions from 3.9.0 | # new functions from 3.9.0 | ||||
| LAPACKE_zgesvdq, | LAPACKE_zgesvdq, | ||||
| LAPACKE_zgesvdq_work | |||||
| LAPACKE_zgesvdq_work, | |||||
| ); | ); | ||||
| #These function may need 2 underscores. | #These function may need 2 underscores. | ||||
| @@ -3573,7 +3698,7 @@ zpotri, | |||||
| ssygv_2stage, | ssygv_2stage, | ||||
| ssysv_aa_2stage, ssytrf_aa_2stage, | ssysv_aa_2stage, ssytrf_aa_2stage, | ||||
| ssytrs_aa_2stage, | ssytrs_aa_2stage, | ||||
| slaorhr_col_getrfnp, slaorhr_col_getrfnp2, sorhr_col, | |||||
| slaorhr_col_getrfnp, slaorhr_col_getrfnp2, sorhr_col, slarfb_gett | |||||
| ); | ); | ||||
| @lapack_embeded_underscore_objs_c=( | @lapack_embeded_underscore_objs_c=( | ||||
| chetf2_rook, chetrf_rook, chetri_rook, | chetf2_rook, chetrf_rook, chetri_rook, | ||||
| @@ -3598,7 +3723,7 @@ zpotri, | |||||
| chetrf_aa_2stage, chetrs_aa_2stage, | chetrf_aa_2stage, chetrs_aa_2stage, | ||||
| csysv_aa_2stage, csytrf_aa_2stage, | csysv_aa_2stage, csytrf_aa_2stage, | ||||
| csytrs_aa_2stage, | csytrs_aa_2stage, | ||||
| claunhr_col_getrfnp, claunhr_col_getrfnp2, cunhr_col, | |||||
| claunhr_col_getrfnp, claunhr_col_getrfnp2, cunhr_col, clarfb_gett | |||||
| ); | ); | ||||
| @lapack_embeded_underscore_objs_d=( | @lapack_embeded_underscore_objs_d=( | ||||
| dlasyf_rook, | dlasyf_rook, | ||||
| @@ -3615,7 +3740,7 @@ zpotri, | |||||
| dsbevd_2stage, dsygv_2stage, | dsbevd_2stage, dsygv_2stage, | ||||
| dsysv_aa_2stage, | dsysv_aa_2stage, | ||||
| dsytrf_aa_2stage, dsytrs_aa_2stage, | dsytrf_aa_2stage, dsytrs_aa_2stage, | ||||
| dlaorhr_col_getrfnp, dlaorhr_col_getrfnp2, dorhr_col, | |||||
| dlaorhr_col_getrfnp, dlaorhr_col_getrfnp2, dorhr_col, dlarfb_gett | |||||
| ); | ); | ||||
| @lapack_embeded_underscore_objs_z=( | @lapack_embeded_underscore_objs_z=( | ||||
| zhetf2_rook, zhetrf_rook, zhetri_rook, | zhetf2_rook, zhetrf_rook, zhetri_rook, | ||||
| @@ -3639,7 +3764,7 @@ zpotri, | |||||
| zhesv_aa_2stage, zhetrf_aa_2stage, | zhesv_aa_2stage, zhetrf_aa_2stage, | ||||
| zhetrs_aa_2stage, zsysv_aa_2stage, | zhetrs_aa_2stage, zsysv_aa_2stage, | ||||
| zsytrf_aa_2stage, zsytrs_aa_2stage, | zsytrf_aa_2stage, zsytrs_aa_2stage, | ||||
| zlaunhr_col_getrfnp, zlaunhr_col_getrfnp2, zunhr_col | |||||
| zlaunhr_col_getrfnp, zlaunhr_col_getrfnp2, zunhr_col, zlarfb_gett | |||||
| ); | ); | ||||
| @@ -245,6 +245,13 @@ else | |||||
| ;; | ;; | ||||
| *flang*) | *flang*) | ||||
| vendor=FLANG | vendor=FLANG | ||||
| data=`$compiler -v 2>&1 > /dev/null` | |||||
| v="${data#*version *}" | |||||
| v="${v%%*.}" | |||||
| major="${v%%.*}" | |||||
| if [ "$major" -ge 17 ]; then | |||||
| vendor=FLANGNEW | |||||
| fi | |||||
| bu=_ | bu=_ | ||||
| openmp='-fopenmp' | openmp='-fopenmp' | ||||
| ;; | ;; | ||||
| @@ -1289,6 +1289,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define CORENAME "ARMV8SVE" | #define CORENAME "ARMV8SVE" | ||||
| #endif | #endif | ||||
| #ifdef FORCE_ARMV9SME | |||||
| #define FORCE | |||||
| #define ARCHITECTURE "ARM64" | |||||
| #define SUBARCHITECTURE "ARMV9SME" | |||||
| #define SUBDIRNAME "arm64" | |||||
| #define ARCHCONFIG "-DARMV9SME " \ | |||||
| "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ | |||||
| "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ | |||||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 " \ | |||||
| "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SVE -DHAVE_SME -DARMV8 -DARMV9" | |||||
| #define LIBNAME "armv9sme" | |||||
| #define CORENAME "ARMV9SME" | |||||
| #endif | |||||
| #ifdef FORCE_ARMV8 | #ifdef FORCE_ARMV8 | ||||
| #define FORCE | #define FORCE | ||||
| @@ -30,17 +30,17 @@ set(BLAS2_SOURCES | |||||
| gemv.c ger.c | gemv.c ger.c | ||||
| trsv.c trmv.c | trsv.c trmv.c | ||||
| syr2.c gbmv.c | syr2.c gbmv.c | ||||
| sbmv.c | |||||
| sbmv.c spmv.c | |||||
| spr2.c | spr2.c | ||||
| tbsv.c tbmv.c | tbsv.c tbmv.c | ||||
| tpsv.c tpmv.c | tpsv.c tpmv.c | ||||
| ) | ) | ||||
| set(BLAS2_REAL_ONLY_SOURCES | set(BLAS2_REAL_ONLY_SOURCES | ||||
| symv.c syr.c spmv.c spr.c | |||||
| symv.c syr.c spr.c | |||||
| ) | ) | ||||
| set(BLAS2_COMPLEX_LAPACK_SOURCES | set(BLAS2_COMPLEX_LAPACK_SOURCES | ||||
| symv.c syr.c spmv.c spr.c | |||||
| symv.c syr.c spr.c | |||||
| ) | ) | ||||
| set(BLAS2_COMPLEX_ONLY_MANGLED_SOURCES | set(BLAS2_COMPLEX_ONLY_MANGLED_SOURCES | ||||
| @@ -109,7 +109,7 @@ endif () | |||||
| GenerateNamedObjects("trsm.c" "TRMM" "trmm" ${CBLAS_FLAG}) | GenerateNamedObjects("trsm.c" "TRMM" "trmm" ${CBLAS_FLAG}) | ||||
| # gemmtr is gemmt under the name adopted by the Reference BLAS | # gemmtr is gemmt under the name adopted by the Reference BLAS | ||||
| GenerateNamedObjects("gemm.c" "" "gemmtr" ${CBLAS_FLAG}) | |||||
| GenerateNamedObjects("gemm.c" "RNAME" "gemmtr" ${CBLAS_FLAG}) | |||||
| # max and imax are compiled 4 times | # max and imax are compiled 4 times | ||||
| GenerateNamedObjects("max.c" "" "" ${CBLAS_FLAG}) | GenerateNamedObjects("max.c" "" "" ${CBLAS_FLAG}) | ||||
| @@ -125,8 +125,8 @@ endif () | |||||
| if (BUILD_BFLOAT16) | if (BUILD_BFLOAT16) | ||||
| GenerateNamedObjects("bf16dot.c" "" "sbdot" ${CBLAS_FLAG} "" "" true "BFLOAT16") | GenerateNamedObjects("bf16dot.c" "" "sbdot" ${CBLAS_FLAG} "" "" true "BFLOAT16") | ||||
| GenerateNamedObjects("gemm.c" "" "sbgemm" ${CBLAS_FLAG} "" "" true "BFLOAT16") | GenerateNamedObjects("gemm.c" "" "sbgemm" ${CBLAS_FLAG} "" "" true "BFLOAT16") | ||||
| GenerateNamedObjects("gemmt.c" "" "sbgemmt" ${CBLAS_FLAG} "" "" true "BFLOAT16") | |||||
| GenerateNamedObjects("gemmt.c" "" "sbgemmtr" ${CBLAS_FLAG} "" "" true "BFLOAT16") | |||||
| GenerateNamedObjects("sbgemmt.c" "" "sbgemmt" ${CBLAS_FLAG} "" "" true "BFLOAT16") | |||||
| GenerateNamedObjects("sbgemmt.c" "RNAME" "sbgemmtr" ${CBLAS_FLAG} "" "" true "BFLOAT16") | |||||
| GenerateNamedObjects("sbgemv.c" "" "sbgemv" ${CBLAS_FLAG} "" "" true "BFLOAT16") | GenerateNamedObjects("sbgemv.c" "" "sbgemv" ${CBLAS_FLAG} "" "" true "BFLOAT16") | ||||
| GenerateNamedObjects("tobf16.c" "SINGLE_PREC" "sbstobf16" ${CBLAS_FLAG} "" "" true "BFLOAT16") | GenerateNamedObjects("tobf16.c" "SINGLE_PREC" "sbstobf16" ${CBLAS_FLAG} "" "" true "BFLOAT16") | ||||
| GenerateNamedObjects("tobf16.c" "DOUBLE_PREC" "sbdtobf16" ${CBLAS_FLAG} "" "" true "BFLOAT16") | GenerateNamedObjects("tobf16.c" "DOUBLE_PREC" "sbdtobf16" ${CBLAS_FLAG} "" "" true "BFLOAT16") | ||||
| @@ -195,7 +195,7 @@ if (NOT DEFINED NO_CBLAS) | |||||
| endforeach () | endforeach () | ||||
| endif() | endif() | ||||
| if (NOT DEFINED NO_LAPACK) | |||||
| if (NOT NO_LAPACK) | |||||
| set(LAPACK_SOURCES | set(LAPACK_SOURCES | ||||
| lapack/gesv.c | lapack/gesv.c | ||||
| ) | ) | ||||
| @@ -250,3 +250,7 @@ if ( BUILD_COMPLEX16 AND NOT BUILD_DOUBLE) | |||||
| endif () | endif () | ||||
| add_library(interface OBJECT ${OPENBLAS_SRC}) | add_library(interface OBJECT ${OPENBLAS_SRC}) | ||||
| if (USE_OPENMP) | |||||
| target_link_libraries(interface OpenMP::OpenMP_C) | |||||
| endif() | |||||
| @@ -1304,9 +1304,9 @@ ifeq ($(BUILD_BFLOAT16),1) | |||||
| sbgemm.$(SUFFIX) sbgemm.$(PSUFFIX) : gemm.c ../param.h | sbgemm.$(SUFFIX) sbgemm.$(PSUFFIX) : gemm.c ../param.h | ||||
| $(CC) -c $(CFLAGS) $< -o $(@F) | $(CC) -c $(CFLAGS) $< -o $(@F) | ||||
| sbgemmt.$(SUFFIX) sbgemmt.$(PSUFFIX) : sbgemmt.c ../param.h | sbgemmt.$(SUFFIX) sbgemmt.$(PSUFFIX) : sbgemmt.c ../param.h | ||||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||||
| $(CC) -c $(CFLAGS) -URNAME $< -o $(@F) | |||||
| sbgemmtr.$(SUFFIX) sbgemmtr.$(PSUFFIX) : sbgemmt.c ../param.h | sbgemmtr.$(SUFFIX) sbgemmtr.$(PSUFFIX) : sbgemmt.c ../param.h | ||||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||||
| $(CC) -c $(CFLAGS) -DRNAME $< -o $(@F) | |||||
| endif | endif | ||||
| sgemm.$(SUFFIX) sgemm.$(PSUFFIX) : gemm.c ../param.h | sgemm.$(SUFFIX) sgemm.$(PSUFFIX) : gemm.c ../param.h | ||||
| @@ -1328,34 +1328,34 @@ xgemm.$(SUFFIX) xgemm.$(PSUFFIX) : gemm.c ../param.h | |||||
| $(CC) -c $(CFLAGS) $< -o $(@F) | $(CC) -c $(CFLAGS) $< -o $(@F) | ||||
| sgemmt.$(SUFFIX) sgemmt.$(PSUFFIX) : gemmt.c ../param.h | sgemmt.$(SUFFIX) sgemmt.$(PSUFFIX) : gemmt.c ../param.h | ||||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||||
| $(CC) -c $(CFLAGS) -URNAME $< -o $(@F) | |||||
| dgemmt.$(SUFFIX) dgemmt.$(PSUFFIX) : gemmt.c ../param.h | dgemmt.$(SUFFIX) dgemmt.$(PSUFFIX) : gemmt.c ../param.h | ||||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||||
| $(CC) -c $(CFLAGS) -URNAME $< -o $(@F) | |||||
| qgemmt.$(SUFFIX) qgemmt.$(PSUFFIX) : gemmt.c ../param.h | qgemmt.$(SUFFIX) qgemmt.$(PSUFFIX) : gemmt.c ../param.h | ||||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||||
| $(CC) -c $(CFLAGS) -URNAME $< -o $(@F) | |||||
| cgemmt.$(SUFFIX) cgemmt.$(PSUFFIX) : gemmt.c ../param.h | cgemmt.$(SUFFIX) cgemmt.$(PSUFFIX) : gemmt.c ../param.h | ||||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||||
| $(CC) -c $(CFLAGS) -URNAME $< -o $(@F) | |||||
| zgemmt.$(SUFFIX) zgemmt.$(PSUFFIX) : gemmt.c ../param.h | zgemmt.$(SUFFIX) zgemmt.$(PSUFFIX) : gemmt.c ../param.h | ||||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||||
| $(CC) -c $(CFLAGS) -URNAME $< -o $(@F) | |||||
| sgemmtr.$(SUFFIX) sgemmtr.$(PSUFFIX) : gemmt.c ../param.h | sgemmtr.$(SUFFIX) sgemmtr.$(PSUFFIX) : gemmt.c ../param.h | ||||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||||
| $(CC) -c $(CFLAGS) -DRNAME $< -o $(@F) | |||||
| dgemmtr.$(SUFFIX) dgemmtr.$(PSUFFIX) : gemmt.c ../param.h | dgemmtr.$(SUFFIX) dgemmtr.$(PSUFFIX) : gemmt.c ../param.h | ||||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||||
| $(CC) -c $(CFLAGS) -DRNAME $< -o $(@F) | |||||
| qgemmtr.$(SUFFIX) qgemmtr.$(PSUFFIX) : gemmt.c ../param.h | qgemmtr.$(SUFFIX) qgemmtr.$(PSUFFIX) : gemmt.c ../param.h | ||||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||||
| $(CC) -c $(CFLAGS) -DRNAME $< -o $(@F) | |||||
| cgemmtr.$(SUFFIX) cgemmtr.$(PSUFFIX) : gemmt.c ../param.h | cgemmtr.$(SUFFIX) cgemmtr.$(PSUFFIX) : gemmt.c ../param.h | ||||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||||
| $(CC) -c $(CFLAGS) -DRNAME $< -o $(@F) | |||||
| zgemmtr.$(SUFFIX) zgemmtr.$(PSUFFIX) : gemmt.c ../param.h | zgemmtr.$(SUFFIX) zgemmtr.$(PSUFFIX) : gemmt.c ../param.h | ||||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||||
| $(CC) -c $(CFLAGS) -DRNAME $< -o $(@F) | |||||
| ssymm.$(SUFFIX) ssymm.$(PSUFFIX) : symm.c | ssymm.$(SUFFIX) ssymm.$(PSUFFIX) : symm.c | ||||
| $(CC) -c $(CFLAGS) $< -o $(@F) | $(CC) -c $(CFLAGS) $< -o $(@F) | ||||
| @@ -1,5 +1,5 @@ | |||||
| /*********************************************************************/ | /*********************************************************************/ | ||||
| /* Copyright 2024 The OpenBLAS Project */ | |||||
| /* Copyright 2024, 2025 The OpenBLAS Project */ | |||||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | /* Copyright 2009, 2010 The University of Texas at Austin. */ | ||||
| /* All rights reserved. */ | /* All rights reserved. */ | ||||
| /* */ | /* */ | ||||
| @@ -177,6 +177,74 @@ static int init_amxtile_permission() { | |||||
| } | } | ||||
| #endif | #endif | ||||
| #ifdef SMP | |||||
| #ifdef DYNAMIC_ARCH | |||||
| extern char* gotoblas_corename(void); | |||||
| #endif | |||||
| #if defined(DYNAMIC_ARCH) || defined(NEOVERSEV1) | |||||
| static inline int get_gemm_optimal_nthreads_neoversev1(double MNK, int ncpu) { | |||||
| return | |||||
| MNK < 262144L ? 1 | |||||
| : MNK < 1124864L ? MIN(ncpu, 6) | |||||
| : MNK < 7880599L ? MIN(ncpu, 12) | |||||
| : MNK < 17173512L ? MIN(ncpu, 16) | |||||
| : MNK < 33386248L ? MIN(ncpu, 20) | |||||
| : MNK < 57066625L ? MIN(ncpu, 24) | |||||
| : MNK < 91733851L ? MIN(ncpu, 32) | |||||
| : MNK < 265847707L ? MIN(ncpu, 40) | |||||
| : MNK < 458314011L ? MIN(ncpu, 48) | |||||
| : MNK < 729000000L ? MIN(ncpu, 56) | |||||
| : ncpu; | |||||
| } | |||||
| #endif | |||||
| #if defined(DYNAMIC_ARCH) || defined(NEOVERSEV2) | |||||
| static inline int get_gemm_optimal_nthreads_neoversev2(double MNK, int ncpu) { | |||||
| return | |||||
| MNK < 125000L ? 1 | |||||
| : MNK < 1092727L ? MIN(ncpu, 6) | |||||
| : MNK < 2628072L ? MIN(ncpu, 8) | |||||
| : MNK < 8000000L ? MIN(ncpu, 12) | |||||
| : MNK < 20346417L ? MIN(ncpu, 16) | |||||
| : MNK < 57066625L ? MIN(ncpu, 24) | |||||
| : MNK < 91125000L ? MIN(ncpu, 28) | |||||
| : MNK < 238328000L ? MIN(ncpu, 40) | |||||
| : MNK < 454756609L ? MIN(ncpu, 48) | |||||
| : MNK < 857375000L ? MIN(ncpu, 56) | |||||
| : MNK < 1073741824L ? MIN(ncpu, 64) | |||||
| : ncpu; | |||||
| } | |||||
| #endif | |||||
| static inline int get_gemm_optimal_nthreads(double MNK) { | |||||
| int ncpu = num_cpu_avail(3); | |||||
| #if defined(NEOVERSEV1) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) | |||||
| return get_gemm_optimal_nthreads_neoversev1(MNK, ncpu); | |||||
| #elif defined(NEOVERSEV2) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) | |||||
| return get_gemm_optimal_nthreads_neoversev2(MNK, ncpu); | |||||
| #elif defined(DYNAMIC_ARCH) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) | |||||
| if (strcmp(gotoblas_corename(), "neoversev1") == 0) { | |||||
| return get_gemm_optimal_nthreads_neoversev1(MNK, ncpu); | |||||
| } | |||||
| if (strcmp(gotoblas_corename(), "neoversev2") == 0) { | |||||
| return get_gemm_optimal_nthreads_neoversev2(MNK, ncpu); | |||||
| } | |||||
| #endif | |||||
| if ( MNK <= (SMP_THRESHOLD_MIN * (double) GEMM_MULTITHREAD_THRESHOLD) ) { | |||||
| return 1; | |||||
| } | |||||
| else { | |||||
| if (MNK/ncpu < SMP_THRESHOLD_MIN*(double)GEMM_MULTITHREAD_THRESHOLD) { | |||||
| return MNK/(SMP_THRESHOLD_MIN*(double)GEMM_MULTITHREAD_THRESHOLD); | |||||
| } | |||||
| else { | |||||
| return ncpu; | |||||
| } | |||||
| } | |||||
| } | |||||
| #endif | |||||
| #ifndef CBLAS | #ifndef CBLAS | ||||
| void NAME(char *TRANSA, char *TRANSB, | void NAME(char *TRANSA, char *TRANSB, | ||||
| @@ -310,7 +378,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS | |||||
| FLOAT *beta = (FLOAT*) vbeta; | FLOAT *beta = (FLOAT*) vbeta; | ||||
| FLOAT *a = (FLOAT*) va; | FLOAT *a = (FLOAT*) va; | ||||
| FLOAT *b = (FLOAT*) vb; | FLOAT *b = (FLOAT*) vb; | ||||
| FLOAT *c = (FLOAT*) vc; | |||||
| FLOAT *c = (FLOAT*) vc; | |||||
| #endif | #endif | ||||
| blas_arg_t args; | blas_arg_t args; | ||||
| @@ -349,15 +417,25 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS | |||||
| PRINT_DEBUG_CNAME; | PRINT_DEBUG_CNAME; | ||||
| #if !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) && defined(USE_SGEMM_KERNEL_DIRECT) | |||||
| #ifdef DYNAMIC_ARCH | |||||
| if (support_avx512() ) | |||||
| #endif | |||||
| #if !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) | |||||
| #if defined(ARCH_x86) && (defined(USE_SGEMM_KERNEL_DIRECT)||defined(DYNAMIC_ARCH)) | |||||
| #if defined(DYNAMIC_ARCH) | |||||
| if (support_avx512() ) | |||||
| #endif | |||||
| if (beta == 0 && alpha == 1.0 && order == CblasRowMajor && TransA == CblasNoTrans && TransB == CblasNoTrans && SGEMM_DIRECT_PERFORMANT(m,n,k)) { | if (beta == 0 && alpha == 1.0 && order == CblasRowMajor && TransA == CblasNoTrans && TransB == CblasNoTrans && SGEMM_DIRECT_PERFORMANT(m,n,k)) { | ||||
| SGEMM_DIRECT(m, n, k, a, lda, b, ldb, c, ldc); | SGEMM_DIRECT(m, n, k, a, lda, b, ldb, c, ldc); | ||||
| return; | return; | ||||
| } | } | ||||
| #endif | |||||
| #if defined(ARCH_ARM64) && (defined(USE_SGEMM_KERNEL_DIRECT)||defined(DYNAMIC_ARCH)) | |||||
| #if defined(DYNAMIC_ARCH) | |||||
| if (support_sme1()) | |||||
| #endif | |||||
| if (beta == 0 && alpha == 1.0 && order == CblasRowMajor && TransA == CblasNoTrans && TransB == CblasNoTrans) { | |||||
| SGEMM_DIRECT(m, n, k, a, lda, b, ldb, c, ldc); | |||||
| return; | |||||
| } | |||||
| #endif | |||||
| #endif | #endif | ||||
| #ifndef COMPLEX | #ifndef COMPLEX | ||||
| @@ -604,13 +682,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS | |||||
| #endif | #endif | ||||
| MNK = (double) args.m * (double) args.n * (double) args.k; | MNK = (double) args.m * (double) args.n * (double) args.k; | ||||
| if ( MNK <= (SMP_THRESHOLD_MIN * (double) GEMM_MULTITHREAD_THRESHOLD) ) | |||||
| args.nthreads = 1; | |||||
| else { | |||||
| args.nthreads = num_cpu_avail(3); | |||||
| if (MNK/args.nthreads < SMP_THRESHOLD_MIN*(double)GEMM_MULTITHREAD_THRESHOLD) | |||||
| args.nthreads = MNK/(SMP_THRESHOLD_MIN*(double)GEMM_MULTITHREAD_THRESHOLD); | |||||
| } | |||||
| args.nthreads = get_gemm_optimal_nthreads(MNK); | |||||
| args.common = NULL; | args.common = NULL; | ||||
| @@ -38,6 +38,17 @@ | |||||
| #ifndef COMPLEX | #ifndef COMPLEX | ||||
| #define SMP_THRESHOLD_MIN 65536.0 | #define SMP_THRESHOLD_MIN 65536.0 | ||||
| #ifdef RNAME | |||||
| #ifdef XDOUBLE | |||||
| #define ERROR_NAME "QGEMMTR" | |||||
| #elif defined(DOUBLE) | |||||
| #define ERROR_NAME "DGEMMTR" | |||||
| #elif defined(BFLOAT16) | |||||
| #define ERROR_NAME "SBGEMMTR" | |||||
| #else | |||||
| #define ERROR_NAME "SGEMMTR" | |||||
| #endif | |||||
| #else | |||||
| #ifdef XDOUBLE | #ifdef XDOUBLE | ||||
| #define ERROR_NAME "QGEMMT " | #define ERROR_NAME "QGEMMT " | ||||
| #elif defined(DOUBLE) | #elif defined(DOUBLE) | ||||
| @@ -47,8 +58,18 @@ | |||||
| #else | #else | ||||
| #define ERROR_NAME "SGEMMT " | #define ERROR_NAME "SGEMMT " | ||||
| #endif | #endif | ||||
| #endif | |||||
| #else | #else | ||||
| #define SMP_THRESHOLD_MIN 8192.0 | #define SMP_THRESHOLD_MIN 8192.0 | ||||
| #ifdef RNAME | |||||
| #ifdef XDOUBLE | |||||
| #define ERROR_NAME "XGEMMTR" | |||||
| #elif defined(DOUBLE) | |||||
| #define ERROR_NAME "ZGEMMTR" | |||||
| #else | |||||
| #define ERROR_NAME "CGEMMTR" | |||||
| #endif | |||||
| #else | |||||
| #ifdef XDOUBLE | #ifdef XDOUBLE | ||||
| #define ERROR_NAME "XGEMMT " | #define ERROR_NAME "XGEMMT " | ||||
| #elif defined(DOUBLE) | #elif defined(DOUBLE) | ||||
| @@ -57,6 +78,7 @@ | |||||
| #define ERROR_NAME "CGEMMT " | #define ERROR_NAME "CGEMMT " | ||||
| #endif | #endif | ||||
| #endif | #endif | ||||
| #endif | |||||
| #ifndef GEMM_MULTITHREAD_THRESHOLD | #ifndef GEMM_MULTITHREAD_THRESHOLD | ||||
| #define GEMM_MULTITHREAD_THRESHOLD 4 | #define GEMM_MULTITHREAD_THRESHOLD 4 | ||||
| @@ -666,5 +688,19 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, | |||||
| IDEBUG_END; | IDEBUG_END; | ||||
| /* transform B back if necessary */ | |||||
| #if defined(COMPLEX) | |||||
| if (transb > 1){ | |||||
| #ifndef CBLAS | |||||
| IMATCOPY_K_CNC(nrowb, ncolb, (FLOAT)(1.0), (FLOAT)(0.0), b, ldb); | |||||
| #else | |||||
| if (order == CblasColMajor) | |||||
| IMATCOPY_K_CNC(nrowb, ncolb, (FLOAT)(1.0), (FLOAT)(0.0), b, ldb); | |||||
| if (order == CblasRowMajor) | |||||
| IMATCOPY_K_RNC(nrowb, ncolb, (FLOAT)(1.0), (FLOAT)(0.0), b, ldb); | |||||
| #endif | |||||
| } | |||||
| #endif | |||||
| return; | return; | ||||
| } | } | ||||
| @@ -63,6 +63,70 @@ static int (*gemv_thread[])(BLASLONG, BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT | |||||
| }; | }; | ||||
| #endif | #endif | ||||
| #ifdef SMP | |||||
| #ifdef DYNAMIC_ARCH | |||||
| extern char* gotoblas_corename(void); | |||||
| #endif | |||||
| #if defined(DYNAMIC_ARCH) || defined(NEOVERSEV1) | |||||
| static inline int get_gemv_optimal_nthreads_neoversev1(BLASLONG MN, int ncpu) { | |||||
| #ifdef DOUBLE | |||||
| return (MN < 8100L) ? 1 | |||||
| : (MN < 12100L) ? MIN(ncpu, 2) | |||||
| : (MN < 36100L) ? MIN(ncpu, 4) | |||||
| : (MN < 84100L) ? MIN(ncpu, 8) | |||||
| : (MN < 348100L) ? MIN(ncpu, 16) | |||||
| : (MN < 435600L) ? MIN(ncpu, 24) | |||||
| : (MN < 810000L) ? MIN(ncpu, 32) | |||||
| : (MN < 1050625L) ? MIN(ncpu, 40) | |||||
| : ncpu; | |||||
| #else | |||||
| return (MN < 25600L) ? 1 | |||||
| : (MN < 63001L) ? MIN(ncpu, 4) | |||||
| : (MN < 459684L) ? MIN(ncpu, 16) | |||||
| : ncpu; | |||||
| #endif | |||||
| } | |||||
| #endif | |||||
| #if defined(DYNAMIC_ARCH) || defined(NEOVERSEV2) | |||||
| static inline int get_gemv_optimal_nthreads_neoversev2(BLASLONG MN, int ncpu) { | |||||
| return | |||||
| MN < 24964L ? 1 | |||||
| : MN < 65536L ? MIN(ncpu, 8) | |||||
| : MN < 262144L ? MIN(ncpu, 32) | |||||
| : MN < 1638400L ? MIN(ncpu, 64) | |||||
| : ncpu; | |||||
| } | |||||
| #endif | |||||
| static inline int get_gemv_optimal_nthreads(BLASLONG MN) { | |||||
| int ncpu = num_cpu_avail(3); | |||||
| #if defined(_WIN64) && defined(_M_ARM64) | |||||
| if (MN > 100000000L) | |||||
| return num_cpu_avail(4); | |||||
| return 1; | |||||
| #endif | |||||
| #if defined(NEOVERSEV1) && !defined(COMPLEX) && !defined(BFLOAT16) | |||||
| return get_gemv_optimal_nthreads_neoversev1(MN, ncpu); | |||||
| #elif defined(NEOVERSEV2) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) | |||||
| return get_gemv_optimal_nthreads_neoversev2(MN, ncpu); | |||||
| #elif defined(DYNAMIC_ARCH) && !defined(COMPLEX) && !defined(BFLOAT16) | |||||
| if (strcmp(gotoblas_corename(), "neoversev1") == 0) { | |||||
| return get_gemv_optimal_nthreads_neoversev1(MN, ncpu); | |||||
| } | |||||
| if (strcmp(gotoblas_corename(), "neoversev2") == 0) { | |||||
| return get_gemv_optimal_nthreads_neoversev2(MN, ncpu); | |||||
| } | |||||
| #endif | |||||
| if ( MN < 115200L * GEMM_MULTITHREAD_THRESHOLD ) | |||||
| return 1; | |||||
| else | |||||
| return num_cpu_avail(2); | |||||
| } | |||||
| #endif | |||||
| #ifndef CBLAS | #ifndef CBLAS | ||||
| void NAME(char *TRANS, blasint *M, blasint *N, | void NAME(char *TRANS, blasint *M, blasint *N, | ||||
| @@ -202,13 +266,6 @@ void CNAME(enum CBLAS_ORDER order, | |||||
| if (alpha == ZERO) return; | if (alpha == ZERO) return; | ||||
| #if 0 | |||||
| /* this optimization causes stack corruption on x86_64 under OSX, Windows and FreeBSD */ | |||||
| if (trans == 0 && incx == 1 && incy == 1 && m*n < 2304 *GEMM_MULTITHREAD_THRESHOLD) { | |||||
| GEMV_N(m, n, 0, alpha, a, lda, x, incx, y, incy, NULL); | |||||
| return; | |||||
| } | |||||
| #endif | |||||
| IDEBUG_START; | IDEBUG_START; | ||||
| FUNCTION_PROFILE_START(); | FUNCTION_PROFILE_START(); | ||||
| @@ -225,11 +282,7 @@ void CNAME(enum CBLAS_ORDER order, | |||||
| STACK_ALLOC(buffer_size, FLOAT, buffer); | STACK_ALLOC(buffer_size, FLOAT, buffer); | ||||
| #ifdef SMP | #ifdef SMP | ||||
| if ( 1L * m * n < 115200L * GEMM_MULTITHREAD_THRESHOLD ) | |||||
| nthreads = 1; | |||||
| else | |||||
| nthreads = num_cpu_avail(2); | |||||
| nthreads = get_gemv_optimal_nthreads(1L * m * n); | |||||
| if (nthreads == 1) { | if (nthreads == 1) { | ||||
| #endif | #endif | ||||
| @@ -107,21 +107,35 @@ int NAME(blasint *N, blasint *NRHS, FLOAT *a, blasint *ldA, blasint *ipiv, | |||||
| #ifndef PPC440 | #ifndef PPC440 | ||||
| buffer = (FLOAT *)blas_memory_alloc(1); | buffer = (FLOAT *)blas_memory_alloc(1); | ||||
| sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A); | sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A); | ||||
| sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); | sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); | ||||
| #endif | #endif | ||||
| #ifdef SMP | #ifdef SMP | ||||
| args.common = NULL; | args.common = NULL; | ||||
| #ifndef DOUBLE | |||||
| if (args.m*args.n < 40000) | |||||
| #if defined(_WIN64) && defined(_M_ARM64) | |||||
| #ifdef COMPLEX | |||||
| if (args.m * args.n <= 300) | |||||
| #else | |||||
| if (args.m * args.n <= 500) | |||||
| #endif | |||||
| args.nthreads = 1; | |||||
| else if (args.m * args.n <= 1000) | |||||
| args.nthreads = 4; | |||||
| else | |||||
| args.nthreads = num_cpu_avail(4); | |||||
| #else | #else | ||||
| if (args.m*args.n < 10000) | |||||
| #ifndef DOUBLE | |||||
| if (args.m * args.n < 40000) | |||||
| #else | |||||
| if (args.m * args.n < 10000) | |||||
| #endif | |||||
| args.nthreads = 1; | |||||
| else | |||||
| args.nthreads = num_cpu_avail(4); | |||||
| #endif | #endif | ||||
| args.nthreads=1; | |||||
| else | |||||
| args.nthreads = num_cpu_avail(4); | |||||
| if (args.nthreads == 1) { | if (args.nthreads == 1) { | ||||
| #endif | #endif | ||||
| @@ -61,6 +61,37 @@ FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX){ | |||||
| #else | #else | ||||
| return fabsf(x[0]); | return fabsf(x[0]); | ||||
| #endif | #endif | ||||
| #endif | |||||
| if (incx == 0) | |||||
| #ifndef COMPLEX | |||||
| #ifdef DOUBLE | |||||
| return (sqrt((double)n)*fabs(x[0])); | |||||
| #else | |||||
| return (sqrt((float)n)*fabsf(x[0])); | |||||
| #endif | |||||
| #else | |||||
| #ifdef DOUBLE | |||||
| { | |||||
| double fr=fabs(x[0]); | |||||
| double fi=fabs(x[1]); | |||||
| double fmin=MIN(fr,fi); | |||||
| double fmax=MAX(fr,fi); | |||||
| if (fmax==0.) return(fmax); | |||||
| if (fmax==fmin) return(sqrt((double)n)*sqrt(2.)*fmax); | |||||
| return (sqrt((double)n) * fmax * sqrt (1. + (fmin/fmax)*(fmin/fmax))); | |||||
| } | |||||
| #else | |||||
| { | |||||
| float fr=fabs(x[0]); | |||||
| float fi=fabs(x[1]); | |||||
| float fmin=MIN(fr,fi); | |||||
| float fmax=MAX(fr,fi); | |||||
| if (fmax==0.) return(fmax); | |||||
| if (fmax==fmin) return(sqrt((float)n)*sqrt(2.)*fmax); | |||||
| return (sqrt((float)n) * fmax * sqrt (1. + (fmin/fmax)*(fmin/fmax))); | |||||
| } | |||||
| #endif | |||||
| #endif | #endif | ||||
| if (incx < 0) | if (incx < 0) | ||||
| @@ -97,13 +128,44 @@ FLOAT CNAME(blasint n, FLOAT *x, blasint incx){ | |||||
| if (n <= 0) return 0.; | if (n <= 0) return 0.; | ||||
| #ifndef COMPLEX | |||||
| #ifndef COMPLEX | |||||
| if (n == 1) | if (n == 1) | ||||
| #ifdef DOUBLE | #ifdef DOUBLE | ||||
| return fabs(x[0]); | return fabs(x[0]); | ||||
| #else | #else | ||||
| return fabsf(x[0]); | return fabsf(x[0]); | ||||
| #endif | #endif | ||||
| #endif | |||||
| if (incx == 0) | |||||
| #ifndef COMPLEX | |||||
| #ifdef DOUBLE | |||||
| return (sqrt((double)n)*fabs(x[0])); | |||||
| #else | |||||
| return (sqrt((float)n)*fabsf(x[0])); | |||||
| #endif | |||||
| #else | |||||
| #ifdef DOUBLE | |||||
| { | |||||
| double fr=fabs(x[0]); | |||||
| double fi=fabs(x[1]); | |||||
| double fmin=MIN(fr,fi); | |||||
| double fmax=MAX(fr,fi); | |||||
| if (fmax==0.) return(fmax); | |||||
| if (fmax==fmin) return(sqrt((double)n)*sqrt(2.)*fmax); | |||||
| return (sqrt((double)n) * fmax * sqrt (1. + (fmin/fmax)*(fmin/fmax))); | |||||
| } | |||||
| #else | |||||
| { | |||||
| float fr=fabs(x[0]); | |||||
| float fi=fabs(x[1]); | |||||
| float fmin=MIN(fr,fi); | |||||
| float fmax=MAX(fr,fi); | |||||
| if (fmax==0.) return(fmax); | |||||
| if (fmax==fmin) return(sqrt((float)n)*sqrt(2.)*fmax); | |||||
| return (sqrt((float)n) * fmax * sqrt (1. + (fmin/fmax)*(fmin/fmax))); | |||||
| } | |||||
| #endif | |||||
| #endif | #endif | ||||
| if (incx < 0) | if (incx < 0) | ||||
| @@ -7,149 +7,21 @@ | |||||
| void NAME(blasint *N, FLOAT *dx, blasint *INCX, FLOAT *dy, blasint *INCY, FLOAT *dparam){ | void NAME(blasint *N, FLOAT *dx, blasint *INCX, FLOAT *dy, blasint *INCY, FLOAT *dparam){ | ||||
| blasint n = *N; | |||||
| blasint incx = *INCX; | |||||
| blasint incy = *INCY; | |||||
| blasint n = *N; | |||||
| blasint incx = *INCX; | |||||
| blasint incy = *INCY; | |||||
| PRINT_DEBUG_NAME | |||||
| #else | #else | ||||
| void CNAME(blasint n, FLOAT *dx, blasint incx, FLOAT *dy, blasint incy, FLOAT *dparam){ | void CNAME(blasint n, FLOAT *dx, blasint incx, FLOAT *dy, blasint incy, FLOAT *dparam){ | ||||
| #endif | |||||
| blasint i__1, i__2; | |||||
| PRINT_DEBUG_CNAME; | |||||
| blasint i__; | |||||
| FLOAT w, z__; | |||||
| blasint kx, ky; | |||||
| FLOAT dh11, dh12, dh22, dh21, dflag; | |||||
| blasint nsteps; | |||||
| #ifndef CBLAS | |||||
| PRINT_DEBUG_CNAME; | |||||
| #else | |||||
| PRINT_DEBUG_CNAME; | |||||
| #endif | #endif | ||||
| --dparam; | |||||
| --dy; | |||||
| --dx; | |||||
| dflag = dparam[1]; | |||||
| if (n <= 0 || dflag == - 2.0) goto L140; | |||||
| if (! (incx == incy && incx > 0)) goto L70; | |||||
| nsteps = n * incx; | |||||
| if (dflag < 0.) { | |||||
| goto L50; | |||||
| } else if (dflag == 0) { | |||||
| goto L10; | |||||
| } else { | |||||
| goto L30; | |||||
| } | |||||
| L10: | |||||
| dh12 = dparam[4]; | |||||
| dh21 = dparam[3]; | |||||
| i__1 = nsteps; | |||||
| i__2 = incx; | |||||
| for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) { | |||||
| w = dx[i__]; | |||||
| z__ = dy[i__]; | |||||
| dx[i__] = w + z__ * dh12; | |||||
| dy[i__] = w * dh21 + z__; | |||||
| /* L20: */ | |||||
| } | |||||
| goto L140; | |||||
| L30: | |||||
| dh11 = dparam[2]; | |||||
| dh22 = dparam[5]; | |||||
| i__2 = nsteps; | |||||
| i__1 = incx; | |||||
| for (i__ = 1; i__1 < 0 ? i__ >= i__2 : i__ <= i__2; i__ += i__1) { | |||||
| w = dx[i__]; | |||||
| z__ = dy[i__]; | |||||
| dx[i__] = w * dh11 + z__; | |||||
| dy[i__] = -w + dh22 * z__; | |||||
| /* L40: */ | |||||
| } | |||||
| goto L140; | |||||
| L50: | |||||
| dh11 = dparam[2]; | |||||
| dh12 = dparam[4]; | |||||
| dh21 = dparam[3]; | |||||
| dh22 = dparam[5]; | |||||
| i__1 = nsteps; | |||||
| i__2 = incx; | |||||
| for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) { | |||||
| w = dx[i__]; | |||||
| z__ = dy[i__]; | |||||
| dx[i__] = w * dh11 + z__ * dh12; | |||||
| dy[i__] = w * dh21 + z__ * dh22; | |||||
| /* L60: */ | |||||
| } | |||||
| goto L140; | |||||
| L70: | |||||
| kx = 1; | |||||
| ky = 1; | |||||
| if (incx < 0) { | |||||
| kx = (1 - n) * incx + 1; | |||||
| } | |||||
| if (incy < 0) { | |||||
| ky = (1 - n) * incy + 1; | |||||
| } | |||||
| ROTM_K(n, dx, incx, dy, incy, dparam); | |||||
| if (dflag < 0.) { | |||||
| goto L120; | |||||
| } else if (dflag == 0) { | |||||
| goto L80; | |||||
| } else { | |||||
| goto L100; | |||||
| } | |||||
| L80: | |||||
| dh12 = dparam[4]; | |||||
| dh21 = dparam[3]; | |||||
| i__2 = n; | |||||
| for (i__ = 1; i__ <= i__2; ++i__) { | |||||
| w = dx[kx]; | |||||
| z__ = dy[ky]; | |||||
| dx[kx] = w + z__ * dh12; | |||||
| dy[ky] = w * dh21 + z__; | |||||
| kx += incx; | |||||
| ky += incy; | |||||
| /* L90: */ | |||||
| } | |||||
| goto L140; | |||||
| L100: | |||||
| dh11 = dparam[2]; | |||||
| dh22 = dparam[5]; | |||||
| i__2 = n; | |||||
| for (i__ = 1; i__ <= i__2; ++i__) { | |||||
| w = dx[kx]; | |||||
| z__ = dy[ky]; | |||||
| dx[kx] = w * dh11 + z__; | |||||
| dy[ky] = -w + dh22 * z__; | |||||
| kx += incx; | |||||
| ky += incy; | |||||
| /* L110: */ | |||||
| } | |||||
| goto L140; | |||||
| L120: | |||||
| dh11 = dparam[2]; | |||||
| dh12 = dparam[4]; | |||||
| dh21 = dparam[3]; | |||||
| dh22 = dparam[5]; | |||||
| i__2 = n; | |||||
| for (i__ = 1; i__ <= i__2; ++i__) { | |||||
| w = dx[kx]; | |||||
| z__ = dy[ky]; | |||||
| dx[kx] = w * dh11 + z__ * dh12; | |||||
| dy[ky] = w * dh21 + z__ * dh22; | |||||
| kx += incx; | |||||
| ky += incy; | |||||
| /* L130: */ | |||||
| } | |||||
| L140: | |||||
| return; | return; | ||||
| } | } | ||||
| @@ -252,25 +252,30 @@ void CNAME(enum CBLAS_ORDER order, | |||||
| #ifdef SMP | #ifdef SMP | ||||
| if ( 1L * m * n < 1024L * GEMM_MULTITHREAD_THRESHOLD ) | |||||
| #if defined(_WIN64) && defined(_M_ARM64) | |||||
| if (m*n > 25000000L) | |||||
| nthreads = num_cpu_avail(4); | |||||
| else | |||||
| nthreads = 1; | |||||
| #else | |||||
| if (1L * m * n < 1024L * GEMM_MULTITHREAD_THRESHOLD) | |||||
| nthreads = 1; | nthreads = 1; | ||||
| else | else | ||||
| nthreads = num_cpu_avail(2); | nthreads = num_cpu_avail(2); | ||||
| #endif | |||||
| if (nthreads == 1) { | if (nthreads == 1) { | ||||
| #endif | |||||
| #endif | |||||
| (gemv[(int)trans])(m, n, 0, alpha_r, alpha_i, a, lda, x, incx, y, incy, buffer); | (gemv[(int)trans])(m, n, 0, alpha_r, alpha_i, a, lda, x, incx, y, incy, buffer); | ||||
| #ifdef SMP | #ifdef SMP | ||||
| } else { | } else { | ||||
| (gemv_thread[(int)trans])(m, n, ALPHA, a, lda, x, incx, y, incy, buffer, nthreads); | (gemv_thread[(int)trans])(m, n, ALPHA, a, lda, x, incx, y, incy, buffer, nthreads); | ||||
| } | } | ||||
| #endif | #endif | ||||
| STACK_FREE(buffer); | STACK_FREE(buffer); | ||||
| FUNCTION_PROFILE_END(4, m * n + m + n, 2 * m * n); | FUNCTION_PROFILE_END(4, m * n + m + n, 2 * m * n); | ||||
| @@ -98,7 +98,7 @@ void CNAME(blasint n, FLOAT alpha_r, void *vx, blasint incx){ | |||||
| if (nthreads == 1) { | if (nthreads == 1) { | ||||
| #endif | #endif | ||||
| SCAL_K(n, 0, 0, alpha[0], alpha[1], x, incx, NULL, 0, NULL, 0); | |||||
| SCAL_K(n, 0, 0, alpha[0], alpha[1], x, incx, NULL, 0, NULL, 1); | |||||
| #ifdef SMP | #ifdef SMP | ||||
| } else { | } else { | ||||
| @@ -108,7 +108,7 @@ void CNAME(blasint n, FLOAT alpha_r, void *vx, blasint incx){ | |||||
| mode = BLAS_SINGLE | BLAS_COMPLEX; | mode = BLAS_SINGLE | BLAS_COMPLEX; | ||||
| #endif | #endif | ||||
| blas_level1_thread(mode, n, 0, 0, alpha, x, incx, NULL, 0, NULL, 0, (int (*)(void))SCAL_K, nthreads); | |||||
| blas_level1_thread(mode, n, 0, 0, alpha, x, incx, NULL, 0, NULL, 1, (int (*)(void))SCAL_K, nthreads); | |||||
| } | } | ||||
| #endif | #endif | ||||
| @@ -116,12 +116,12 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, | |||||
| #else | #else | ||||
| void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, int n, FLOAT alpha, FLOAT *x, int incx, FLOAT *a, int lda) { | |||||
| void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, int n, void* valpha, FLOAT *x, int incx, FLOAT *a, int lda) { | |||||
| FLOAT *buffer; | FLOAT *buffer; | ||||
| int uplo; | int uplo; | ||||
| blasint info; | blasint info; | ||||
| FLOAT * ALPHA = α | |||||
| FLOAT * ALPHA = (FLOAT*)valpha; | |||||
| FLOAT alpha_r = ALPHA[0]; | FLOAT alpha_r = ALPHA[0]; | ||||
| FLOAT alpha_i = ALPHA[1]; | FLOAT alpha_i = ALPHA[1]; | ||||
| #ifdef SMP | #ifdef SMP | ||||
| @@ -65,6 +65,7 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) | |||||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}COPYKERNEL}" "C_INTERFACE" "copy_k" false "" "" false ${float_type}) | GenerateNamedObjects("${KERNELDIR}/${${float_char}COPYKERNEL}" "C_INTERFACE" "copy_k" false "" "" false ${float_type}) | ||||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}NRM2KERNEL}" "" "nrm2_k" false "" "" false ${float_type}) | GenerateNamedObjects("${KERNELDIR}/${${float_char}NRM2KERNEL}" "" "nrm2_k" false "" "" false ${float_type}) | ||||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}ROTKERNEL}" "" "rot_k" false "" "" false ${float_type}) | GenerateNamedObjects("${KERNELDIR}/${${float_char}ROTKERNEL}" "" "rot_k" false "" "" false ${float_type}) | ||||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}ROTMKERNEL}" "" "rotm_k" false "" "" false ${float_type}) | |||||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}SCALKERNEL}" "" "scal_k" false "" "" false ${float_type}) | GenerateNamedObjects("${KERNELDIR}/${${float_char}SCALKERNEL}" "" "scal_k" false "" "" false ${float_type}) | ||||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}SWAPKERNEL}" "" "swap_k" false "" "" false ${float_type}) | GenerateNamedObjects("${KERNELDIR}/${${float_char}SWAPKERNEL}" "" "swap_k" false "" "" false ${float_type}) | ||||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}AXPBYKERNEL}" "" "axpby_k" false "" "" false ${float_type}) | GenerateNamedObjects("${KERNELDIR}/${${float_char}AXPBYKERNEL}" "" "axpby_k" false "" "" false ${float_type}) | ||||
| @@ -125,6 +126,7 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) | |||||
| GenerateNamedObjects("${KERNELDIR}/${SNRM2KERNEL}" "" "nrm2_k" false "" "" false "SINGLE") | GenerateNamedObjects("${KERNELDIR}/${SNRM2KERNEL}" "" "nrm2_k" false "" "" false "SINGLE") | ||||
| GenerateNamedObjects("${KERNELDIR}/${SDOTKERNEL}" "" "dot_k" false "" "" false "SINGLE") | GenerateNamedObjects("${KERNELDIR}/${SDOTKERNEL}" "" "dot_k" false "" "" false "SINGLE") | ||||
| GenerateNamedObjects("${KERNELDIR}/${SROTKERNEL}" "" "rot_k" false "" "" false "SINGLE") | GenerateNamedObjects("${KERNELDIR}/${SROTKERNEL}" "" "rot_k" false "" "" false "SINGLE") | ||||
| GenerateNamedObjects("${KERNELDIR}/${SROTMKERNEL}" "" "rotm_k" false "" "" false "SINGLE") | |||||
| endif () | endif () | ||||
| if (BUILD_COMPLEX16 AND NOT BUILD_DOUBLE) | if (BUILD_COMPLEX16 AND NOT BUILD_DOUBLE) | ||||
| GenerateNamedObjects("${KERNELDIR}/${DAMAXKERNEL}" "USE_ABS" "amax_k" false "" "" false "DOUBLE") | GenerateNamedObjects("${KERNELDIR}/${DAMAXKERNEL}" "USE_ABS" "amax_k" false "" "" false "DOUBLE") | ||||
| @@ -148,6 +150,7 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) | |||||
| GenerateNamedObjects("${KERNELDIR}/${DCOPYKERNEL}" "C_INTERFACE" "copy_k" false "" "" false "DOUBLE") | GenerateNamedObjects("${KERNELDIR}/${DCOPYKERNEL}" "C_INTERFACE" "copy_k" false "" "" false "DOUBLE") | ||||
| GenerateNamedObjects("${KERNELDIR}/${DNRM2KERNEL}" "" "nrm2_k" false "" "" false "DOUBLE") | GenerateNamedObjects("${KERNELDIR}/${DNRM2KERNEL}" "" "nrm2_k" false "" "" false "DOUBLE") | ||||
| GenerateNamedObjects("${KERNELDIR}/${DROTKERNEL}" "" "rot_k" false "" "" false "DOUBLE") | GenerateNamedObjects("${KERNELDIR}/${DROTKERNEL}" "" "rot_k" false "" "" false "DOUBLE") | ||||
| GenerateNamedObjects("${KERNELDIR}/${DROTMKERNEL}" "" "rotm_k" false "" "" false "DOUBLE") | |||||
| GenerateNamedObjects("${KERNELDIR}/${DDOTKERNEL}" "" "dot_k" false "" "" false "DOUBLE") | GenerateNamedObjects("${KERNELDIR}/${DDOTKERNEL}" "" "dot_k" false "" "" false "DOUBLE") | ||||
| GenerateNamedObjects("${KERNELDIR}/${DSWAPKERNEL}" "" "swap_k" false "" "" false "DOUBLE") | GenerateNamedObjects("${KERNELDIR}/${DSWAPKERNEL}" "" "swap_k" false "" "" false "DOUBLE") | ||||
| GenerateNamedObjects("${KERNELDIR}/${DAXPYKERNEL}" "" "axpy_k" false "" "" false "DOUBLE") | GenerateNamedObjects("${KERNELDIR}/${DAXPYKERNEL}" "" "axpy_k" false "" "" false "DOUBLE") | ||||
| @@ -198,25 +201,35 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) | |||||
| # Makefile.L3 | # Makefile.L3 | ||||
| set(USE_TRMM false) | set(USE_TRMM false) | ||||
| string(TOUPPER ${TARGET_CORE} UC_TARGET_CORE) | string(TOUPPER ${TARGET_CORE} UC_TARGET_CORE) | ||||
| if (ARM OR ARM64 OR (UC_TARGET_CORE MATCHES LONGSOON3B) OR (UC_TARGET_CORE MATCHES GENERIC) OR (UC_TARGET_CORE MATCHES HASWELL) OR (UC_TARGET_CORE MATCHES ZEN) OR (UC_TARGET_CORE MATCHES SKYLAKEX) OR (UC_TARGET_CORE MATCHES COOPERLAKE) OR (UC_TARGET_CORE MATCHES SAPPHIRERAPIDS)) | |||||
| if (ARM OR ARM64 OR RISCV64 OR (UC_TARGET_CORE MATCHES LONGSOON3B) OR (UC_TARGET_CORE MATCHES GENERIC) OR (UC_TARGET_CORE MATCHES HASWELL) OR (UC_TARGET_CORE MATCHES ZEN) OR (UC_TARGET_CORE MATCHES SKYLAKEX) OR (UC_TARGET_CORE MATCHES COOPERLAKE) OR (UC_TARGET_CORE MATCHES SAPPHIRERAPIDS)) | |||||
| set(USE_TRMM true) | set(USE_TRMM true) | ||||
| endif () | endif () | ||||
| if (ZARCH OR (UC_TARGET_CORE MATCHES POWER8) OR (UC_TARGET_CORE MATCHES POWER9) OR (UC_TARGET_CORE MATCHES POWER10)) | if (ZARCH OR (UC_TARGET_CORE MATCHES POWER8) OR (UC_TARGET_CORE MATCHES POWER9) OR (UC_TARGET_CORE MATCHES POWER10)) | ||||
| set(USE_TRMM true) | set(USE_TRMM true) | ||||
| endif () | endif () | ||||
| set(USE_DIRECT_SGEMM false) | set(USE_DIRECT_SGEMM false) | ||||
| if (X86_64) | |||||
| if (X86_64 OR ARM64) | |||||
| set(USE_DIRECT_SGEMM true) | set(USE_DIRECT_SGEMM true) | ||||
| endif() | endif() | ||||
| if (USE_DIRECT_SGEMM) | if (USE_DIRECT_SGEMM) | ||||
| # if (NOT DEFINED SGEMMDIRECTKERNEL) | # if (NOT DEFINED SGEMMDIRECTKERNEL) | ||||
| if (X86_64) | |||||
| set (SGEMMDIRECTKERNEL sgemm_direct_skylakex.c) | set (SGEMMDIRECTKERNEL sgemm_direct_skylakex.c) | ||||
| set (SGEMMDIRECTPERFORMANT sgemm_direct_performant.c) | set (SGEMMDIRECTPERFORMANT sgemm_direct_performant.c) | ||||
| # endif() | # endif() | ||||
| GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTKERNEL}" "" "gemm_direct" false "" "" false SINGLE) | GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTKERNEL}" "" "gemm_direct" false "" "" false SINGLE) | ||||
| GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTPERFORMANT}" "" "gemm_direct_performant" false "" "" false SINGLE) | GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTPERFORMANT}" "" "gemm_direct_performant" false "" "" false SINGLE) | ||||
| elseif (ARM64) | |||||
| set (SGEMMDIRECTKERNEL sgemm_direct_arm64_sme1.c) | |||||
| set (SGEMMDIRECTSMEKERNEL sgemm_direct_sme1.S) | |||||
| set (SGEMMDIRECTPREKERNEL sgemm_direct_sme1_preprocess.S) | |||||
| GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTKERNEL}" "" "gemm_direct" false "" "" false SINGLE) | |||||
| if (HAVE_SME) | |||||
| GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTSMEKERNEL}" "" "gemm_direct_sme1" false "" "" false SINGLE) | |||||
| GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTPREKERNEL}" "" "gemm_direct_sme1_preprocess" false "" "" false SINGLE) | |||||
| endif () | |||||
| endif () | |||||
| endif() | endif() | ||||
| foreach (float_type SINGLE DOUBLE) | foreach (float_type SINGLE DOUBLE) | ||||
| @@ -1105,6 +1118,7 @@ endif () | |||||
| GenerateNamedObjects("${KERNELDIR}/${DCOPYKERNEL}" "C_INTERFACE" "copy_k" false "" "" false "DOUBLE") | GenerateNamedObjects("${KERNELDIR}/${DCOPYKERNEL}" "C_INTERFACE" "copy_k" false "" "" false "DOUBLE") | ||||
| GenerateNamedObjects("${KERNELDIR}/${DNRM2KERNEL}" "" "nrm2_k" false "" "" false "DOUBLE") | GenerateNamedObjects("${KERNELDIR}/${DNRM2KERNEL}" "" "nrm2_k" false "" "" false "DOUBLE") | ||||
| GenerateNamedObjects("${KERNELDIR}/${DROTKERNEL}" "" "rot_k" false "" "" false "DOUBLE") | GenerateNamedObjects("${KERNELDIR}/${DROTKERNEL}" "" "rot_k" false "" "" false "DOUBLE") | ||||
| GenerateNamedObjects("${KERNELDIR}/${DROTMKERNEL}" "" "rotm_k" false "" "" false "DOUBLE") | |||||
| GenerateNamedObjects("${KERNELDIR}/${DDOTKERNEL}" "" "dot_k" false "" "" false "DOUBLE") | GenerateNamedObjects("${KERNELDIR}/${DDOTKERNEL}" "" "dot_k" false "" "" false "DOUBLE") | ||||
| GenerateNamedObjects("${KERNELDIR}/${DSWAPKERNEL}" "" "swap_k" false "" "" false "DOUBLE") | GenerateNamedObjects("${KERNELDIR}/${DSWAPKERNEL}" "" "swap_k" false "" "" false "DOUBLE") | ||||
| GenerateNamedObjects("${KERNELDIR}/${DAXPYKERNEL}" "" "axpy_k" false "" "" false "DOUBLE") | GenerateNamedObjects("${KERNELDIR}/${DAXPYKERNEL}" "" "axpy_k" false "" "" false "DOUBLE") | ||||
| @@ -1352,6 +1366,9 @@ endif () | |||||
| if (USE_GEMM3M) | if (USE_GEMM3M) | ||||
| target_compile_definitions(kernel${TSUFFIX} PRIVATE USE_GEMM3M) | target_compile_definitions(kernel${TSUFFIX} PRIVATE USE_GEMM3M) | ||||
| endif() | endif() | ||||
| if (USE_OPENMP) | |||||
| target_link_libraries(kernel${TSUFFIX} OpenMP::OpenMP_C) | |||||
| endif() | |||||
| endfunction () | endfunction () | ||||
| @@ -24,7 +24,11 @@ ifdef NO_AVX2 | |||||
| AVX2OPT= | AVX2OPT= | ||||
| endif | endif | ||||
| ifdef TARGET_CORE | ifdef TARGET_CORE | ||||
| ifeq ($(TARGET_CORE), ARMV9SME) | |||||
| override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -DHAVE_SME -march=armv9-a+sve2+sme | |||||
| endif | |||||
| ifeq ($(TARGET_CORE), SAPPHIRERAPIDS) | ifeq ($(TARGET_CORE), SAPPHIRERAPIDS) | ||||
| override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) | override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) | ||||
| ifeq (1, $(filter 1,$(GCCVERSIONGTEQ11) $(CLANGVERSIONGTEQ12))) | ifeq (1, $(filter 1,$(GCCVERSIONGTEQ11) $(CLANGVERSIONGTEQ12))) | ||||
| @@ -336,6 +336,18 @@ ifndef XROTKERNEL | |||||
| XROTKERNEL = zrot.S | XROTKERNEL = zrot.S | ||||
| endif | endif | ||||
| ifndef SROTMKERNEL | |||||
| SROTMKERNEL = rotm.S | |||||
| endif | |||||
| ifndef DROTMKERNEL | |||||
| DROTMKERNEL = rotm.S | |||||
| endif | |||||
| ifndef QROTMKERNEL | |||||
| QROTMKERNEL = rotm.S | |||||
| endif | |||||
| ### SCAL ### | ### SCAL ### | ||||
| ifndef SSCALKERNEL | ifndef SSCALKERNEL | ||||
| @@ -504,21 +516,21 @@ SBLASOBJS += \ | |||||
| sasum_k$(TSUFFIX).$(SUFFIX) ssum_k$(TSUFFIX).$(SUFFIX) saxpy_k$(TSUFFIX).$(SUFFIX) scopy_k$(TSUFFIX).$(SUFFIX) \ | sasum_k$(TSUFFIX).$(SUFFIX) ssum_k$(TSUFFIX).$(SUFFIX) saxpy_k$(TSUFFIX).$(SUFFIX) scopy_k$(TSUFFIX).$(SUFFIX) \ | ||||
| sdot_k$(TSUFFIX).$(SUFFIX) sdsdot_k$(TSUFFIX).$(SUFFIX) dsdot_k$(TSUFFIX).$(SUFFIX) \ | sdot_k$(TSUFFIX).$(SUFFIX) sdsdot_k$(TSUFFIX).$(SUFFIX) dsdot_k$(TSUFFIX).$(SUFFIX) \ | ||||
| snrm2_k$(TSUFFIX).$(SUFFIX) srot_k$(TSUFFIX).$(SUFFIX) sscal_k$(TSUFFIX).$(SUFFIX) sswap_k$(TSUFFIX).$(SUFFIX) \ | snrm2_k$(TSUFFIX).$(SUFFIX) srot_k$(TSUFFIX).$(SUFFIX) sscal_k$(TSUFFIX).$(SUFFIX) sswap_k$(TSUFFIX).$(SUFFIX) \ | ||||
| saxpby_k$(TSUFFIX).$(SUFFIX) | |||||
| saxpby_k$(TSUFFIX).$(SUFFIX) srotm_k$(TSUFFIX).$(SUFFIX) | |||||
| DBLASOBJS += \ | DBLASOBJS += \ | ||||
| damax_k$(TSUFFIX).$(SUFFIX) damin_k$(TSUFFIX).$(SUFFIX) dmax_k$(TSUFFIX).$(SUFFIX) dmin_k$(TSUFFIX).$(SUFFIX) \ | damax_k$(TSUFFIX).$(SUFFIX) damin_k$(TSUFFIX).$(SUFFIX) dmax_k$(TSUFFIX).$(SUFFIX) dmin_k$(TSUFFIX).$(SUFFIX) \ | ||||
| idamax_k$(TSUFFIX).$(SUFFIX) idamin_k$(TSUFFIX).$(SUFFIX) idmax_k$(TSUFFIX).$(SUFFIX) idmin_k$(TSUFFIX).$(SUFFIX) \ | idamax_k$(TSUFFIX).$(SUFFIX) idamin_k$(TSUFFIX).$(SUFFIX) idmax_k$(TSUFFIX).$(SUFFIX) idmin_k$(TSUFFIX).$(SUFFIX) \ | ||||
| dasum_k$(TSUFFIX).$(SUFFIX) daxpy_k$(TSUFFIX).$(SUFFIX) dcopy_k$(TSUFFIX).$(SUFFIX) ddot_k$(TSUFFIX).$(SUFFIX) \ | dasum_k$(TSUFFIX).$(SUFFIX) daxpy_k$(TSUFFIX).$(SUFFIX) dcopy_k$(TSUFFIX).$(SUFFIX) ddot_k$(TSUFFIX).$(SUFFIX) \ | ||||
| dnrm2_k$(TSUFFIX).$(SUFFIX) drot_k$(TSUFFIX).$(SUFFIX) dscal_k$(TSUFFIX).$(SUFFIX) dswap_k$(TSUFFIX).$(SUFFIX) \ | dnrm2_k$(TSUFFIX).$(SUFFIX) drot_k$(TSUFFIX).$(SUFFIX) dscal_k$(TSUFFIX).$(SUFFIX) dswap_k$(TSUFFIX).$(SUFFIX) \ | ||||
| daxpby_k$(TSUFFIX).$(SUFFIX) dsum_k$(TSUFFIX).$(SUFFIX) | |||||
| daxpby_k$(TSUFFIX).$(SUFFIX) dsum_k$(TSUFFIX).$(SUFFIX) drotm_k$(TSUFFIX).$(SUFFIX) | |||||
| QBLASOBJS += \ | QBLASOBJS += \ | ||||
| qamax_k$(TSUFFIX).$(SUFFIX) qamin_k$(TSUFFIX).$(SUFFIX) qmax_k$(TSUFFIX).$(SUFFIX) qmin_k$(TSUFFIX).$(SUFFIX) \ | qamax_k$(TSUFFIX).$(SUFFIX) qamin_k$(TSUFFIX).$(SUFFIX) qmax_k$(TSUFFIX).$(SUFFIX) qmin_k$(TSUFFIX).$(SUFFIX) \ | ||||
| iqamax_k$(TSUFFIX).$(SUFFIX) iqamin_k$(TSUFFIX).$(SUFFIX) iqmax_k$(TSUFFIX).$(SUFFIX) iqmin_k$(TSUFFIX).$(SUFFIX) \ | iqamax_k$(TSUFFIX).$(SUFFIX) iqamin_k$(TSUFFIX).$(SUFFIX) iqmax_k$(TSUFFIX).$(SUFFIX) iqmin_k$(TSUFFIX).$(SUFFIX) \ | ||||
| qasum_k$(TSUFFIX).$(SUFFIX) qaxpy_k$(TSUFFIX).$(SUFFIX) qcopy_k$(TSUFFIX).$(SUFFIX) qdot_k$(TSUFFIX).$(SUFFIX) \ | qasum_k$(TSUFFIX).$(SUFFIX) qaxpy_k$(TSUFFIX).$(SUFFIX) qcopy_k$(TSUFFIX).$(SUFFIX) qdot_k$(TSUFFIX).$(SUFFIX) \ | ||||
| qnrm2_k$(TSUFFIX).$(SUFFIX) qrot_k$(TSUFFIX).$(SUFFIX) qscal_k$(TSUFFIX).$(SUFFIX) qswap_k$(TSUFFIX).$(SUFFIX) \ | qnrm2_k$(TSUFFIX).$(SUFFIX) qrot_k$(TSUFFIX).$(SUFFIX) qscal_k$(TSUFFIX).$(SUFFIX) qswap_k$(TSUFFIX).$(SUFFIX) \ | ||||
| qsum_k$(TSUFFIX).$(SUFFIX) | |||||
| qsum_k$(TSUFFIX).$(SUFFIX) qrotm_k$(TSUFFIX).$(SUFFIX) | |||||
| CBLASOBJS += \ | CBLASOBJS += \ | ||||
| camax_k$(TSUFFIX).$(SUFFIX) camin_k$(TSUFFIX).$(SUFFIX) icamax_k$(TSUFFIX).$(SUFFIX) icamin_k$(TSUFFIX).$(SUFFIX) \ | camax_k$(TSUFFIX).$(SUFFIX) camin_k$(TSUFFIX).$(SUFFIX) icamax_k$(TSUFFIX).$(SUFFIX) icamin_k$(TSUFFIX).$(SUFFIX) \ | ||||
| @@ -842,7 +854,16 @@ $(KDIR)drot_k$(TSUFFIX).$(SUFFIX) $(KDIR)drot_k$(TPSUFFIX).$(PSUFFIX) : $(KERN | |||||
| $(CC) -c $(CFLAGS) $(FMAFLAG) -UCOMPLEX -UCOMPLEX -DDOUBLE $< -o $@ | $(CC) -c $(CFLAGS) $(FMAFLAG) -UCOMPLEX -UCOMPLEX -DDOUBLE $< -o $@ | ||||
| $(KDIR)qrot_k$(TSUFFIX).$(SUFFIX) $(KDIR)qrot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QROTKERNEL) | $(KDIR)qrot_k$(TSUFFIX).$(SUFFIX) $(KDIR)qrot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QROTKERNEL) | ||||
| $(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -DXDOUBLE $< -o $@ | |||||
| $(CC) -c $(CFLAGS) $(FMAFLAG) -UCOMPLEX -UCOMPLEX -DXDOUBLE $< -o $@ | |||||
| $(KDIR)srotm_k$(TSUFFIX).$(SUFFIX) $(KDIR)srotm_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SROTMKERNEL) | |||||
| $(CC) -c $(CFLAGS) $(FMAFLAG) -UCOMPLEX -UCOMPLEX -UDOUBLE $< -o $@ | |||||
| $(KDIR)drotm_k$(TSUFFIX).$(SUFFIX) $(KDIR)drotm_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DROTMKERNEL) | |||||
| $(CC) -c $(CFLAGS) $(FMAFLAG) -UCOMPLEX -UCOMPLEX -DDOUBLE $< -o $@ | |||||
| $(KDIR)qrotm_k$(TSUFFIX).$(SUFFIX) $(KDIR)qrotm_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QROTMKERNEL) | |||||
| $(CC) -c $(CFLAGS) $(FMAFLAG) -UCOMPLEX -UCOMPLEX -DXDOUBLE $< -o $@ | |||||
| $(KDIR)csrot_k$(TSUFFIX).$(SUFFIX) $(KDIR)csrot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CROTKERNEL) | $(KDIR)csrot_k$(TSUFFIX).$(SUFFIX) $(KDIR)csrot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CROTKERNEL) | ||||
| $(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -UDOUBLE $< -o $@ | $(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -UDOUBLE $< -o $@ | ||||
| @@ -24,6 +24,7 @@ endif | |||||
| ifeq ($(ARCH), arm64) | ifeq ($(ARCH), arm64) | ||||
| USE_TRMM = 1 | USE_TRMM = 1 | ||||
| USE_DIRECT_SGEMM = 1 | |||||
| endif | endif | ||||
| ifeq ($(ARCH), riscv64) | ifeq ($(ARCH), riscv64) | ||||
| @@ -95,9 +96,17 @@ endif | |||||
| ifdef USE_DIRECT_SGEMM | ifdef USE_DIRECT_SGEMM | ||||
| ifndef SGEMMDIRECTKERNEL | ifndef SGEMMDIRECTKERNEL | ||||
| ifeq ($(ARCH), x86_64) | |||||
| SGEMMDIRECTKERNEL = sgemm_direct_skylakex.c | SGEMMDIRECTKERNEL = sgemm_direct_skylakex.c | ||||
| SGEMMDIRECTPERFORMANT = sgemm_direct_performant.c | SGEMMDIRECTPERFORMANT = sgemm_direct_performant.c | ||||
| endif | endif | ||||
| ifeq ($(ARCH), arm64) | |||||
| ifeq ($(TARGET_CORE), ARMV9SME) | |||||
| HAVE_SME = 1 | |||||
| endif | |||||
| SGEMMDIRECTKERNEL = sgemm_direct_arm64_sme1.c | |||||
| endif | |||||
| endif | |||||
| endif | endif | ||||
| ifeq ($(BUILD_BFLOAT16), 1) | ifeq ($(BUILD_BFLOAT16), 1) | ||||
| @@ -128,9 +137,20 @@ SKERNELOBJS += \ | |||||
| $(SGEMMONCOPYOBJ) $(SGEMMOTCOPYOBJ) | $(SGEMMONCOPYOBJ) $(SGEMMOTCOPYOBJ) | ||||
| ifdef USE_DIRECT_SGEMM | ifdef USE_DIRECT_SGEMM | ||||
| ifeq ($(ARCH), x86_64) | |||||
| SKERNELOBJS += \ | SKERNELOBJS += \ | ||||
| sgemm_direct$(TSUFFIX).$(SUFFIX) \ | sgemm_direct$(TSUFFIX).$(SUFFIX) \ | ||||
| sgemm_direct_performant$(TSUFFIX).$(SUFFIX) | |||||
| sgemm_direct_performant$(TSUFFIX).$(SUFFIX) | |||||
| endif | |||||
| ifeq ($(ARCH), arm64) | |||||
| SKERNELOBJS += \ | |||||
| sgemm_direct$(TSUFFIX).$(SUFFIX) | |||||
| ifdef HAVE_SME | |||||
| SKERNELOBJS += \ | |||||
| sgemm_direct_sme1$(TSUFFIX).$(SUFFIX) \ | |||||
| sgemm_direct_sme1_preprocess$(TSUFFIX).$(SUFFIX) | |||||
| endif | |||||
| endif | |||||
| endif | endif | ||||
| endif | endif | ||||
| @@ -809,11 +829,23 @@ else | |||||
| endif | endif | ||||
| ifdef USE_DIRECT_SGEMM | ifdef USE_DIRECT_SGEMM | ||||
| ifeq ($(ARCH), x86_64) | |||||
| $(KDIR)sgemm_direct_performant$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMDIRECTPERFORMANT) | $(KDIR)sgemm_direct_performant$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMDIRECTPERFORMANT) | ||||
| $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ | $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ | ||||
| $(KDIR)sgemm_direct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMDIRECTKERNEL) | $(KDIR)sgemm_direct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMDIRECTKERNEL) | ||||
| $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ | $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ | ||||
| endif | endif | ||||
| ifeq ($(ARCH), arm64) | |||||
| $(KDIR)sgemm_direct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMDIRECTKERNEL) | |||||
| $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ | |||||
| ifdef HAVE_SME | |||||
| $(KDIR)sgemm_direct_sme1$(TSUFFIX).$(SUFFIX) : | |||||
| $(CC) $(CFLAGS) -c $(KERNELDIR)/sgemm_direct_sme1.S -UDOUBLE -UCOMPLEX -o $@ | |||||
| $(KDIR)sgemm_direct_sme1_preprocess$(TSUFFIX).$(SUFFIX) : | |||||
| $(CC) $(CFLAGS) -c $(KERNELDIR)/sgemm_direct_sme1_preprocess.S -UDOUBLE -UCOMPLEX -o $@ | |||||
| endif | |||||
| endif | |||||
| endif | |||||
| ifeq ($(BUILD_BFLOAT16), 1) | ifeq ($(BUILD_BFLOAT16), 1) | ||||
| @@ -122,3 +122,15 @@ ZTRSMKERNEL_LN = ztrsm_kernel_2x2_LN.S | |||||
| ZTRSMKERNEL_LT = ztrsm_kernel_2x2_LT.S | ZTRSMKERNEL_LT = ztrsm_kernel_2x2_LT.S | ||||
| ZTRSMKERNEL_RN = ztrsm_kernel_2x2_LT.S | ZTRSMKERNEL_RN = ztrsm_kernel_2x2_LT.S | ||||
| ZTRSMKERNEL_RT = ztrsm_kernel_2x2_RT.S | ZTRSMKERNEL_RT = ztrsm_kernel_2x2_RT.S | ||||
| ifndef SROTMKERNEL | |||||
| SROTMKERNEL = ../generic/rotm.c | |||||
| endif | |||||
| ifndef DROTMKERNEL | |||||
| DROTMKERNEL = ../generic/rotm.c | |||||
| endif | |||||
| ifndef QROTMKERNEL | |||||
| QROTMKERNEL = ../generic/rotm.c | |||||
| endif | |||||
| @@ -43,4 +43,14 @@ ifndef ZGEMM_BETA | |||||
| ZGEMM_BETA = ../generic/zgemm_beta.c | ZGEMM_BETA = ../generic/zgemm_beta.c | ||||
| endif | endif | ||||
| ifndef SROTMKERNEL | |||||
| SROTMKERNEL = ../generic/rotm.c | |||||
| endif | |||||
| ifndef DROTMKERNEL | |||||
| DROTMKERNEL = ../generic/rotm.c | |||||
| endif | |||||
| ifndef QROTMKERNEL | |||||
| QROTMKERNEL = ../generic/rotm.c | |||||
| endif | |||||
| @@ -27,65 +27,56 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| /************************************************************************************** | /************************************************************************************** | ||||
| * 2013/09/14 Saar | * 2013/09/14 Saar | ||||
| * BLASTEST float : OK | |||||
| * BLASTEST double : OK | |||||
| * CTEST : OK | |||||
| * TEST : OK | |||||
| * BLASTEST float : OK | |||||
| * BLASTEST double : OK | |||||
| * CTEST : OK | |||||
| * TEST : OK | |||||
| * | * | ||||
| **************************************************************************************/ | **************************************************************************************/ | ||||
| #include "common.h" | #include "common.h" | ||||
| // The c/zscal_k function is called not only by cblas_c/zscal but also by other upper-level interfaces. | |||||
| // In certain cases, the expected return values for cblas_s/zscal differ from those of other upper-level interfaces. | |||||
| // To handle this, we use the dummy2 parameter to differentiate between them. | |||||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | ||||
| { | { | ||||
| BLASLONG i=0; | |||||
| BLASLONG inc_x2; | |||||
| BLASLONG ip = 0; | |||||
| FLOAT temp; | |||||
| BLASLONG i = 0; | |||||
| BLASLONG inc_x2; | |||||
| BLASLONG ip = 0; | |||||
| FLOAT temp; | |||||
| if ( (n <= 0) || (inc_x <= 0)) | |||||
| return(0); | |||||
| if ((n <= 0) || (inc_x <= 0)) | |||||
| return(0); | |||||
| inc_x2 = 2 * inc_x; | |||||
| if (dummy2 == 0) { | |||||
| for (i = 0; i < n; i++) | |||||
| { | |||||
| if (da_r == 0.0 && da_i == 0.0) | |||||
| { | |||||
| x[ip] = 0.0; | |||||
| x[ip+1] = 0.0; | |||||
| } | |||||
| else | |||||
| { | |||||
| temp = da_r * x[ip] - da_i * x[ip+1]; | |||||
| x[ip+1] = da_r * x[ip+1] + da_i * x[ip] ; | |||||
| x[ip] = temp; | |||||
| } | |||||
| inc_x2 = 2 * inc_x; | |||||
| for ( i=0; i<n; i++ ) | |||||
| { | |||||
| if ( da_r == 0.0 ) | |||||
| { | |||||
| if ( da_i == 0.0 ) | |||||
| { | |||||
| temp = 0.0; | |||||
| x[ip+1] = 0.0 ; | |||||
| } | |||||
| else | |||||
| { | |||||
| temp = - da_i * x[ip+1] ; | |||||
| if (isnan(x[ip]) || isinf(x[ip])) temp = NAN; | |||||
| if (!isinf(x[ip+1])) | |||||
| x[ip+1] = da_i * x[ip] ; | |||||
| else x[ip+1] = NAN; | |||||
| } | |||||
| } | |||||
| else | |||||
| { | |||||
| if ( da_i == 0.0 ) | |||||
| { | |||||
| temp = da_r * x[ip] ; | |||||
| x[ip+1] = da_r * x[ip+1]; | |||||
| } | |||||
| else | |||||
| { | |||||
| temp = da_r * x[ip] - da_i * x[ip+1] ; | |||||
| x[ip+1] = da_r * x[ip+1] + da_i * x[ip] ; | |||||
| } | |||||
| } | |||||
| x[ip] = temp; | |||||
| ip += inc_x2; | |||||
| } | |||||
| return(0); | |||||
| } | |||||
| for (i = 0; i < n; i++) | |||||
| { | |||||
| temp = da_r * x[ip] - da_i * x[ip+1]; | |||||
| x[ip+1] = da_r * x[ip+1] + da_i * x[ip] ; | |||||
| ip += inc_x2; | |||||
| } | |||||
| return(0); | |||||
| x[ip] = temp; | |||||
| ip += inc_x2; | |||||
| } | |||||
| return(0); | |||||
| } | } | ||||
| @@ -45,4 +45,14 @@ ifndef ZGEMM_BETA | |||||
| ZGEMM_BETA = ../generic/zgemm_beta.c | ZGEMM_BETA = ../generic/zgemm_beta.c | ||||
| endif | endif | ||||
| ifndef SROTMKERNEL | |||||
| SROTMKERNEL = ../generic/rotm.c | |||||
| endif | |||||
| ifndef DROTMKERNEL | |||||
| DROTMKERNEL = ../generic/rotm.c | |||||
| endif | |||||
| ifndef QROTMKERNEL | |||||
| QROTMKERNEL = ../generic/rotm.c | |||||
| endif | |||||
| @@ -1,6 +1,6 @@ | |||||
| include $(KERNELDIR)/KERNEL.ARMV8SVE | include $(KERNELDIR)/KERNEL.ARMV8SVE | ||||
| SGEMVNKERNEL = gemv_n_sve.c | |||||
| DGEMVNKERNEL = gemv_n_sve.c | |||||
| SGEMVNKERNEL = gemv_n_sve_v4x3.c | |||||
| DGEMVNKERNEL = gemv_n_sve_v4x3.c | |||||
| SGEMVTKERNEL = gemv_t_sve_v4x3.c | SGEMVTKERNEL = gemv_t_sve_v4x3.c | ||||
| DGEMVTKERNEL = gemv_t_sve_v4x3.c | DGEMVTKERNEL = gemv_t_sve_v4x3.c | ||||
| @@ -74,16 +74,21 @@ DSCALKERNEL = scal.S | |||||
| CSCALKERNEL = zscal.S | CSCALKERNEL = zscal.S | ||||
| ZSCALKERNEL = zscal.S | ZSCALKERNEL = zscal.S | ||||
| SGEMVNKERNEL = gemv_n.S | |||||
| DGEMVNKERNEL = gemv_n.S | |||||
| SGEMVNKERNEL = gemv_n_sve_v1x3.c | |||||
| DGEMVNKERNEL = gemv_n_sve_v1x3.c | |||||
| CGEMVNKERNEL = zgemv_n.S | CGEMVNKERNEL = zgemv_n.S | ||||
| ZGEMVNKERNEL = zgemv_n.S | ZGEMVNKERNEL = zgemv_n.S | ||||
| SGEMVTKERNEL = gemv_t.S | |||||
| DGEMVTKERNEL = gemv_t.S | |||||
| SGEMVTKERNEL = gemv_t_sve_v1x3.c | |||||
| DGEMVTKERNEL = gemv_t_sve_v1x3.c | |||||
| CGEMVTKERNEL = zgemv_t.S | CGEMVTKERNEL = zgemv_t.S | ||||
| ZGEMVTKERNEL = zgemv_t.S | ZGEMVTKERNEL = zgemv_t.S | ||||
| SSYMV_L_KERNEL = symv_L_sve_v1x4.c | |||||
| SSYMV_U_KERNEL = symv_U_sve_v1x4.c | |||||
| DSYMV_L_KERNEL = symv_L_sve_v1x4.c | |||||
| DSYMV_U_KERNEL = symv_U_sve_v1x4.c | |||||
| SASUMKERNEL = sasum_thunderx2t99.c | SASUMKERNEL = sasum_thunderx2t99.c | ||||
| DASUMKERNEL = dasum_thunderx2t99.c | DASUMKERNEL = dasum_thunderx2t99.c | ||||
| CASUMKERNEL = casum_thunderx2t99.c | CASUMKERNEL = casum_thunderx2t99.c | ||||
| @@ -0,0 +1,3 @@ | |||||
| include $(KERNELDIR)/KERNEL.ARMV8SVE | |||||
| @@ -60,7 +60,7 @@ DSCALKERNEL = scal.S | |||||
| CSCALKERNEL = zscal.S | CSCALKERNEL = zscal.S | ||||
| ZSCALKERNEL = zscal.S | ZSCALKERNEL = zscal.S | ||||
| SGEMVNKERNEL = gemv_n.S | |||||
| SGEMVNKERNEL = sgemv_n_neon.c | |||||
| DGEMVNKERNEL = gemv_n.S | DGEMVNKERNEL = gemv_n.S | ||||
| CGEMVNKERNEL = zgemv_n.S | CGEMVNKERNEL = zgemv_n.S | ||||
| ZGEMVNKERNEL = zgemv_n.S | ZGEMVNKERNEL = zgemv_n.S | ||||
| @@ -70,6 +70,10 @@ DGEMVTKERNEL = gemv_t.S | |||||
| CGEMVTKERNEL = zgemv_t.S | CGEMVTKERNEL = zgemv_t.S | ||||
| ZGEMVTKERNEL = zgemv_t.S | ZGEMVTKERNEL = zgemv_t.S | ||||
| SSYMV_L_KERNEL = symv_L_asimd_4x4.c | |||||
| SSYMV_U_KERNEL = symv_U_asimd_4x4.c | |||||
| DSYMV_L_KERNEL = symv_L_asimd_4x4.c | |||||
| DSYMV_U_KERNEL = symv_U_asimd_4x4.c | |||||
| SASUMKERNEL = sasum_thunderx2t99.c | SASUMKERNEL = sasum_thunderx2t99.c | ||||
| DASUMKERNEL = dasum_thunderx2t99.c | DASUMKERNEL = dasum_thunderx2t99.c | ||||
| @@ -98,8 +102,18 @@ ZNRM2KERNEL = znrm2.S | |||||
| DDOTKERNEL = dot.c | DDOTKERNEL = dot.c | ||||
| SDOTKERNEL = dot.c | SDOTKERNEL = dot.c | ||||
| ifeq ($(OSNAME), WINNT) | |||||
| ifeq ($(C_COMPILER), CLANG) | |||||
| CDOTKERNEL = zdot.S | |||||
| ZDOTKERNEL = zdot.S | |||||
| else | |||||
| CDOTKERNEL = zdot_thunderx2t99.c | |||||
| ZDOTKERNEL = zdot_thunderx2t99.c | |||||
| endif | |||||
| else | |||||
| CDOTKERNEL = zdot_thunderx2t99.c | CDOTKERNEL = zdot_thunderx2t99.c | ||||
| ZDOTKERNEL = zdot_thunderx2t99.c | ZDOTKERNEL = zdot_thunderx2t99.c | ||||
| endif | |||||
| DSDOTKERNEL = dot.S | DSDOTKERNEL = dot.S | ||||
| DGEMM_BETA = dgemm_beta.S | DGEMM_BETA = dgemm_beta.S | ||||
| @@ -60,13 +60,13 @@ DSCALKERNEL = scal.S | |||||
| CSCALKERNEL = zscal.S | CSCALKERNEL = zscal.S | ||||
| ZSCALKERNEL = zscal.S | ZSCALKERNEL = zscal.S | ||||
| SGEMVNKERNEL = gemv_n.S | |||||
| DGEMVNKERNEL = gemv_n.S | |||||
| SGEMVNKERNEL = gemv_n_sve_v1x3.c | |||||
| DGEMVNKERNEL = gemv_n_sve_v1x3.c | |||||
| CGEMVNKERNEL = zgemv_n.S | CGEMVNKERNEL = zgemv_n.S | ||||
| ZGEMVNKERNEL = zgemv_n.S | ZGEMVNKERNEL = zgemv_n.S | ||||
| SGEMVTKERNEL = gemv_t.S | |||||
| DGEMVTKERNEL = gemv_t.S | |||||
| SGEMVTKERNEL = gemv_t_sve_v1x3.c | |||||
| DGEMVTKERNEL = gemv_t_sve_v1x3.c | |||||
| CGEMVTKERNEL = zgemv_t.S | CGEMVTKERNEL = zgemv_t.S | ||||
| ZGEMVTKERNEL = zgemv_t.S | ZGEMVTKERNEL = zgemv_t.S | ||||
| @@ -198,3 +198,5 @@ SBGEMMINCOPYOBJ = sbgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
| SBGEMMITCOPYOBJ = sbgemm_itcopy$(TSUFFIX).$(SUFFIX) | SBGEMMITCOPYOBJ = sbgemm_itcopy$(TSUFFIX).$(SUFFIX) | ||||
| SBGEMMONCOPYOBJ = sbgemm_oncopy$(TSUFFIX).$(SUFFIX) | SBGEMMONCOPYOBJ = sbgemm_oncopy$(TSUFFIX).$(SUFFIX) | ||||
| SBGEMMOTCOPYOBJ = sbgemm_otcopy$(TSUFFIX).$(SUFFIX) | SBGEMMOTCOPYOBJ = sbgemm_otcopy$(TSUFFIX).$(SUFFIX) | ||||
| SBGEMVTKERNEL = sbgemv_t_bfdot.c | |||||
| SBGEMVNKERNEL = sbgemv_n_neon.c | |||||
| @@ -1,4 +1,24 @@ | |||||
| include $(KERNELDIR)/KERNEL.ARMV8SVE | include $(KERNELDIR)/KERNEL.ARMV8SVE | ||||
| SGEMVNKERNEL = gemv_n_sve_v1x3.c | |||||
| DGEMVNKERNEL = gemv_n_sve_v1x3.c | |||||
| SGEMVTKERNEL = gemv_t_sve_v1x3.c | SGEMVTKERNEL = gemv_t_sve_v1x3.c | ||||
| DGEMVTKERNEL = gemv_t_sve_v1x3.c | DGEMVTKERNEL = gemv_t_sve_v1x3.c | ||||
| ifeq ($(BUILD_BFLOAT16), 1) | |||||
| SBGEMM_BETA = sbgemm_beta_neoversev1.c | |||||
| SBGEMMKERNEL = sbgemm_kernel_$(SBGEMM_UNROLL_M)x$(SBGEMM_UNROLL_N)_neoversev1.c | |||||
| ifneq ($(SBGEMM_UNROLL_M), $(SBGEMM_UNROLL_N)) | |||||
| SBGEMMINCOPY = sbgemm_ncopy_$(SBGEMM_UNROLL_M)_neoversev1.c | |||||
| SBGEMMITCOPY = sbgemm_tcopy_$(SBGEMM_UNROLL_M)_neoversev1.c | |||||
| SBGEMMINCOPYOBJ = sbgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
| SBGEMMITCOPYOBJ = sbgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
| endif | |||||
| SBGEMMONCOPY = sbgemm_ncopy_$(SBGEMM_UNROLL_N)_neoversev1.c | |||||
| SBGEMMOTCOPY = sbgemm_tcopy_$(SBGEMM_UNROLL_N)_neoversev1.c | |||||
| SBGEMMONCOPYOBJ = sbgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| SBGEMMOTCOPYOBJ = sbgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| SBGEMVNKERNEL = sbgemv_n_neon.c | |||||
| SBGEMVTKERNEL = sbgemv_t_bfdot.c | |||||
| endif | |||||
| @@ -1 +1,6 @@ | |||||
| include $(KERNELDIR)/KERNEL.ARMV8SVE | include $(KERNELDIR)/KERNEL.ARMV8SVE | ||||
| ifeq ($(BUILD_BFLOAT16), 1) | |||||
| SBGEMVTKERNEL = sbgemv_t_bfdot.c | |||||
| SBGEMVNKERNEL = sbgemv_n_neon.c | |||||
| endif | |||||
| @@ -171,3 +171,15 @@ QCABS_KERNEL = ../generic/cabs.c | |||||
| #Dump kernel | #Dump kernel | ||||
| CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c | CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c | ||||
| ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c | ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c | ||||
| ifndef SROTMKERNEL | |||||
| SROTMKERNEL = ../generic/rotm.c | |||||
| endif | |||||
| ifndef DROTMKERNEL | |||||
| DROTMKERNEL = ../generic/rotm.c | |||||
| endif | |||||
| ifndef QROTMKERNEL | |||||
| QROTMKERNEL = ../generic/rotm.c | |||||
| endif | |||||
| @@ -1,216 +1,217 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2017, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #include "common.h" | |||||
| #include <arm_neon.h> | |||||
| #define N "x0" /* vector length */ | |||||
| #define X "x1" /* X vector address */ | |||||
| #define INC_X "x2" /* X stride */ | |||||
| #define Y "x3" /* Y vector address */ | |||||
| #define INC_Y "x4" /* Y stride */ | |||||
| #define J "x5" /* loop variable */ | |||||
| /******************************************************************************* | |||||
| * Macro definitions | |||||
| *******************************************************************************/ | |||||
| #if !defined(COMPLEX) | |||||
| #if !defined(DOUBLE) | |||||
| #define TMPF "s0" | |||||
| #define INC_SHIFT "2" | |||||
| #define N_DIV_SHIFT "2" | |||||
| #define N_REM_MASK "3" | |||||
| #else | |||||
| #define TMPF "d0" | |||||
| #define INC_SHIFT "3" | |||||
| #define N_DIV_SHIFT "1" | |||||
| #define N_REM_MASK "1" | |||||
| #endif | |||||
| #else | |||||
| #if !defined(DOUBLE) | |||||
| #define TMPF "d0" | |||||
| #define INC_SHIFT "3" | |||||
| #define N_DIV_SHIFT "1" | |||||
| #define N_REM_MASK "1" | |||||
| #else | |||||
| #define TMPF "q0" | |||||
| #define INC_SHIFT "4" | |||||
| #define N_DIV_SHIFT "0" | |||||
| #define N_REM_MASK "0" | |||||
| #endif | |||||
| #endif | |||||
| #define KERNEL_F1 \ | |||||
| "ldr "TMPF", ["X"] \n" \ | |||||
| "add "X", "X", "INC_X" \n" \ | |||||
| "str "TMPF", ["Y"] \n" \ | |||||
| "add "Y", "Y", "INC_Y" \n" | |||||
| #define KERNEL_F \ | |||||
| "ldr q0, ["X"], #16 \n" \ | |||||
| "str q0, ["Y"], #16 \n" | |||||
| #define INIT \ | |||||
| "lsl "INC_X", "INC_X", #"INC_SHIFT" \n" \ | |||||
| "lsl "INC_Y", "INC_Y", #"INC_SHIFT" \n" | |||||
| static int do_copy(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||||
| { | |||||
| if ( n < 0 ) return 0; | |||||
| __asm__ __volatile__ ( | |||||
| " mov "N", %[N_] \n" | |||||
| " mov "X", %[X_] \n" | |||||
| " mov "INC_X", %[INCX_] \n" | |||||
| " mov "Y", %[Y_] \n" | |||||
| " mov "INC_Y", %[INCY_] \n" | |||||
| " cmp "N", xzr \n" | |||||
| " ble 8f //copy_kernel_L999 \n" | |||||
| " cmp "INC_X", #1 \n" | |||||
| " bne 4f //copy_kernel_S_BEGIN \n" | |||||
| " cmp "INC_Y", #1 \n" | |||||
| " bne 4f //copy_kernel_S_BEGIN \n" | |||||
| "// .Lcopy_kernel_F_BEGIN: \n" | |||||
| " "INIT" \n" | |||||
| " asr "J", "N", #"N_DIV_SHIFT" \n" | |||||
| " cmp "J", xzr \n" | |||||
| " beq 2f //copy_kernel_F1 \n" | |||||
| " .align 5 \n" | |||||
| "1: //copy_kernel_F: \n" | |||||
| " "KERNEL_F" \n" | |||||
| " subs "J", "J", #1 \n" | |||||
| " bne 1b //copy_kernel_F \n" | |||||
| "2: //copy_kernel_F1: \n" | |||||
| #if defined(COMPLEX) && defined(DOUBLE) | |||||
| " b 8f //copy_kernel_L999 \n" | |||||
| #else | |||||
| " ands "J", "N", #"N_REM_MASK" \n" | |||||
| " ble 8f //copy_kernel_L999 \n" | |||||
| #endif | |||||
| "3: //copy_kernel_F10: \n" | |||||
| " "KERNEL_F1" \n" | |||||
| " subs "J", "J", #1 \n" | |||||
| " bne 3b //copy_kernel_F10 \n" | |||||
| " b 8f //copy_kernel_L999 \n" | |||||
| "4: //copy_kernel_S_BEGIN: \n" | |||||
| " "INIT" \n" | |||||
| " asr "J", "N", #2 \n" | |||||
| " cmp "J", xzr \n" | |||||
| " ble 6f //copy_kernel_S1 \n" | |||||
| "5: //copy_kernel_S4: \n" | |||||
| " "KERNEL_F1" \n" | |||||
| " "KERNEL_F1" \n" | |||||
| " "KERNEL_F1" \n" | |||||
| " "KERNEL_F1" \n" | |||||
| " subs "J", "J", #1 \n" | |||||
| " bne 5b //copy_kernel_S4 \n" | |||||
| "6: //copy_kernel_S1: \n" | |||||
| " ands "J", "N", #3 \n" | |||||
| " ble 8f //copy_kernel_L999 \n" | |||||
| "7: //copy_kernel_S10: \n" | |||||
| " "KERNEL_F1" \n" | |||||
| " subs "J", "J", #1 \n" | |||||
| " bne 7b //copy_kernel_S10 \n" | |||||
| "8: //copy_kernel_L999: \n" | |||||
| : | |||||
| : [N_] "r" (n), //%1 | |||||
| [X_] "r" (x), //%2 | |||||
| [INCX_] "r" (inc_x), //%3 | |||||
| [Y_] "r" (y), //%4 | |||||
| [INCY_] "r" (inc_y) //%5 | |||||
| : "cc", | |||||
| "memory", | |||||
| "x0", "x1", "x2", "x3", "x4", "x5", | |||||
| "d0" | |||||
| ); | |||||
| return 0; | |||||
| } | |||||
| #if defined(SMP) | |||||
| static int copy_thread_function(BLASLONG n, BLASLONG dummy0, | |||||
| BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *y, | |||||
| BLASLONG inc_y, FLOAT *dummy3, BLASLONG dummy4) | |||||
| { | |||||
| do_copy(n, x, inc_x, y, inc_y); | |||||
| return 0; | |||||
| } | |||||
| #endif | |||||
| int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||||
| { | |||||
| #if defined(SMP) | |||||
| int nthreads; | |||||
| FLOAT dummy_alpha; | |||||
| #endif | |||||
| if (n <= 0) return 0; | |||||
| #if defined(SMP) | |||||
| if (inc_x == 0 || n <= 10000) | |||||
| nthreads = 1; | |||||
| else | |||||
| nthreads = num_cpu_avail(1); | |||||
| if (nthreads == 1) { | |||||
| do_copy(n, x, inc_x, y, inc_y); | |||||
| } else { | |||||
| int mode = 0; | |||||
| #if !defined(COMPLEX) | |||||
| mode = BLAS_REAL; | |||||
| #else | |||||
| mode = BLAS_COMPLEX; | |||||
| #endif | |||||
| #if !defined(DOUBLE) | |||||
| mode |= BLAS_SINGLE; | |||||
| #else | |||||
| mode |= BLAS_DOUBLE; | |||||
| #endif | |||||
| blas_level1_thread(mode, n, 0, 0, &dummy_alpha, | |||||
| x, inc_x, y, inc_y, NULL, 0, | |||||
| ( void *)copy_thread_function, nthreads); | |||||
| } | |||||
| #else | |||||
| do_copy(n, x, inc_x, y, inc_y); | |||||
| #endif | |||||
| return 0; | |||||
| } | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2017, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #include "common.h" | |||||
| #include <arm_neon.h> | |||||
| #define N "x0" /* vector length */ | |||||
| #define X "x1" /* X vector address */ | |||||
| #define INC_X "x2" /* X stride */ | |||||
| #define Y "x3" /* Y vector address */ | |||||
| #define INC_Y "x4" /* Y stride */ | |||||
| #define J "x5" /* loop variable */ | |||||
| /******************************************************************************* | |||||
| * Macro definitions | |||||
| *******************************************************************************/ | |||||
| #if !defined(COMPLEX) | |||||
| #if !defined(DOUBLE) | |||||
| #define TMPF "s0" | |||||
| #define INC_SHIFT "2" | |||||
| #define N_DIV_SHIFT "2" | |||||
| #define N_REM_MASK "3" | |||||
| #else | |||||
| #define TMPF "d0" | |||||
| #define INC_SHIFT "3" | |||||
| #define N_DIV_SHIFT "1" | |||||
| #define N_REM_MASK "1" | |||||
| #endif | |||||
| #else | |||||
| #if !defined(DOUBLE) | |||||
| #define TMPF "d0" | |||||
| #define INC_SHIFT "3" | |||||
| #define N_DIV_SHIFT "1" | |||||
| #define N_REM_MASK "1" | |||||
| #else | |||||
| #define TMPF "q0" | |||||
| #define INC_SHIFT "4" | |||||
| #define N_DIV_SHIFT "0" | |||||
| #define N_REM_MASK "0" | |||||
| #endif | |||||
| #endif | |||||
| #define KERNEL_F1 \ | |||||
| "ldr "TMPF", ["X"] \n" \ | |||||
| "add "X", "X", "INC_X" \n" \ | |||||
| "str "TMPF", ["Y"] \n" \ | |||||
| "add "Y", "Y", "INC_Y" \n" | |||||
| #define KERNEL_F \ | |||||
| "ldr q0, ["X"], #16 \n" \ | |||||
| "str q0, ["Y"], #16 \n" | |||||
| #define INIT \ | |||||
| "lsl "INC_X", "INC_X", #"INC_SHIFT" \n" \ | |||||
| "lsl "INC_Y", "INC_Y", #"INC_SHIFT" \n" | |||||
| static int do_copy(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||||
| { | |||||
| if ( n < 0 ) return 0; | |||||
| __asm__ __volatile__ ( | |||||
| " mov "N", %[N_] \n" | |||||
| " mov "X", %[X_] \n" | |||||
| " mov "INC_X", %[INCX_] \n" | |||||
| " mov "Y", %[Y_] \n" | |||||
| " mov "INC_Y", %[INCY_] \n" | |||||
| " cmp "N", xzr \n" | |||||
| " ble 8f //copy_kernel_L999 \n" | |||||
| " cmp "INC_X", #1 \n" | |||||
| " bne 4f //copy_kernel_S_BEGIN \n" | |||||
| " cmp "INC_Y", #1 \n" | |||||
| " bne 4f //copy_kernel_S_BEGIN \n" | |||||
| "// .Lcopy_kernel_F_BEGIN: \n" | |||||
| " "INIT" \n" | |||||
| " asr "J", "N", #"N_DIV_SHIFT" \n" | |||||
| " cmp "J", xzr \n" | |||||
| " beq 2f //copy_kernel_F1 \n" | |||||
| #if !(defined(__clang__) && defined(OS_WINDOWS)) | |||||
| " .align 5 \n" | |||||
| #endif | |||||
| "1: //copy_kernel_F: \n" | |||||
| " "KERNEL_F" \n" | |||||
| " subs "J", "J", #1 \n" | |||||
| " bne 1b //copy_kernel_F \n" | |||||
| "2: //copy_kernel_F1: \n" | |||||
| #if defined(COMPLEX) && defined(DOUBLE) | |||||
| " b 8f //copy_kernel_L999 \n" | |||||
| #else | |||||
| " ands "J", "N", #"N_REM_MASK" \n" | |||||
| " ble 8f //copy_kernel_L999 \n" | |||||
| #endif | |||||
| "3: //copy_kernel_F10: \n" | |||||
| " "KERNEL_F1" \n" | |||||
| " subs "J", "J", #1 \n" | |||||
| " bne 3b //copy_kernel_F10 \n" | |||||
| " b 8f //copy_kernel_L999 \n" | |||||
| "4: //copy_kernel_S_BEGIN: \n" | |||||
| " "INIT" \n" | |||||
| " asr "J", "N", #2 \n" | |||||
| " cmp "J", xzr \n" | |||||
| " ble 6f //copy_kernel_S1 \n" | |||||
| "5: //copy_kernel_S4: \n" | |||||
| " "KERNEL_F1" \n" | |||||
| " "KERNEL_F1" \n" | |||||
| " "KERNEL_F1" \n" | |||||
| " "KERNEL_F1" \n" | |||||
| " subs "J", "J", #1 \n" | |||||
| " bne 5b //copy_kernel_S4 \n" | |||||
| "6: //copy_kernel_S1: \n" | |||||
| " ands "J", "N", #3 \n" | |||||
| " ble 8f //copy_kernel_L999 \n" | |||||
| "7: //copy_kernel_S10: \n" | |||||
| " "KERNEL_F1" \n" | |||||
| " subs "J", "J", #1 \n" | |||||
| " bne 7b //copy_kernel_S10 \n" | |||||
| "8: //copy_kernel_L999: \n" | |||||
| : | |||||
| : [N_] "r" (n), //%1 | |||||
| [X_] "r" (x), //%2 | |||||
| [INCX_] "r" (inc_x), //%3 | |||||
| [Y_] "r" (y), //%4 | |||||
| [INCY_] "r" (inc_y) //%5 | |||||
| : "cc", | |||||
| "memory", | |||||
| "x0", "x1", "x2", "x3", "x4", "x5", | |||||
| "d0" | |||||
| ); | |||||
| return 0; | |||||
| } | |||||
| #if defined(SMP) | |||||
| static int copy_thread_function(BLASLONG n, BLASLONG dummy0, | |||||
| BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *y, | |||||
| BLASLONG inc_y, FLOAT *dummy3, BLASLONG dummy4) | |||||
| { | |||||
| do_copy(n, x, inc_x, y, inc_y); | |||||
| return 0; | |||||
| } | |||||
| #endif | |||||
| int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||||
| { | |||||
| #if defined(SMP) | |||||
| int nthreads; | |||||
| FLOAT dummy_alpha; | |||||
| #endif | |||||
| if (n <= 0) return 0; | |||||
| #if defined(SMP) | |||||
| if (inc_x == 0 || n <= 10000) | |||||
| nthreads = 1; | |||||
| else | |||||
| nthreads = num_cpu_avail(1); | |||||
| if (nthreads == 1) { | |||||
| do_copy(n, x, inc_x, y, inc_y); | |||||
| } else { | |||||
| int mode = 0; | |||||
| #if !defined(COMPLEX) | |||||
| mode = BLAS_REAL; | |||||
| #else | |||||
| mode = BLAS_COMPLEX; | |||||
| #endif | |||||
| #if !defined(DOUBLE) | |||||
| mode |= BLAS_SINGLE; | |||||
| #else | |||||
| mode |= BLAS_DOUBLE; | |||||
| #endif | |||||
| blas_level1_thread(mode, n, 0, 0, &dummy_alpha, | |||||
| x, inc_x, y, inc_y, NULL, 0, | |||||
| ( void *)copy_thread_function, nthreads); | |||||
| } | |||||
| #else | |||||
| do_copy(n, x, inc_x, y, inc_y); | |||||
| #endif | |||||
| return 0; | |||||
| } | |||||
| @@ -152,7 +152,9 @@ static FLOAT dasum_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| " cmp "J", xzr \n" | " cmp "J", xzr \n" | ||||
| " beq 3f //asum_kernel_F1 \n" | " beq 3f //asum_kernel_F1 \n" | ||||
| #if !(defined(__clang__) && defined(OS_WINDOWS)) | |||||
| ".align 5 \n" | ".align 5 \n" | ||||
| #endif | |||||
| "2: //asum_kernel_F32: \n" | "2: //asum_kernel_F32: \n" | ||||
| " "KERNEL_F32" \n" | " "KERNEL_F32" \n" | ||||
| " subs "J", "J", #1 \n" | " subs "J", "J", #1 \n" | ||||
| @@ -213,7 +213,7 @@ CNAME(BLASLONG M, | |||||
| const BLASLONG n2 = N & -2; | const BLASLONG n2 = N & -2; | ||||
| const BLASLONG n8 = N & -8; | const BLASLONG n8 = N & -8; | ||||
| const int pack_a = M >= v_size2 && N >= 8 && K >= 8 ? 1 : 0; | |||||
| const int pack_a = M >= v_size2 && N >= 8 ? 1 : 0; | |||||
| FLOAT* packed_a = | FLOAT* packed_a = | ||||
| (pack_a) ? packed_a = (FLOAT*)malloc(K * v_size2 * sizeof(FLOAT)) : NULL; | (pack_a) ? packed_a = (FLOAT*)malloc(K * v_size2 * sizeof(FLOAT)) : NULL; | ||||
| @@ -219,7 +219,7 @@ CNAME(BLASLONG M, | |||||
| const BLASLONG n4 = N & -4; | const BLASLONG n4 = N & -4; | ||||
| const BLASLONG n2 = N & -2; | const BLASLONG n2 = N & -2; | ||||
| const int pack_a = M >= v_size2 && N >= 8 && K >= 8 ? 1 : 0; | |||||
| const int pack_a = M >= v_size2 && N >= 8 ? 1 : 0; | |||||
| FLOAT* packed_a = | FLOAT* packed_a = | ||||
| (pack_a) ? packed_a = (FLOAT*)malloc(K * v_size2 * sizeof(FLOAT)) : NULL; | (pack_a) ? packed_a = (FLOAT*)malloc(K * v_size2 * sizeof(FLOAT)) : NULL; | ||||
| @@ -48,6 +48,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n, | extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n, | ||||
| BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, | BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, | ||||
| void *c, BLASLONG ldc, int (*function)(), int nthreads); | void *c, BLASLONG ldc, int (*function)(), int nthreads); | ||||
| #ifdef DYNAMIC_ARCH | |||||
| extern char* gotoblas_corename(void); | |||||
| #endif | |||||
| #if defined(DYNAMIC_ARCH) || defined(NEOVERSEV1) | |||||
| static inline int get_dot_optimal_nthreads_neoversev1(BLASLONG N, int ncpu) { | |||||
| #ifdef DOUBLE | |||||
| return (N <= 10000L) ? 1 | |||||
| : (N <= 64500L) ? 1 | |||||
| : (N <= 100000L) ? MIN(ncpu, 2) | |||||
| : (N <= 150000L) ? MIN(ncpu, 4) | |||||
| : (N <= 260000L) ? MIN(ncpu, 8) | |||||
| : (N <= 360000L) ? MIN(ncpu, 16) | |||||
| : (N <= 520000L) ? MIN(ncpu, 24) | |||||
| : (N <= 1010000L) ? MIN(ncpu, 56) | |||||
| : ncpu; | |||||
| #else | |||||
| return (N <= 10000L) ? 1 | |||||
| : (N <= 110000L) ? 1 | |||||
| : (N <= 200000L) ? MIN(ncpu, 2) | |||||
| : (N <= 280000L) ? MIN(ncpu, 4) | |||||
| : (N <= 520000L) ? MIN(ncpu, 8) | |||||
| : (N <= 830000L) ? MIN(ncpu, 16) | |||||
| : (N <= 1010000L) ? MIN(ncpu, 24) | |||||
| : ncpu; | |||||
| #endif | |||||
| } | |||||
| #endif | |||||
| static inline int get_dot_optimal_nthreads(BLASLONG n) { | |||||
| int ncpu = num_cpu_avail(1); | |||||
| #if defined(NEOVERSEV1) && !defined(COMPLEX) && !defined(BFLOAT16) | |||||
| return get_dot_optimal_nthreads_neoversev1(n, ncpu); | |||||
| #elif defined(DYNAMIC_ARCH) && !defined(COMPLEX) && !defined(BFLOAT16) | |||||
| if (strcmp(gotoblas_corename(), "neoversev1") == 0) { | |||||
| return get_dot_optimal_nthreads_neoversev1(n, ncpu); | |||||
| } | |||||
| #endif | |||||
| // Default case | |||||
| if (n <= 10000L) | |||||
| return 1; | |||||
| else | |||||
| return num_cpu_avail(1); | |||||
| } | |||||
| #endif | #endif | ||||
| static RETURN_TYPE dot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | static RETURN_TYPE dot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | ||||
| @@ -85,10 +132,10 @@ RETURN_TYPE CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y | |||||
| RETURN_TYPE dot = 0.0; | RETURN_TYPE dot = 0.0; | ||||
| #if defined(SMP) | #if defined(SMP) | ||||
| if (inc_x == 0 || inc_y == 0 || n <= 10000) | |||||
| if (inc_x == 0 || inc_y == 0) | |||||
| nthreads = 1; | nthreads = 1; | ||||
| else | else | ||||
| nthreads = num_cpu_avail(1); | |||||
| nthreads = get_dot_optimal_nthreads(n); | |||||
| if (nthreads == 1) { | if (nthreads == 1) { | ||||
| dot = dot_compute(n, x, inc_x, y, inc_y); | dot = dot_compute(n, x, inc_x, y, inc_y); | ||||
| @@ -105,7 +152,7 @@ RETURN_TYPE CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y | |||||
| blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, | blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, | ||||
| x, inc_x, y, inc_y, result, 0, | x, inc_x, y, inc_y, result, 0, | ||||
| ( void *)dot_thread_function, nthreads); | |||||
| (void *)dot_thread_function, nthreads); | |||||
| ptr = (RETURN_TYPE *)result; | ptr = (RETURN_TYPE *)result; | ||||
| for (i = 0; i < nthreads; i++) { | for (i = 0; i < nthreads; i++) { | ||||
| @@ -134,7 +134,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| " fadd v4.4s, v4.4s, v6.4s \n" \ | " fadd v4.4s, v4.4s, v6.4s \n" \ | ||||
| " fadd v0.4s, v0.4s, v4.4s \n" \ | " fadd v0.4s, v0.4s, v4.4s \n" \ | ||||
| " faddp v0.4s, v0.4s, v0.4s \n" \ | " faddp v0.4s, v0.4s, v0.4s \n" \ | ||||
| " faddp v0.4s, v0.4s, v0.4s \n" | |||||
| " faddp "OUT", v0.2s \n" | |||||
| #else /* !defined(DSDOT) */ | #else /* !defined(DSDOT) */ | ||||
| #define KERNEL_F1 \ | #define KERNEL_F1 \ | ||||
| @@ -285,8 +285,9 @@ static RETURN_TYPE dot_kernel_asimd(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT | |||||
| " asr %[J_], %[N_], #"N_DIV_SHIFT" \n" | " asr %[J_], %[N_], #"N_DIV_SHIFT" \n" | ||||
| " cmp %[J_], xzr \n" | " cmp %[J_], xzr \n" | ||||
| " beq 3f //dot_kernel_F1 \n" | " beq 3f //dot_kernel_F1 \n" | ||||
| #if !(defined(__clang__) && defined(OS_WINDOWS)) | |||||
| " .align 5 \n" | " .align 5 \n" | ||||
| #endif | |||||
| "2: //dot_kernel_F: \n" | "2: //dot_kernel_F: \n" | ||||
| " "KERNEL_F" \n" | " "KERNEL_F" \n" | ||||
| " subs %[J_], %[J_], #1 \n" | " subs %[J_], %[J_], #1 \n" | ||||
| @@ -1,5 +1,5 @@ | |||||
| /*************************************************************************** | /*************************************************************************** | ||||
| Copyright (c) 2024, The OpenBLAS Project | |||||
| Copyright (c) 2024-2025, The OpenBLAS Project | |||||
| All rights reserved. | All rights reserved. | ||||
| Redistribution and use in source and binary forms, with or without | Redistribution and use in source and binary forms, with or without | ||||
| @@ -59,23 +59,82 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO | |||||
| a_ptr = a; | a_ptr = a; | ||||
| if (inc_y == 1) { | if (inc_y == 1) { | ||||
| BLASLONG width = n / 3; | |||||
| uint64_t sve_size = SV_COUNT(); | uint64_t sve_size = SV_COUNT(); | ||||
| for (j = 0; j < n; j++) { | |||||
| SV_TYPE temp_vec = SV_DUP(alpha * x[ix]); | |||||
| i = 0; | |||||
| svbool_t pg = SV_WHILE(i, m); | |||||
| while (svptest_any(SV_TRUE(), pg)) { | |||||
| SV_TYPE a_vec = svld1(pg, a_ptr + i); | |||||
| svbool_t pg_true = SV_TRUE(); | |||||
| svbool_t pg = SV_WHILE(0, m % sve_size); | |||||
| FLOAT *a0_ptr = a + lda * width * 0; | |||||
| FLOAT *a1_ptr = a + lda * width * 1; | |||||
| FLOAT *a2_ptr = a + lda * width * 2; | |||||
| for (j = 0; j < width; j++) { | |||||
| for (i = 0; (i + sve_size - 1) < m; i += sve_size) { | |||||
| ix = j * inc_x; | |||||
| SV_TYPE x0_vec = SV_DUP(alpha * x[ix + (inc_x * width * 0)]); | |||||
| SV_TYPE x1_vec = SV_DUP(alpha * x[ix + (inc_x * width * 1)]); | |||||
| SV_TYPE x2_vec = SV_DUP(alpha * x[ix + (inc_x * width * 2)]); | |||||
| SV_TYPE a00_vec = svld1(pg_true, a0_ptr + i); | |||||
| SV_TYPE a01_vec = svld1(pg_true, a1_ptr + i); | |||||
| SV_TYPE a02_vec = svld1(pg_true, a2_ptr + i); | |||||
| SV_TYPE y_vec = svld1(pg_true, y + i); | |||||
| y_vec = svmla_lane(y_vec, a00_vec, x0_vec, 0); | |||||
| y_vec = svmla_lane(y_vec, a01_vec, x1_vec, 0); | |||||
| y_vec = svmla_lane(y_vec, a02_vec, x2_vec, 0); | |||||
| svst1(pg_true, y + i, y_vec); | |||||
| } | |||||
| if (i < m) { | |||||
| SV_TYPE x0_vec = SV_DUP(alpha * x[ix + (inc_x * width * 0)]); | |||||
| SV_TYPE x1_vec = SV_DUP(alpha * x[ix + (inc_x * width * 1)]); | |||||
| SV_TYPE x2_vec = SV_DUP(alpha * x[ix + (inc_x * width * 2)]); | |||||
| SV_TYPE a00_vec = svld1(pg, a0_ptr + i); | |||||
| SV_TYPE a01_vec = svld1(pg, a1_ptr + i); | |||||
| SV_TYPE a02_vec = svld1(pg, a2_ptr + i); | |||||
| SV_TYPE y_vec = svld1(pg, y + i); | SV_TYPE y_vec = svld1(pg, y + i); | ||||
| y_vec = svmla_x(pg, y_vec, temp_vec, a_vec); | |||||
| y_vec = svmla_m(pg, y_vec, a00_vec, x0_vec); | |||||
| y_vec = svmla_m(pg, y_vec, a01_vec, x1_vec); | |||||
| y_vec = svmla_m(pg, y_vec, a02_vec, x2_vec); | |||||
| ix += inc_x; | |||||
| svst1(pg, y + i, y_vec); | svst1(pg, y + i, y_vec); | ||||
| i += sve_size; | |||||
| pg = SV_WHILE(i, m); | |||||
| } | } | ||||
| a0_ptr += lda; | |||||
| a1_ptr += lda; | |||||
| a2_ptr += lda; | |||||
| } | |||||
| a_ptr = a2_ptr; | |||||
| for (j = width * 3; j < n; j++) { | |||||
| ix = j * inc_x; | |||||
| for (i = 0; (i + sve_size - 1) < m; i += sve_size) { | |||||
| SV_TYPE y_vec = svld1(pg_true, y + i); | |||||
| SV_TYPE x_vec = SV_DUP(alpha * x[(ix)]); | |||||
| SV_TYPE a_vec = svld1(pg_true, a_ptr + i); | |||||
| y_vec = svmla_x(pg_true, y_vec, a_vec, x_vec); | |||||
| svst1(pg_true, y + i, y_vec); | |||||
| } | |||||
| if (i < m) { | |||||
| SV_TYPE y_vec = svld1(pg, y + i); | |||||
| SV_TYPE x_vec = SV_DUP(alpha * x[(ix)]); | |||||
| SV_TYPE a_vec = svld1(pg, a_ptr + i); | |||||
| y_vec = svmla_m(pg, y_vec, a_vec, x_vec); | |||||
| svst1(pg, y + i, y_vec); | |||||
| } | |||||
| a_ptr += lda; | a_ptr += lda; | ||||
| ix += inc_x; | ix += inc_x; | ||||
| } | } | ||||
| return(0); | |||||
| return (0); | |||||
| } | } | ||||
| for (j = 0; j < n; j++) { | for (j = 0; j < n; j++) { | ||||
| @@ -89,4 +148,4 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO | |||||
| ix += inc_x; | ix += inc_x; | ||||
| } | } | ||||
| return (0); | return (0); | ||||
| } | |||||
| } | |||||
| @@ -0,0 +1,138 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2025, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written | |||||
| permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #include <arm_sve.h> | |||||
| #include "common.h" | |||||
| #ifdef DOUBLE | |||||
| #define SV_COUNT svcntd | |||||
| #define SV_TYPE svfloat64_t | |||||
| #define SV_TRUE svptrue_b64 | |||||
| #define SV_WHILE svwhilelt_b64_s64 | |||||
| #define SV_DUP svdup_f64 | |||||
| #else | |||||
| #define SV_COUNT svcntw | |||||
| #define SV_TYPE svfloat32_t | |||||
| #define SV_TRUE svptrue_b32 | |||||
| #define SV_WHILE svwhilelt_b32_s64 | |||||
| #define SV_DUP svdup_f32 | |||||
| #endif | |||||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, | |||||
| BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, | |||||
| FLOAT *buffer) | |||||
| { | |||||
| BLASLONG i; | |||||
| BLASLONG ix,iy; | |||||
| BLASLONG j; | |||||
| FLOAT *a_ptr; | |||||
| FLOAT temp; | |||||
| ix = 0; | |||||
| a_ptr = a; | |||||
| if (inc_y == 1) { | |||||
| BLASLONG width = (n + 3 - 1) / 3; | |||||
| FLOAT *a0_ptr = a_ptr + lda * width * 0; | |||||
| FLOAT *a1_ptr = a_ptr + lda * width * 1; | |||||
| FLOAT *a2_ptr = a_ptr + lda * width * 2; | |||||
| FLOAT *x0_ptr = x + inc_x * width * 0; | |||||
| FLOAT *x1_ptr = x + inc_x * width * 1; | |||||
| FLOAT *x2_ptr = x + inc_x * width * 2; | |||||
| for (j = 0; j < width; j++) { | |||||
| svbool_t pg00 = ((j + width * 0) < n) ? SV_TRUE() : svpfalse(); | |||||
| svbool_t pg01 = ((j + width * 1) < n) ? SV_TRUE() : svpfalse(); | |||||
| svbool_t pg02 = ((j + width * 2) < n) ? SV_TRUE() : svpfalse(); | |||||
| SV_TYPE temp0_vec = ((j + width * 0) < n) ? SV_DUP(alpha * x0_ptr[ix]) : SV_DUP(0.0); | |||||
| SV_TYPE temp1_vec = ((j + width * 1) < n) ? SV_DUP(alpha * x1_ptr[ix]) : SV_DUP(0.0); | |||||
| SV_TYPE temp2_vec = ((j + width * 2) < n) ? SV_DUP(alpha * x2_ptr[ix]) : SV_DUP(0.0); | |||||
| i = 0; | |||||
| BLASLONG sve_size = SV_COUNT(); | |||||
| while ((i + sve_size * 1 - 1) < m) { | |||||
| SV_TYPE y0_vec = svld1_vnum(SV_TRUE(), y + i, 0); | |||||
| SV_TYPE a00_vec = svld1_vnum(pg00, a0_ptr + i, 0); | |||||
| SV_TYPE a01_vec = svld1_vnum(pg01, a1_ptr + i, 0); | |||||
| SV_TYPE a02_vec = svld1_vnum(pg02, a2_ptr + i, 0); | |||||
| y0_vec = svmla_m(pg00, y0_vec, temp0_vec, a00_vec); | |||||
| y0_vec = svmla_m(pg01, y0_vec, temp1_vec, a01_vec); | |||||
| y0_vec = svmla_m(pg02, y0_vec, temp2_vec, a02_vec); | |||||
| svst1_vnum(SV_TRUE(), y + i, 0, y0_vec); | |||||
| i += sve_size * 1; | |||||
| } | |||||
| if (i < m) { | |||||
| svbool_t pg0 = SV_WHILE(i + sve_size * 0, m); | |||||
| pg00 = svand_z(SV_TRUE(), pg0, pg00); | |||||
| pg01 = svand_z(SV_TRUE(), pg0, pg01); | |||||
| pg02 = svand_z(SV_TRUE(), pg0, pg02); | |||||
| SV_TYPE y0_vec = svld1_vnum(pg0, y + i, 0); | |||||
| SV_TYPE a00_vec = svld1_vnum(pg00, a0_ptr + i, 0); | |||||
| SV_TYPE a01_vec = svld1_vnum(pg01, a1_ptr + i, 0); | |||||
| SV_TYPE a02_vec = svld1_vnum(pg02, a2_ptr + i, 0); | |||||
| y0_vec = svmla_m(pg00, y0_vec, temp0_vec, a00_vec); | |||||
| y0_vec = svmla_m(pg01, y0_vec, temp1_vec, a01_vec); | |||||
| y0_vec = svmla_m(pg02, y0_vec, temp2_vec, a02_vec); | |||||
| svst1_vnum(pg0, y + i, 0, y0_vec); | |||||
| } | |||||
| a0_ptr += lda; | |||||
| a1_ptr += lda; | |||||
| a2_ptr += lda; | |||||
| ix += inc_x; | |||||
| } | |||||
| return(0); | |||||
| } | |||||
| for (j = 0; j < n; j++) { | |||||
| temp = alpha * x[ix]; | |||||
| iy = 0; | |||||
| for (i = 0; i < m; i++) { | |||||
| y[iy] += temp * a_ptr[i]; | |||||
| iy += inc_y; | |||||
| } | |||||
| a_ptr += lda; | |||||
| ix += inc_x; | |||||
| } | |||||
| return (0); | |||||
| } | |||||
| @@ -0,0 +1,207 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2025, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written | |||||
| permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #include <arm_sve.h> | |||||
| #include "common.h" | |||||
| #ifdef DOUBLE | |||||
| #define SV_COUNT svcntd | |||||
| #define SV_TYPE svfloat64_t | |||||
| #define SV_TRUE svptrue_b64 | |||||
| #define SV_WHILE svwhilelt_b64_s64 | |||||
| #define SV_DUP svdup_f64 | |||||
| #else | |||||
| #define SV_COUNT svcntw | |||||
| #define SV_TYPE svfloat32_t | |||||
| #define SV_TRUE svptrue_b32 | |||||
| #define SV_WHILE svwhilelt_b32_s64 | |||||
| #define SV_DUP svdup_f32 | |||||
| #endif | |||||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, | |||||
| BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, | |||||
| FLOAT *buffer) | |||||
| { | |||||
| BLASLONG i; | |||||
| BLASLONG ix,iy; | |||||
| BLASLONG j; | |||||
| FLOAT *a_ptr; | |||||
| FLOAT temp; | |||||
| ix = 0; | |||||
| a_ptr = a; | |||||
| if (inc_y == 1) { | |||||
| BLASLONG width = (n + 3 - 1) / 3; | |||||
| FLOAT *a0_ptr = a_ptr + lda * width * 0; | |||||
| FLOAT *a1_ptr = a_ptr + lda * width * 1; | |||||
| FLOAT *a2_ptr = a_ptr + lda * width * 2; | |||||
| FLOAT *x0_ptr = x + inc_x * width * 0; | |||||
| FLOAT *x1_ptr = x + inc_x * width * 1; | |||||
| FLOAT *x2_ptr = x + inc_x * width * 2; | |||||
| for (j = 0; j < width; j++) { | |||||
| svbool_t pg00 = ((j + width * 0) < n) ? SV_TRUE() : svpfalse(); | |||||
| svbool_t pg10 = ((j + width * 0) < n) ? SV_TRUE() : svpfalse(); | |||||
| svbool_t pg20 = ((j + width * 0) < n) ? SV_TRUE() : svpfalse(); | |||||
| svbool_t pg30 = ((j + width * 0) < n) ? SV_TRUE() : svpfalse(); | |||||
| svbool_t pg01 = ((j + width * 1) < n) ? SV_TRUE() : svpfalse(); | |||||
| svbool_t pg11 = ((j + width * 1) < n) ? SV_TRUE() : svpfalse(); | |||||
| svbool_t pg21 = ((j + width * 1) < n) ? SV_TRUE() : svpfalse(); | |||||
| svbool_t pg31 = ((j + width * 1) < n) ? SV_TRUE() : svpfalse(); | |||||
| svbool_t pg02 = ((j + width * 2) < n) ? SV_TRUE() : svpfalse(); | |||||
| svbool_t pg12 = ((j + width * 2) < n) ? SV_TRUE() : svpfalse(); | |||||
| svbool_t pg22 = ((j + width * 2) < n) ? SV_TRUE() : svpfalse(); | |||||
| svbool_t pg32 = ((j + width * 2) < n) ? SV_TRUE() : svpfalse(); | |||||
| SV_TYPE temp0_vec = ((j + width * 0) < n) ? SV_DUP(alpha * x0_ptr[ix]) : SV_DUP(0.0); | |||||
| SV_TYPE temp1_vec = ((j + width * 1) < n) ? SV_DUP(alpha * x1_ptr[ix]) : SV_DUP(0.0); | |||||
| SV_TYPE temp2_vec = ((j + width * 2) < n) ? SV_DUP(alpha * x2_ptr[ix]) : SV_DUP(0.0); | |||||
| i = 0; | |||||
| BLASLONG sve_size = SV_COUNT(); | |||||
| while ((i + sve_size * 4 - 1) < m) { | |||||
| SV_TYPE y0_vec = svld1_vnum(SV_TRUE(), y + i, 0); | |||||
| SV_TYPE y1_vec = svld1_vnum(SV_TRUE(), y + i, 1); | |||||
| SV_TYPE y2_vec = svld1_vnum(SV_TRUE(), y + i, 2); | |||||
| SV_TYPE y3_vec = svld1_vnum(SV_TRUE(), y + i, 3); | |||||
| SV_TYPE a00_vec = svld1_vnum(pg00, a0_ptr + i, 0); | |||||
| SV_TYPE a10_vec = svld1_vnum(pg10, a0_ptr + i, 1); | |||||
| SV_TYPE a20_vec = svld1_vnum(pg20, a0_ptr + i, 2); | |||||
| SV_TYPE a30_vec = svld1_vnum(pg30, a0_ptr + i, 3); | |||||
| SV_TYPE a01_vec = svld1_vnum(pg01, a1_ptr + i, 0); | |||||
| SV_TYPE a11_vec = svld1_vnum(pg11, a1_ptr + i, 1); | |||||
| SV_TYPE a21_vec = svld1_vnum(pg21, a1_ptr + i, 2); | |||||
| SV_TYPE a31_vec = svld1_vnum(pg31, a1_ptr + i, 3); | |||||
| SV_TYPE a02_vec = svld1_vnum(pg02, a2_ptr + i, 0); | |||||
| SV_TYPE a12_vec = svld1_vnum(pg12, a2_ptr + i, 1); | |||||
| SV_TYPE a22_vec = svld1_vnum(pg22, a2_ptr + i, 2); | |||||
| SV_TYPE a32_vec = svld1_vnum(pg32, a2_ptr + i, 3); | |||||
| y0_vec = svmla_m(pg00, y0_vec, temp0_vec, a00_vec); | |||||
| y1_vec = svmla_m(pg10, y1_vec, temp0_vec, a10_vec); | |||||
| y2_vec = svmla_m(pg20, y2_vec, temp0_vec, a20_vec); | |||||
| y3_vec = svmla_m(pg30, y3_vec, temp0_vec, a30_vec); | |||||
| y0_vec = svmla_m(pg01, y0_vec, temp1_vec, a01_vec); | |||||
| y1_vec = svmla_m(pg11, y1_vec, temp1_vec, a11_vec); | |||||
| y2_vec = svmla_m(pg21, y2_vec, temp1_vec, a21_vec); | |||||
| y3_vec = svmla_m(pg31, y3_vec, temp1_vec, a31_vec); | |||||
| y0_vec = svmla_m(pg02, y0_vec, temp2_vec, a02_vec); | |||||
| y1_vec = svmla_m(pg12, y1_vec, temp2_vec, a12_vec); | |||||
| y2_vec = svmla_m(pg22, y2_vec, temp2_vec, a22_vec); | |||||
| y3_vec = svmla_m(pg32, y3_vec, temp2_vec, a32_vec); | |||||
| svst1_vnum(SV_TRUE(), y + i, 0, y0_vec); | |||||
| svst1_vnum(SV_TRUE(), y + i, 1, y1_vec); | |||||
| svst1_vnum(SV_TRUE(), y + i, 2, y2_vec); | |||||
| svst1_vnum(SV_TRUE(), y + i, 3, y3_vec); | |||||
| i += sve_size * 4; | |||||
| } | |||||
| if (i < m) { | |||||
| svbool_t pg0 = SV_WHILE(i + sve_size * 0, m); | |||||
| svbool_t pg1 = SV_WHILE(i + sve_size * 1, m); | |||||
| svbool_t pg2 = SV_WHILE(i + sve_size * 2, m); | |||||
| svbool_t pg3 = SV_WHILE(i + sve_size * 3, m); | |||||
| pg00 = svand_z(SV_TRUE(), pg0, pg00); | |||||
| pg10 = svand_z(SV_TRUE(), pg1, pg10); | |||||
| pg20 = svand_z(SV_TRUE(), pg2, pg20); | |||||
| pg30 = svand_z(SV_TRUE(), pg3, pg30); | |||||
| pg01 = svand_z(SV_TRUE(), pg0, pg01); | |||||
| pg11 = svand_z(SV_TRUE(), pg1, pg11); | |||||
| pg21 = svand_z(SV_TRUE(), pg2, pg21); | |||||
| pg31 = svand_z(SV_TRUE(), pg3, pg31); | |||||
| pg02 = svand_z(SV_TRUE(), pg0, pg02); | |||||
| pg12 = svand_z(SV_TRUE(), pg1, pg12); | |||||
| pg22 = svand_z(SV_TRUE(), pg2, pg22); | |||||
| pg32 = svand_z(SV_TRUE(), pg3, pg32); | |||||
| SV_TYPE y0_vec = svld1_vnum(pg0, y + i, 0); | |||||
| SV_TYPE y1_vec = svld1_vnum(pg1, y + i, 1); | |||||
| SV_TYPE y2_vec = svld1_vnum(pg2, y + i, 2); | |||||
| SV_TYPE y3_vec = svld1_vnum(pg3, y + i, 3); | |||||
| SV_TYPE a00_vec = svld1_vnum(pg00, a0_ptr + i, 0); | |||||
| SV_TYPE a10_vec = svld1_vnum(pg10, a0_ptr + i, 1); | |||||
| SV_TYPE a20_vec = svld1_vnum(pg20, a0_ptr + i, 2); | |||||
| SV_TYPE a30_vec = svld1_vnum(pg30, a0_ptr + i, 3); | |||||
| SV_TYPE a01_vec = svld1_vnum(pg01, a1_ptr + i, 0); | |||||
| SV_TYPE a11_vec = svld1_vnum(pg11, a1_ptr + i, 1); | |||||
| SV_TYPE a21_vec = svld1_vnum(pg21, a1_ptr + i, 2); | |||||
| SV_TYPE a31_vec = svld1_vnum(pg31, a1_ptr + i, 3); | |||||
| SV_TYPE a02_vec = svld1_vnum(pg02, a2_ptr + i, 0); | |||||
| SV_TYPE a12_vec = svld1_vnum(pg12, a2_ptr + i, 1); | |||||
| SV_TYPE a22_vec = svld1_vnum(pg22, a2_ptr + i, 2); | |||||
| SV_TYPE a32_vec = svld1_vnum(pg32, a2_ptr + i, 3); | |||||
| y0_vec = svmla_m(pg00, y0_vec, temp0_vec, a00_vec); | |||||
| y1_vec = svmla_m(pg10, y1_vec, temp0_vec, a10_vec); | |||||
| y2_vec = svmla_m(pg20, y2_vec, temp0_vec, a20_vec); | |||||
| y3_vec = svmla_m(pg30, y3_vec, temp0_vec, a30_vec); | |||||
| y0_vec = svmla_m(pg01, y0_vec, temp1_vec, a01_vec); | |||||
| y1_vec = svmla_m(pg11, y1_vec, temp1_vec, a11_vec); | |||||
| y2_vec = svmla_m(pg21, y2_vec, temp1_vec, a21_vec); | |||||
| y3_vec = svmla_m(pg31, y3_vec, temp1_vec, a31_vec); | |||||
| y0_vec = svmla_m(pg02, y0_vec, temp2_vec, a02_vec); | |||||
| y1_vec = svmla_m(pg12, y1_vec, temp2_vec, a12_vec); | |||||
| y2_vec = svmla_m(pg22, y2_vec, temp2_vec, a22_vec); | |||||
| y3_vec = svmla_m(pg32, y3_vec, temp2_vec, a32_vec); | |||||
| svst1_vnum(pg0, y + i, 0, y0_vec); | |||||
| svst1_vnum(pg1, y + i, 1, y1_vec); | |||||
| svst1_vnum(pg2, y + i, 2, y2_vec); | |||||
| svst1_vnum(pg3, y + i, 3, y3_vec); | |||||
| } | |||||
| a0_ptr += lda; | |||||
| a1_ptr += lda; | |||||
| a2_ptr += lda; | |||||
| ix += inc_x; | |||||
| } | |||||
| return(0); | |||||
| } | |||||
| for (j = 0; j < n; j++) { | |||||
| temp = alpha * x[ix]; | |||||
| iy = 0; | |||||
| for (i = 0; i < m; i++) { | |||||
| y[iy] += temp * a_ptr[i]; | |||||
| iy += inc_y; | |||||
| } | |||||
| a_ptr += lda; | |||||
| ix += inc_x; | |||||
| } | |||||
| return (0); | |||||
| } | |||||
| @@ -1,5 +1,5 @@ | |||||
| /*************************************************************************** | /*************************************************************************** | ||||
| Copyright (c) 2024, The OpenBLAS Project | |||||
| Copyright (c) 2024, 2025 The OpenBLAS Project | |||||
| All rights reserved. | All rights reserved. | ||||
| Redistribution and use in source and binary forms, with or without | Redistribution and use in source and binary forms, with or without | ||||
| @@ -56,12 +56,16 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, | |||||
| BLASLONG ix,iy; | BLASLONG ix,iy; | ||||
| BLASLONG j; | BLASLONG j; | ||||
| FLOAT *a_ptr; | FLOAT *a_ptr; | ||||
| FLOAT *y_ptr; | |||||
| FLOAT temp; | FLOAT temp; | ||||
| iy = 0; | iy = 0; | ||||
| if (inc_x == 1) { | if (inc_x == 1) { | ||||
| BLASLONG width = (n + 3 - 1) / 3; | |||||
| BLASLONG width = n / 3; | |||||
| BLASLONG sve_size = SV_COUNT(); | |||||
| svbool_t pg_true = SV_TRUE(); | |||||
| svbool_t pg = SV_WHILE(0, m % sve_size); | |||||
| FLOAT *a0_ptr = a + lda * width * 0; | FLOAT *a0_ptr = a + lda * width * 0; | ||||
| FLOAT *a1_ptr = a + lda * width * 1; | FLOAT *a1_ptr = a + lda * width * 1; | ||||
| @@ -72,60 +76,41 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, | |||||
| FLOAT *y2_ptr = y + inc_y * width * 2; | FLOAT *y2_ptr = y + inc_y * width * 2; | ||||
| for (j = 0; j < width; j++) { | for (j = 0; j < width; j++) { | ||||
| svbool_t pg00 = ((j + width * 0) < n) ? SV_TRUE() : svpfalse(); | |||||
| svbool_t pg01 = ((j + width * 1) < n) ? SV_TRUE() : svpfalse(); | |||||
| svbool_t pg02 = ((j + width * 2) < n) ? SV_TRUE() : svpfalse(); | |||||
| SV_TYPE temp00_vec = SV_DUP(0.0); | SV_TYPE temp00_vec = SV_DUP(0.0); | ||||
| SV_TYPE temp01_vec = SV_DUP(0.0); | SV_TYPE temp01_vec = SV_DUP(0.0); | ||||
| SV_TYPE temp02_vec = SV_DUP(0.0); | SV_TYPE temp02_vec = SV_DUP(0.0); | ||||
| i = 0; | i = 0; | ||||
| BLASLONG sve_size = SV_COUNT(); | |||||
| while ((i + sve_size * 1 - 1) < m) { | while ((i + sve_size * 1 - 1) < m) { | ||||
| SV_TYPE x0_vec = svld1_vnum(SV_TRUE(), x + i, 0); | |||||
| SV_TYPE x0_vec = svld1(pg_true, x + i); | |||||
| SV_TYPE a00_vec = svld1_vnum(pg00, a0_ptr + i, 0); | |||||
| SV_TYPE a01_vec = svld1_vnum(pg01, a1_ptr + i, 0); | |||||
| SV_TYPE a02_vec = svld1_vnum(pg02, a2_ptr + i, 0); | |||||
| SV_TYPE a00_vec = svld1(pg_true, a0_ptr + i); | |||||
| SV_TYPE a01_vec = svld1(pg_true, a1_ptr + i); | |||||
| SV_TYPE a02_vec = svld1(pg_true, a2_ptr + i); | |||||
| temp00_vec = svmla_m(pg00, temp00_vec, a00_vec, x0_vec); | |||||
| temp01_vec = svmla_m(pg01, temp01_vec, a01_vec, x0_vec); | |||||
| temp02_vec = svmla_m(pg02, temp02_vec, a02_vec, x0_vec); | |||||
| temp00_vec = svmla_x(pg_true, temp00_vec, a00_vec, x0_vec); | |||||
| temp01_vec = svmla_x(pg_true, temp01_vec, a01_vec, x0_vec); | |||||
| temp02_vec = svmla_x(pg_true, temp02_vec, a02_vec, x0_vec); | |||||
| i += sve_size * 1; | i += sve_size * 1; | ||||
| } | } | ||||
| if (i < m) { | if (i < m) { | ||||
| svbool_t pg0 = SV_WHILE(i + sve_size * 0, m); | |||||
| pg00 = svand_z(SV_TRUE(), pg0, pg00); | |||||
| pg01 = svand_z(SV_TRUE(), pg0, pg01); | |||||
| pg02 = svand_z(SV_TRUE(), pg0, pg02); | |||||
| SV_TYPE x0_vec = svld1(pg, x + i); | |||||
| SV_TYPE x0_vec = svld1_vnum(pg0, x + i, 0); | |||||
| SV_TYPE a00_vec = svld1(pg, a0_ptr + i); | |||||
| SV_TYPE a01_vec = svld1(pg, a1_ptr + i); | |||||
| SV_TYPE a02_vec = svld1(pg, a2_ptr + i); | |||||
| SV_TYPE a00_vec = svld1_vnum(pg00, a0_ptr + i, 0); | |||||
| SV_TYPE a01_vec = svld1_vnum(pg01, a1_ptr + i, 0); | |||||
| SV_TYPE a02_vec = svld1_vnum(pg02, a2_ptr + i, 0); | |||||
| temp00_vec = svmla_m(pg00, temp00_vec, a00_vec, x0_vec); | |||||
| temp01_vec = svmla_m(pg01, temp01_vec, a01_vec, x0_vec); | |||||
| temp02_vec = svmla_m(pg02, temp02_vec, a02_vec, x0_vec); | |||||
| temp00_vec = svmla_m(pg, temp00_vec, a00_vec, x0_vec); | |||||
| temp01_vec = svmla_m(pg, temp01_vec, a01_vec, x0_vec); | |||||
| temp02_vec = svmla_m(pg, temp02_vec, a02_vec, x0_vec); | |||||
| } | } | ||||
| if ((j + width * 0) < n) { | |||||
| temp = svaddv(SV_TRUE(), temp00_vec); | |||||
| y0_ptr[iy] += alpha * temp; | |||||
| } | |||||
| if ((j + width * 1) < n) { | |||||
| temp = svaddv(SV_TRUE(), temp01_vec); | |||||
| y1_ptr[iy] += alpha * temp; | |||||
| } | |||||
| if ((j + width * 2) < n) { | |||||
| temp = svaddv(SV_TRUE(), temp02_vec); | |||||
| y2_ptr[iy] += alpha * temp; | |||||
| } | |||||
| y0_ptr[iy] += alpha * svaddv(pg_true, temp00_vec); | |||||
| y1_ptr[iy] += alpha * svaddv(pg_true, temp01_vec); | |||||
| y2_ptr[iy] += alpha * svaddv(pg_true, temp02_vec); | |||||
| iy += inc_y; | iy += inc_y; | ||||
| a0_ptr += lda; | a0_ptr += lda; | ||||
| @@ -133,6 +118,37 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, | |||||
| a2_ptr += lda; | a2_ptr += lda; | ||||
| } | } | ||||
| a_ptr = a2_ptr; | |||||
| y_ptr = y2_ptr; | |||||
| for (j = width * 3; j < n; j++) { | |||||
| SV_TYPE temp_vec = SV_DUP(0.0); | |||||
| i = 0; | |||||
| while ((i + sve_size * 1 - 1) < m) { | |||||
| SV_TYPE x_vec = svld1(pg_true, x + i); | |||||
| SV_TYPE a_vec = svld1(pg_true, a_ptr + i); | |||||
| temp_vec = svmla_x(pg_true, temp_vec, a_vec, x_vec); | |||||
| i += sve_size * 1; | |||||
| } | |||||
| if (i < m) { | |||||
| SV_TYPE x_vec = svld1(pg, x + i); | |||||
| SV_TYPE a_vec = svld1(pg, a_ptr + i); | |||||
| temp_vec = svmla_m(pg, temp_vec, a_vec, x_vec); | |||||
| } | |||||
| y_ptr[iy] += alpha * svaddv(pg_true, temp_vec); | |||||
| iy += inc_y; | |||||
| a_ptr += lda; | |||||
| } | |||||
| return(0); | return(0); | ||||
| } | } | ||||
| @@ -153,8 +153,9 @@ static FLOAT sasum_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| " asr "J", "N", #6 \n" | " asr "J", "N", #6 \n" | ||||
| " cmp "J", xzr \n" | " cmp "J", xzr \n" | ||||
| " beq 3f //asum_kernel_F1 \n" | " beq 3f //asum_kernel_F1 \n" | ||||
| #if !(defined(__clang__) && defined(OS_WINDOWS)) | |||||
| ".align 5 \n" | ".align 5 \n" | ||||
| #endif | |||||
| "2: //asum_kernel_F64: \n" | "2: //asum_kernel_F64: \n" | ||||
| " "KERNEL_F64" \n" | " "KERNEL_F64" \n" | ||||
| " subs "J", "J", #1 \n" | " subs "J", "J", #1 \n" | ||||
| @@ -0,0 +1,83 @@ | |||||
| /*************************************************************************** | |||||
| * Copyright (c) 2024, The OpenBLAS Project | |||||
| * All rights reserved. | |||||
| * Redistribution and use in source and binary forms, with or without | |||||
| * modification, are permitted provided that the following conditions are | |||||
| * met: | |||||
| * 1. Redistributions of source code must retain the above copyright | |||||
| * notice, this list of conditions and the following disclaimer. | |||||
| * 2. Redistributions in binary form must reproduce the above copyright | |||||
| * notice, this list of conditions and the following disclaimer in | |||||
| * the documentation and/or other materials provided with the | |||||
| * distribution. | |||||
| * 3. Neither the name of the OpenBLAS project nor the names of | |||||
| * its contributors may be used to endorse or promote products | |||||
| * derived from this software without specific prior written permission. | |||||
| * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |||||
| * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | |||||
| * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | |||||
| * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | |||||
| * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |||||
| * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | |||||
| * POSSIBILITY OF SUCH DAMAGE. | |||||
| * *****************************************************************************/ | |||||
| #include "common.h" | |||||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, IFLOAT *dummy2, | |||||
| BLASLONG dummy3, IFLOAT *dummy4, BLASLONG dummy5, FLOAT *c, | |||||
| BLASLONG ldc) { | |||||
| BLASLONG i, j; | |||||
| BLASLONG chunk, remain; | |||||
| FLOAT *c_offset1, *c_offset; | |||||
| c_offset = c; | |||||
| chunk = m >> 3; | |||||
| remain = m & 7; | |||||
| if (beta == ZERO) { | |||||
| for (j = n; j > 0; j--) { | |||||
| c_offset1 = c_offset; | |||||
| c_offset += ldc; | |||||
| for (i = chunk; i > 0; i--) { | |||||
| *(c_offset1 + 0) = ZERO; | |||||
| *(c_offset1 + 1) = ZERO; | |||||
| *(c_offset1 + 2) = ZERO; | |||||
| *(c_offset1 + 3) = ZERO; | |||||
| *(c_offset1 + 4) = ZERO; | |||||
| *(c_offset1 + 5) = ZERO; | |||||
| *(c_offset1 + 6) = ZERO; | |||||
| *(c_offset1 + 7) = ZERO; | |||||
| c_offset1 += 8; | |||||
| } | |||||
| for (i = remain; i > 0; i--) { | |||||
| *c_offset1 = ZERO; | |||||
| c_offset1++; | |||||
| } | |||||
| } | |||||
| } else { | |||||
| for (j = n; j > 0; j--) { | |||||
| c_offset1 = c_offset; | |||||
| c_offset += ldc; | |||||
| for (i = chunk; i > 0; i--) { | |||||
| *(c_offset1 + 0) *= beta; | |||||
| *(c_offset1 + 1) *= beta; | |||||
| *(c_offset1 + 2) *= beta; | |||||
| *(c_offset1 + 3) *= beta; | |||||
| *(c_offset1 + 4) *= beta; | |||||
| *(c_offset1 + 5) *= beta; | |||||
| *(c_offset1 + 6) *= beta; | |||||
| *(c_offset1 + 7) *= beta; | |||||
| c_offset1 += 8; | |||||
| } | |||||
| for (i = remain; i > 0; i--) { | |||||
| *c_offset1 *= beta; | |||||
| c_offset1++; | |||||
| } | |||||
| } | |||||
| } | |||||
| return 0; | |||||
| }; | |||||
| @@ -0,0 +1,46 @@ | |||||
| /*************************************************************************** | |||||
| * Copyright (c) 2024-2025, The OpenBLAS Project | |||||
| * All rights reserved. | |||||
| * Redistribution and use in source and binary forms, with or without | |||||
| * modification, are permitted provided that the following conditions are | |||||
| * met: | |||||
| * 1. Redistributions of source code must retain the above copyright | |||||
| * notice, this list of conditions and the following disclaimer. | |||||
| * 2. Redistributions in binary form must reproduce the above copyright | |||||
| * notice, this list of conditions and the following disclaimer in | |||||
| * the documentation and/or other materials provided with the | |||||
| * distribution. | |||||
| * 3. Neither the name of the OpenBLAS project nor the names of | |||||
| * its contributors may be used to endorse or promote products | |||||
| * derived from this software without specific prior written permission. | |||||
| * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |||||
| * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | |||||
| * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | |||||
| * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | |||||
| * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |||||
| * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | |||||
| * POSSIBILITY OF SUCH DAMAGE. | |||||
| * *****************************************************************************/ | |||||
| #include <arm_sve.h> | |||||
| #include "common.h" | |||||
| #define ALPHA_ONE | |||||
| #include "sbgemm_kernel_4x4_neoversev1_impl.c" | |||||
| #undef ALPHA_ONE | |||||
| #include "sbgemm_kernel_4x4_neoversev1_impl.c" | |||||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT *A, IFLOAT *B, | |||||
| FLOAT *C, BLASLONG ldc) { | |||||
| if (alpha == 1.0f) | |||||
| return sbgemm_kernel_neoversev1_alpha_one(m, n, k, alpha, A, B, C, ldc); | |||||
| else | |||||
| return sbgemm_kernel_neoversev1_alpha(m, n, k, alpha, A, B, C, ldc); | |||||
| return 0; | |||||
| } | |||||
| @@ -0,0 +1,414 @@ | |||||
| /*************************************************************************** | |||||
| * Copyright (c) 2024-2025, The OpenBLAS Project | |||||
| * All rights reserved. | |||||
| * Redistribution and use in source and binary forms, with or without | |||||
| * modification, are permitted provided that the following conditions are | |||||
| * met: | |||||
| * 1. Redistributions of source code must retain the above copyright | |||||
| * notice, this list of conditions and the following disclaimer. | |||||
| * 2. Redistributions in binary form must reproduce the above copyright | |||||
| * notice, this list of conditions and the following disclaimer in | |||||
| * the documentation and/or other materials provided with the | |||||
| * distribution. | |||||
| * 3. Neither the name of the OpenBLAS project nor the names of | |||||
| * its contributors may be used to endorse or promote products | |||||
| * derived from this software without specific prior written permission. | |||||
| * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |||||
| * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | |||||
| * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | |||||
| * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | |||||
| * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |||||
| * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | |||||
| * POSSIBILITY OF SUCH DAMAGE. | |||||
| * *****************************************************************************/ | |||||
| #include <arm_sve.h> | |||||
| #include "common.h" | |||||
| #define INIT_C(M, N) mc##M##N = svdup_f32(0); | |||||
| #define MATMUL(M, N) mc##M##N = svbfmmla(mc##M##N, ma##M, mb##N); | |||||
| #define INIT_C_4x4 \ | |||||
| do { \ | |||||
| INIT_C(0, 0); \ | |||||
| INIT_C(0, 1); \ | |||||
| INIT_C(1, 0); \ | |||||
| INIT_C(1, 1); \ | |||||
| } while (0); | |||||
| #ifdef ALPHA_ONE | |||||
| #define UPDATE_C(PG, PTR, DST, SRC) \ | |||||
| do { \ | |||||
| DST = svld1_f32((PG), (PTR)); \ | |||||
| DST = svadd_z((PG), SRC, DST); \ | |||||
| svst1_f32((PG), (PTR), DST); \ | |||||
| } while (0); | |||||
| #else | |||||
| #define UPDATE_C(PG, PTR, DST, SRC) \ | |||||
| do { \ | |||||
| DST = svld1_f32((PG), (PTR)); \ | |||||
| DST = svmad_z((PG), svalpha, SRC, DST); \ | |||||
| svst1_f32((PG), (PTR), DST); \ | |||||
| } while (0); | |||||
| #endif | |||||
| #define ZIP_EVEN_ELEMENTS(PG, mc0, mc1, tmp, vc) \ | |||||
| do { \ | |||||
| (tmp) = svuzp1_f32((mc0), (mc1)); \ | |||||
| (vc) = svcompact_f32((PG), (tmp)); \ | |||||
| } while (0) | |||||
| #define ZIP_ODD_ELEMENTS(PG, mc0, mc1, tmp, vc) \ | |||||
| do { \ | |||||
| (tmp) = svuzp2_f32((mc0), (mc1)); \ | |||||
| (vc) = svcompact_f32((PG), (tmp)); \ | |||||
| } while (0) | |||||
| #define ACCUMULATE_LAST4_TO_FIRST4(M, N, TMP) \ | |||||
| do { \ | |||||
| TMP = svext_f32(mc##M##N, mc##M##N, 4); \ | |||||
| mc##M##N = svadd_f32_z(svptrue_b32(), mc##M##N, (TMP)); \ | |||||
| } while (0) | |||||
| #ifdef ALPHA_ONE | |||||
| int sbgemm_kernel_neoversev1_alpha_one(BLASLONG m, BLASLONG n, BLASLONG k, | |||||
| FLOAT alpha, IFLOAT *A, IFLOAT *B, | |||||
| FLOAT *C, BLASLONG ldc) | |||||
| #else | |||||
| int sbgemm_kernel_neoversev1_alpha(BLASLONG m, BLASLONG n, BLASLONG k, | |||||
| FLOAT alpha, IFLOAT *A, IFLOAT *B, FLOAT *C, | |||||
| BLASLONG ldc) | |||||
| #endif | |||||
| { | |||||
| BLASLONG pad_k = (k + 7) & ~7; | |||||
| svbfloat16_t ma0, ma1, mb0, mb1; | |||||
| svfloat32_t mc00, mc01, mc10, mc11, vc0, vc1, vc2, vc3, oc0, oc1, oc2, oc3; | |||||
| svfloat32_t tmp; | |||||
| svfloat32_t svalpha = svdup_f32(alpha); | |||||
| svbool_t pg16_all = svptrue_b16(); | |||||
| svbool_t pg32_first_1 = svwhilelt_b32(0, 1); | |||||
| svbool_t pg32_first_2 = svwhilelt_b32(0, 2); | |||||
| svbool_t pg32_first_4 = svwhilelt_b32(0, 4); | |||||
| svbool_t pg32_select_first_2_per_quadword = svdupq_b32(1, 1, 0, 0); | |||||
| bfloat16_t *ptr_a = (bfloat16_t *)A; | |||||
| bfloat16_t *ptr_b = (bfloat16_t *)B; | |||||
| FLOAT *ptr_c = C; | |||||
| bfloat16_t *ptr_a0; | |||||
| bfloat16_t *ptr_b0; | |||||
| FLOAT *ptr_c0, *ptr_c1, *ptr_c2, *ptr_c3; | |||||
| for (BLASLONG j = 0; j < n / 4; j++) { | |||||
| ptr_c0 = ptr_c; | |||||
| ptr_c1 = ptr_c0 + ldc; | |||||
| ptr_c2 = ptr_c1 + ldc; | |||||
| ptr_c3 = ptr_c2 + ldc; | |||||
| ptr_c += 4 * ldc; | |||||
| ptr_a = (bfloat16_t *)A; | |||||
| for (BLASLONG i = 0; i < m / 4; i++) { | |||||
| ptr_a0 = ptr_a; | |||||
| ptr_a += 4 * pad_k; | |||||
| ptr_b0 = ptr_b; | |||||
| INIT_C_4x4; | |||||
| for (BLASLONG p = 0; p < pad_k; p += 8) { | |||||
| ma0 = svld1_bf16(pg16_all, ptr_a0); | |||||
| ma1 = svld1_bf16(pg16_all, ptr_a0 + 16); | |||||
| mb0 = svld1_bf16(pg16_all, ptr_b0); | |||||
| mb1 = svld1_bf16(pg16_all, ptr_b0 + 16); | |||||
| MATMUL(0, 0); | |||||
| MATMUL(0, 1); | |||||
| MATMUL(1, 0); | |||||
| MATMUL(1, 1); | |||||
| ptr_a0 += 32; | |||||
| ptr_b0 += 32; | |||||
| } | |||||
| ACCUMULATE_LAST4_TO_FIRST4(0, 0, tmp); | |||||
| ACCUMULATE_LAST4_TO_FIRST4(0, 1, tmp); | |||||
| ACCUMULATE_LAST4_TO_FIRST4(1, 0, tmp); | |||||
| ACCUMULATE_LAST4_TO_FIRST4(1, 1, tmp); | |||||
| ZIP_EVEN_ELEMENTS(pg32_select_first_2_per_quadword, mc00, mc10, tmp, vc0); | |||||
| ZIP_ODD_ELEMENTS(pg32_select_first_2_per_quadword, mc00, mc10, tmp, vc1); | |||||
| ZIP_EVEN_ELEMENTS(pg32_select_first_2_per_quadword, mc01, mc11, tmp, vc2); | |||||
| ZIP_ODD_ELEMENTS(pg32_select_first_2_per_quadword, mc01, mc11, tmp, vc3); | |||||
| UPDATE_C(pg32_first_4, ptr_c0, oc0, vc0); | |||||
| UPDATE_C(pg32_first_4, ptr_c1, oc1, vc1); | |||||
| UPDATE_C(pg32_first_4, ptr_c2, oc2, vc2) | |||||
| UPDATE_C(pg32_first_4, ptr_c3, oc3, vc3) | |||||
| ptr_c0 += 4; | |||||
| ptr_c1 += 4; | |||||
| ptr_c2 += 4; | |||||
| ptr_c3 += 4; | |||||
| } | |||||
| if (m & 2) { | |||||
| ptr_a0 = ptr_a; | |||||
| ptr_a += 2 * pad_k; | |||||
| ptr_b0 = ptr_b; | |||||
| INIT_C(0, 0); | |||||
| INIT_C(0, 1); | |||||
| for (BLASLONG p = 0; p < pad_k; p += 8) { | |||||
| ma0 = svld1_bf16(pg16_all, ptr_a0); | |||||
| mb0 = svld1_bf16(pg16_all, ptr_b0); | |||||
| mb1 = svld1_bf16(pg16_all, ptr_b0 + 16); | |||||
| MATMUL(0, 0); | |||||
| MATMUL(0, 1); | |||||
| ptr_a0 += 16; | |||||
| ptr_b0 += 32; | |||||
| } | |||||
| ACCUMULATE_LAST4_TO_FIRST4(0, 0, tmp); | |||||
| ACCUMULATE_LAST4_TO_FIRST4(0, 1, tmp); | |||||
| vc0 = svuzp1(mc00, mc00); | |||||
| vc1 = svuzp2(mc00, mc00); | |||||
| vc2 = svuzp1(mc01, mc01); | |||||
| vc3 = svuzp2(mc01, mc01); | |||||
| UPDATE_C(pg32_first_2, ptr_c0, oc0, vc0); | |||||
| UPDATE_C(pg32_first_2, ptr_c1, oc1, vc1); | |||||
| UPDATE_C(pg32_first_2, ptr_c2, oc2, vc2); | |||||
| UPDATE_C(pg32_first_2, ptr_c3, oc3, vc3); | |||||
| ptr_c0 += 2; | |||||
| ptr_c1 += 2; | |||||
| ptr_c2 += 2; | |||||
| ptr_c3 += 2; | |||||
| } | |||||
| if (m & 1) { | |||||
| ptr_a0 = ptr_a; | |||||
| ptr_b0 = ptr_b; | |||||
| INIT_C(0, 0); | |||||
| INIT_C(0, 1); | |||||
| for (BLASLONG p = 0; p < pad_k; p += 8) { | |||||
| ma0 = svld1_bf16(pg16_all, ptr_a0); | |||||
| mb0 = svld1_bf16(pg16_all, ptr_b0); | |||||
| mb1 = svld1_bf16(pg16_all, ptr_b0 + 16); | |||||
| MATMUL(0, 0); | |||||
| MATMUL(0, 1); | |||||
| ptr_a0 += 16; | |||||
| ptr_b0 += 32; | |||||
| } | |||||
| ACCUMULATE_LAST4_TO_FIRST4(0, 0, tmp); | |||||
| ACCUMULATE_LAST4_TO_FIRST4(0, 1, tmp); | |||||
| // use compact is more straightforward | |||||
| vc1 = svuzp2(mc00, mc00); | |||||
| vc3 = svuzp2(mc01, mc01); | |||||
| UPDATE_C(pg32_first_1, ptr_c0, oc0, mc00); | |||||
| UPDATE_C(pg32_first_1, ptr_c1, oc1, vc1); | |||||
| UPDATE_C(pg32_first_1, ptr_c2, oc2, mc01); | |||||
| UPDATE_C(pg32_first_1, ptr_c3, oc3, vc3); | |||||
| } | |||||
| ptr_b += 4 * pad_k; | |||||
| } | |||||
| if (n & 2) { | |||||
| ptr_c0 = ptr_c; | |||||
| ptr_c1 = ptr_c0 + ldc; | |||||
| ptr_c += 2 * ldc; | |||||
| ptr_a = (bfloat16_t *)A; | |||||
| for (BLASLONG i = 0; i < m / 4; i++) { | |||||
| ptr_a0 = ptr_a; | |||||
| ptr_a += 4 * pad_k; | |||||
| ptr_b0 = ptr_b; | |||||
| INIT_C(0, 0); | |||||
| INIT_C(1, 0); | |||||
| for (BLASLONG p = 0; p < pad_k; p += 8) { | |||||
| ma0 = svld1_bf16(pg16_all, ptr_a0); | |||||
| ma1 = svld1_bf16(pg16_all, ptr_a0 + 16); | |||||
| mb0 = svld1_bf16(pg16_all, ptr_b0); | |||||
| MATMUL(0, 0); | |||||
| MATMUL(1, 0); | |||||
| ptr_a0 += 32; | |||||
| ptr_b0 += 16; | |||||
| } | |||||
| ACCUMULATE_LAST4_TO_FIRST4(0, 0, tmp); | |||||
| ACCUMULATE_LAST4_TO_FIRST4(1, 0, tmp); | |||||
| ZIP_EVEN_ELEMENTS(pg32_select_first_2_per_quadword, mc00, mc10, tmp, vc0); | |||||
| ZIP_ODD_ELEMENTS(pg32_select_first_2_per_quadword, mc00, mc10, tmp, vc2); | |||||
| UPDATE_C(pg32_first_4, ptr_c0, oc0, vc0); | |||||
| UPDATE_C(pg32_first_4, ptr_c1, oc2, vc2); | |||||
| ptr_c0 += 4; | |||||
| ptr_c1 += 4; | |||||
| } | |||||
| if (m & 2) { | |||||
| ptr_a0 = ptr_a; | |||||
| ptr_a += 2 * pad_k; | |||||
| ptr_b0 = ptr_b; | |||||
| INIT_C(0, 0); | |||||
| for (BLASLONG p = 0; p < pad_k; p += 8) { | |||||
| ma0 = svld1_bf16(pg16_all, ptr_a0); | |||||
| mb0 = svld1_bf16(pg16_all, ptr_b0); | |||||
| MATMUL(0, 0); | |||||
| ptr_a0 += 16; | |||||
| ptr_b0 += 16; | |||||
| } | |||||
| ACCUMULATE_LAST4_TO_FIRST4(0, 0, tmp); | |||||
| vc0 = svuzp1(mc00, mc00); | |||||
| vc1 = svuzp2(mc00, mc00); | |||||
| UPDATE_C(pg32_first_2, ptr_c0, oc0, vc0); | |||||
| UPDATE_C(pg32_first_2, ptr_c1, oc1, vc1); | |||||
| ptr_c0 += 2; | |||||
| ptr_c1 += 2; | |||||
| } | |||||
| if (m & 1) { | |||||
| ptr_a0 = ptr_a; | |||||
| ptr_b0 = ptr_b; | |||||
| INIT_C(0, 0); | |||||
| for (BLASLONG p = 0; p < pad_k; p += 8) { | |||||
| ma0 = svld1_bf16(pg16_all, ptr_a0); | |||||
| mb0 = svld1_bf16(pg16_all, ptr_b0); | |||||
| MATMUL(0, 0); | |||||
| ptr_a0 += 16; | |||||
| ptr_b0 += 16; | |||||
| } | |||||
| ACCUMULATE_LAST4_TO_FIRST4(0, 0, tmp); | |||||
| vc1 = svuzp2(mc00, mc00); | |||||
| UPDATE_C(pg32_first_1, ptr_c0, oc0, mc00); | |||||
| UPDATE_C(pg32_first_1, ptr_c1, oc1, vc1); | |||||
| } | |||||
| ptr_b += 2 * pad_k; | |||||
| } | |||||
| if (n & 1) { // TODO: this case seems a overhead. find out whether it's in our | |||||
| // case. | |||||
| ptr_c0 = ptr_c; | |||||
| ptr_a = (bfloat16_t *)A; | |||||
| for (BLASLONG i = 0; i < m / 4; i++) { | |||||
| ptr_a0 = ptr_a; | |||||
| ptr_a += 4 * pad_k; | |||||
| ptr_b0 = ptr_b; | |||||
| INIT_C(0, 0); | |||||
| INIT_C(1, 0); | |||||
| for (BLASLONG p = 0; p < pad_k; p += 8) { | |||||
| ma0 = svld1_bf16(pg16_all, ptr_a0); | |||||
| ma1 = svld1_bf16(pg16_all, ptr_a0 + 16); | |||||
| mb0 = svld1_bf16(pg16_all, ptr_b0); | |||||
| MATMUL(0, 0); | |||||
| MATMUL(1, 0); | |||||
| ptr_a0 += 32; | |||||
| ptr_b0 += 16; | |||||
| } | |||||
| ACCUMULATE_LAST4_TO_FIRST4(0, 0, tmp); | |||||
| ACCUMULATE_LAST4_TO_FIRST4(1, 0, tmp); | |||||
| ZIP_EVEN_ELEMENTS(pg32_select_first_2_per_quadword, mc00, mc10, tmp, vc0); | |||||
| UPDATE_C(pg32_first_4, ptr_c0, oc0, vc0); | |||||
| ptr_c0 += 4; | |||||
| } | |||||
| if (m & 2) { | |||||
| ptr_a0 = ptr_a; | |||||
| ptr_a += 2 * pad_k; | |||||
| ptr_b0 = ptr_b; | |||||
| INIT_C(0, 0); | |||||
| for (BLASLONG p = 0; p < pad_k; p += 8) { | |||||
| ma0 = svld1_bf16(pg16_all, ptr_a0); | |||||
| mb0 = svld1_bf16(pg16_all, ptr_b0); | |||||
| MATMUL(0, 0); | |||||
| ptr_a0 += 16; | |||||
| ptr_b0 += 16; | |||||
| } | |||||
| ACCUMULATE_LAST4_TO_FIRST4(0, 0, tmp); | |||||
| vc0 = svuzp1(mc00, mc00); | |||||
| UPDATE_C(pg32_first_2, ptr_c0, oc0, vc0); | |||||
| ptr_c0 += 2; | |||||
| } | |||||
| if (m & 1) { | |||||
| ptr_a0 = ptr_a; | |||||
| ptr_b0 = ptr_b; | |||||
| INIT_C(0, 0); | |||||
| for (BLASLONG p = 0; p < pad_k; p += 8) { | |||||
| ma0 = svld1_bf16(pg16_all, ptr_a0); | |||||
| mb0 = svld1_bf16(pg16_all, ptr_b0); | |||||
| MATMUL(0, 0); | |||||
| ptr_a0 += 16; | |||||
| ptr_b0 += 16; | |||||
| } | |||||
| ACCUMULATE_LAST4_TO_FIRST4(0, 0, tmp); | |||||
| UPDATE_C(pg32_first_1, ptr_c0, oc0, mc00); | |||||
| } | |||||
| } | |||||
| return 0; | |||||
| } | |||||
| @@ -0,0 +1,148 @@ | |||||
| /*************************************************************************** | |||||
| * Copyright (c) 2024-2025, The OpenBLAS Project | |||||
| * All rights reserved. | |||||
| * Redistribution and use in source and binary forms, with or without | |||||
| * modification, are permitted provided that the following conditions are | |||||
| * met: | |||||
| * 1. Redistributions of source code must retain the above copyright | |||||
| * notice, this list of conditions and the following disclaimer. | |||||
| * 2. Redistributions in binary form must reproduce the above copyright | |||||
| * notice, this list of conditions and the following disclaimer in | |||||
| * the documentation and/or other materials provided with the | |||||
| * distribution. | |||||
| * 3. Neither the name of the OpenBLAS project nor the names of | |||||
| * its contributors may be used to endorse or promote products | |||||
| * derived from this software without specific prior written permission. | |||||
| * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |||||
| * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | |||||
| * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | |||||
| * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | |||||
| * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |||||
| * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | |||||
| * POSSIBILITY OF SUCH DAMAGE. | |||||
| * *****************************************************************************/ | |||||
| #include <arm_sve.h> | |||||
| #include "common.h" | |||||
| int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) { | |||||
| IFLOAT *a_offset; | |||||
| IFLOAT *a_offsetx[4]; | |||||
| IFLOAT *b_offset; | |||||
| a_offset = a; | |||||
| b_offset = b; | |||||
| bfloat16_t zero_value_bf16; | |||||
| *((uint16_t *)(&zero_value_bf16)) = 0; | |||||
| svbool_t pg16_all = svptrue_b16(); // 16 elements for sve-256 machine. | |||||
| svbool_t pg16_first_8 = svwhilelt_b16(0, 8); | |||||
| svbfloat16_t v0, v1, v2, v3; | |||||
| svuint64_t t0, t1; | |||||
| BLASLONG rest = m & 7; | |||||
| svbool_t pg16_rest = svwhilelt_b16_s32(0, rest); | |||||
| for (BLASLONG j = 0; j < n / 4; j++) { | |||||
| a_offsetx[0] = a_offset; | |||||
| a_offsetx[1] = a_offsetx[0] + lda; | |||||
| a_offsetx[2] = a_offsetx[1] + lda; | |||||
| a_offsetx[3] = a_offsetx[2] + lda; | |||||
| a_offset += 4 * lda; | |||||
| for (BLASLONG i = 0; i < m / 8; i++) { | |||||
| v0 = svld1_bf16(pg16_first_8, (bfloat16_t *)a_offsetx[0]); | |||||
| v1 = svld1_bf16(pg16_first_8, (bfloat16_t *)a_offsetx[1]); | |||||
| v2 = svld1_bf16(pg16_first_8, (bfloat16_t *)a_offsetx[2]); | |||||
| v3 = svld1_bf16(pg16_first_8, (bfloat16_t *)a_offsetx[3]); | |||||
| t0 = svzip1_u64(svreinterpret_u64_bf16(v0), svreinterpret_u64_bf16(v1)); | |||||
| t1 = svzip1_u64(svreinterpret_u64_bf16(v2), svreinterpret_u64_bf16(v3)); | |||||
| svst1_bf16(pg16_all, (bfloat16_t *)b_offset, svreinterpret_bf16_u64(t0)); | |||||
| svst1_bf16(pg16_all, (bfloat16_t *)b_offset + 16, | |||||
| svreinterpret_bf16_u64(t1)); | |||||
| a_offsetx[0] += 8; | |||||
| a_offsetx[1] += 8; | |||||
| a_offsetx[2] += 8; | |||||
| a_offsetx[3] += 8; | |||||
| b_offset += 32; | |||||
| } | |||||
| if (rest) { // remainder along k dim | |||||
| v0 = svld1_bf16(pg16_rest, (bfloat16_t *)a_offsetx[0]); | |||||
| v1 = svld1_bf16(pg16_rest, (bfloat16_t *)a_offsetx[1]); | |||||
| v2 = svld1_bf16(pg16_rest, (bfloat16_t *)a_offsetx[2]); | |||||
| v3 = svld1_bf16(pg16_rest, (bfloat16_t *)a_offsetx[3]); | |||||
| t0 = svzip1_u64(svreinterpret_u64_bf16(v0), svreinterpret_u64_bf16(v1)); | |||||
| t1 = svzip1_u64(svreinterpret_u64_bf16(v2), svreinterpret_u64_bf16(v3)); | |||||
| svst1_bf16(pg16_all, (bfloat16_t *)b_offset, svreinterpret_bf16_u64(t0)); | |||||
| svst1_bf16(pg16_all, (bfloat16_t *)b_offset + 16, | |||||
| svreinterpret_bf16_u64(t1)); | |||||
| b_offset += 32; | |||||
| } | |||||
| } | |||||
| if (n & 2) { | |||||
| a_offsetx[0] = a_offset; | |||||
| a_offsetx[1] = a_offsetx[0] + lda; | |||||
| a_offset += 2 * lda; | |||||
| for (BLASLONG i = 0; i < m / 8; i++) { | |||||
| v0 = svld1_bf16(pg16_first_8, (bfloat16_t *)a_offsetx[0]); | |||||
| v1 = svld1_bf16(pg16_first_8, (bfloat16_t *)a_offsetx[1]); | |||||
| t0 = svzip1_u64(svreinterpret_u64_bf16(v0), svreinterpret_u64_bf16(v1)); | |||||
| svst1_bf16(pg16_all, (bfloat16_t *)b_offset, svreinterpret_bf16_u64(t0)); | |||||
| b_offset += 16; | |||||
| a_offsetx[0] += 8; | |||||
| a_offsetx[1] += 8; | |||||
| } | |||||
| if (rest) { // remainder along k dim | |||||
| v0 = svld1_bf16(pg16_rest, (bfloat16_t *)a_offsetx[0]); | |||||
| v1 = svld1_bf16(pg16_rest, (bfloat16_t *)a_offsetx[1]); | |||||
| t0 = svzip1_u64(svreinterpret_u64_bf16(v0), svreinterpret_u64_bf16(v1)); | |||||
| svst1_bf16(pg16_all, (bfloat16_t *)b_offset, svreinterpret_bf16_u64(t0)); | |||||
| b_offset += 16; | |||||
| } | |||||
| } | |||||
| if (n & 1) { | |||||
| a_offsetx[0] = a_offset; | |||||
| for (BLASLONG i = 0; i < m / 8; i++) { | |||||
| v0 = svld1_bf16(pg16_first_8, (bfloat16_t *)a_offsetx[0]); | |||||
| v1 = svdup_bf16(zero_value_bf16); | |||||
| t0 = svzip1_u64(svreinterpret_u64_bf16(v0), svreinterpret_u64_bf16(v1)); | |||||
| svst1_bf16(pg16_all, (bfloat16_t *)b_offset, svreinterpret_bf16_u64(t0)); | |||||
| b_offset += 16; | |||||
| a_offsetx[0] += 8; | |||||
| } | |||||
| if (rest) { // remainder along k dim | |||||
| v0 = svld1_bf16(pg16_rest, (bfloat16_t *)a_offsetx[0]); | |||||
| v1 = svdup_bf16(zero_value_bf16); | |||||
| t0 = svzip1_u64(svreinterpret_u64_bf16(v0), svreinterpret_u64_bf16(v1)); | |||||
| svst1_bf16(pg16_all, (bfloat16_t *)b_offset, svreinterpret_bf16_u64(t0)); | |||||
| } | |||||
| } | |||||
| return 0; | |||||
| } | |||||
| @@ -0,0 +1,361 @@ | |||||
| /*************************************************************************** | |||||
| * Copyright (c) 2024-2025, The OpenBLAS Project | |||||
| * All rights reserved. | |||||
| * Redistribution and use in source and binary forms, with or without | |||||
| * modification, are permitted provided that the following conditions are | |||||
| * met: | |||||
| * 1. Redistributions of source code must retain the above copyright | |||||
| * notice, this list of conditions and the following disclaimer. | |||||
| * 2. Redistributions in binary form must reproduce the above copyright | |||||
| * notice, this list of conditions and the following disclaimer in | |||||
| * the documentation and/or other materials provided with the | |||||
| * distribution. | |||||
| * 3. Neither the name of the OpenBLAS project nor the names of | |||||
| * its contributors may be used to endorse or promote products | |||||
| * derived from this software without specific prior written permission. | |||||
| * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |||||
| * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | |||||
| * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | |||||
| * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | |||||
| * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |||||
| * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | |||||
| * POSSIBILITY OF SUCH DAMAGE. | |||||
| * *****************************************************************************/ | |||||
| #include "common.h" | |||||
| #include <arm_neon.h> | |||||
| #include <arm_sve.h> | |||||
| int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) { | |||||
| BLASLONG pad_m = ((m + 7) & ~7); | |||||
| BLASLONG rest = (m & 7); // rest along m dim | |||||
| IFLOAT *a_offset; | |||||
| IFLOAT *a_offset0, *a_offset1, *a_offset2, *a_offset3; | |||||
| IFLOAT *a_offset4, *a_offset5, *a_offset6, *a_offset7; | |||||
| IFLOAT *b_offset; | |||||
| IFLOAT *b_offset0, *b_offset1; | |||||
| a_offset = a; | |||||
| b_offset = b; | |||||
| svuint16_t c0, c1, c2, c3, c4, c5, c6, c7; | |||||
| svuint16_t t0, t1, t2, t3; | |||||
| svuint32_t m00, m01, m10, m11; | |||||
| svuint64_t st_offsets_0, st_offsets_1; | |||||
| svbool_t pg16_first_4 = svwhilelt_b16(0, 4); | |||||
| svbool_t pg16_first_8 = svwhilelt_b16(0, 8); | |||||
| svbool_t pg64_first_4 = svwhilelt_b64(0, 4); | |||||
| u_int32_t sizeof_u64 = 8; | |||||
| u_int64_t _st_offsets_0[4] = { | |||||
| 0 * sizeof_u64, | |||||
| 1 * sizeof_u64, | |||||
| 4 * sizeof_u64, | |||||
| 5 * sizeof_u64, | |||||
| }; | |||||
| u_int64_t _st_offsets_1[4] = { | |||||
| 2 * sizeof_u64, | |||||
| 3 * sizeof_u64, | |||||
| 6 * sizeof_u64, | |||||
| 7 * sizeof_u64, | |||||
| }; | |||||
| st_offsets_0 = svld1_u64(pg64_first_4, _st_offsets_0); | |||||
| st_offsets_1 = svld1_u64(pg64_first_4, _st_offsets_1); | |||||
| for (BLASLONG j = 0; j < n / 8; j++) { | |||||
| a_offset0 = a_offset; | |||||
| a_offset1 = a_offset0 + lda; | |||||
| a_offset2 = a_offset1 + lda; | |||||
| a_offset3 = a_offset2 + lda; | |||||
| a_offset4 = a_offset3 + lda; | |||||
| a_offset5 = a_offset4 + lda; | |||||
| a_offset6 = a_offset5 + lda; | |||||
| a_offset7 = a_offset6 + lda; | |||||
| a_offset += 8; | |||||
| b_offset0 = b_offset; | |||||
| b_offset1 = b_offset0 + 4 * pad_m; | |||||
| b_offset += 8 * pad_m; | |||||
| for (BLASLONG i = 0; i < m / 8; i++) { | |||||
| // transpose 8x8 matrix and pack into two 4x8 block consists of two 2x4 | |||||
| // small blocks | |||||
| c0 = svld1_u16(pg16_first_8, a_offset0); | |||||
| c1 = svld1_u16(pg16_first_8, a_offset1); | |||||
| c2 = svld1_u16(pg16_first_8, a_offset2); | |||||
| c3 = svld1_u16(pg16_first_8, a_offset3); | |||||
| c4 = svld1_u16(pg16_first_8, a_offset4); | |||||
| c5 = svld1_u16(pg16_first_8, a_offset5); | |||||
| c6 = svld1_u16(pg16_first_8, a_offset6); | |||||
| c7 = svld1_u16(pg16_first_8, a_offset7); | |||||
| t0 = svzip1_u16(c0, c1); | |||||
| t1 = svzip1_u16(c2, c3); | |||||
| t2 = svzip1_u16(c4, c5); | |||||
| t3 = svzip1_u16(c6, c7); | |||||
| m00 = svzip1_u32(svreinterpret_u32_u16(t0), svreinterpret_u32_u16(t1)); | |||||
| m10 = svzip2_u32(svreinterpret_u32_u16(t0), svreinterpret_u32_u16(t1)); | |||||
| m01 = svzip1_u32(svreinterpret_u32_u16(t2), svreinterpret_u32_u16(t3)); | |||||
| m11 = svzip2_u32(svreinterpret_u32_u16(t2), svreinterpret_u32_u16(t3)); | |||||
| svst1_scatter_u64offset_u64(pg64_first_4, (u_int64_t *)b_offset0, | |||||
| st_offsets_0, svreinterpret_u64_u32(m00)); | |||||
| svst1_scatter_u64offset_u64(pg64_first_4, (u_int64_t *)b_offset0, | |||||
| st_offsets_1, svreinterpret_u64_u32(m01)); | |||||
| svst1_scatter_u64offset_u64(pg64_first_4, (u_int64_t *)b_offset1, | |||||
| st_offsets_0, svreinterpret_u64_u32(m10)); | |||||
| svst1_scatter_u64offset_u64(pg64_first_4, (u_int64_t *)b_offset1, | |||||
| st_offsets_1, svreinterpret_u64_u32(m11)); | |||||
| a_offset0 += 8 * lda; | |||||
| a_offset1 += 8 * lda; | |||||
| a_offset2 += 8 * lda; | |||||
| a_offset3 += 8 * lda; | |||||
| a_offset4 += 8 * lda; | |||||
| a_offset5 += 8 * lda; | |||||
| a_offset6 += 8 * lda; | |||||
| a_offset7 += 8 * lda; | |||||
| b_offset0 += 32; | |||||
| b_offset1 += 32; | |||||
| } | |||||
| if (rest) { | |||||
| c0 = svld1_u16(pg16_first_8, a_offset0); | |||||
| c1 = (rest >= 2 ? svld1_u16(pg16_first_8, a_offset1) : svdup_u16(0)); | |||||
| c2 = (rest >= 3 ? svld1_u16(pg16_first_8, a_offset2) : svdup_u16(0)); | |||||
| c3 = (rest >= 4 ? svld1_u16(pg16_first_8, a_offset3) : svdup_u16(0)); | |||||
| c4 = (rest >= 5 ? svld1_u16(pg16_first_8, a_offset4) : svdup_u16(0)); | |||||
| c5 = (rest >= 6 ? svld1_u16(pg16_first_8, a_offset5) : svdup_u16(0)); | |||||
| c6 = (rest == 7 ? svld1_u16(pg16_first_8, a_offset6) : svdup_u16(0)); | |||||
| c7 = (svdup_u16(0)); | |||||
| t0 = svzip1_u16(c0, c1); | |||||
| t1 = svzip1_u16(c2, c3); | |||||
| t2 = svzip1_u16(c4, c5); | |||||
| t3 = svzip1_u16(c6, c7); | |||||
| m00 = svzip1_u32(svreinterpret_u32_u16(t0), svreinterpret_u32_u16(t1)); | |||||
| m10 = svzip2_u32(svreinterpret_u32_u16(t0), svreinterpret_u32_u16(t1)); | |||||
| m01 = svzip1_u32(svreinterpret_u32_u16(t2), svreinterpret_u32_u16(t3)); | |||||
| m11 = svzip2_u32(svreinterpret_u32_u16(t2), svreinterpret_u32_u16(t3)); | |||||
| svst1_scatter_u64offset_u64(pg64_first_4, (u_int64_t *)b_offset0, | |||||
| st_offsets_0, svreinterpret_u64_u32(m00)); | |||||
| svst1_scatter_u64offset_u64(pg64_first_4, (u_int64_t *)b_offset0, | |||||
| st_offsets_1, svreinterpret_u64_u32(m01)); | |||||
| svst1_scatter_u64offset_u64(pg64_first_4, (u_int64_t *)b_offset1, | |||||
| st_offsets_0, svreinterpret_u64_u32(m10)); | |||||
| svst1_scatter_u64offset_u64(pg64_first_4, (u_int64_t *)b_offset1, | |||||
| st_offsets_1, svreinterpret_u64_u32(m11)); | |||||
| } | |||||
| } | |||||
| if (n & 4) { | |||||
| a_offset0 = a_offset; | |||||
| a_offset1 = a_offset0 + lda; | |||||
| a_offset2 = a_offset1 + lda; | |||||
| a_offset3 = a_offset2 + lda; | |||||
| a_offset4 = a_offset3 + lda; | |||||
| a_offset5 = a_offset4 + lda; | |||||
| a_offset6 = a_offset5 + lda; | |||||
| a_offset7 = a_offset6 + lda; | |||||
| a_offset += 4; | |||||
| b_offset0 = b_offset; | |||||
| b_offset += 4 * pad_m; | |||||
| for (BLASLONG i = 0; i < m / 8; i++) { | |||||
| // transpose 8x8 matrix and pack into two 4x8 block consists of two 2x4 | |||||
| // small blocks | |||||
| c0 = svld1_u16(pg16_first_4, a_offset0); | |||||
| c1 = svld1_u16(pg16_first_4, a_offset1); | |||||
| c2 = svld1_u16(pg16_first_4, a_offset2); | |||||
| c3 = svld1_u16(pg16_first_4, a_offset3); | |||||
| c4 = svld1_u16(pg16_first_4, a_offset4); | |||||
| c5 = svld1_u16(pg16_first_4, a_offset5); | |||||
| c6 = svld1_u16(pg16_first_4, a_offset6); | |||||
| c7 = svld1_u16(pg16_first_4, a_offset7); | |||||
| t0 = svzip1_u16(c0, c1); | |||||
| t1 = svzip1_u16(c2, c3); | |||||
| t2 = svzip1_u16(c4, c5); | |||||
| t3 = svzip1_u16(c6, c7); | |||||
| m00 = svzip1_u32(svreinterpret_u32_u16(t0), svreinterpret_u32_u16(t1)); | |||||
| m01 = svzip1_u32(svreinterpret_u32_u16(t2), svreinterpret_u32_u16(t3)); | |||||
| svst1_scatter_u64offset_u64(pg64_first_4, (u_int64_t *)b_offset0, | |||||
| st_offsets_0, svreinterpret_u64_u32(m00)); | |||||
| svst1_scatter_u64offset_u64(pg64_first_4, (u_int64_t *)b_offset0, | |||||
| st_offsets_1, svreinterpret_u64_u32(m01)); | |||||
| a_offset0 += 8 * lda; | |||||
| a_offset1 += 8 * lda; | |||||
| a_offset2 += 8 * lda; | |||||
| a_offset3 += 8 * lda; | |||||
| a_offset4 += 8 * lda; | |||||
| a_offset5 += 8 * lda; | |||||
| a_offset6 += 8 * lda; | |||||
| a_offset7 += 8 * lda; | |||||
| b_offset0 += 32; | |||||
| } | |||||
| if (rest) { | |||||
| c0 = svld1_u16(pg16_first_4, a_offset0); // rest >= 1 | |||||
| c1 = (rest >= 2 ? svld1_u16(pg16_first_4, a_offset1) : svdup_u16(0)); | |||||
| c2 = (rest >= 3 ? svld1_u16(pg16_first_4, a_offset2) : svdup_u16(0)); | |||||
| c3 = (rest >= 4 ? svld1_u16(pg16_first_4, a_offset3) : svdup_u16(0)); | |||||
| c4 = (rest >= 5 ? svld1_u16(pg16_first_4, a_offset4) : svdup_u16(0)); | |||||
| c5 = (rest >= 6 ? svld1_u16(pg16_first_4, a_offset5) : svdup_u16(0)); | |||||
| c6 = (rest == 7 ? svld1_u16(pg16_first_4, a_offset6) : svdup_u16(0)); | |||||
| c7 = (svdup_u16(0)); | |||||
| t0 = svzip1_u16(c0, c1); | |||||
| t1 = svzip1_u16(c2, c3); | |||||
| t2 = svzip1_u16(c4, c5); | |||||
| t3 = svzip1_u16(c6, c7); | |||||
| m00 = svzip1_u32(svreinterpret_u32_u16(t0), svreinterpret_u32_u16(t1)); | |||||
| m01 = svzip1_u32(svreinterpret_u32_u16(t2), svreinterpret_u32_u16(t3)); | |||||
| svst1_scatter_u64offset_u64(pg64_first_4, (u_int64_t *)b_offset0, | |||||
| st_offsets_0, svreinterpret_u64_u32(m00)); | |||||
| svst1_scatter_u64offset_u64(pg64_first_4, (u_int64_t *)b_offset0, | |||||
| st_offsets_1, svreinterpret_u64_u32(m01)); | |||||
| } | |||||
| } | |||||
| if (n & 2) { | |||||
| a_offset0 = a_offset; | |||||
| a_offset1 = a_offset0 + lda; | |||||
| a_offset2 = a_offset1 + lda; | |||||
| a_offset3 = a_offset2 + lda; | |||||
| a_offset4 = a_offset3 + lda; | |||||
| a_offset5 = a_offset4 + lda; | |||||
| a_offset6 = a_offset5 + lda; | |||||
| a_offset7 = a_offset6 + lda; | |||||
| a_offset += 2; | |||||
| b_offset0 = b_offset; | |||||
| b_offset1 = b_offset0 + 8; | |||||
| b_offset += 2 * pad_m; | |||||
| for (BLASLONG i = 0; i < m / 8; i++) { | |||||
| for (BLASLONG line = 0; line < 2; line++) { | |||||
| b_offset0[line * 4] = a_offset0[line]; | |||||
| b_offset0[line * 4 + 1] = a_offset1[line]; | |||||
| b_offset0[line * 4 + 2] = a_offset2[line]; | |||||
| b_offset0[line * 4 + 3] = a_offset3[line]; | |||||
| b_offset1[line * 4] = a_offset4[line]; | |||||
| b_offset1[line * 4 + 1] = a_offset5[line]; | |||||
| b_offset1[line * 4 + 2] = a_offset6[line]; | |||||
| b_offset1[line * 4 + 3] = a_offset7[line]; | |||||
| } | |||||
| b_offset0 += 16; | |||||
| b_offset1 += 16; | |||||
| a_offset0 += 8 * lda; | |||||
| a_offset1 += 8 * lda; | |||||
| a_offset2 += 8 * lda; | |||||
| a_offset3 += 8 * lda; | |||||
| a_offset4 += 8 * lda; | |||||
| a_offset5 += 8 * lda; | |||||
| a_offset6 += 8 * lda; | |||||
| a_offset7 += 8 * lda; | |||||
| } | |||||
| if (rest) { | |||||
| for (BLASLONG line = 0; line < 2; line++) { | |||||
| b_offset0[line * 4] = a_offset0[line]; | |||||
| b_offset0[line * 4 + 1] = rest == 1 ? 0 : a_offset1[line]; | |||||
| b_offset0[line * 4 + 2] = rest <= 2 ? 0 : a_offset2[line]; | |||||
| b_offset0[line * 4 + 3] = rest <= 3 ? 0 : a_offset3[line]; | |||||
| b_offset1[line * 4] = rest <= 4 ? 0 : a_offset4[line]; | |||||
| b_offset1[line * 4 + 1] = rest <= 5 ? 0 : a_offset5[line]; | |||||
| b_offset1[line * 4 + 2] = rest <= 6 ? 0 : a_offset6[line]; | |||||
| b_offset1[line * 4 + 3] = 0; | |||||
| } | |||||
| } | |||||
| } | |||||
| if (n & 1) { | |||||
| a_offset0 = a_offset; | |||||
| a_offset1 = a_offset0 + lda; | |||||
| a_offset2 = a_offset1 + lda; | |||||
| a_offset3 = a_offset2 + lda; | |||||
| a_offset4 = a_offset3 + lda; | |||||
| a_offset5 = a_offset4 + lda; | |||||
| a_offset6 = a_offset5 + lda; | |||||
| a_offset7 = a_offset6 + lda; | |||||
| for (BLASLONG i = 0; i < m / 8; i++) { | |||||
| b_offset[0] = a_offset0[0]; | |||||
| b_offset[1] = a_offset1[0]; | |||||
| b_offset[2] = a_offset2[0]; | |||||
| b_offset[3] = a_offset3[0]; | |||||
| b_offset[4] = 0; | |||||
| b_offset[5] = 0; | |||||
| b_offset[6] = 0; | |||||
| b_offset[7] = 0; | |||||
| b_offset[8] = a_offset4[0]; | |||||
| b_offset[9] = a_offset5[0]; | |||||
| b_offset[10] = a_offset6[0]; | |||||
| b_offset[11] = a_offset7[0]; | |||||
| b_offset[12] = 0; | |||||
| b_offset[13] = 0; | |||||
| b_offset[14] = 0; | |||||
| b_offset[15] = 0; | |||||
| b_offset += 16; | |||||
| a_offset0 += 8 * lda; | |||||
| a_offset1 += 8 * lda; | |||||
| a_offset2 += 8 * lda; | |||||
| a_offset3 += 8 * lda; | |||||
| a_offset4 += 8 * lda; | |||||
| a_offset5 += 8 * lda; | |||||
| a_offset6 += 8 * lda; | |||||
| a_offset7 += 8 * lda; | |||||
| } | |||||
| if (rest) { | |||||
| b_offset[0] = *a_offset0; | |||||
| b_offset[1] = rest == 1 ? 0 : *a_offset1; | |||||
| b_offset[2] = rest <= 2 ? 0 : *a_offset2; | |||||
| b_offset[3] = rest <= 3 ? 0 : *a_offset3; | |||||
| b_offset[4] = 0; | |||||
| b_offset[5] = 0; | |||||
| b_offset[6] = 0; | |||||
| b_offset[7] = 0; | |||||
| b_offset[8] = rest <= 4 ? 0 : *a_offset4; | |||||
| b_offset[9] = rest <= 5 ? 0 : *a_offset5; | |||||
| b_offset[10] = rest <= 6 ? 0 : *a_offset6; | |||||
| b_offset[11] = 0; | |||||
| b_offset[12] = 0; | |||||
| b_offset[13] = 0; | |||||
| b_offset[14] = 0; | |||||
| b_offset[15] = 0; | |||||
| } | |||||
| } | |||||
| return 0; | |||||
| } | |||||
| @@ -0,0 +1,515 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2025, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written | |||||
| permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |||||
| CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE | |||||
| GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |||||
| HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | |||||
| LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF | |||||
| THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #include "common.h" | |||||
| #include <arm_neon.h> | |||||
| static void beta_op(float *x, BLASLONG n, FLOAT beta) { | |||||
| if (beta == 0) { | |||||
| memset(x, 0, n * sizeof(float)); | |||||
| return; | |||||
| } | |||||
| float32x4_t y0, y1, y2, y3; | |||||
| for (BLASLONG i = 0; i < n / 16; i++) { | |||||
| y0 = vld1q_f32(x); | |||||
| y1 = vld1q_f32(x + 4); | |||||
| y2 = vld1q_f32(x + 8); | |||||
| y3 = vld1q_f32(x + 12); | |||||
| y0 = vmulq_n_f32(y0, beta); | |||||
| y1 = vmulq_n_f32(y1, beta); | |||||
| y2 = vmulq_n_f32(y2, beta); | |||||
| y3 = vmulq_n_f32(y3, beta); | |||||
| vst1q_f32(x, y0); | |||||
| vst1q_f32(x + 4, y1); | |||||
| vst1q_f32(x + 8, y2); | |||||
| vst1q_f32(x + 12, y3); | |||||
| x += 16; | |||||
| } | |||||
| if (n & 15) { | |||||
| BLASLONG rest_n = n & 15; | |||||
| for (BLASLONG i = 0; i < (rest_n) / 4; i++) { | |||||
| y0 = vld1q_f32(x); | |||||
| y0 = vmulq_n_f32(y0, beta); | |||||
| vst1q_f32(x, y0); | |||||
| x += 4; | |||||
| } | |||||
| for (BLASLONG i = 0; i < (rest_n & 3); i ++) { | |||||
| x[i] *= beta; | |||||
| } | |||||
| } | |||||
| return; | |||||
| } | |||||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT alpha, bfloat16 *a, BLASLONG lda, | |||||
| bfloat16 *x, BLASLONG incx, float beta, float *y, BLASLONG incy) { | |||||
| BLASLONG i, j; | |||||
| bfloat16_t *a_ptr, *x_ptr; | |||||
| FLOAT *y_ptr; | |||||
| bfloat16x8_t a0, a1, a2, a3, a4, a5, a6, a7; | |||||
| bfloat16x8_t t0, t1, t2, t3, t4, t5, t6, t7; | |||||
| bfloat16x8_t x_vec; | |||||
| bfloat16x4_t x_vecx4; | |||||
| float32x4_t y1_vec, y2_vec; | |||||
| float32x4_t fp32_low, fp32_high; | |||||
| float x0, x1, x2, x3, x4, x5, x6, x7; | |||||
| bfloat16_t *a_ptr0, *a_ptr1, *a_ptr2, *a_ptr3, *a_ptr4, *a_ptr5, *a_ptr6, | |||||
| *a_ptr7; | |||||
| a_ptr = (bfloat16_t *)a; | |||||
| x_ptr = (bfloat16_t *)x; | |||||
| BLASLONG rest_m = m & 3; | |||||
| bfloat16x4_t bf16_zero = vreinterpret_bf16_u16(vdup_n_u16(0)); | |||||
| bfloat16x8_t bf16_zero_q = vreinterpretq_bf16_u16(vdupq_n_u16(0)); | |||||
| if (incx == 1 && incy == 1) { | |||||
| if (beta != 1) { | |||||
| beta_op(y, m, beta); | |||||
| } | |||||
| for (i = 0; i < n / 8; i++) { | |||||
| a_ptr0 = a_ptr; | |||||
| a_ptr1 = a_ptr0 + lda; | |||||
| a_ptr2 = a_ptr1 + lda; | |||||
| a_ptr3 = a_ptr2 + lda; | |||||
| a_ptr4 = a_ptr3 + lda; | |||||
| a_ptr5 = a_ptr4 + lda; | |||||
| a_ptr6 = a_ptr5 + lda; | |||||
| a_ptr7 = a_ptr6 + lda; | |||||
| a_ptr += 8 * lda; | |||||
| y_ptr = y; | |||||
| x_vec = vld1q_bf16(x_ptr); | |||||
| if (alpha != 1) { | |||||
| fp32_low = vreinterpretq_f32_u16( | |||||
| vzip1q_u16(vreinterpretq_u16_bf16(bf16_zero_q), | |||||
| vreinterpretq_u16_bf16(x_vec))); | |||||
| fp32_high = vreinterpretq_f32_u16( | |||||
| vzip2q_u16(vreinterpretq_u16_bf16(bf16_zero_q), | |||||
| vreinterpretq_u16_bf16(x_vec))); | |||||
| fp32_low = vmulq_n_f32(fp32_low, alpha); | |||||
| fp32_high = vmulq_n_f32(fp32_high, alpha); | |||||
| x_vec = | |||||
| vcombine_bf16(vcvt_bf16_f32(fp32_low), vcvt_bf16_f32(fp32_high)); | |||||
| } | |||||
| for (j = 0; j < m / 8; j++) { | |||||
| a0 = vld1q_bf16(a_ptr0); | |||||
| a1 = vld1q_bf16(a_ptr1); | |||||
| a2 = vld1q_bf16(a_ptr2); | |||||
| a3 = vld1q_bf16(a_ptr3); | |||||
| a4 = vld1q_bf16(a_ptr4); | |||||
| a5 = vld1q_bf16(a_ptr5); | |||||
| a6 = vld1q_bf16(a_ptr6); | |||||
| a7 = vld1q_bf16(a_ptr7); | |||||
| y1_vec = vld1q_f32(y_ptr); | |||||
| y2_vec = vld1q_f32(y_ptr + 4); | |||||
| t0 = vreinterpretq_bf16_u16( | |||||
| vzip1q_u16(vreinterpretq_u16_bf16(a0), vreinterpretq_u16_bf16(a1))); | |||||
| t1 = vreinterpretq_bf16_u16( | |||||
| vzip1q_u16(vreinterpretq_u16_bf16(a2), vreinterpretq_u16_bf16(a3))); | |||||
| t2 = vreinterpretq_bf16_u16( | |||||
| vzip1q_u16(vreinterpretq_u16_bf16(a4), vreinterpretq_u16_bf16(a5))); | |||||
| t3 = vreinterpretq_bf16_u16( | |||||
| vzip1q_u16(vreinterpretq_u16_bf16(a6), vreinterpretq_u16_bf16(a7))); | |||||
| t4 = vreinterpretq_bf16_u16( | |||||
| vzip2q_u16(vreinterpretq_u16_bf16(a0), vreinterpretq_u16_bf16(a1))); | |||||
| t5 = vreinterpretq_bf16_u16( | |||||
| vzip2q_u16(vreinterpretq_u16_bf16(a2), vreinterpretq_u16_bf16(a3))); | |||||
| t6 = vreinterpretq_bf16_u16( | |||||
| vzip2q_u16(vreinterpretq_u16_bf16(a4), vreinterpretq_u16_bf16(a5))); | |||||
| t7 = vreinterpretq_bf16_u16( | |||||
| vzip2q_u16(vreinterpretq_u16_bf16(a6), vreinterpretq_u16_bf16(a7))); | |||||
| y1_vec = vbfmlalbq_laneq_f32(y1_vec, t0, x_vec, 0); | |||||
| y1_vec = vbfmlaltq_laneq_f32(y1_vec, t0, x_vec, 1); | |||||
| y1_vec = vbfmlalbq_laneq_f32(y1_vec, t1, x_vec, 2); | |||||
| y1_vec = vbfmlaltq_laneq_f32(y1_vec, t1, x_vec, 3); | |||||
| y1_vec = vbfmlalbq_laneq_f32(y1_vec, t2, x_vec, 4); | |||||
| y1_vec = vbfmlaltq_laneq_f32(y1_vec, t2, x_vec, 5); | |||||
| y1_vec = vbfmlalbq_laneq_f32(y1_vec, t3, x_vec, 6); | |||||
| y1_vec = vbfmlaltq_laneq_f32(y1_vec, t3, x_vec, 7); | |||||
| y2_vec = vbfmlalbq_laneq_f32(y2_vec, t4, x_vec, 0); | |||||
| y2_vec = vbfmlaltq_laneq_f32(y2_vec, t4, x_vec, 1); | |||||
| y2_vec = vbfmlalbq_laneq_f32(y2_vec, t5, x_vec, 2); | |||||
| y2_vec = vbfmlaltq_laneq_f32(y2_vec, t5, x_vec, 3); | |||||
| y2_vec = vbfmlalbq_laneq_f32(y2_vec, t6, x_vec, 4); | |||||
| y2_vec = vbfmlaltq_laneq_f32(y2_vec, t6, x_vec, 5); | |||||
| y2_vec = vbfmlalbq_laneq_f32(y2_vec, t7, x_vec, 6); | |||||
| y2_vec = vbfmlaltq_laneq_f32(y2_vec, t7, x_vec, 7); | |||||
| vst1q_f32(y_ptr, y1_vec); | |||||
| vst1q_f32(y_ptr + 4, y2_vec); | |||||
| a_ptr0 += 8; | |||||
| a_ptr1 += 8; | |||||
| a_ptr2 += 8; | |||||
| a_ptr3 += 8; | |||||
| a_ptr4 += 8; | |||||
| a_ptr5 += 8; | |||||
| a_ptr6 += 8; | |||||
| a_ptr7 += 8; | |||||
| y_ptr += 8; | |||||
| } | |||||
| if (m & 4) { | |||||
| bfloat16x4_t a0x4 = vld1_bf16(a_ptr0); | |||||
| bfloat16x4_t a1x4 = vld1_bf16(a_ptr1); | |||||
| bfloat16x4_t a2x4 = vld1_bf16(a_ptr2); | |||||
| bfloat16x4_t a3x4 = vld1_bf16(a_ptr3); | |||||
| bfloat16x4_t a4x4 = vld1_bf16(a_ptr4); | |||||
| bfloat16x4_t a5x4 = vld1_bf16(a_ptr5); | |||||
| bfloat16x4_t a6x4 = vld1_bf16(a_ptr6); | |||||
| bfloat16x4_t a7x4 = vld1_bf16(a_ptr7); | |||||
| y1_vec = vld1q_f32(y_ptr); | |||||
| a0 = vcombine_bf16(a0x4, bf16_zero); | |||||
| a1 = vcombine_bf16(a1x4, bf16_zero); | |||||
| a2 = vcombine_bf16(a2x4, bf16_zero); | |||||
| a3 = vcombine_bf16(a3x4, bf16_zero); | |||||
| a4 = vcombine_bf16(a4x4, bf16_zero); | |||||
| a5 = vcombine_bf16(a5x4, bf16_zero); | |||||
| a6 = vcombine_bf16(a6x4, bf16_zero); | |||||
| a7 = vcombine_bf16(a7x4, bf16_zero); | |||||
| t0 = vreinterpretq_bf16_u16( | |||||
| vzip1q_u16(vreinterpretq_u16_bf16(a0), vreinterpretq_u16_bf16(a1))); | |||||
| t1 = vreinterpretq_bf16_u16( | |||||
| vzip1q_u16(vreinterpretq_u16_bf16(a2), vreinterpretq_u16_bf16(a3))); | |||||
| t2 = vreinterpretq_bf16_u16( | |||||
| vzip1q_u16(vreinterpretq_u16_bf16(a4), vreinterpretq_u16_bf16(a5))); | |||||
| t3 = vreinterpretq_bf16_u16( | |||||
| vzip1q_u16(vreinterpretq_u16_bf16(a6), vreinterpretq_u16_bf16(a7))); | |||||
| y1_vec = vbfmlalbq_laneq_f32(y1_vec, t0, x_vec, 0); | |||||
| y1_vec = vbfmlaltq_laneq_f32(y1_vec, t0, x_vec, 1); | |||||
| y1_vec = vbfmlalbq_laneq_f32(y1_vec, t1, x_vec, 2); | |||||
| y1_vec = vbfmlaltq_laneq_f32(y1_vec, t1, x_vec, 3); | |||||
| y1_vec = vbfmlalbq_laneq_f32(y1_vec, t2, x_vec, 4); | |||||
| y1_vec = vbfmlaltq_laneq_f32(y1_vec, t2, x_vec, 5); | |||||
| y1_vec = vbfmlalbq_laneq_f32(y1_vec, t3, x_vec, 6); | |||||
| y1_vec = vbfmlaltq_laneq_f32(y1_vec, t3, x_vec, 7); | |||||
| vst1q_f32(y_ptr, y1_vec); | |||||
| a_ptr0 += 4; | |||||
| a_ptr1 += 4; | |||||
| a_ptr2 += 4; | |||||
| a_ptr3 += 4; | |||||
| a_ptr4 += 4; | |||||
| a_ptr5 += 4; | |||||
| a_ptr6 += 4; | |||||
| a_ptr7 += 4; | |||||
| y_ptr += 4; | |||||
| } | |||||
| if (rest_m) { | |||||
| x0 = alpha * vcvtah_f32_bf16(x_ptr[0]); | |||||
| x1 = alpha * vcvtah_f32_bf16(x_ptr[1]); | |||||
| x2 = alpha * vcvtah_f32_bf16(x_ptr[2]); | |||||
| x3 = alpha * vcvtah_f32_bf16(x_ptr[3]); | |||||
| x4 = alpha * vcvtah_f32_bf16(x_ptr[4]); | |||||
| x5 = alpha * vcvtah_f32_bf16(x_ptr[5]); | |||||
| x6 = alpha * vcvtah_f32_bf16(x_ptr[6]); | |||||
| x7 = alpha * vcvtah_f32_bf16(x_ptr[7]); | |||||
| for (BLASLONG j = 0; j < rest_m; j++) { | |||||
| y_ptr[j] += x0 * vcvtah_f32_bf16(a_ptr0[j]); | |||||
| y_ptr[j] += x1 * vcvtah_f32_bf16(a_ptr1[j]); | |||||
| y_ptr[j] += x2 * vcvtah_f32_bf16(a_ptr2[j]); | |||||
| y_ptr[j] += x3 * vcvtah_f32_bf16(a_ptr3[j]); | |||||
| y_ptr[j] += x4 * vcvtah_f32_bf16(a_ptr4[j]); | |||||
| y_ptr[j] += x5 * vcvtah_f32_bf16(a_ptr5[j]); | |||||
| y_ptr[j] += x6 * vcvtah_f32_bf16(a_ptr6[j]); | |||||
| y_ptr[j] += x7 * vcvtah_f32_bf16(a_ptr7[j]); | |||||
| } | |||||
| } | |||||
| x_ptr += 8; | |||||
| } | |||||
| if (n & 4) { | |||||
| a_ptr0 = a_ptr; | |||||
| a_ptr1 = a_ptr0 + lda; | |||||
| a_ptr2 = a_ptr1 + lda; | |||||
| a_ptr3 = a_ptr2 + lda; | |||||
| a_ptr += 4 * lda; | |||||
| x_vecx4 = vld1_bf16(x_ptr); | |||||
| if (alpha != 1) { | |||||
| fp32_low = vcvt_f32_bf16(x_vecx4); | |||||
| fp32_low = vmulq_n_f32(fp32_low, alpha); | |||||
| x_vecx4 = vcvt_bf16_f32(fp32_low); | |||||
| } | |||||
| y_ptr = y; | |||||
| for (j = 0; j < m / 8; j++) { | |||||
| a0 = vld1q_bf16(a_ptr0); | |||||
| a1 = vld1q_bf16(a_ptr1); | |||||
| a2 = vld1q_bf16(a_ptr2); | |||||
| a3 = vld1q_bf16(a_ptr3); | |||||
| y1_vec = vld1q_f32(y_ptr); | |||||
| y2_vec = vld1q_f32(y_ptr + 4); | |||||
| t0 = vreinterpretq_bf16_u16( | |||||
| vzip1q_u16(vreinterpretq_u16_bf16(a0), vreinterpretq_u16_bf16(a1))); | |||||
| t1 = vreinterpretq_bf16_u16( | |||||
| vzip1q_u16(vreinterpretq_u16_bf16(a2), vreinterpretq_u16_bf16(a3))); | |||||
| t4 = vreinterpretq_bf16_u16( | |||||
| vzip2q_u16(vreinterpretq_u16_bf16(a0), vreinterpretq_u16_bf16(a1))); | |||||
| t5 = vreinterpretq_bf16_u16( | |||||
| vzip2q_u16(vreinterpretq_u16_bf16(a2), vreinterpretq_u16_bf16(a3))); | |||||
| y1_vec = vbfmlalbq_lane_f32(y1_vec, t0, x_vecx4, 0); | |||||
| y1_vec = vbfmlaltq_lane_f32(y1_vec, t0, x_vecx4, 1); | |||||
| y1_vec = vbfmlalbq_lane_f32(y1_vec, t1, x_vecx4, 2); | |||||
| y1_vec = vbfmlaltq_lane_f32(y1_vec, t1, x_vecx4, 3); | |||||
| y2_vec = vbfmlalbq_lane_f32(y2_vec, t4, x_vecx4, 0); | |||||
| y2_vec = vbfmlaltq_lane_f32(y2_vec, t4, x_vecx4, 1); | |||||
| y2_vec = vbfmlalbq_lane_f32(y2_vec, t5, x_vecx4, 2); | |||||
| y2_vec = vbfmlaltq_lane_f32(y2_vec, t5, x_vecx4, 3); | |||||
| vst1q_f32(y_ptr, y1_vec); | |||||
| vst1q_f32(y_ptr + 4, y2_vec); | |||||
| a_ptr0 += 8; | |||||
| a_ptr1 += 8; | |||||
| a_ptr2 += 8; | |||||
| a_ptr3 += 8; | |||||
| y_ptr += 8; | |||||
| } | |||||
| if (m & 4) { | |||||
| bfloat16x4_t a0x4 = vld1_bf16(a_ptr0); | |||||
| bfloat16x4_t a1x4 = vld1_bf16(a_ptr1); | |||||
| bfloat16x4_t a2x4 = vld1_bf16(a_ptr2); | |||||
| bfloat16x4_t a3x4 = vld1_bf16(a_ptr3); | |||||
| y1_vec = vld1q_f32(y_ptr); | |||||
| a0 = vcombine_bf16(a0x4, a2x4); | |||||
| a1 = vcombine_bf16(a1x4, a3x4); | |||||
| t0 = vreinterpretq_bf16_u16(vzip1q_u16(vreinterpretq_u16_bf16(a0), vreinterpretq_u16_bf16(a1))); | |||||
| t1 = vreinterpretq_bf16_u16(vzip2q_u16(vreinterpretq_u16_bf16(a0), vreinterpretq_u16_bf16(a1))); | |||||
| y1_vec = vbfmlalbq_lane_f32(y1_vec, t0, x_vecx4, 0); | |||||
| y1_vec = vbfmlaltq_lane_f32(y1_vec, t0, x_vecx4, 1); | |||||
| y1_vec = vbfmlalbq_lane_f32(y1_vec, t1, x_vecx4, 2); | |||||
| y1_vec = vbfmlaltq_lane_f32(y1_vec, t1, x_vecx4, 3); | |||||
| vst1q_f32(y_ptr, y1_vec); | |||||
| a_ptr0 += 4; | |||||
| a_ptr1 += 4; | |||||
| a_ptr2 += 4; | |||||
| a_ptr3 += 4; | |||||
| y_ptr += 4; | |||||
| } | |||||
| if (rest_m) { | |||||
| fp32_low = vcvt_f32_bf16(x_vecx4); | |||||
| x0 = vgetq_lane_f32(fp32_low, 0); | |||||
| x1 = vgetq_lane_f32(fp32_low, 1); | |||||
| x2 = vgetq_lane_f32(fp32_low, 2); | |||||
| x3 = vgetq_lane_f32(fp32_low, 3); | |||||
| for (BLASLONG j = 0; j < rest_m; j++) { | |||||
| y_ptr[j] += x0 * vcvtah_f32_bf16(a_ptr0[j]); | |||||
| y_ptr[j] += x1 * vcvtah_f32_bf16(a_ptr1[j]); | |||||
| y_ptr[j] += x2 * vcvtah_f32_bf16(a_ptr2[j]); | |||||
| y_ptr[j] += x3 * vcvtah_f32_bf16(a_ptr3[j]); | |||||
| } | |||||
| } | |||||
| x_ptr += 4; | |||||
| } | |||||
| if (n & 2) { | |||||
| a_ptr0 = a_ptr; | |||||
| a_ptr1 = a_ptr0 + lda; | |||||
| a_ptr += 2 * lda; | |||||
| x_vecx4 = vreinterpret_bf16_u16(vzip1_u16( | |||||
| vreinterpret_u16_bf16(vdup_n_bf16(x_ptr[0])), | |||||
| vreinterpret_u16_bf16(vdup_n_bf16(x_ptr[1])) | |||||
| )); | |||||
| if (alpha != 1) { | |||||
| fp32_low = vcvt_f32_bf16(x_vecx4); | |||||
| fp32_low = vmulq_n_f32(fp32_low, alpha); | |||||
| x_vecx4 = vcvt_bf16_f32(fp32_low); | |||||
| } | |||||
| y_ptr = y; | |||||
| for (j = 0; j < m / 8; j++) { | |||||
| a0 = vld1q_bf16(a_ptr0); | |||||
| a1 = vld1q_bf16(a_ptr1); | |||||
| y1_vec = vld1q_f32(y_ptr); | |||||
| y2_vec = vld1q_f32(y_ptr + 4); | |||||
| t0 = vreinterpretq_bf16_u16( | |||||
| vzip1q_u16(vreinterpretq_u16_bf16(a0), vreinterpretq_u16_bf16(a1))); | |||||
| t1 = vreinterpretq_bf16_u16( | |||||
| vzip2q_u16(vreinterpretq_u16_bf16(a0), vreinterpretq_u16_bf16(a1))); | |||||
| y1_vec = vbfmlalbq_lane_f32(y1_vec, t0, x_vecx4, 0); | |||||
| y1_vec = vbfmlaltq_lane_f32(y1_vec, t0, x_vecx4, 1); | |||||
| y2_vec = vbfmlalbq_lane_f32(y2_vec, t1, x_vecx4, 0); | |||||
| y2_vec = vbfmlaltq_lane_f32(y2_vec, t1, x_vecx4, 1); | |||||
| vst1q_f32(y_ptr, y1_vec); | |||||
| vst1q_f32(y_ptr + 4, y2_vec); | |||||
| a_ptr0 += 8; | |||||
| a_ptr1 += 8; | |||||
| y_ptr += 8; | |||||
| } | |||||
| if (m & 4) { | |||||
| bfloat16x4_t a0x4 = vld1_bf16(a_ptr0); | |||||
| bfloat16x4_t a1x4 = vld1_bf16(a_ptr1); | |||||
| y1_vec = vld1q_f32(y_ptr); | |||||
| a0 = vcombine_bf16(a0x4, bf16_zero); | |||||
| a1 = vcombine_bf16(a1x4, bf16_zero); | |||||
| t0 = vreinterpretq_bf16_u16(vzip1q_u16(vreinterpretq_u16_bf16(a0), vreinterpretq_u16_bf16(a1))); | |||||
| y1_vec = vbfmlalbq_lane_f32(y1_vec, t0, x_vecx4, 0); | |||||
| y1_vec = vbfmlaltq_lane_f32(y1_vec, t0, x_vecx4, 1); | |||||
| vst1q_f32(y_ptr, y1_vec); | |||||
| a_ptr0 += 4; | |||||
| a_ptr1 += 4; | |||||
| y_ptr += 4; | |||||
| } | |||||
| if (m & 2) { | |||||
| fp32_low = vcvt_f32_bf16(x_vecx4); | |||||
| x0 = vgetq_lane_f32(fp32_low, 0); | |||||
| x1 = vgetq_lane_f32(fp32_low, 1); | |||||
| y_ptr[0] += x0 * vcvtah_f32_bf16(a_ptr0[0]); | |||||
| y_ptr[0] += x1 * vcvtah_f32_bf16(a_ptr1[0]); | |||||
| y_ptr[1] += x0 * vcvtah_f32_bf16(a_ptr0[1]); | |||||
| y_ptr[1] += x1 * vcvtah_f32_bf16(a_ptr1[1]); | |||||
| a_ptr0 += 2; | |||||
| a_ptr1 += 2; | |||||
| y_ptr += 2; | |||||
| } | |||||
| if (m & 1) { | |||||
| fp32_low = vcvt_f32_bf16(x_vecx4); | |||||
| x0 = vgetq_lane_f32(fp32_low, 0); | |||||
| x1 = vgetq_lane_f32(fp32_low, 1); | |||||
| y_ptr[0] += x0 * vcvtah_f32_bf16(a_ptr0[0]); | |||||
| y_ptr[0] += x1 * vcvtah_f32_bf16(a_ptr1[0]); | |||||
| } | |||||
| x_ptr += 2; | |||||
| } | |||||
| if (n & 1) { | |||||
| x0 = vcvtah_f32_bf16(x_ptr[0]) * alpha; | |||||
| y_ptr = y; | |||||
| a_ptr0 = a_ptr; | |||||
| for (j = 0; j < m; j++) { | |||||
| y_ptr[j] += x0 * vcvtah_f32_bf16(a_ptr0[j]); | |||||
| } | |||||
| } | |||||
| return (0); | |||||
| } | |||||
| BLASLONG iy = 0; | |||||
| for (i = 0; i < m; i++) { | |||||
| y[iy] *= beta; | |||||
| iy += incy; | |||||
| } | |||||
| for (j = 0; j < n; j++) { | |||||
| x0 = alpha * vcvtah_f32_bf16(*x_ptr); | |||||
| iy = 0; | |||||
| for (i = 0; i < m; i++) { | |||||
| y[iy] += x0 * vcvtah_f32_bf16(a_ptr[i]); | |||||
| iy += incy; | |||||
| } | |||||
| a_ptr += lda; | |||||
| x_ptr += incx; | |||||
| } | |||||
| return (0); | |||||
| } | |||||
| @@ -0,0 +1,202 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2025, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written | |||||
| permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #include <arm_neon.h> | |||||
| #include "common.h" | |||||
| int CNAME(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, BLASLONG incx, float beta, float *y, BLASLONG incy) | |||||
| { | |||||
| if (m < 1 || n < 1) return(0); | |||||
| BLASLONG i; | |||||
| BLASLONG ix,iy; | |||||
| BLASLONG j; | |||||
| bfloat16_t *a_ptr; | |||||
| bfloat16_t *x_ptr; | |||||
| float *y_ptr; | |||||
| float temp; | |||||
| iy = 0; | |||||
| a_ptr = (bfloat16_t*)(a); | |||||
| x_ptr = (bfloat16_t*)(x); | |||||
| if (incx == 1) { | |||||
| BLASLONG width = n / 4; | |||||
| bfloat16_t *a0_ptr = a_ptr + lda * width * 0; | |||||
| bfloat16_t *a1_ptr = a_ptr + lda * width * 1; | |||||
| bfloat16_t *a2_ptr = a_ptr + lda * width * 2; | |||||
| bfloat16_t *a3_ptr = a_ptr + lda * width * 3; | |||||
| float *y0_ptr = y + incy * width * 0; | |||||
| float *y1_ptr = y + incy * width * 1; | |||||
| float *y2_ptr = y + incy * width * 2; | |||||
| float *y3_ptr = y + incy * width * 3; | |||||
| for (j = 0; j < width; j++) { | |||||
| float32x4_t temp0_vec = vdupq_n_f32(0.0f); | |||||
| float32x4_t temp1_vec = vdupq_n_f32(0.0f); | |||||
| float32x4_t temp2_vec = vdupq_n_f32(0.0f); | |||||
| float32x4_t temp3_vec = vdupq_n_f32(0.0f); | |||||
| i = 0; | |||||
| while (i + 7 < m) { | |||||
| bfloat16x8_t x_vec = vld1q_bf16(x_ptr + i); | |||||
| bfloat16x8_t a0_vec = vld1q_bf16(a0_ptr + i); | |||||
| bfloat16x8_t a1_vec = vld1q_bf16(a1_ptr + i); | |||||
| bfloat16x8_t a2_vec = vld1q_bf16(a2_ptr + i); | |||||
| bfloat16x8_t a3_vec = vld1q_bf16(a3_ptr + i); | |||||
| temp0_vec = vbfdotq_f32(temp0_vec, a0_vec, x_vec); | |||||
| temp1_vec = vbfdotq_f32(temp1_vec, a1_vec, x_vec); | |||||
| temp2_vec = vbfdotq_f32(temp2_vec, a2_vec, x_vec); | |||||
| temp3_vec = vbfdotq_f32(temp3_vec, a3_vec, x_vec); | |||||
| i += 8; | |||||
| } | |||||
| if (i + 3 < m) { | |||||
| float32x2_t t0 = vdup_n_f32(0.0f); | |||||
| float32x2_t t1 = vdup_n_f32(0.0f); | |||||
| float32x2_t t2 = vdup_n_f32(0.0f); | |||||
| float32x2_t t3 = vdup_n_f32(0.0f); | |||||
| bfloat16x4_t x_vec = vld1_bf16(x_ptr + i); | |||||
| bfloat16x4_t a0_vec = vld1_bf16(a0_ptr + i); | |||||
| bfloat16x4_t a1_vec = vld1_bf16(a1_ptr + i); | |||||
| bfloat16x4_t a2_vec = vld1_bf16(a2_ptr + i); | |||||
| bfloat16x4_t a3_vec = vld1_bf16(a3_ptr + i); | |||||
| t0 = vbfdot_f32(t0, a0_vec, x_vec); | |||||
| t1 = vbfdot_f32(t1, a1_vec, x_vec); | |||||
| t2 = vbfdot_f32(t2, a2_vec, x_vec); | |||||
| t3 = vbfdot_f32(t3, a3_vec, x_vec); | |||||
| float32x2_t temp0_vec_low = vget_low_f32(temp0_vec); | |||||
| float32x2_t temp1_vec_low = vget_low_f32(temp1_vec); | |||||
| float32x2_t temp2_vec_low = vget_low_f32(temp2_vec); | |||||
| float32x2_t temp3_vec_low = vget_low_f32(temp3_vec); | |||||
| temp0_vec = vcombine_f32(vadd_f32(t0, temp0_vec_low), vget_high_f32(temp0_vec)); | |||||
| temp1_vec = vcombine_f32(vadd_f32(t1, temp1_vec_low), vget_high_f32(temp1_vec)); | |||||
| temp2_vec = vcombine_f32(vadd_f32(t2, temp2_vec_low), vget_high_f32(temp2_vec)); | |||||
| temp3_vec = vcombine_f32(vadd_f32(t3, temp3_vec_low), vget_high_f32(temp3_vec)); | |||||
| i += 4; | |||||
| } | |||||
| if (beta == 0.0f) { | |||||
| y0_ptr[iy] = alpha * vaddvq_f32(temp0_vec); | |||||
| y1_ptr[iy] = alpha * vaddvq_f32(temp1_vec); | |||||
| y2_ptr[iy] = alpha * vaddvq_f32(temp2_vec); | |||||
| y3_ptr[iy] = alpha * vaddvq_f32(temp3_vec); | |||||
| } | |||||
| else { | |||||
| y0_ptr[iy] = alpha * vaddvq_f32(temp0_vec) + beta * y0_ptr[iy]; | |||||
| y1_ptr[iy] = alpha * vaddvq_f32(temp1_vec) + beta * y1_ptr[iy]; | |||||
| y2_ptr[iy] = alpha * vaddvq_f32(temp2_vec) + beta * y2_ptr[iy]; | |||||
| y3_ptr[iy] = alpha * vaddvq_f32(temp3_vec) + beta * y3_ptr[iy]; | |||||
| } | |||||
| for (; i < m; ++i) { | |||||
| y0_ptr[iy] += alpha * vcvtah_f32_bf16(a0_ptr[i]) * vcvtah_f32_bf16(x_ptr[i]); | |||||
| y1_ptr[iy] += alpha * vcvtah_f32_bf16(a1_ptr[i]) * vcvtah_f32_bf16(x_ptr[i]); | |||||
| y2_ptr[iy] += alpha * vcvtah_f32_bf16(a2_ptr[i]) * vcvtah_f32_bf16(x_ptr[i]); | |||||
| y3_ptr[iy] += alpha * vcvtah_f32_bf16(a3_ptr[i]) * vcvtah_f32_bf16(x_ptr[i]); | |||||
| } | |||||
| iy += incy; | |||||
| a0_ptr += lda; | |||||
| a1_ptr += lda; | |||||
| a2_ptr += lda; | |||||
| a3_ptr += lda; | |||||
| } | |||||
| a_ptr = a3_ptr; | |||||
| y_ptr = y3_ptr; | |||||
| for (j = width * 4; j < n; j++) { | |||||
| float32x4_t temp0_vec = vdupq_n_f32(0.0f); | |||||
| i = 0; | |||||
| while (i + 7 < m) { | |||||
| bfloat16x8_t x_vec = vld1q_bf16(x_ptr + i); | |||||
| bfloat16x8_t a0_vec = vld1q_bf16(a_ptr + i); | |||||
| temp0_vec = vbfdotq_f32(temp0_vec, a0_vec, x_vec); | |||||
| i += 8; | |||||
| } | |||||
| if (i + 3 < m) { | |||||
| float32x2_t t0 = vdup_n_f32(0.0f); | |||||
| bfloat16x4_t x_vec = vld1_bf16(x_ptr + i); | |||||
| bfloat16x4_t a0_vec = vld1_bf16(a_ptr + i); | |||||
| t0 = vbfdot_f32(t0, a0_vec, x_vec); | |||||
| float32x2_t temp0_vec_low = vget_low_f32(temp0_vec); | |||||
| temp0_vec = vcombine_f32(vadd_f32(t0, temp0_vec_low), vget_high_f32(temp0_vec)); | |||||
| i += 4; | |||||
| } | |||||
| if (beta == 0.0f) { | |||||
| y_ptr[iy] = alpha * vaddvq_f32(temp0_vec); | |||||
| } | |||||
| else { | |||||
| y_ptr[iy] = alpha * vaddvq_f32(temp0_vec) + beta * y_ptr[iy]; | |||||
| } | |||||
| for (; i < m; ++i) { | |||||
| y_ptr[iy] += alpha * vcvtah_f32_bf16(a_ptr[i]) * vcvtah_f32_bf16(x_ptr[i]); | |||||
| } | |||||
| iy += incy; | |||||
| a_ptr += lda; | |||||
| } | |||||
| return(0); | |||||
| } | |||||
| for (j = 0; j < n; j++) { | |||||
| temp = 0.0; | |||||
| ix = 0; | |||||
| for (i = 0; i < m; i++) { | |||||
| temp += vcvtah_f32_bf16(a_ptr[i]) * vcvtah_f32_bf16(x_ptr[ix]); | |||||
| ix += incx; | |||||
| } | |||||
| if (beta == 0.0f) { | |||||
| y[iy] = alpha * temp; | |||||
| } | |||||
| else { | |||||
| y[iy] = alpha * temp + beta * y[iy]; | |||||
| } | |||||
| iy += incy; | |||||
| a_ptr += lda; | |||||
| } | |||||
| return (0); | |||||
| } | |||||
| @@ -0,0 +1,80 @@ | |||||
| /* | |||||
| Copyright (c) 2025 Qualcomm Innovation Center, Inc. All rights reserved. | |||||
| SPDX-License-Identifier: BSD-3-Clause-Clear | |||||
| */ | |||||
| #include "common.h" | |||||
| #include <stdlib.h> | |||||
| #include <inttypes.h> | |||||
| #include <math.h> | |||||
| #if defined(HAVE_SME) | |||||
| /* Function prototypes */ | |||||
| extern void sgemm_direct_sme1_preprocess(uint64_t nbr, uint64_t nbc,\ | |||||
| const float * restrict a, float * a_mod) __asm__("sgemm_direct_sme1_preprocess"); | |||||
| extern void sgemm_direct_sme1_2VLx2VL(uint64_t m, uint64_t k, uint64_t n,\ | |||||
| const float * matLeft,\ | |||||
| const float * restrict matRight,\ | |||||
| const float * restrict matResult) __asm__("sgemm_direct_sme1_2VLx2VL"); | |||||
| /* Function Definitions */ | |||||
| uint64_t sve_cntw() { | |||||
| uint64_t cnt; | |||||
| asm volatile( | |||||
| "rdsvl %[res], #1\n" | |||||
| "lsr %[res], %[res], #2\n" | |||||
| : [res] "=r" (cnt) :: | |||||
| ); | |||||
| return cnt; | |||||
| } | |||||
| /*void sgemm_kernel_direct (BLASLONG M, BLASLONG N, BLASLONG K,\ | |||||
| float * __restrict A, BLASLONG strideA, float * __restrict B,\ | |||||
| BLASLONG strideB , float * __restrict R, BLASLONG strideR) | |||||
| */ | |||||
| void CNAME (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A,\ | |||||
| BLASLONG strideA, float * __restrict B, BLASLONG strideB ,\ | |||||
| float * __restrict R, BLASLONG strideR){ | |||||
| uint64_t m_mod, vl_elms; | |||||
| vl_elms = sve_cntw(); | |||||
| m_mod = ceil((double)M/(double)vl_elms) * vl_elms; | |||||
| float *A_mod = (float *) malloc(m_mod*K*sizeof(float)); | |||||
| /* Prevent compiler optimization by reading from memory instead | |||||
| * of reading directly from vector (z) registers. | |||||
| * */ | |||||
| asm volatile("" : : :"p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", | |||||
| "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", | |||||
| "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", | |||||
| "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", | |||||
| "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", | |||||
| "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"); | |||||
| /* Pre-process the left matrix to make it suitable for | |||||
| matrix sum of outer-product calculation | |||||
| */ | |||||
| sgemm_direct_sme1_preprocess(M, K, A, A_mod); | |||||
| /* Calculate C = A*B */ | |||||
| sgemm_direct_sme1_2VLx2VL(M, K, N, A_mod, B, R); | |||||
| asm volatile("" : : :"p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", | |||||
| "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", | |||||
| "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", | |||||
| "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", | |||||
| "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", | |||||
| "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"); | |||||
| free(A_mod); | |||||
| } | |||||
| #else | |||||
| void CNAME (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A,\ | |||||
| BLASLONG strideA, float * __restrict B, BLASLONG strideB ,\ | |||||
| float * __restrict R, BLASLONG strideR){} | |||||
| #endif | |||||