Update from develop for 0.3.30 releasetags/v0.3.30
| @@ -58,8 +58,8 @@ task: | |||
| - export VALID_ARCHS="i386 x86_64" | |||
| - xcrun --sdk macosx --show-sdk-path | |||
| - xcodebuild -version | |||
| - export CC=/Applications/Xcode_15.4.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang | |||
| - export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode_15.4.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX14.5.sdk -arch x86_64" | |||
| - export CC=/Applications/Xcode_16.3.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang | |||
| - export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode_16.3.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX15.4.sdk -arch x86_64" | |||
| - make TARGET=CORE2 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 RANLIB="ls -l" | |||
| always: | |||
| config_artifacts: | |||
| @@ -78,8 +78,8 @@ task: | |||
| - export #PATH=/opt/homebrew/opt/llvm/bin:$PATH | |||
| - export #LDFLAGS="-L/opt/homebrew/opt/llvm/lib" | |||
| - export #CPPFLAGS="-I/opt/homebrew/opt/llvm/include" | |||
| - export CC=/Applications/Xcode_15.4.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang | |||
| - export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode_15.4.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS17.5.sdk -arch arm64 -miphoneos-version-min=10.0" | |||
| - export CC=/Applications/Xcode_16.3.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang | |||
| - export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode_16.3.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS18.4.sdk -arch arm64 -miphoneos-version-min=10.0" | |||
| - xcrun --sdk iphoneos --show-sdk-path | |||
| - ls -l /Applications | |||
| - make TARGET=ARMV8 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 CROSS=1 | |||
| @@ -127,7 +127,7 @@ task: | |||
| FreeBSD_task: | |||
| name: FreeBSD-gcc | |||
| freebsd_instance: | |||
| image_family: freebsd-14-1 | |||
| image_family: freebsd-14-2 | |||
| install_script: | |||
| - pkg update -f && pkg upgrade -y && pkg install -y gmake gcc | |||
| compile_script: | |||
| @@ -138,7 +138,7 @@ FreeBSD_task: | |||
| FreeBSD_task: | |||
| name: freebsd-gcc-ilp64 | |||
| freebsd_instance: | |||
| image_family: freebsd-14-1 | |||
| image_family: freebsd-14-2 | |||
| install_script: | |||
| - pkg update -f && pkg upgrade -y && pkg install -y gmake gcc | |||
| compile_script: | |||
| @@ -148,7 +148,7 @@ FreeBSD_task: | |||
| FreeBSD_task: | |||
| name: FreeBSD-clang-openmp | |||
| freebsd_instance: | |||
| image_family: freebsd-14-1 | |||
| image_family: freebsd-14-2 | |||
| install_script: | |||
| - pkg update -f && pkg upgrade -y && pkg install -y gmake gcc | |||
| - ln -s /usr/local/lib/gcc13/libgfortran.so.5.0.0 /usr/lib/libgfortran.so | |||
| @@ -102,6 +102,7 @@ jobs: | |||
| mkdir build && cd build | |||
| cmake -DDYNAMIC_ARCH=1 \ | |||
| -DUSE_OPENMP=${{matrix.openmp}} \ | |||
| -DOpenMP_Fortran_LIB_NAMES=omp \ | |||
| -DINTERFACE64=${{matrix.ilp64}} \ | |||
| -DNOFORTRAN=0 \ | |||
| -DBUILD_WITHOUT_LAPACK=0 \ | |||
| @@ -31,27 +31,28 @@ jobs: | |||
| steps: | |||
| - name: Checkout repository | |||
| uses: actions/checkout@v3 | |||
| uses: actions/checkout@v4 | |||
| - name: install build deps | |||
| run: | | |||
| sudo apt-get update | |||
| sudo apt-get install autoconf automake autotools-dev ninja-build make ccache \ | |||
| gcc-${{ matrix.apt_triple }} gfortran-${{ matrix.apt_triple }} libgomp1-riscv64-cross | |||
| gcc-${{ matrix.apt_triple }} gfortran-${{ matrix.apt_triple }} libgomp1-riscv64-cross libglib2.0-dev | |||
| - name: checkout qemu | |||
| uses: actions/checkout@v3 | |||
| uses: actions/checkout@v4 | |||
| with: | |||
| repository: T-head-Semi/qemu | |||
| repository: XUANTIE-RV/qemu | |||
| path: qemu | |||
| ref: 1e692ebb43d396c52352406323fc782c1ac99a42 | |||
| ref: e0ace167effcd36d1f82c7ccb4522b3126011479 # xuantie-qemu-9.0 | |||
| - name: build qemu | |||
| run: | | |||
| # Force use c910v qemu-user | |||
| wget https://github.com/revyos/qemu/commit/5164bca5a4bcde4534dc1a9aa3a7f619719874cf.patch | |||
| wget https://github.com/revyos/qemu/commit/222729c7455784dd855216d7a2bec4bd8f2a6800.patch | |||
| cd qemu | |||
| patch -p1 < ../5164bca5a4bcde4534dc1a9aa3a7f619719874cf.patch | |||
| patch -p1 < ../222729c7455784dd855216d7a2bec4bd8f2a6800.patch | |||
| export CXXFLAGS="-Wno-error"; export CFLAGS="-Wno-error" | |||
| ./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=riscv64-linux-user --disable-system | |||
| make -j$(nproc) | |||
| make install | |||
| @@ -82,9 +83,39 @@ jobs: | |||
| - name: test | |||
| run: | | |||
| export PATH=$GITHUB_WORKSPACE/qemu-install/bin/:$PATH | |||
| qemu-riscv64 ./utest/openblas_utest | |||
| qemu-riscv64 ./utest/openblas_utest_ext | |||
| run_with_retry() { | |||
| local cmd="$1" | |||
| local time_out=10 | |||
| local retries=10 | |||
| local attempt=0 | |||
| for ((i=1; i<=retries; i++)); do | |||
| attempt=$((i)) | |||
| if timeout -s 12 --preserve-status $time_out $cmd; then | |||
| echo "Command succeeded on attempt $i." | |||
| return 0 | |||
| else | |||
| local exit_code=$? | |||
| if [ $exit_code -eq 140 ]; then | |||
| echo "Attempt $i timed out (retrying...)" | |||
| time_out=$((time_out + 5)) | |||
| else | |||
| echo "Attempt $i failed with exit code $exit_code. Aborting workflow." | |||
| exit $exit_code | |||
| fi | |||
| fi | |||
| done | |||
| echo "All $retries attempts failed, giving up." | |||
| echo "Final failure was due to timeout." | |||
| echo "Aborting workflow." | |||
| exit $exit_code | |||
| } | |||
| export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH | |||
| which qemu-riscv64 | |||
| export QEMU_BIN=$(which qemu-riscv64) | |||
| run_with_retry "$QEMU_BIN ./utest/openblas_utest" | |||
| run_with_retry "$QEMU_BIN ./utest/openblas_utest_ext" | |||
| OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xscblat1 | |||
| OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xdcblat1 | |||
| OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xccblat1 | |||
| @@ -15,7 +15,7 @@ jobs: | |||
| strategy: | |||
| fail-fast: false | |||
| matrix: | |||
| os: [ubuntu-latest] | |||
| os: [ubuntu-22.04] | |||
| fortran: [gfortran] | |||
| build: [make] | |||
| pyver: ["3.12"] | |||
| @@ -147,7 +147,7 @@ jobs: | |||
| OPENBLAS_NUM_THREADS=1 pytest benchmarks/bench_blas.py -k 'gesdd' | |||
| - name: Run benchmarks | |||
| uses: CodSpeedHQ/action@v2 | |||
| uses: CodSpeedHQ/action@v3 | |||
| with: | |||
| token: ${{ secrets.CODSPEED_TOKEN }} | |||
| run: | | |||
| @@ -43,7 +43,9 @@ jobs: | |||
| run: | | |||
| if [ "$RUNNER_OS" == "Linux" ]; then | |||
| sudo apt-get update | |||
| sudo apt-get install -y gfortran cmake ccache libtinfo5 | |||
| sudo apt-get install -y gfortran cmake ccache | |||
| wget http://security.ubuntu.com/ubuntu/pool/universe/n/ncurses/libtinfo5_6.3-2ubuntu0.1_amd64.deb | |||
| sudo apt install ./libtinfo5_6.3-2ubuntu0.1_amd64.deb | |||
| elif [ "$RUNNER_OS" == "macOS" ]; then | |||
| # It looks like "gfortran" isn't working correctly unless "gcc" is re-installed. | |||
| brew reinstall gcc | |||
| @@ -354,3 +356,23 @@ jobs: | |||
| - name: Build OpenBLAS | |||
| run: | | |||
| make -j$(nproc) HOSTCC="ccache gcc" CC="ccache ${{ matrix.triple }}-gcc" FC="ccache ${{ matrix.triple }}-gfortran" ARCH=${{ matrix.target }} ${{ matrix.opts }} | |||
| neoverse_build: | |||
| if: "github.repository == 'OpenMathLib/OpenBLAS'" | |||
| runs-on: ubuntu-24.04-arm | |||
| steps: | |||
| - name: Checkout repository | |||
| uses: actions/checkout@v3 | |||
| - name: Install Dependencies | |||
| run: | | |||
| sudo apt-get update | |||
| sudo apt-get install -y gcc gfortran make | |||
| - name: Build OpenBLAS | |||
| run: | | |||
| make -j${nproc} | |||
| make -j${nproc} lapack-test | |||
| @@ -41,7 +41,7 @@ jobs: | |||
| - name: Install APT deps | |||
| run: | | |||
| sudo apt-get update | |||
| sudo apt-get install autoconf automake autotools-dev ninja-build make ccache | |||
| sudo apt-get install autoconf automake autotools-dev ninja-build make ccache libglib2.0-dev | |||
| - name: Download and install loongarch64-toolchain | |||
| run: | | |||
| @@ -41,14 +41,14 @@ jobs: | |||
| run: | | |||
| sudo apt-get update | |||
| sudo apt-get install autoconf automake autotools-dev ninja-build make ccache \ | |||
| gcc-${{ matrix.triple }} gfortran-${{ matrix.triple }} libgomp1-mips64el-cross | |||
| gcc-${{ matrix.triple }} gfortran-${{ matrix.triple }} libgomp1-mips64el-cross libglib2.0-dev | |||
| - name: checkout qemu | |||
| uses: actions/checkout@v3 | |||
| with: | |||
| repository: qemu/qemu | |||
| path: qemu | |||
| ref: 79dfa177ae348bb5ab5f97c0915359b13d6186e2 | |||
| ref: ae35f033b874c627d81d51070187fbf55f0bf1a7 | |||
| - name: build qemu | |||
| run: | | |||
| @@ -9,7 +9,7 @@ project(OpenBLAS C ASM) | |||
| set(OpenBLAS_MAJOR_VERSION 0) | |||
| set(OpenBLAS_MINOR_VERSION 3) | |||
| set(OpenBLAS_PATCH_VERSION 29) | |||
| set(OpenBLAS_PATCH_VERSION 29.dev) | |||
| set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") | |||
| @@ -21,6 +21,8 @@ include(CMakePackageConfigHelpers) | |||
| ####### | |||
| option(BUILD_WITHOUT_LAPACK "Do not build LAPACK and LAPACKE (Only BLAS or CBLAS)" OFF) | |||
| option(BUILD_WITHOUT_LAPACKE "Do not build the C interface to LAPACK)" OFF) | |||
| option(BUILD_LAPACK_DEPRECATED "When building LAPACK, include also some older, deprecated routines" ON) | |||
| set(LAPACK_STRLEN "" CACHE STRING "When building LAPACK, use this type (e.g. \"int\") for character lengths (defaults to size_t)") | |||
| @@ -60,6 +62,7 @@ option(CPP_THREAD_SAFETY_TEST "Run a massively parallel DGEMM test to confirm th | |||
| option(CPP_THREAD_SAFETY_GEMV "Run a massively parallel DGEMV test to confirm thread safety of the library (requires OpenMP)" OFF) | |||
| option(BUILD_STATIC_LIBS "Build static library" OFF) | |||
| option(BUILD_SHARED_LIBS "Build shared library" OFF) | |||
| if(NOT BUILD_STATIC_LIBS AND NOT BUILD_SHARED_LIBS) | |||
| set(BUILD_STATIC_LIBS ON CACHE BOOL "Build static library" FORCE) | |||
| endif() | |||
| @@ -75,12 +78,27 @@ set(SYMBOLPREFIX "" CACHE STRING "Add a prefix to all exported symbol names in | |||
| set(SYMBOLSUFFIX "" CACHE STRING "Add a suffix to all exported symbol names in the shared library, e.g. _64 for INTERFACE64 builds" ) | |||
| if (CMAKE_SYSTEM_NAME MATCHES "Windows" AND BUILD_SHARED_LIBS AND NOT ("${SYMBOLPREFIX}${SYMBOLSUFFIX}" STREQUAL "")) | |||
| set (DELETE_STATIC_LIBS "") | |||
| if (NOT BUILD_STATIC_LIBS) | |||
| message (STATUS "forcing build of a temporary static library for symbol renaming") | |||
| set (BUILD_SHARED_LIBS OFF CACHE BOOL "Build shared library" FORCE) | |||
| set (BUILD_STATIC_LIBS ON CACHE BOOL "Build static library" FORCE) | |||
| set (DELETE_STATIC_LIBS file (REMOVE $<TARGET_FILE_DIR:${OpenBLAS_LIBNAME}_static>/${OpenBLAS_LIBNAME}.lib)) | |||
| endif () | |||
| endif() | |||
| ####### | |||
| if(BUILD_WITHOUT_LAPACK) | |||
| set(NO_LAPACK 1) | |||
| set(NO_LAPACKE 1) | |||
| endif() | |||
| if (BUILD_WITHOUT_LAPACKE) | |||
| set(NO_LAPACKE 1) | |||
| endif() | |||
| if(BUILD_WITHOUT_CBLAS) | |||
| set(NO_CBLAS 1) | |||
| endif() | |||
| @@ -103,14 +121,15 @@ endif() | |||
| message(WARNING "CMake support is experimental. It does not yet support all build options and may not produce the same Makefiles that OpenBLAS ships with.") | |||
| if (USE_OPENMP) | |||
| find_package(OpenMP REQUIRED) | |||
| endif () | |||
| include("${PROJECT_SOURCE_DIR}/cmake/utils.cmake") | |||
| include("${PROJECT_SOURCE_DIR}/cmake/system.cmake") | |||
| set(OpenBLAS_LIBNAME ${LIBNAMEPREFIX}openblas${LIBNAMESUFFIX}${SUFFIX64_UNDERSCORE}) | |||
| string(FIND "${LIBNAMESUFFIX}" "${SUFFIX64_UNDERSCORE}" HAVE64) | |||
| if (${HAVE64} GREATER -1) | |||
| set(OpenBLAS_LIBNAME ${LIBNAMEPREFIX}openblas${LIBNAMESUFFIX}) | |||
| else () | |||
| set(OpenBLAS_LIBNAME ${LIBNAMEPREFIX}openblas${LIBNAMESUFFIX}${SUFFIX64_UNDERSCORE}) | |||
| endif () | |||
| set(BLASDIRS interface driver/level2 driver/level3 driver/others) | |||
| @@ -224,6 +243,12 @@ endif () | |||
| # add objects to the openblas lib | |||
| if(NOT NO_LAPACK) | |||
| add_library(LAPACK_OVERRIDES OBJECT ${LA_SOURCES}) | |||
| if (USE_OPENMP AND (NOT NOFORTRAN)) | |||
| # Disable OpenMP for LAPACK Fortran codes on Windows. | |||
| if(NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Windows") | |||
| target_link_libraries(LAPACK_OVERRIDES OpenMP::OpenMP_Fortran) | |||
| endif() | |||
| endif() | |||
| list(APPEND TARGET_OBJS "$<TARGET_OBJECTS:LAPACK_OVERRIDES>") | |||
| endif() | |||
| if(NOT NO_LAPACKE) | |||
| @@ -265,30 +290,59 @@ endif() | |||
| if (USE_OPENMP) | |||
| if(BUILD_STATIC_LIBS) | |||
| target_link_libraries(${OpenBLAS_LIBNAME}_static OpenMP::OpenMP_C) | |||
| if(NOFORTRAN) | |||
| target_link_libraries(${OpenBLAS_LIBNAME}_static OpenMP::OpenMP_C) | |||
| else() | |||
| target_link_libraries(${OpenBLAS_LIBNAME}_static OpenMP::OpenMP_C OpenMP::OpenMP_Fortran) | |||
| endif() | |||
| endif() | |||
| if(BUILD_SHARED_LIBS) | |||
| target_link_libraries(${OpenBLAS_LIBNAME}_shared OpenMP::OpenMP_C) | |||
| if(NOFORTRAN) | |||
| target_link_libraries(${OpenBLAS_LIBNAME}_shared OpenMP::OpenMP_C) | |||
| else() | |||
| target_link_libraries(${OpenBLAS_LIBNAME}_shared OpenMP::OpenMP_C OpenMP::OpenMP_Fortran) | |||
| endif() | |||
| endif() | |||
| endif() | |||
| # Seems that this hack doesn't required since macOS 11 Big Sur | |||
| if (APPLE AND BUILD_SHARED_LIBS AND CMAKE_HOST_SYSTEM_VERSION VERSION_LESS 20) | |||
| set (CMAKE_C_USE_RESPONSE_FILE_FOR_OBJECTS 1) | |||
| if (NOT NOFORTRAN) | |||
| set (CMAKE_Fortran_USE_RESPONSE_FILE_FOR_OBJECTS 1) | |||
| set (CMAKE_Fortran_CREATE_SHARED_LIBRARY | |||
| "sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ${CMAKE_AR} -ru libopenblas.a && exit 0' " | |||
| "sh -c '${CMAKE_AR} -rs libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' " | |||
| "sh -c 'echo \"\" | ${CMAKE_Fortran_COMPILER} -o dummy.o -c -x f95-cpp-input - '" | |||
| "sh -c '${CMAKE_Fortran_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load dummy.o -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'" | |||
| "sh -c 'ls -l ${CMAKE_BINARY_DIR}/lib'") | |||
| else () | |||
| set (CMAKE_C_CREATE_SHARED_LIBRARY | |||
| "sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ${CMAKE_AR} -ru libopenblas.a && exit 0' " | |||
| "sh -c '${CMAKE_AR} -rs libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' " | |||
| "sh -c '${CMAKE_C_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'") | |||
| endif () | |||
| # Fix "Argument list too long" for macOS with Intel CPUs and DYNAMIC_ARCH turned on | |||
| if(APPLE AND DYNAMIC_ARCH AND (NOT CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "arm64")) | |||
| # Use response files | |||
| set(CMAKE_C_USE_RESPONSE_FILE_FOR_OBJECTS 1) | |||
| # Always build static library first | |||
| if(BUILD_STATIC_LIBS) | |||
| set(STATIC_PATH "${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/lib${OpenBLAS_LIBNAME}.a") | |||
| else() | |||
| add_library(${OpenBLAS_LIBNAME}_static STATIC ${TARGET_OBJS} ${OpenBLAS_DEF_FILE}) | |||
| set(STATIC_PATH "lib${OpenBLAS_LIBNAME}.a") | |||
| endif() | |||
| set(CREATE_STATIC_LIBRARY_COMMAND | |||
| "sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/${OpenBLAS_LIBNAME}_static.dir/objects*.rsp | xargs -n 1024 ${CMAKE_AR} -ru ${STATIC_PATH} && exit 0' " | |||
| "sh -c '${CMAKE_AR} -rs ${STATIC_PATH} ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' ") | |||
| if(BUILD_SHARED_LIBS) | |||
| add_dependencies(${OpenBLAS_LIBNAME}_shared ${OpenBLAS_LIBNAME}_static) | |||
| set(SHARED_PATH "${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib") | |||
| endif() | |||
| if(USE_OPENMP) | |||
| get_target_property(OMP_LIB OpenMP::OpenMP_C INTERFACE_LINK_LIBRARIES) | |||
| else() | |||
| set(OMP_LIB "") | |||
| endif() | |||
| if(NOT NOFORTRAN) | |||
| set(CMAKE_Fortran_USE_RESPONSE_FILE_FOR_OBJECTS 1) | |||
| set(CMAKE_Fortran_CREATE_STATIC_LIBRARY ${CREATE_STATIC_LIBRARY_COMMAND}) | |||
| if(BUILD_SHARED_LIBS) | |||
| set(CMAKE_Fortran_CREATE_SHARED_LIBRARY | |||
| "sh -c 'echo \"\" | ${CMAKE_Fortran_COMPILER} -o dummy.o -c -x f95-cpp-input - '" | |||
| "sh -c '${CMAKE_Fortran_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,${STATIC_PATH} dummy.o -o ${SHARED_PATH} ${OMP_LIB}'") | |||
| endif() | |||
| else() | |||
| set(CMAKE_C_CREATE_STATIC_LIBRARY ${CREATE_STATIC_LIBRARY_COMMAND}) | |||
| if(BUILD_SHARED_LIBS) | |||
| set(CMAKE_C_CREATE_SHARED_LIBRARY | |||
| "sh -c '${CMAKE_C_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,${STATIC_PATH} -o ${SHARED_PATH} ${OMP_LIB}'") | |||
| endif() | |||
| endif() | |||
| endif() | |||
| # Handle MSVC exports | |||
| @@ -373,7 +427,7 @@ if (BUILD_SHARED_LIBS AND BUILD_RELAPACK) | |||
| endif() | |||
| endif() | |||
| if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFFIX} STREQUAL "") | |||
| if (BUILD_SHARED_LIBS OR DELETE_STATIC_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFFIX} STREQUAL "") | |||
| if (NOT DEFINED ARCH) | |||
| set(ARCH_IN "x86_64") | |||
| else() | |||
| @@ -461,10 +515,33 @@ if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFFIX} STREQUAL "") | |||
| else () | |||
| set (BZ 0) | |||
| endif() | |||
| if (CMAKE_SYSTEM_NAME MATCHES "Windows") | |||
| set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib) | |||
| set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib) | |||
| if (CMAKE_BUILD_TYPE MATCHES "Debug") | |||
| set (CRTLIB msvcrtd) | |||
| set (PDBOPT -debug -pdb:$<TARGET_FILE_DIR:${OpenBLAS_LIBNAME}_static>/${OpenBLAS_LIBNAME}.pdb) | |||
| set (PDB_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib) | |||
| else () | |||
| set (CRTLIB msvcrt) | |||
| set (PDBOPT "") | |||
| endif() | |||
| #if (USE_PERL) | |||
| message(STATUS "adding postbuild instruction to rename syms") | |||
| add_custom_command(TARGET ${OpenBLAS_LIBNAME}_static POST_BUILD | |||
| COMMAND perl ${PROJECT_SOURCE_DIR}/exports/gensymbol.pl "win2k" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" "${SYMBOLPREFIX}" "${SYMBOLSUFFIX}" "${BLD}" "${BBF16}" "${BS}" "${BD}" "${BC}" "${BZ}" > ${PROJECT_BINARY_DIR}/renamesyms.def | |||
| COMMAND ${CMAKE_C_COMPILER} ${CMAKE_C_FLAGS} -I${PROJECT_SOURCE_DIR} -I${PROJECT_BINARY_DIR} -c -o ${PROJECT_BINARY_DIR}/dllinit.o ${PROJECT_SOURCE_DIR}/exports/dllinit.c | |||
| COMMAND lld-link -nodefaultlib:libcmt -defaultlib:${CRTLIB} ${CMAKE_LINKER_FLAGS} -errorlimit:0 -def:${PROJECT_BINARY_DIR}/renamesyms.def ${PROJECT_BINARY_DIR}/dllinit.o $<TARGET_FILE:${OpenBLAS_LIBNAME}_static> -wholearchive:$<TARGET_FILE:${OpenBLAS_LIBNAME}_static> -dll -out:$<TARGET_FILE_DIR:${OpenBLAS_LIBNAME}_static>/${OpenBLAS_LIBNAME}.dll -implib:$<TARGET_FILE_DIR:${OpenBLAS_LIBNAME}_static>/${OpenBLAS_LIBNAME}.dll.a ${PDBOPT} | |||
| #COMMAND lld-link -nodefaultlib:libcmt -defaultlib:msvcrt ${CMAKE_LINKER_FLAGS} -errorlimit:0 -def:${PROJECT_BINARY_DIR}/renamesyms.def ${PROJECT_BINARY_DIR}/dllinit.o $<TARGET_FILE:${OpenBLAS_LIBNAME}_static> -wholearchive:$<TARGET_FILE:${OpenBLAS_LIBNAME}_static> -dll -out:$<TARGET_FILE_DIR:${OpenBLAS_LIBNAME}_static>/${OpenBLAS_LIBNAME}.dll -implib:$<TARGET_FILE_DIR:${OpenBLAS_LIBNAME}_static>/${OpenBLAS_LIBNAME}.dll.a | |||
| ${REMOVE_STATIC_LIB} VERBATIM | |||
| ) | |||
| #endif () | |||
| else () | |||
| if (NOT USE_PERL) | |||
| add_custom_command(TARGET ${OpenBLAS_LIBNAME}_shared POST_BUILD | |||
| COMMAND ${PROJECT_SOURCE_DIR}/exports/gensymbol "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BLD}" "${BBF16}" "${BS}" "${BD}" "${BC}" "${BZ}" > ${PROJECT_BINARY_DIR}/objcopy.def | |||
| COMMAND objcopy -v --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def ${PROJECT_BINARY_DIR}/lib/lib${OpenBLAS_LIBNAME}.so | |||
| COMMAND sh ${PROJECT_SOURCE_DIR}/exports/gensymbol "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BLD}" "${BBF16}" "${BS}" "${BD}" "${BC}" "${BZ}" > ${PROJECT_BINARY_DIR}/objcopy.def | |||
| COMMAND objcopy -v --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def ${PROJECT_BINARY_DIR}/lib/${OpenBLAS_LIBNAME}.so | |||
| COMMENT "renaming symbols" | |||
| ) | |||
| else() | |||
| @@ -475,6 +552,7 @@ if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFFIX} STREQUAL "") | |||
| ) | |||
| endif() | |||
| endif() | |||
| endif() | |||
| if (BUILD_BENCHMARKS) | |||
| #find_package(OpenMP REQUIRED) | |||
| @@ -645,3 +723,4 @@ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${PN}ConfigVersion.cmake | |||
| install(EXPORT "${PN}${SUFFIX64}Targets" | |||
| NAMESPACE "${PN}${SUFFIX64}::" | |||
| DESTINATION ${CMAKECONFIG_INSTALL_DIR}) | |||
| @@ -26,6 +26,9 @@ | |||
| * Chris Sidebottom <chris.sidebottom@arm.com> | |||
| * Optimizations and other improvements targeting AArch64 | |||
| * Annop Wongwathanarat <annop.wongwathanarat@arm.com> | |||
| * Optimizations and other improvements targeting AArch64 | |||
| ## Previous Developers | |||
| * Zaheer Chothia <zaheer.chothia@gmail.com> | |||
| @@ -231,4 +234,23 @@ In chronological order: | |||
| * [2024-01-24] Optimize GEMV forwarding on ARM64 systems | |||
| * Aniket P. Garade <https://github.com/garadeaniket> Sushil Pratap Singh <https://github.com/SushilPratap04> Juliya James <https://github.com/Juliya32> | |||
| * [2024-12-13] Optimized swap and rot Level-1 BLAS routines with ARM SVE | |||
| * [2024-12-13] Optimized swap and rot Level-1 BLAS routines with ARM SVE | |||
| * Annop Wongwathanarat <annop.wongwathanarat@arm.com> | |||
| * [2025-01-10] Add thread throttling profile for SGEMM on NEOVERSEV1 | |||
| * [2025-01-21] Optimize gemv_t_sve_v1x3 kernel | |||
| * [2025-02-26] Add sbgemv_t_bfdot kernel | |||
| * [2025-03-12] Fix aarch64 sbgemv_t compilation error for GCC < 13 | |||
| * [2025-03-12] Optimize aarch64 sgemm_ncopy | |||
| * Marek Michalowski <marek.michalowski@arm.com> | |||
| * [2025-01-21] Add thread throttling profile for SGEMV on `NEOVERSEV1` | |||
| * [2025-02-18] Add thread throttling profile for SGEMM on `NEOVERSEV2` | |||
| * [2025-02-19] Add thread throttling profile for SGEMV on `NEOVERSEV2` | |||
| * Ye Tao <ye.tao@arm.com> | |||
| * [2025-02-03] Optimize SBGEMM kernel on NEOVERSEV1 | |||
| * [2025-02-27] Add sbgemv_n_neon kernel | |||
| * Abhishek Kumar <https://github.com/abhishek-iitmadras> | |||
| * [2025-04-22] Optimise dot kernel for NEOVERSE V1 | |||
| @@ -1,4 +1,138 @@ | |||
| OpenBLAS ChangeLog | |||
| ==================================================================== | |||
| Version 0.3.30 | |||
| 19-Jun-2025 | |||
| general: | |||
| - fixed an installation problem with the thread safety test in gmake builds | |||
| - fixed spurious overwriting of an input array in complex GEMMT/GEMMTR | |||
| - fixed naming of GEMMTR in error messages from XERBLA | |||
| - fixed compilation of SBGEMMT/SBGEMMTR in CMake builds | |||
| - fixed the implementation of ?NRM2 to handle INCX=0 correctly | |||
| - removed tests for CSROT and ZDROT that relied on unspecified behavior | |||
| - fixed a performance regression in multithreaded GEMM that was particularly | |||
| serious on POWER targets | |||
| - fixed linking issues when using LLVM's flang-new with gmake | |||
| - fixed a potential thread safety problem with C11 atomic operations | |||
| - further improved the workload partitioning in parallel GEMM | |||
| - fixed omission of LAPACKE interfaces for CGESVDQ,CTRSYL3 and ?GEQPF in | |||
| CMake builds | |||
| - fixed mishandling of setting NO_LAPACK to FALSE, and incorrect dependencies | |||
| for LAPACK function SPMV in CMake builds | |||
| - added explicit CMake options for building LAPACKE and shared libraries | |||
| - simplified and improved handling of OpenMP options in CMake builds | |||
| - reworked Windows DLL generation in CMake builds to ensure correct symbol | |||
| renaming (pre/postfixing) and optional generation of PDB files for debugging | |||
| - updated the Perl script version of the gensymbol utility for use with | |||
| Windows-on-Arm | |||
| - Fixed building with (Mingw) gmake on Windows to ensure completeness of the | |||
| LAPACK included in the static library (potential race condition due to the | |||
| Windows version of the "ln" utility creating snapshot copies rather than links) | |||
| - fixed unwanted deletion of the lapacke_mangling.h file by "make clean" | |||
| - fixed potential duplication of a _64 suffix on library names in CMake builds | |||
| - fixed compilation of the C fallback copies of the LAPACK code with GCC 15 | |||
| - included fixed from the Reference-LAPACK project: | |||
| - fixed a truncated error message in the EIG part of the testsuite | |||
| (Reference-LAPACK PR 1119) | |||
| - fixed too strict check in LAPACKE_?gesdd_work (PR #1126) | |||
| - fixed memory corruption when calling ?GEEV with non-finite data (PR #1128) | |||
| - fixed missing initialization of a variable in C/GEQP3RK (PR #1131) | |||
| - fixed 2nd dimension chosen in C/ZUNMLQ transposition operation (PR #1135) | |||
| x86_64: | |||
| - fixed an error in the SBGEMV kernel for Cooper Lake/Sapphire Rapids | |||
| - fixed corner cases of NAN and INF input handling in CSCAL and ZSCAL | |||
| - improved the compiler identification code for flang-new | |||
| - fixed a potential build issue in the ZSUM kernel | |||
| - fixed "argument list too long" errors when building on MacOS | |||
| - added cpu autodetection support for several new Arrow Lake models | |||
| - fixed conditional inclusion of the fast path SGEMM kernel in DYNAMIC_ARCH | |||
| - fixed compilation with the MinGW build of GCC 15 | |||
| arm64: | |||
| - fixed cpu type detection of A64FX and some ThunderX models (broken in 0.3.29) | |||
| - added support for the AmpereOne/1A cpus in DYNAMIC_ ARCH builds | |||
| - added an optimized SBGEMM kernel for NEOVERSEV1 | |||
| - improved 1xN SBGEMM performance by forwarding to SBGEMV | |||
| - introduced a stepwise increase of the thread count used for | |||
| SGEMM and SGEMV on NEOVERSEV1/V2 in relation to problem size | |||
| - introduced a stepwise increase of the thread count used for | |||
| DGEMV on NEOVERSEV1 in relation to problem size | |||
| - introduced a stepwise increase of the thread count used for | |||
| SDOT and DDOT on NEOVERSEV1 in relation to problem size | |||
| - worked around assembler limitations in LLVM for Windows-on-Arm | |||
| - enabled cpu type autodetection from the registry on Windows-on-Arm | |||
| - improved multithreading threshold for GEMV and GESV on Windows-on-Arm | |||
| - fixed overoptimization issues with LLVM's flang in Windows-on-Arm | |||
| - fixed corner cases of NAN and INF input handling in CSCAL and ZSCAL | |||
| - added a fast path SGEMM kernel for small workloads on SME capable targets | |||
| - improved performance of SGEMM and DGEMM kernels for small workloads | |||
| - improved performance of SGEMV and DGEMV on SVE-capable targets | |||
| - improved performance of SGEMV on NEOVERSEN1 and Apple M | |||
| - added optimized SSYMV and DSYMV kernels for NEOVERSEN1, Apple M and all | |||
| SVE capable targets | |||
| - added optimized SBGEMV kernels for NEOVERSEV1/V2/N2 | |||
| - improved performance of SGEMM through faster NCOPY kernels | |||
| - added compiler options for the NVIDIA HPC Compiler Suite | |||
| - fixed compilation on OSX with XCode 16.3 and later | |||
| - fixed cpu core type and cache size detection on Apple M4 | |||
| - updated GEMM parameter settings for Neoverse cpus in cross-builds with CMake | |||
| - fixed default compiler options for NEOVERSEN1 and CORTEXX2 in CMake builds | |||
| - fixed conditional inclusion of the fast path SGEMM kernel in DYNAMIC_ARCH | |||
| - fixed potential miscompilation of the non-SVE SDOT kernel | |||
| riscv64: | |||
| - added optimized SROTM and DROTM kernels for x280 | |||
| - fixed corner cases of NAN and INF input handling in CSCAL and ZSCAL | |||
| - improved performance of GEMM_TCOPY on RVV1.0 targets with | |||
| VLEN of 128 or 256 | |||
| - improved performance of OMATCOPY on targets with VLEN 256 | |||
| - greatly improved performance of SGEMV/DGEMV | |||
| - improved performance of CGEMV and ZGEMV on C910V and all RVV targets | |||
| with VLEN 256 | |||
| - improved performance of SAXPBY and DAXPBY on C910V and all RVV targets | |||
| with VLEN 256 | |||
| - improved performance of AXPY and DOT on C910V and ZVL256B targets by | |||
| falling back to non-vectorized code for very small N. (Thereby fixing | |||
| poor performance of CHBMV/ZHBMV for very small K) | |||
| - fixed CMake build failures of the TRMM kernels | |||
| loongarch64: | |||
| - improved performance of the LSX versions of SSYMV/DSYMV | |||
| - made the LASX versions of the DSYMV and SSYMV kernels | |||
| compatible with hardware changes in LA664 and future targets | |||
| - fixed inaccuracies in several LASX kernels | |||
| - improved compatibility of LSX kernels with LA264 targets | |||
| - fixed handling of deprecated target names in CMake builds | |||
| - fixed corner cases of NAN and INF input handling in CSCAL and ZSCAL | |||
| power: | |||
| - fixed building for PPCG4 with CMake | |||
| - fixed SSCAL/DSCAL on PPC970 running FreeBSD | |||
| - fixed a potential alignment issue in the POWER8 SGEMV kernel | |||
| - fixed corner cases of NAN and INF input handling in CSCAL and ZSCAL | |||
| zarch: | |||
| - fixed corner cases of NAN and INF input handling in CSCAL and ZSCAL | |||
| - fixed unwanted generation of object files with a writable stack | |||
| x86: | |||
| - fixed corner cases of NAN and INF input handling in CSCAL and ZSCAL | |||
| - worked around potential miscompilation of CDOT with very old binutils | |||
| arm: | |||
| - fixed corner cases of NAN and INF input handling in CSCAL and ZSCAL | |||
| - fixed unwanted generation of object files with a writable stack | |||
| sparc: | |||
| - fixed corner cases of NAN and INF input handling in CSCAL and ZSCAL | |||
| alpha: | |||
| - fixed build failure caused by spurious Windows-only typecasts | |||
| cell: | |||
| - fixed probable build issue caused by spurious Windows-only typecasts | |||
| ==================================================================== | |||
| Version 0.3.29 | |||
| 12-Jan-2025 | |||
| @@ -93,6 +93,11 @@ ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) | |||
| echo " Fortran compiler ... $(F_COMPILER) (command line : $(FC))";\ | |||
| fi | |||
| endif | |||
| ifeq ($(OSNAME), WINNT) | |||
| @-$(LNCMD) $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) | |||
| endif | |||
| ifneq ($(OSNAME), AIX) | |||
| @echo -n " Library Name ... $(LIBNAME)" | |||
| else | |||
| @@ -447,7 +452,7 @@ endif | |||
| @rm -f cblas.tmp cblas.tmp2 | |||
| @touch $(NETLIB_LAPACK_DIR)/make.inc | |||
| @$(MAKE) -C $(NETLIB_LAPACK_DIR) clean | |||
| @rm -f $(NETLIB_LAPACK_DIR)/make.inc $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_mangling.h | |||
| @rm -f $(NETLIB_LAPACK_DIR)/make.inc | |||
| @$(MAKE) -C relapack clean | |||
| @rm -f *.grd Makefile.conf_last config_last.h | |||
| @(cd $(NETLIB_LAPACK_DIR)/TESTING && rm -f x* *.out testing_results.txt) | |||
| @@ -30,6 +30,11 @@ FCOMMON_OPT += -march=armv8-a+sve | |||
| endif | |||
| endif | |||
| ifeq ($(CORE), ARMV9SME) | |||
| CCOMMON_OPT += -march=armv9-a+sve2+sme | |||
| FCOMMON_OPT += -march=armv9-a+sve2 | |||
| endif | |||
| ifeq ($(CORE), CORTEXA53) | |||
| CCOMMON_OPT += -march=armv8-a -mtune=cortex-a53 | |||
| ifneq ($(F_COMPILER), NAG) | |||
| @@ -101,7 +106,7 @@ ifeq ($(CORE), NEOVERSEV1) | |||
| ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG))) | |||
| ifeq (1, $(filter 1,$(GCCVERSIONGTEQ10) $(ISCLANG))) | |||
| ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ11) $(ISCLANG))) | |||
| CCOMMON_OPT += -march=armv8.4-a+sve | |||
| CCOMMON_OPT += -march=armv8.4-a+sve+bf16 | |||
| ifeq (1, $(ISCLANG)) | |||
| CCOMMON_OPT += -mtune=cortex-x1 | |||
| else | |||
| @@ -111,7 +116,7 @@ ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=armv8.4-a -mtune=neoverse-v1 | |||
| endif | |||
| else | |||
| CCOMMON_OPT += -march=armv8.4-a+sve | |||
| CCOMMON_OPT += -march=armv8.4-a+sve+bf16 | |||
| ifneq ($(CROSS), 1) | |||
| CCOMMON_OPT += -mtune=native | |||
| endif | |||
| @@ -315,8 +315,8 @@ endif | |||
| endif | |||
| ifeq ($(CPP_THREAD_SAFETY_TEST), 1) | |||
| @install -m 666 cpp_thread_test/dgemm_tester $(DESTDIR)$(OPENBLAS_BINARY_DIR) | |||
| @install -m 666 cpp_thread_test/dgemv_tester $(DESTDIR)$(OPENBLAS_BINARY_DIR) | |||
| @install -m 666 cpp_thread_test/dgemm_tester $(DESTDIR)$(OPENBLAS_BINARY_DIR) | |||
| @install -m 666 cpp_thread_test/dgemv_tester $(DESTDIR)$(OPENBLAS_BINARY_DIR) | |||
| endif | |||
| endif | |||
| @@ -3,7 +3,7 @@ | |||
| # | |||
| # This library's version | |||
| VERSION = 0.3.29 | |||
| VERSION = 0.3.29.dev | |||
| # If you set this prefix, the library name will be lib$(LIBNAMESUFFIX)openblas.a | |||
| # and lib$(LIBNAMESUFFIX)openblas.so, with a matching soname in the shared library | |||
| @@ -276,6 +276,7 @@ SMALL_MATRIX_OPT = 1 | |||
| endif | |||
| ifeq ($(ARCH), arm64) | |||
| GEMM_GEMV_FORWARD = 1 | |||
| GEMM_GEMV_FORWARD_BF16 = 1 | |||
| endif | |||
| ifeq ($(ARCH), riscv) | |||
| GEMM_GEMV_FORWARD = 1 | |||
| @@ -420,6 +421,7 @@ ifeq ($(ARCH), arm64) | |||
| export MACOSX_DEPLOYMENT_TARGET=11.0 | |||
| ifeq ($(C_COMPILER), GCC) | |||
| export NO_SVE = 1 | |||
| export NO_SME = 1 | |||
| endif | |||
| else | |||
| export MACOSX_DEPLOYMENT_TARGET=10.8 | |||
| @@ -434,6 +436,11 @@ ifeq (x$(XCVER), x 15) | |||
| CCOMMON_OPT += -Wl,-ld_classic | |||
| FCOMMON_OPT += -Wl,-ld_classic | |||
| endif | |||
| ifeq (x$(XCVER), x 16) | |||
| ifeq ($(F_COMPILER), GFORTRAN) | |||
| override CEXTRALIB := $(filter-out(-lto_library, $(CEXTRALIB))) | |||
| endif | |||
| endif | |||
| endif | |||
| ifneq (,$(findstring $(OSNAME), FreeBSD OpenBSD DragonFly)) | |||
| @@ -709,6 +716,9 @@ DYNAMIC_CORE += NEOVERSEN2 | |||
| DYNAMIC_CORE += ARMV8SVE | |||
| DYNAMIC_CORE += A64FX | |||
| endif | |||
| ifneq ($(NO_SME), 1) | |||
| DYNAMIC_CORE += ARMV9SME | |||
| endif | |||
| DYNAMIC_CORE += THUNDERX | |||
| DYNAMIC_CORE += THUNDERX2T99 | |||
| DYNAMIC_CORE += TSV110 | |||
| @@ -1472,6 +1482,10 @@ ifeq ($(NO_SVE), 1) | |||
| CCOMMON_OPT += -DNO_SVE | |||
| endif | |||
| ifeq ($(NO_SME), 1) | |||
| CCOMMON_OPT += -DNO_SME | |||
| endif | |||
| ifdef SMP | |||
| CCOMMON_OPT += -DSMP_SERVER | |||
| @@ -111,6 +111,7 @@ THUNDERX3T110 | |||
| VORTEX | |||
| A64FX | |||
| ARMV8SVE | |||
| ARMV9SME | |||
| FT2000 | |||
| 9.System Z: | |||
| @@ -25,14 +25,28 @@ jobs: | |||
| echo "FROM quay.io/pypa/manylinux1_x86_64 | |||
| COPY . /tmp/openblas | |||
| RUN cd /tmp/openblas && \ | |||
| COMMON_FLAGS='DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32' && \ | |||
| BTYPE='BINARY=64' CC=gcc && \ | |||
| make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE && \ | |||
| make -C test $COMMON_FLAGS $BTYPE && \ | |||
| make -C ctest $COMMON_FLAGS $BTYPE && \ | |||
| make -C utest $COMMON_FLAGS $BTYPE" > Dockerfile | |||
| CC=gcc && \ | |||
| make QUIET_MAKE=1 BINARY=64 DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 && \ | |||
| make -C test BINARY=64 DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 && \ | |||
| make -C ctest BINARY=64 DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 && \ | |||
| make -C utest BINARY=64 DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32" > Dockerfile | |||
| docker build . | |||
| displayName: Run manylinux1 docker build | |||
| - job: manylinux_32bit | |||
| pool: | |||
| vmImage: 'ubuntu-latest' | |||
| steps: | |||
| - script: | | |||
| echo "FROM quay.io/pypa/manylinux2014_i686 | |||
| COPY . /tmp/openblas | |||
| RUN cd /tmp/openblas && \ | |||
| CC=gcc && \ | |||
| make QUIET_MAKE=1 BINARY=32 TARGET=NEHALEM NUM_THREADS=32 && \ | |||
| make -C test BINARY=32 TARGET=NEHALEM NUM_THREADS=32 && \ | |||
| make -C ctest BINARY=32 TARGET=NEHALEM NUM_THREADS=32 && \ | |||
| make -C utest BINARY=32 TARGET=NEHALEM NUM_THREADS=32" > Dockerfile | |||
| docker build . | |||
| displayName: Run manylinux 32bit docker build | |||
| - job: Intel_SDE_skx | |||
| pool: | |||
| vmImage: 'ubuntu-latest' | |||
| @@ -141,7 +155,7 @@ jobs: | |||
| - job: OSX_OpenMP | |||
| pool: | |||
| vmImage: 'macOS-12' | |||
| vmImage: 'macOS-13' | |||
| steps: | |||
| - script: | | |||
| brew update | |||
| @@ -151,7 +165,7 @@ jobs: | |||
| - job: OSX_GCC_Nothreads | |||
| pool: | |||
| vmImage: 'macOS-12' | |||
| vmImage: 'macOS-13' | |||
| steps: | |||
| - script: | | |||
| brew update | |||
| @@ -164,7 +178,19 @@ jobs: | |||
| - script: | | |||
| brew update | |||
| make CC=gcc-12 FC=gfortran-12 | |||
| - job: OSX_LLVM_flangnew | |||
| pool: | |||
| vmImage: 'macOS-latest' | |||
| variables: | |||
| LD_LIBRARY_PATH: /usr/local/opt/llvm/lib | |||
| LIBRARY_PATH: /usr/local/opt/llvm/lib | |||
| steps: | |||
| - script: | | |||
| brew update | |||
| brew install llvm flang | |||
| make TARGET=NEHALEM CC=/usr/local/opt/llvm/bin/clang FC=/usr/local/opt/flang/bin/flang NO_SHARED=1 | |||
| - job: OSX_OpenMP_Clang | |||
| pool: | |||
| vmImage: 'macOS-latest' | |||
| @@ -195,7 +221,7 @@ jobs: | |||
| - job: OSX_dynarch_cmake | |||
| pool: | |||
| vmImage: 'macOS-12' | |||
| vmImage: 'macOS-13' | |||
| variables: | |||
| LD_LIBRARY_PATH: /usr/local/opt/llvm/lib | |||
| LIBRARY_PATH: /usr/local/opt/llvm/lib | |||
| @@ -242,7 +268,7 @@ jobs: | |||
| - job: OSX_NDK_ARMV7 | |||
| pool: | |||
| vmImage: 'macOS-12' | |||
| vmImage: 'macOS-13' | |||
| steps: | |||
| - script: | | |||
| brew update | |||
| @@ -252,7 +278,7 @@ jobs: | |||
| - job: OSX_IOS_ARMV8 | |||
| pool: | |||
| vmImage: 'macOS-12' | |||
| vmImage: 'macOS-13' | |||
| variables: | |||
| CC: /Applications/Xcode_14.2.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang | |||
| CFLAGS: -O2 -Wno-macro-redefined -isysroot /Applications/Xcode_14.2.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS16.2.sdk -arch arm64 -miphoneos-version-min=10.0 | |||
| @@ -262,7 +288,7 @@ jobs: | |||
| - job: OSX_IOS_ARMV7 | |||
| pool: | |||
| vmImage: 'macOS-12' | |||
| vmImage: 'macOS-13' | |||
| variables: | |||
| CC: /Applications/Xcode_14.2.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang | |||
| CFLAGS: -O2 -mno-thumb -Wno-macro-redefined -isysroot /Applications/Xcode_14.2.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS16.2.sdk -arch armv7 -miphoneos-version-min=5.1 | |||
| @@ -272,7 +298,7 @@ jobs: | |||
| - job: OSX_xbuild_DYNAMIC_ARM64 | |||
| pool: | |||
| vmImage: 'macOS-12' | |||
| vmImage: 'macOS-13' | |||
| variables: | |||
| CC: /Applications/Xcode_14.2.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang | |||
| CFLAGS: -O2 -Wno-macro-redefined -isysroot /Applications/Xcode_14.2.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX13.1.sdk -arch arm64 | |||
| @@ -334,6 +334,24 @@ if [ "$architecture" = "arm64" ]; then | |||
| rm -rf "$tmpd" | |||
| fi | |||
| no_sme=0 | |||
| if [ "$architecture" = "arm64" ]; then | |||
| tmpd=$(mktemp -d 2>/dev/null || mktemp -d -t 'OBC') | |||
| tmpf="$tmpd/a.S" | |||
| printf ".text \n.global sme_test\n\nsme_test:\nsmstart\nsmstop\nret\n">> "$tmpf" | |||
| args=" -march=armv9-a+sve2+sme -c -o $tmpf.o $tmpf" | |||
| no_sme=0 | |||
| { | |||
| $compiler_name $flags $args >/dev/null 2>&1 | |||
| } || { | |||
| args=" -march=armv9-a+sme -c -o $tmpf.o $tmpf" | |||
| $compiler_name $flags $args >/dev/null 2>&1 | |||
| } || { | |||
| no_sme=1 | |||
| } | |||
| rm -rf "$tmpd" | |||
| fi | |||
| c11_atomics=0 | |||
| case "$data" in | |||
| *HAVE_C11*) | |||
| @@ -475,6 +493,7 @@ done | |||
| printf "CEXTRALIB=%s %s %s\n" "$linker_L" "$linker_l" "$linker_a" | |||
| [ "$no_msa" -eq 1 ] && printf "NO_MSA=1\n" | |||
| [ "$no_sve" -eq 1 ] && printf "NO_SVE=1\n" | |||
| [ "$no_sme" -eq 1 ] && printf "NO_SME=1\n" | |||
| [ "$no_rv64gv" -eq 1 ] && printf "NO_RV64GV=1\n" | |||
| [ "$no_avx512" -eq 1 ] && printf "NO_AVX512=1\n" | |||
| [ "$no_avx512bf" -eq 1 ] && printf "NO_AVX512BF16=1\n" | |||
| @@ -31,22 +31,23 @@ if (${CMAKE_C_COMPILER_ID} STREQUAL "Intel") | |||
| set(CCOMMON_OPT "${CCOMMON_OPT} -wd981") | |||
| endif () | |||
| if (USE_OPENMP) | |||
| # USE_SIMPLE_THREADED_LEVEL3 = 1 | |||
| # NO_AFFINITY = 1 | |||
| find_package(OpenMP REQUIRED) | |||
| if (OpenMP_FOUND) | |||
| set(CCOMMON_OPT "${CCOMMON_OPT} ${OpenMP_C_FLAGS} -DUSE_OPENMP") | |||
| set(FCOMMON_OPT "${FCOMMON_OPT} ${OpenMP_Fortran_FLAGS}") | |||
| endif() | |||
| endif () | |||
| if (DYNAMIC_ARCH) | |||
| if (ARM64) | |||
| set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110) | |||
| if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 9.99) | |||
| set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2 ARMV8SVE A64FX) | |||
| set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110) | |||
| if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU") | |||
| if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER_EQUAL 10) # SVE ACLE supported in GCC >= 10 | |||
| set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2 ARMV8SVE A64FX) | |||
| endif () | |||
| if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER_EQUAL 14) # SME ACLE supported in GCC >= 14 | |||
| set(DYNAMIC_CORE ${DYNAMIC_CORE} ARMV9SME) | |||
| endif() | |||
| elseif (${CMAKE_C_COMPILER_ID} MATCHES "Clang") | |||
| if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER_EQUAL 11) # SVE ACLE supported in LLVM >= 11 | |||
| set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2 ARMV8SVE A64FX) | |||
| endif () | |||
| if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER_EQUAL 19) # SME ACLE supported in LLVM >= 19 | |||
| set(DYNAMIC_CORE ${DYNAMIC_CORE} ARMV9SME) | |||
| endif() | |||
| endif () | |||
| if (DYNAMIC_LIST) | |||
| set(DYNAMIC_CORE ARMV8 ${DYNAMIC_LIST}) | |||
| @@ -84,7 +84,7 @@ endif () | |||
| if (${CMAKE_C_COMPILER_ID} STREQUAL "NVHPC") | |||
| if (POWER) | |||
| set(CCOMMON_OPT "${CCOMMON_OPT} -tp pwr8") | |||
| else () | |||
| elseif (X86_64) | |||
| set(CCOMMON_OPT "${CCOMMON_OPT} -tp px") | |||
| endif () | |||
| endif () | |||
| @@ -182,7 +182,9 @@ endif () | |||
| if (${CORE} STREQUAL A64FX) | |||
| if (NOT DYNAMIC_ARCH) | |||
| if (${GCC_VERSION} VERSION_GREATER 11.0 OR ${GCC_VERSION} VERSION_EQUAL 11.0) | |||
| if (${CMAKE_C_COMPILER_ID} STREQUAL "NVC" AND NOT NO_SVE) | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -tp=a64fx") | |||
| elseif (${GCC_VERSION} VERSION_GREATER 11.0 OR ${GCC_VERSION} VERSION_EQUAL 11.0) | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve -mtune=a64fx") | |||
| else () | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve") | |||
| @@ -194,6 +196,8 @@ if (${CORE} STREQUAL NEOVERSEN2) | |||
| if (NOT DYNAMIC_ARCH) | |||
| if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE) | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -Msve_intrinsics -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2") | |||
| elseif (${CMAKE_C_COMPILER_ID} STREQUAL "NVC" AND NOT NO_SVE) | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -tp=neoverse-v2") | |||
| else () | |||
| if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4) | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2") | |||
| @@ -208,6 +212,8 @@ if (${CORE} STREQUAL NEOVERSEV1) | |||
| if (NOT DYNAMIC_ARCH) | |||
| if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE) | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -Msve_intrinsics -march=armv8.4-a+sve -mtune=neoverse-v1") | |||
| elseif (${CMAKE_C_COMPILER_ID} STREQUAL "NVC" AND NOT NO_SVE) | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -tp=neoverse-v1") | |||
| else () | |||
| if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4) | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.4-a+sve -mtune=neoverse-v1") | |||
| @@ -220,10 +226,12 @@ endif () | |||
| if (${CORE} STREQUAL NEOVERSEN1) | |||
| if (NOT DYNAMIC_ARCH) | |||
| if (${GCC_VERSION} VERSION_GREATER 9.4 OR ${GCC_VERSION} VERSION_EQUAL 9.4) | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve -mtune=neoverse-n1") | |||
| if (${CMAKE_C_COMPILER_ID} STREQUAL "NVC" AND NOT NO_SVE) | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -tp=neoverse-n1") | |||
| elseif (${GCC_VERSION} VERSION_GREATER 9.4 OR ${GCC_VERSION} VERSION_EQUAL 9.4) | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a -mtune=neoverse-n1") | |||
| else () | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve") | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a") | |||
| endif() | |||
| endif () | |||
| endif () | |||
| @@ -232,21 +240,33 @@ if (${CORE} STREQUAL ARMV8SVE) | |||
| if (NOT DYNAMIC_ARCH) | |||
| if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE) | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -Msve_intrinsics -march=armv8-a+sve") | |||
| elseif (${CMAKE_C_COMPILER_ID} STREQUAL "NVC" AND NOT NO_SVE) | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -tp=host") | |||
| else () | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve") | |||
| endif () | |||
| endif () | |||
| endif () | |||
| if (${CORE} STREQUAL ARMV9SME) | |||
| if (NOT DYNAMIC_ARCH) | |||
| if (${CMAKE_C_COMPILER_ID} STREQUAL "NVC" AND NOT NO_SVE) | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -tp=host") | |||
| else () | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv9-a+sme") | |||
| endif () | |||
| endif () | |||
| endif () | |||
| if (${CORE} STREQUAL CORTEXA510) | |||
| if (NOT DYNAMIC_ARCH) | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve") | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.4-a+sve") | |||
| endif () | |||
| endif () | |||
| if (${CORE} STREQUAL CORTEXA710) | |||
| if (NOT DYNAMIC_ARCH) | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve") | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.4-a+sve") | |||
| endif () | |||
| endif () | |||
| @@ -258,7 +278,7 @@ endif () | |||
| if (${CORE} STREQUAL CORTEXX2) | |||
| if (NOT DYNAMIC_ARCH) | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve") | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.4-a+sve") | |||
| endif () | |||
| endif () | |||
| @@ -7,7 +7,7 @@ if (${F_COMPILER} STREQUAL "FLANG" AND NOT CMAKE_Fortran_COMPILER_ID STREQUAL "L | |||
| # This is for classic Flang. LLVM Flang is handled with gfortran below. | |||
| set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_FLANG") | |||
| if (USE_OPENMP) | |||
| set(FCOMMON_OPT "${FCOMMON_OPT} -fopenmp") | |||
| set(OpenMP_Fortran_FLAGS "-fopenmp" CACHE STRING "OpenMP Fortran compiler flags") | |||
| endif () | |||
| set(FCOMMON_OPT "${FCOMMON_OPT} -Mrecursive -Kieee") | |||
| endif () | |||
| @@ -117,7 +117,7 @@ if (${F_COMPILER} STREQUAL "GFORTRAN" OR ${F_COMPILER} STREQUAL "F95" OR CMAKE_F | |||
| endif () | |||
| if (USE_OPENMP) | |||
| set(FCOMMON_OPT "${FCOMMON_OPT} -fopenmp") | |||
| set(OpenMP_Fortran_FLAGS "-fopenmp" CACHE STRING "OpenMP Fortran compiler flags") | |||
| endif () | |||
| endif () | |||
| @@ -128,14 +128,14 @@ if (${F_COMPILER} STREQUAL "INTEL" OR CMAKE_Fortran_COMPILER_ID MATCHES "Intel") | |||
| endif () | |||
| set(FCOMMON_OPT "${FCOMMON_OPT} -recursive -fp-model=consistent") | |||
| if (USE_OPENMP) | |||
| set(FCOMMON_OPT "${FCOMMON_OPT} -openmp") | |||
| set(OpenMP_Fortran_FLAGS "-openmp" CACHE STRING "OpenMP Fortran compiler flags") | |||
| endif () | |||
| endif () | |||
| if (${F_COMPILER} STREQUAL "FUJITSU") | |||
| set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_FUJITSU") | |||
| if (USE_OPENMP) | |||
| set(FCOMMON_OPT "${FCOMMON_OPT} -openmp") | |||
| set(OpenMP_Fortran_FLAGS "-openmp" CACHE STRING "OpenMP Fortran compiler flags") | |||
| endif () | |||
| endif () | |||
| @@ -151,7 +151,7 @@ if (${F_COMPILER} STREQUAL "IBM") | |||
| set(FCOMMON_OPT "${FCOMMON_OPT} -q32") | |||
| endif () | |||
| if (USE_OPENMP) | |||
| set(FCOMMON_OPT "${FCOMMON_OPT} -openmp") | |||
| set(OpenMP_Fortran_FLAGS "-openmp" CACHE STRING "OpenMP Fortran compiler flags") | |||
| endif () | |||
| endif () | |||
| @@ -168,7 +168,7 @@ if (${F_COMPILER} STREQUAL "PGI" OR ${F_COMPILER} STREQUAL "PGF95") | |||
| endif () | |||
| set(FCOMMON_OPT "${FCOMMON_OPT} -Mrecursive") | |||
| if (USE_OPENMP) | |||
| set(FCOMMON_OPT "${FCOMMON_OPT} -mp") | |||
| set(OpenMP_Fortran_FLAGS "-mp" CACHE STRING "OpenMP Fortran compiler flags") | |||
| endif () | |||
| endif () | |||
| @@ -195,7 +195,7 @@ if (${F_COMPILER} STREQUAL "PATHSCALE") | |||
| endif () | |||
| if (USE_OPENMP) | |||
| set(FCOMMON_OPT "${FCOMMON_OPT} -mp") | |||
| set(OpenMP_Fortran_FLAGS "-mp" CACHE STRING "OpenMP Fortran compiler flags") | |||
| endif () | |||
| endif () | |||
| @@ -233,7 +233,7 @@ if (${F_COMPILER} STREQUAL "OPEN64") | |||
| if (USE_OPENMP) | |||
| set(FEXTRALIB "${FEXTRALIB} -lstdc++") | |||
| set(FCOMMON_OPT "${FCOMMON_OPT} -mp") | |||
| set(OpenMP_Fortran_FLAGS "-mp" CACHE STRING "OpenMP Fortran compiler flags") | |||
| endif () | |||
| endif () | |||
| @@ -245,14 +245,14 @@ if (${F_COMPILER} STREQUAL "SUN") | |||
| set(FCOMMON_OPT "${FCOMMON_OPT} -m64") | |||
| endif () | |||
| if (USE_OPENMP) | |||
| set(FCOMMON_OPT "${FCOMMON_OPT} -xopenmp=parallel") | |||
| set(OpenMP_Fortran_FLAGS "-xopenmp=parallel" CACHE STRING "OpenMP Fortran compiler flags") | |||
| endif () | |||
| endif () | |||
| if (${F_COMPILER} STREQUAL "COMPAQ") | |||
| set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_COMPAQ") | |||
| if (USE_OPENMP) | |||
| set(FCOMMON_OPT "${FCOMMON_OPT} -openmp") | |||
| set(OpenMP_Fortran_FLAGS "-openmp" CACHE STRING "OpenMP Fortran compiler flags") | |||
| endif () | |||
| endif () | |||
| @@ -265,7 +265,7 @@ if (${F_COMPILER} STREQUAL "CRAY") | |||
| if (NOT USE_OPENMP) | |||
| set(FCOMMON_OPT "${FCOMMON_OPT} -fno-openmp") | |||
| else () | |||
| set(FCOMMON_OPT "${FCOMMON_OPT} -fopenmp") | |||
| set(OpenMP_Fortran_FLAGS "-fopenmp" CACHE STRING "OpenMP Fortran compiler flags") | |||
| endif () | |||
| endif () | |||
| @@ -290,7 +290,7 @@ if (${F_COMPILER} STREQUAL "NAGFOR") | |||
| # -w=unused: Suppress warning messages about unused variables | |||
| set(FCOMMON_OPT "${FCOMMON_OPT} -w=x77 -w=ques -w=unused") | |||
| if (USE_OPENMP) | |||
| set(FCOMMON_OPT "${FCOMMON_OPT} -openmp") | |||
| set(OpenMP_Fortran_FLAGS "-openmp" CACHE STRING "OpenMP Fortran compiler flags") | |||
| endif () | |||
| endif () | |||
| @@ -79,6 +79,9 @@ macro(SetDefaultL1) | |||
| SetFallback(CROTKERNEL zrot.S) | |||
| SetFallback(ZROTKERNEL zrot.S) | |||
| SetFallback(XROTKERNEL zrot.S) | |||
| SetFallback(SROTMKERNEL rotm.S) | |||
| SetFallback(DROTMKERNEL rotm.S) | |||
| SetFallback(QROTMKERNEL rotm.S) | |||
| SetFallback(SSCALKERNEL scal.S) | |||
| SetFallback(DSCALKERNEL scal.S) | |||
| SetFallback(CSCALKERNEL zscal.S) | |||
| @@ -98,6 +98,8 @@ set(CSRC | |||
| lapacke_cgesv_work.c | |||
| lapacke_cgesvd.c | |||
| lapacke_cgesvd_work.c | |||
| lapacke_cgesvdq.c | |||
| lapacke_cgesvdq_work.c | |||
| lapacke_cgesvdx.c | |||
| lapacke_cgesvdx_work.c | |||
| lapacke_cgesvj.c | |||
| @@ -1766,8 +1768,8 @@ set(SSRC | |||
| lapacke_strsna_work.c | |||
| lapacke_strsyl.c | |||
| lapacke_strsyl_work.c | |||
| lapacke_ctrsyl3.c | |||
| lapacke_ctrsyl3_work.c | |||
| lapacke_strsyl3.c | |||
| lapacke_strsyl3_work.c | |||
| lapacke_strtri.c | |||
| lapacke_strtri_work.c | |||
| lapacke_strtrs.c | |||
| @@ -2410,10 +2412,10 @@ set(ZSRC | |||
| lapacke_ilaver.c | |||
| ) | |||
| if (BUILD_LAPACK_DEPRECATED) | |||
| set(SRCS $SRCS lapacke_sgeqpf.c lapacke_sgeqpf_work.c lapacke_sggsvd.c lapacke_sggsvd_work.c lapacke_sggsvp.c lapacke_sggsvp_work.c) | |||
| set(SRCD $SRCD lapacke_dgeqpf.c lapacke_dgeqpf_work.c lapacke_dggsvd.c lapacke_dggsvd_work.c lapacke_dggsvp.c lapacke_dggsvp_work.c) | |||
| set(SRCC $SRCC lapacke_cgeqpf.c lapacke_cgeqpf_work.c lapacke_cggsvd.c lapacke_cggsvd_work.c lapacke_cggsvp.c lapacke_cggsvp_work.c) | |||
| set(SRCZ $SRCZ lapacke_zgeqpf.c lapacke_zgeqpf_work.c lapacke_zggsvd.c lapacke_zggsvd_work.c lapacke_zggsvp.c lapacke_zggsvp_work.c) | |||
| list(APPEND SSRC lapacke_sgeqpf.c lapacke_sgeqpf_work.c lapacke_sggsvd.c lapacke_sggsvd_work.c lapacke_sggsvp.c lapacke_sggsvp_work.c) | |||
| list(APPEND DSRC lapacke_dgeqpf.c lapacke_dgeqpf_work.c lapacke_dggsvd.c lapacke_dggsvd_work.c lapacke_dggsvp.c lapacke_dggsvp_work.c) | |||
| list(APPEND CSRC lapacke_cgeqpf.c lapacke_cgeqpf_work.c lapacke_cggsvd.c lapacke_cggsvd_work.c lapacke_cggsvp.c lapacke_cggsvp_work.c) | |||
| list(APPEND ZSRC lapacke_zgeqpf.c lapacke_zgeqpf_work.c lapacke_zggsvd.c lapacke_zggsvd_work.c lapacke_zggsvp.c lapacke_zggsvp_work.c) | |||
| endif() | |||
| set(SRCX | |||
| @@ -1006,15 +1006,15 @@ endif () | |||
| "#define HAVE_SVE\n" | |||
| "#define ARMV8\n") | |||
| set(SGEMM_UNROLL_M 16) | |||
| set(SGEMM_UNROLL_N 4) | |||
| set(DGEMM_UNROLL_M 8) | |||
| set(DGEMM_UNROLL_N 4) | |||
| set(CGEMM_UNROLL_M 8) | |||
| set(SGEMM_UNROLL_N 8) | |||
| set(DGEMM_UNROLL_M 4) | |||
| set(DGEMM_UNROLL_N 8) | |||
| set(CGEMM_UNROLL_M 2) | |||
| set(CGEMM_UNROLL_N 4) | |||
| set(ZGEMM_UNROLL_M 4) | |||
| set(ZGEMM_UNROLL_M 2) | |||
| set(ZGEMM_UNROLL_N 4) | |||
| set(SYMV_P 16) | |||
| elseif ("${TCORE}" STREQUAL "NEOVERSEN2") | |||
| elseif ("${TCORE}" STREQUAL "NEOVERSEN2" OR "${TCORE}" STREQUAL "ARMV9SME") | |||
| file(APPEND ${TARGET_CONF_TEMP} | |||
| "#define L1_CODE_SIZE\t65536\n" | |||
| "#define L1_CODE_LINESIZE\t64\n" | |||
| @@ -1249,6 +1249,25 @@ endif () | |||
| set(ZGEMM_UNROLL_M 2) | |||
| set(ZGEMM_UNROLL_N 4) | |||
| set(SYMV_P 16) | |||
| elseif ("${TCORE}" STREQUAL "ARMV8SVE" OR "${TCORE}" STREQUAL "CORTEXA510" OR "${TCORE}" STREQUAL "CORTEXX2" OR "${TCORE}" STREQUAL "ARMV9") | |||
| file(APPEND ${TARGET_CONF_TEMP} | |||
| "#define L1_DATA_SIZE\t32768\n" | |||
| "#define L1_DATA_LINESIZE\t64\n" | |||
| "#define L2_SIZE\t262144\n" | |||
| "#define L2_LINESIZE\t64\n" | |||
| "#define DTB_DEFAULT_ENTRIES\t64\n" | |||
| "#define DTB_SIZE\t4096\n" | |||
| "#define L2_ASSOCIATIVE\t32\n" | |||
| "#define ARMV8\n") | |||
| set(SGEMM_UNROLL_M 4) | |||
| set(SGEMM_UNROLL_N 8) | |||
| set(DGEMM_UNROLL_M 4) | |||
| set(DGEMM_UNROLL_N 8) | |||
| set(CGEMM_UNROLL_M 2) | |||
| set(CGEMM_UNROLL_N 4) | |||
| set(ZGEMM_UNROLL_M 2) | |||
| set(ZGEMM_UNROLL_N 4) | |||
| set(SYMV_P 16) | |||
| elseif ("${TCORE}" STREQUAL "P5600") | |||
| file(APPEND ${TARGET_CONF_TEMP} | |||
| "#define L2_SIZE 1048576\n" | |||
| @@ -1409,9 +1428,11 @@ endif () | |||
| # GetArch_2nd | |||
| foreach(float_char S;D;Q;C;Z;X) | |||
| if (NOT DEFINED ${float_char}GEMM_UNROLL_M) | |||
| message(STATUS "setting unrollm=2") | |||
| set(${float_char}GEMM_UNROLL_M 2) | |||
| endif() | |||
| if (NOT DEFINED ${float_char}GEMM_UNROLL_N) | |||
| message(STATUS "setting unrolln=2") | |||
| set(${float_char}GEMM_UNROLL_N 2) | |||
| endif() | |||
| endforeach() | |||
| @@ -21,7 +21,15 @@ endif() | |||
| # Other files expect CORE, which is actually TARGET and will become TARGET_CORE for kernel build. Confused yet? | |||
| # It seems we are meant to use TARGET as input and CORE internally as kernel. | |||
| if(NOT DEFINED CORE AND DEFINED TARGET) | |||
| set(CORE ${TARGET}) | |||
| if (${TARGET} STREQUAL "LOONGSON3R5") | |||
| set(CORE "LA464") | |||
| elseif (${TARGET} STREQUAL "LOONGSON2K1000") | |||
| set(CORE "LA264") | |||
| elseif (${TARGET} STREQUAL "LOONGSONGENERIC") | |||
| set(CORE "LA64_GENERIC)") | |||
| else () | |||
| set(CORE ${TARGET}) | |||
| endif() | |||
| endif() | |||
| # TARGET_CORE will override TARGET which is used in DYNAMIC_ARCH=1. | |||
| @@ -310,6 +318,9 @@ if (${TARGET} STREQUAL NEOVERSEV1) | |||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv8.2-a+sve") | |||
| endif() | |||
| endif() | |||
| if (${TARGET} STREQUAL ARMV9SME) | |||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv9-a+sme -O3") | |||
| endif() | |||
| if (${TARGET} STREQUAL A64FX) | |||
| if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE) | |||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -Msve-intrinsics -march=armv8.2-a+sve -mtune=a64fx") | |||
| @@ -361,6 +372,20 @@ else () | |||
| endif () | |||
| endif () | |||
| if (USE_OPENMP) | |||
| find_package(OpenMP COMPONENTS C REQUIRED) | |||
| set(CCOMMON_OPT "${CCOMMON_OPT} -DUSE_OPENMP") | |||
| if (NOT NOFORTRAN) | |||
| find_package(OpenMP COMPONENTS Fortran REQUIRED) | |||
| # Avoid mixed OpenMP linkage | |||
| get_target_property(OMP_C_LIB OpenMP::OpenMP_C INTERFACE_LINK_LIBRARIES) | |||
| get_target_property(OMP_Fortran_LIB OpenMP::OpenMP_Fortran INTERFACE_LINK_LIBRARIES) | |||
| if (NOT OMP_C_LIB STREQUAL OMP_Fortran_LIB) | |||
| message(FATAL_ERROR "Multiple OpenMP runtime libraries detected. Mixed OpenMP runtime linkage is dangerous. You may pass -DOpenMP_LANG_LIB_NAMES and -DOpenMP_omp_LIBRARY to manually choose the OpenMP library.") | |||
| endif() | |||
| endif () | |||
| endif () | |||
| if (BINARY64) | |||
| if (INTERFACE64) | |||
| # CCOMMON_OPT += -DUSE64BITINT | |||
| @@ -620,6 +645,18 @@ set(CMAKE_ASM_FLAGS "${CMAKE_ASM_FLAGS} ${CCOMMON_OPT}") | |||
| endif() | |||
| # TODO: not sure what PFLAGS is -hpa | |||
| set(PFLAGS "${PFLAGS} ${CCOMMON_OPT} -I${TOPDIR} -DPROFILE ${COMMON_PROF}") | |||
| if ("${CMAKE_BUILD_TYPE}" STREQUAL "Release") | |||
| if ("${F_COMPILER}" STREQUAL "FLANG") | |||
| if (${CMAKE_Fortran_COMPILER_VERSION} VERSION_LESS_EQUAL 3) | |||
| set(CMAKE_Fortran_FLAGS_RELEASE "${CMAKE_Fortran_FLAGS_RELEASE} -fno-unroll-loops") | |||
| endif () | |||
| endif () | |||
| if (ARM64 AND CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*" AND CMAKE_SYSTEM_NAME STREQUAL "Windows") | |||
| set(CMAKE_Fortran_FLAGS_RELEASE "${CMAKE_Fortran_FLAGS_RELEASE} -O2") | |||
| endif () | |||
| endif () | |||
| set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} ${FCOMMON_OPT}") | |||
| # TODO: not sure what FPFLAGS is -hpa | |||
| @@ -632,20 +669,11 @@ if (LAPACK_STRLEN) | |||
| endif() | |||
| set(LAPACK_FPFLAGS "${LAPACK_FPFLAGS} ${FPFLAGS}") | |||
| #Disable -fopenmp for LAPACK Fortran codes on Windows. | |||
| if (${CMAKE_SYSTEM_NAME} STREQUAL "Windows") | |||
| set(FILTER_FLAGS "-fopenmp;-mp;-openmp;-xopenmp=parallel") | |||
| foreach (FILTER_FLAG ${FILTER_FLAGS}) | |||
| string(REPLACE ${FILTER_FLAG} "" LAPACK_FFLAGS ${LAPACK_FFLAGS}) | |||
| string(REPLACE ${FILTER_FLAG} "" LAPACK_FPFLAGS ${LAPACK_FPFLAGS}) | |||
| endforeach () | |||
| endif () | |||
| if (CMAKE_Fortran_COMPILER) | |||
| if ("${F_COMPILER}" STREQUAL "NAGFOR" OR "${F_COMPILER}" STREQUAL "CRAY" OR CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*") | |||
| set(FILTER_FLAGS "-msse3;-mssse3;-msse4.1;-mavx;-mavx2,-mskylake-avx512") | |||
| if (CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*") | |||
| message(STATUS "removing fortran flags") | |||
| message(STATUS "removing fortran flags not supported by the compiler") | |||
| set(FILTER_FLAGS "${FILTER_FLAGS};-m32;-m64") | |||
| endif () | |||
| foreach (FILTER_FLAG ${FILTER_FLAGS}) | |||
| @@ -676,13 +704,6 @@ if (${CMAKE_C_COMPILER_ID} MATCHES "IntelLLVM" AND ${CMAKE_SYSTEM_NAME} STREQUAL | |||
| set(LAPACK_CFLAGS "${LAPACK_CFLAGS} -DNOCHANGE") | |||
| endif () | |||
| if ("${CMAKE_BUILD_TYPE}" STREQUAL "Release") | |||
| if ("${F_COMPILER}" STREQUAL "FLANG") | |||
| if (${CMAKE_Fortran_COMPILER_VERSION} VERSION_LESS_EQUAL 3) | |||
| set(CMAKE_Fortran_FLAGS_RELEASE "${CMAKE_Fortran_FLAGS_RELEASE} -fno-unroll-loops") | |||
| endif () | |||
| endif () | |||
| endif () | |||
| if (NOT DEFINED SUFFIX) | |||
| set(SUFFIX o) | |||
| @@ -139,6 +139,17 @@ endif() | |||
| endif() | |||
| endif() | |||
| if (ARM64) | |||
| if (NOT NO_SME) | |||
| file(WRITE ${PROJECT_BINARY_DIR}/sme.c ".text \n.global sme_test\n\nsme_test:\nsmstart\nsmstop\nret\n") | |||
| execute_process(COMMAND ${CMAKE_C_COMPILER} -march=armv9-a+sve2+sme -c -v -o ${PROJECT_BINARY_DIR}/sme.o ${PROJECT_BINARY_DIR}/sme.c OUTPUT_QUIET ERROR_QUIET RESULT_VARIABLE NO_SME) | |||
| if (NO_SME EQUAL 1) | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -DNO_SME") | |||
| endif() | |||
| file(REMOVE "${PROJECT_BINARY_DIR}/sme.c" "${PROJECT_BINARY_DIR}/sme.o") | |||
| endif() | |||
| endif() | |||
| include(CheckIncludeFile) | |||
| CHECK_INCLUDE_FILE("stdatomic.h" HAVE_C11) | |||
| if (HAVE_C11 EQUAL 1) | |||
| @@ -16,6 +16,14 @@ endfunction () | |||
| macro(ParseMakefileVars MAKEFILE_IN) | |||
| message(STATUS "Reading vars from ${MAKEFILE_IN}...") | |||
| set (C_COMPILER ${CMAKE_C_COMPILER_ID}) | |||
| set (OSNAME ${CMAKE_SYSTEM_NAME}) | |||
| if (${C_COMPILER} MATCHES Clang) | |||
| set (C_COMPILER CLANG) | |||
| endif () | |||
| if (${OSNAME} STREQUAL Windows) | |||
| set (OSNAME WINNT) | |||
| endif () | |||
| message(STATUS OS ${OSNAME} COMPILER ${C_COMPILER}) | |||
| set (IfElse 0) | |||
| set (ElseSeen 0) | |||
| set (SkipIfs 0) | |||
| @@ -702,6 +702,7 @@ void gotoblas_profile_init(void); | |||
| void gotoblas_profile_quit(void); | |||
| int support_avx512(void); | |||
| int support_sme1(void); | |||
| #ifdef USE_OPENMP | |||
| @@ -114,7 +114,15 @@ static inline int blas_quickdivide(blasint x, blasint y){ | |||
| OPENBLAS_ARM_TYPE_FUNCTION \ | |||
| REALNAME: | |||
| #define EPILOGUE | |||
| #if defined(__ELF__) && defined(__linux__) | |||
| # define GNUSTACK .section .note.GNU-stack,"",%progbits | |||
| #else | |||
| # define GNUSTACK | |||
| #endif | |||
| #define EPILOGUE \ | |||
| GNUSTACK | |||
| #define PROFCODE | |||
| @@ -175,7 +175,7 @@ REALNAME: | |||
| #define HUGE_PAGESIZE ( 4 << 20) | |||
| #ifndef BUFFERSIZE | |||
| #if defined(NEOVERSEN1) || defined(NEOVERSEN2) || defined(NEOVERSEV1) || defined(A64FX) || defined(ARMV8SVE) | |||
| #if defined(NEOVERSEN1) || defined(NEOVERSEN2) || defined(NEOVERSEV1) || defined(A64FX) || defined(ARMV8SVE) || defined(ARMV9SME) | |||
| #define BUFFER_SIZE (32 << 22) | |||
| #else | |||
| #define BUFFER_SIZE (32 << 20) | |||
| @@ -22,6 +22,7 @@ | |||
| #define DSUM_K dsum_k | |||
| #define DSWAP_K dswap_k | |||
| #define DROT_K drot_k | |||
| #define DROTM_K drotm_k | |||
| #define DGEMV_N dgemv_n | |||
| #define DGEMV_T dgemv_t | |||
| @@ -180,6 +181,7 @@ | |||
| #define DSUM_K gotoblas -> dsum_k | |||
| #define DSWAP_K gotoblas -> dswap_k | |||
| #define DROT_K gotoblas -> drot_k | |||
| #define DROTM_K gotoblas -> drotm_k | |||
| #define DGEMV_N gotoblas -> dgemv_n | |||
| #define DGEMV_T gotoblas -> dgemv_t | |||
| @@ -213,9 +213,9 @@ int srotmg_k(float *, float *, float *, float *, float *); | |||
| int drotmg_k(double *, double *, double *, double *, double *); | |||
| int qrotmg_k(xdouble *, xdouble *, xdouble *, xdouble *, xdouble *); | |||
| int srotm_k (BLASLONG, float, BLASLONG, float, BLASLONG, float); | |||
| int drotm_k (BLASLONG, double, BLASLONG, double, BLASLONG, double); | |||
| int qrotm_k (BLASLONG, xdouble, BLASLONG, xdouble, BLASLONG, xdouble); | |||
| int srotm_k (BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); | |||
| int drotm_k (BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); | |||
| int qrotm_k (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); | |||
| int saxpby_k (BLASLONG, float, float *, BLASLONG, float, float *, BLASLONG); | |||
| @@ -70,6 +70,7 @@ | |||
| #define SUM_K QSUM_K | |||
| #define SWAP_K QSWAP_K | |||
| #define ROT_K QROT_K | |||
| #define ROTM_K QROTM_K | |||
| #define GEMV_N QGEMV_N | |||
| #define GEMV_T QGEMV_T | |||
| @@ -361,6 +362,7 @@ | |||
| #define SUM_K DSUM_K | |||
| #define SWAP_K DSWAP_K | |||
| #define ROT_K DROT_K | |||
| #define ROTM_K DROTM_K | |||
| #define GEMV_N DGEMV_N | |||
| #define GEMV_T DGEMV_T | |||
| @@ -977,6 +979,7 @@ | |||
| #define SUM_K SSUM_K | |||
| #define SWAP_K SSWAP_K | |||
| #define ROT_K SROT_K | |||
| #define ROTM_K SROTM_K | |||
| #define GEMV_N SGEMV_N | |||
| #define GEMV_T SGEMV_T | |||
| @@ -77,6 +77,7 @@ BLASLONG (*isbmin_k) (BLASLONG, float *, BLASLONG); | |||
| double (*dsbdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); | |||
| int (*sbrot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float, float); | |||
| int (*sbrotm_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); | |||
| int (*sbaxpy_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); | |||
| int (*sbscal_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); | |||
| @@ -197,6 +198,7 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG); | |||
| //double (*dsdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); | |||
| int (*srot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float, float); | |||
| int (*srotm_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); | |||
| #endif | |||
| #if (BUILD_SINGLE==1) || (BUILD_DOUBLE==1) || (BUILD_COMPLEX==1) | |||
| int (*saxpy_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); | |||
| @@ -221,6 +223,10 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG); | |||
| void (*sgemm_direct) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG , float *, BLASLONG , float * , BLASLONG); | |||
| int (*sgemm_direct_performant) (BLASLONG M, BLASLONG N, BLASLONG K); | |||
| #endif | |||
| #ifdef ARCH_ARM64 | |||
| void (*sgemm_direct) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG , float *, BLASLONG , float * , BLASLONG); | |||
| #endif | |||
| int (*sgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG); | |||
| int (*sgemm_beta )(BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); | |||
| @@ -330,6 +336,7 @@ BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG); | |||
| #endif | |||
| #if (BUILD_DOUBLE==1) || (BUILD_COMPLEX16==1) | |||
| int (*drot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG, double, double); | |||
| int (*drotm_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); | |||
| int (*daxpy_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); | |||
| int (*dscal_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); | |||
| int (*dswap_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); | |||
| @@ -439,6 +446,7 @@ BLASLONG (*iqmin_k) (BLASLONG, xdouble *, BLASLONG); | |||
| int (*qcopy_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); | |||
| xdouble (*qdot_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); | |||
| int (*qrot_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble); | |||
| int (*qrotm_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); | |||
| int (*qaxpy_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); | |||
| int (*qscal_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); | |||
| @@ -22,6 +22,7 @@ | |||
| #define QSUM_K qsum_k | |||
| #define QSWAP_K qswap_k | |||
| #define QROT_K qrot_k | |||
| #define QROTM_K qrotm_k | |||
| #define QGEMV_N qgemv_n | |||
| #define QGEMV_T qgemv_t | |||
| @@ -165,6 +166,7 @@ | |||
| #define QSUM_K gotoblas -> qsum_k | |||
| #define QSWAP_K gotoblas -> qswap_k | |||
| #define QROT_K gotoblas -> qrot_k | |||
| #define QROTM_K gotoblas -> qrotm_k | |||
| #define QGEMV_N gotoblas -> qgemv_n | |||
| #define QGEMV_T gotoblas -> qgemv_t | |||
| @@ -24,6 +24,7 @@ | |||
| #define SSCAL_K sscal_k | |||
| #define SSWAP_K sswap_k | |||
| #define SROT_K srot_k | |||
| #define SROTM_K srotm_k | |||
| #define SGEMV_N sgemv_n | |||
| #define SGEMV_T sgemv_t | |||
| @@ -189,6 +190,7 @@ | |||
| #define SSCAL_K gotoblas -> sscal_k | |||
| #define SSWAP_K gotoblas -> sswap_k | |||
| #define SROT_K gotoblas -> srot_k | |||
| #define SROTM_K gotoblas -> srotm_k | |||
| #define SGEMV_N gotoblas -> sgemv_n | |||
| #define SGEMV_T gotoblas -> sgemv_t | |||
| @@ -213,9 +215,9 @@ | |||
| #ifdef ARCH_X86_64 | |||
| #define SGEMM_DIRECT_PERFORMANT gotoblas -> sgemm_direct_performant | |||
| #define SGEMM_DIRECT gotoblas -> sgemm_direct | |||
| #else | |||
| #elif ARCH_ARM64 | |||
| #define SGEMM_DIRECT_PERFORMANT sgemm_direct_performant | |||
| #define SGEMM_DIRECT sgemm_direct | |||
| #define SGEMM_DIRECT gotoblas -> sgemm_direct | |||
| #endif | |||
| #define SGEMM_ONCOPY gotoblas -> sgemm_oncopy | |||
| @@ -103,9 +103,16 @@ static inline int blas_quickdivide(blasint x, blasint y){ | |||
| .global REALNAME ;\ | |||
| .type REALNAME, %function ;\ | |||
| REALNAME: | |||
| #define EPILOGUE | |||
| #if defined(__ELF__) && defined(__linux__) | |||
| # define GNUSTACK .section .note.GNU-stack,"",@progbits | |||
| #else | |||
| # define GNUSTACK | |||
| #endif | |||
| #define EPILOGUE \ | |||
| .size REALNAME, .-REALNAME; \ | |||
| GNUSTACK | |||
| #define PROFCODE | |||
| @@ -65,3 +65,6 @@ _cpuid: | |||
| .subsections_via_symbols | |||
| #endif | |||
| #if defined(__ELF__) && defined(__linux__) | |||
| .section .note.GNU-stack,"",@progbits | |||
| #endif | |||
| @@ -43,6 +43,9 @@ size_t length64=sizeof(value64); | |||
| #ifndef HWCAP_SVE | |||
| #define HWCAP_SVE (1 << 22) | |||
| #endif | |||
| #if (defined OS_WINDOWS) | |||
| #include <winreg.h> | |||
| #endif | |||
| #define get_cpu_ftr(id, var) ({ \ | |||
| __asm__ __volatile__ ("mrs %0, "#id : "=r" (var)); \ | |||
| @@ -273,11 +276,11 @@ int detect(void) | |||
| fclose(infile); | |||
| } | |||
| } | |||
| sprintf(cpuimpl,"0x%2x",implementer); | |||
| sprintf(cpuimpl,"0x%02x",implementer); | |||
| cpu_implementer=strdup(cpuimpl); | |||
| } | |||
| qsort(cpucores,1024,sizeof(int),cpusort); | |||
| sprintf(cpupart,"0x%3x",cpucores[0]); | |||
| sprintf(cpupart,"0x%03x",cpucores[0]); | |||
| cpu_part=strdup(cpupart); | |||
| if(cpu_part != NULL && cpu_implementer != NULL) { | |||
| // Arm | |||
| @@ -371,20 +374,47 @@ int detect(void) | |||
| } | |||
| #else | |||
| #ifdef __APPLE__ | |||
| length64 = sizeof(value64); | |||
| sysctlbyname("hw.ncpu",&value64,&length64,NULL,0); | |||
| cpulowperf=value64; | |||
| length64 = sizeof(value64); | |||
| sysctlbyname("hw.nperflevels",&value64,&length64,NULL,0); | |||
| if (value64 > 1) { | |||
| sysctlbyname("hw.perflevel0.cpusperl",&value64,&length64,NULL,0); | |||
| length64 = sizeof(value64); | |||
| sysctlbyname("hw.perflevel0.cpusperl2",&value64,&length64,NULL,0); | |||
| cpuhiperf=value64; | |||
| sysctlbyname("hw.perflevel1.cpusperl",&value64,&length64,NULL,0); | |||
| length64 = sizeof(value64); | |||
| sysctlbyname("hw.perflevel1.cpusperl2",&value64,&length64,NULL,0); | |||
| cpulowperf=value64; | |||
| } | |||
| length64 = sizeof(value64); | |||
| sysctlbyname("hw.cpufamily",&value64,&length64,NULL,0); | |||
| if (value64 ==131287967|| value64 == 458787763 ) return CPU_VORTEX; //A12/M1 | |||
| if (value64 == 3660830781) return CPU_VORTEX; //A15/M2 | |||
| if (value64 == 2271604202) return CPU_VORTEX; //A16/M3 | |||
| if (value64 == 1867590060) return CPU_VORTEX; //M4 | |||
| #else | |||
| #ifdef OS_WINDOWS | |||
| HKEY reghandle; | |||
| HKEY hklm = HKEY_LOCAL_MACHINE; | |||
| WCHAR valstring[512]; | |||
| PVOID pvalstring=valstring; | |||
| DWORD size=sizeof (valstring); | |||
| DWORD type=RRF_RT_ANY; | |||
| DWORD flags=0; | |||
| LPCWSTR subkey= L"HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0"; | |||
| LPCWSTR field=L"ProcessorNameString"; | |||
| LONG errcode=RegOpenKeyEx(HKEY_LOCAL_MACHINE,TEXT("Hardware\\Description\\System\\CentralProcessor\\0"), 0, KEY_READ, ®handle); | |||
| if (errcode != NO_ERROR) wprintf(L"Could not open registry key for proc0: %x\n",errcode); | |||
| errcode=RegQueryValueEx(reghandle, "ProcessorNameString", NULL,NULL ,pvalstring,&size); | |||
| if (errcode != ERROR_SUCCESS) wprintf(L"Error reading cpuname from registry:%x\n",errcode); | |||
| //wprintf(stderr,L"%s\n",(PWSTR)valstring); | |||
| RegCloseKey(reghandle); | |||
| if (strstr(valstring, "Snapdragon(R) X Elite")) return CPU_NEOVERSEN1; | |||
| if (strstr(valstring, "Ampere(R) Altra")) return CPU_NEOVERSEN1; | |||
| if (strstr(valstring, "Snapdragon (TM) 8cx Gen 3")) return CPU_CORTEXX1; | |||
| if (strstr(valstring, "Snapdragon Compute Platform")) return CPU_CORTEXX1; | |||
| #endif | |||
| #endif | |||
| return CPU_ARMV8; | |||
| #endif | |||
| @@ -442,6 +472,7 @@ int n=0; | |||
| printf("#define NUM_CORES_HP %d\n",cpuhiperf); | |||
| #endif | |||
| #ifdef __APPLE__ | |||
| length64 = sizeof(value64); | |||
| sysctlbyname("hw.physicalcpu_max",&value,&length,NULL,0); | |||
| printf("#define NUM_CORES %d\n",value); | |||
| if (cpulowperf >0) | |||
| @@ -673,12 +704,17 @@ void get_cpuconfig(void) | |||
| case CPU_VORTEX: | |||
| printf("#define VORTEX \n"); | |||
| #ifdef __APPLE__ | |||
| length64 = sizeof(value64); | |||
| sysctlbyname("hw.l1icachesize",&value64,&length64,NULL,0); | |||
| printf("#define L1_CODE_SIZE %lld \n",value64); | |||
| length64 = sizeof(value64); | |||
| sysctlbyname("hw.cachelinesize",&value64,&length64,NULL,0); | |||
| printf("#define L1_CODE_LINESIZE %lld \n",value64); | |||
| printf("#define L1_DATA_LINESIZE %lld \n",value64); | |||
| length64 = sizeof(value64); | |||
| sysctlbyname("hw.l1dcachesize",&value64,&length64,NULL,0); | |||
| printf("#define L1_DATA_SIZE %lld \n",value64); | |||
| length64 = sizeof(value64); | |||
| sysctlbyname("hw.l2cachesize",&value64,&length64,NULL,0); | |||
| printf("#define L2_SIZE %lld \n",value64); | |||
| #endif | |||
| @@ -1578,6 +1578,7 @@ int get_cpuname(void){ | |||
| case 12: //family 6 exmodel 12 | |||
| switch (model) { | |||
| case 15: | |||
| case 6: // Arrow Lake | |||
| if(support_avx512()) | |||
| return CPUTYPE_SAPPHIRERAPIDS; | |||
| if(support_avx2()) | |||
| @@ -2421,6 +2422,22 @@ int get_coretype(void){ | |||
| else | |||
| return CORE_NEHALEM; | |||
| } | |||
| case 12: | |||
| switch (model) { | |||
| case 6: // Arrow Lake | |||
| if(support_amx_bf16()) | |||
| return CORE_SAPPHIRERAPIDS; | |||
| if(support_avx512_bf16()) | |||
| return CORE_COOPERLAKE; | |||
| if(support_avx512()) | |||
| return CORE_SKYLAKEX; | |||
| if(support_avx2()) | |||
| return CORE_HASWELL; | |||
| if(support_avx()) | |||
| return CORE_SANDYBRIDGE; | |||
| else | |||
| return CORE_NEHALEM; | |||
| } | |||
| } | |||
| case 15: | |||
| if (model <= 0x2) return CORE_NORTHWOOD; | |||
| @@ -6,7 +6,7 @@ enable_language(Fortran) | |||
| endif() | |||
| set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DADD${BU} -DCBLAS") | |||
| if (BINARY32 AND CMAKE_C_PLATFORM_ID MATCHES "MinGW" AND CMAKE_Fortran_COMPILER_VERSION VERSION_EQUAL 14.2) | |||
| if (BINARY32 AND CMAKE_C_PLATFORM_ID MATCHES "MinGW" AND CMAKE_Fortran_COMPILER_VERSION VERSION_GREATER 14.1) | |||
| list(REMOVE_ITEM ${CMAKE_Fortran_FLAGS} -O3 -O2 -O1 -Os) | |||
| set (CMAKE_Fortran_FLAGS_RELEASE "" CACHE STRING "" FORCE) | |||
| endif() | |||
| @@ -44,10 +44,6 @@ else() | |||
| c_${float_char}blas1.c) | |||
| endif() | |||
| target_link_libraries(x${float_char}cblat1 ${OpenBLAS_LIBNAME}) | |||
| if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang)) | |||
| string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}") | |||
| target_link_libraries(x${float_char}cblat1 omp pthread) | |||
| endif() | |||
| if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX") | |||
| target_link_libraries(x${float_char}cblat1 m) | |||
| endif() | |||
| @@ -73,10 +69,6 @@ else() | |||
| constant.c) | |||
| endif() | |||
| target_link_libraries(x${float_char}cblat2 ${OpenBLAS_LIBNAME}) | |||
| if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang)) | |||
| string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}") | |||
| target_link_libraries(x${float_char}cblat2 omp pthread) | |||
| endif() | |||
| if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX") | |||
| target_link_libraries(x${float_char}cblat2 m) | |||
| endif() | |||
| @@ -124,20 +116,12 @@ else() | |||
| endif() | |||
| endif() | |||
| target_link_libraries(x${float_char}cblat3 ${OpenBLAS_LIBNAME}) | |||
| if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang)) | |||
| string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}") | |||
| target_link_libraries(x${float_char}cblat3 omp pthread) | |||
| endif() | |||
| if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX") | |||
| target_link_libraries(x${float_char}cblat3 m) | |||
| endif() | |||
| if (USE_GEMM3M) | |||
| if ((${float_char} STREQUAL "c") OR (${float_char} STREQUAL "z")) | |||
| target_link_libraries(x${float_char}cblat3_3m ${OpenBLAS_LIBNAME}) | |||
| if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang)) | |||
| string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}") | |||
| target_link_libraries(x${float_char}cblat3 omp pthread) | |||
| endif() | |||
| if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX") | |||
| target_link_libraries(x${float_char}cblat3_3m m) | |||
| endif() | |||
| @@ -235,18 +235,18 @@ FLDFLAGS = $(FFLAGS:-fPIC=) $(LDFLAGS) | |||
| ifeq ($(USE_OPENMP), 1) | |||
| ifeq ($(F_COMPILER), GFORTRAN) | |||
| ifeq ($(C_COMPILER), CLANG) | |||
| CEXTRALIB += -lomp | |||
| EXTRALIB += -lomp | |||
| endif | |||
| endif | |||
| ifeq ($(F_COMPILER), NAG) | |||
| CEXTRALIB = -lgomp | |||
| EXTRALIB = -lgomp | |||
| endif | |||
| ifeq ($(F_COMPILER), IBM) | |||
| ifeq ($(C_COMPILER), GCC) | |||
| CEXTRALIB += -lgomp | |||
| EXTRALIB += -lgomp | |||
| endif | |||
| ifeq ($(C_COMPILER), CLANG) | |||
| CEXTRALIB += -lomp | |||
| EXTRALIB += -lomp | |||
| endif | |||
| endif | |||
| endif | |||
| @@ -440,7 +440,7 @@ static real c_b43 = (float)1.; | |||
| extern /* Subroutine */ int ctest_(integer*, complex*, complex*, complex*, real*); | |||
| static complex mwpcs[5], mwpct[5]; | |||
| extern /* Subroutine */ int itest1_(integer*, integer*), stest1_(real*,real*,real*,real*); | |||
| extern /* Subroutine */ int cscaltest_(), itest1_(), stest1_(); | |||
| extern /* Subroutine */ int cscaltest_(integer*, complex*, complex*, integer*); | |||
| static complex cx[8]; | |||
| extern real scnrm2test_(integer*, complex*, integer*); | |||
| static integer np1; | |||
| @@ -480,13 +480,13 @@ the LLVM toolchain enables native compilation of the Fortran sources of LAPACK a | |||
| 4. Navigate to the OpenBLAS source code directory and start building OpenBLAS | |||
| by invoking Ninja: | |||
| ```cmd | |||
| cd OpenBLAS | |||
| mkdir build | |||
| cd build | |||
| cmake .. -G Ninja -DCMAKE_BUILD_TYPE=Release -DTARGET=ARMV8 -DBINARY=64 -DCMAKE_C_COMPILER=clang-cl -DCMAKE_C_COMPILER=arm64-pc-windows-msvc -DCMAKE_ASM_COMPILER=arm64-pc-windows-msvc -DCMAKE_Fortran_COMPILER=flang-new | |||
| cmake .. -G Ninja -DCMAKE_BUILD_TYPE=Release -DTARGET=ARMV8 -DBINARY=64 -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang-new | |||
| ninja -j16 | |||
| ``` | |||
| @@ -223,3 +223,7 @@ if (USE_THREAD) | |||
| endif () | |||
| add_library(driver_level2 OBJECT ${OPENBLAS_SRC}) | |||
| if (USE_OPENMP) | |||
| target_link_libraries(driver_level2 OpenMP::OpenMP_C) | |||
| endif() | |||
| @@ -171,3 +171,7 @@ endforeach () | |||
| # | |||
| add_library(driver_level3 OBJECT ${OPENBLAS_SRC}) | |||
| if (USE_OPENMP) | |||
| target_link_libraries(driver_level3 OpenMP::OpenMP_C) | |||
| endif() | |||
| @@ -547,7 +547,7 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG | |||
| #ifdef USE_OPENMP | |||
| static omp_lock_t level3_lock, critical_section_lock; | |||
| static volatile BLASLONG init_lock = 0, omp_lock_initialized = 0, | |||
| static volatile BLASULONG init_lock = 0, omp_lock_initialized = 0, | |||
| parallel_section_left = MAX_PARALLEL_NUMBER; | |||
| // Lock initialization; Todo : Maybe this part can be moved to blas_init() in blas_server_omp.c | |||
| @@ -591,7 +591,7 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG | |||
| BLASLONG nthreads = args -> nthreads; | |||
| BLASLONG width, i, j, k, js; | |||
| BLASLONG width, width_n, i, j, k, js; | |||
| BLASLONG m, n, n_from, n_to; | |||
| int mode; | |||
| #if defined(DYNAMIC_ARCH) | |||
| @@ -740,18 +740,25 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG | |||
| /* Partition (a step of) n into nthreads regions */ | |||
| range_N[0] = js; | |||
| num_parts = 0; | |||
| while (n > 0){ | |||
| width = blas_quickdivide(n + nthreads - num_parts - 1, nthreads - num_parts); | |||
| if (width < switch_ratio && width > 1) { | |||
| width = switch_ratio; | |||
| for(j = 0; j < nthreads_n; j++){ | |||
| width_n = blas_quickdivide(n + nthreads_n - j - 1, nthreads_n - j); | |||
| n -= width_n; | |||
| for(i = 0; i < nthreads_m; i++){ | |||
| width = blas_quickdivide(width_n + nthreads_m - i - 1, nthreads_m - i); | |||
| if (width < switch_ratio) { | |||
| width = switch_ratio; | |||
| } | |||
| width = round_up(width_n, width, GEMM_PREFERED_SIZE); | |||
| width_n -= width; | |||
| if (width_n < 0) { | |||
| width = width + width_n; | |||
| width_n = 0; | |||
| } | |||
| range_N[num_parts + 1] = range_N[num_parts] + width; | |||
| num_parts ++; | |||
| } | |||
| width = round_up(n, width, GEMM_PREFERED_SIZE); | |||
| n -= width; | |||
| if (n < 0) width = width + n; | |||
| range_N[num_parts + 1] = range_N[num_parts] + width; | |||
| num_parts ++; | |||
| } | |||
| for (j = num_parts; j < MAX_CPU_NUMBER; j++) { | |||
| range_N[j + 1] = range_N[num_parts]; | |||
| @@ -844,9 +851,20 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, IFLOAT *sa, IF | |||
| /* Objective function come from sum of partitions in m and n. */ | |||
| /* (n / nthreads_n) + (m / nthreads_m) */ | |||
| /* = (n * nthreads_m + m * nthreads_n) / (nthreads_n * nthreads_m) */ | |||
| while (nthreads_m % 2 == 0 && n * nthreads_m + m * nthreads_n > n * (nthreads_m / 2) + m * (nthreads_n * 2)) { | |||
| nthreads_m /= 2; | |||
| nthreads_n *= 2; | |||
| BLASLONG cost = 0, div = 0; | |||
| BLASLONG i; | |||
| for (i = 1; i <= sqrt(nthreads_m); i++) { | |||
| if (nthreads_m % i) continue; | |||
| BLASLONG j = nthreads_m / i; | |||
| BLASLONG cost_i = n * j + m * nthreads_n * i; | |||
| BLASLONG cost_j = n * i + m * nthreads_n * j; | |||
| if (cost == 0 || | |||
| cost_i < cost) {cost = cost_i; div = i;} | |||
| if (cost_j < cost) {cost = cost_j; div = j;} | |||
| } | |||
| if (div > 1) { | |||
| nthreads_m /= div; | |||
| nthreads_n *= div; | |||
| } | |||
| } | |||
| @@ -88,3 +88,7 @@ endif () | |||
| #endif | |||
| add_library(driver_others OBJECT ${OPENBLAS_SRC} ${MEMORY} ${SMP_SOURCES} ${COMMON_SOURCES}) | |||
| if (USE_OPENMP) | |||
| target_link_libraries(driver_others OpenMP::OpenMP_C) | |||
| endif() | |||
| @@ -146,8 +146,8 @@ typedef struct { | |||
| } thread_status_t; | |||
| #ifdef HAVE_C11 | |||
| #define atomic_load_queue(p) __atomic_load_n(p, __ATOMIC_RELAXED) | |||
| #define atomic_store_queue(p, v) __atomic_store_n(p, v, __ATOMIC_RELAXED) | |||
| #define atomic_load_queue(p) __atomic_load_n(p, __ATOMIC_ACQUIRE) | |||
| #define atomic_store_queue(p, v) __atomic_store_n(p, v, __ATOMIC_RELEASE) | |||
| #else | |||
| #define atomic_load_queue(p) (blas_queue_t*)(*(volatile blas_queue_t**)(p)) | |||
| #define atomic_store_queue(p, v) (*(volatile blas_queue_t* volatile*)(p) = (v)) | |||
| @@ -637,7 +637,9 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){ | |||
| #ifdef SMP_SERVER | |||
| // Handle lazy re-init of the thread-pool after a POSIX fork | |||
| LOCK_COMMAND(&server_lock); | |||
| if (unlikely(blas_server_avail == 0)) blas_thread_init(); | |||
| UNLOCK_COMMAND(&server_lock); | |||
| #endif | |||
| BLASLONG i = 0; | |||
| blas_queue_t *current = queue; | |||
| @@ -43,6 +43,14 @@ | |||
| #include <sys/auxv.h> | |||
| #endif | |||
| #ifdef __APPLE__ | |||
| #include <sys/sysctl.h> | |||
| int32_t value; | |||
| size_t length=sizeof(value); | |||
| int64_t value64; | |||
| size_t length64=sizeof(value64); | |||
| #endif | |||
| extern gotoblas_t gotoblas_ARMV8; | |||
| #ifdef DYNAMIC_LIST | |||
| #ifdef DYN_CORTEXA53 | |||
| @@ -115,7 +123,12 @@ extern gotoblas_t gotoblas_ARMV8SVE; | |||
| #else | |||
| #define gotoblas_ARMV8SVE gotoblas_ARMV8 | |||
| #endif | |||
| #ifdef DYN_CORTEX_A55 | |||
| #ifdef DYN_ARMV9SME | |||
| extern gotoblas_t gotoblas_ARMV9SME; | |||
| #else | |||
| #define gotoblas_ARMV9SME gotoblas_ARMV8 | |||
| #endif | |||
| #ifdef DYN_CORTEXA55 | |||
| extern gotoblas_t gotoblas_CORTEXA55; | |||
| #else | |||
| #define gotoblas_CORTEXA55 gotoblas_ARMV8 | |||
| @@ -142,21 +155,28 @@ extern gotoblas_t gotoblas_NEOVERSEV1; | |||
| extern gotoblas_t gotoblas_NEOVERSEN2; | |||
| extern gotoblas_t gotoblas_ARMV8SVE; | |||
| extern gotoblas_t gotoblas_A64FX; | |||
| #ifndef NO_SME | |||
| extern gotoblas_t gotoblas_ARMV9SME; | |||
| #else | |||
| #define gotoblas_ARMV9SME gotoblas_ARMV8SVE | |||
| #endif | |||
| #else | |||
| #define gotoblas_NEOVERSEV1 gotoblas_ARMV8 | |||
| #define gotoblas_NEOVERSEN2 gotoblas_ARMV8 | |||
| #define gotoblas_ARMV8SVE gotoblas_ARMV8 | |||
| #define gotoblas_A64FX gotoblas_ARMV8 | |||
| #define gotoblas_ARMV9SME gotoblas_ARMV8 | |||
| #endif | |||
| extern gotoblas_t gotoblas_THUNDERX3T110; | |||
| #endif | |||
| #define gotoblas_NEOVERSEV2 gotoblas_NEOVERSEV1 | |||
| #define gotoblas_NEOVERSEV2 gotoblas_NEOVERSEN2 | |||
| extern void openblas_warning(int verbose, const char * msg); | |||
| #define FALLBACK_VERBOSE 1 | |||
| #define NEOVERSEN1_FALLBACK "OpenBLAS : Your OS does not support SVE instructions. OpenBLAS is using Neoverse N1 kernels as a fallback, which may give poorer performance.\n" | |||
| #define NUM_CORETYPES 18 | |||
| #define NUM_CORETYPES 19 | |||
| /* | |||
| * In case asm/hwcap.h is outdated on the build system, make sure | |||
| @@ -168,6 +188,9 @@ extern void openblas_warning(int verbose, const char * msg); | |||
| #ifndef HWCAP_SVE | |||
| #define HWCAP_SVE (1 << 22) | |||
| #endif | |||
| #ifndef HWCAP2_SME | |||
| #define HWCAP2_SME 1<<23 | |||
| #endif | |||
| #define get_cpu_ftr(id, var) ({ \ | |||
| __asm__ __volatile__ ("mrs %0, "#id : "=r" (var)); \ | |||
| @@ -192,6 +215,7 @@ static char *corename[] = { | |||
| "cortexa55", | |||
| "armv8sve", | |||
| "a64fx", | |||
| "armv9sme", | |||
| "unknown" | |||
| }; | |||
| @@ -214,6 +238,7 @@ char *gotoblas_corename(void) { | |||
| if (gotoblas == &gotoblas_CORTEXA55) return corename[15]; | |||
| if (gotoblas == &gotoblas_ARMV8SVE) return corename[16]; | |||
| if (gotoblas == &gotoblas_A64FX) return corename[17]; | |||
| if (gotoblas == &gotoblas_ARMV9SME) return corename[18]; | |||
| return corename[NUM_CORETYPES]; | |||
| } | |||
| @@ -251,6 +276,7 @@ static gotoblas_t *force_coretype(char *coretype) { | |||
| case 15: return (&gotoblas_CORTEXA55); | |||
| case 16: return (&gotoblas_ARMV8SVE); | |||
| case 17: return (&gotoblas_A64FX); | |||
| case 18: return (&gotoblas_ARMV9SME); | |||
| } | |||
| snprintf(message, 128, "Core not found: %s\n", coretype); | |||
| openblas_warning(1, message); | |||
| @@ -262,6 +288,11 @@ static gotoblas_t *get_coretype(void) { | |||
| char coremsg[128]; | |||
| #if defined (OS_DARWIN) | |||
| //future #if !defined(NO_SME) | |||
| // if (support_sme1()) { | |||
| // return &gotoblas_ARMV9SME; | |||
| // } | |||
| // #endif | |||
| return &gotoblas_NEOVERSEN1; | |||
| #endif | |||
| @@ -409,13 +440,21 @@ static gotoblas_t *get_coretype(void) { | |||
| return &gotoblas_TSV110; | |||
| } | |||
| break; | |||
| case 0x50: // Ampere | |||
| case 0x50: // Ampere/AppliedMicro | |||
| switch (part) | |||
| { | |||
| case 0x000: // Skylark/EMAG8180 | |||
| return &gotoblas_EMAG8180; | |||
| } | |||
| break; | |||
| case 0xc0: // Ampere | |||
| switch(part) | |||
| { | |||
| case 0xac3: | |||
| case 0xac4: | |||
| return &gotoblas_NEOVERSEN1; | |||
| } | |||
| break; | |||
| case 0x51: // Qualcomm | |||
| switch (part) | |||
| { | |||
| @@ -424,12 +463,20 @@ static gotoblas_t *get_coretype(void) { | |||
| } | |||
| break; | |||
| case 0x61: // Apple | |||
| //future if (support_sme1()) return &gotoblas_ARMV9SME; | |||
| return &gotoblas_NEOVERSEN1; | |||
| break; | |||
| default: | |||
| snprintf(coremsg, 128, "Unknown CPU model - implementer %x part %x\n",implementer,part); | |||
| openblas_warning(1, coremsg); | |||
| } | |||
| #if !defined(NO_SME) | |||
| if (support_sme1()) { | |||
| return &gotoblas_ARMV9SME; | |||
| } | |||
| #endif | |||
| #ifndef NO_SVE | |||
| if ((getauxval(AT_HWCAP) & HWCAP_SVE)) { | |||
| return &gotoblas_ARMV8SVE; | |||
| @@ -480,3 +527,19 @@ void gotoblas_dynamic_init(void) { | |||
| void gotoblas_dynamic_quit(void) { | |||
| gotoblas = NULL; | |||
| } | |||
| int support_sme1(void) { | |||
| int ret = 0; | |||
| #if (defined OS_LINUX || defined OS_ANDROID) | |||
| ret = getauxval(AT_HWCAP2) & HWCAP2_SME; | |||
| if(getauxval(AT_HWCAP2) & HWCAP2_SME){ | |||
| ret = 1; | |||
| } | |||
| #endif | |||
| #if defined(__APPLE__) | |||
| sysctlbyname("hw.optional.arm.FEAT_SME",&value64,&length64,NULL,0); | |||
| ret = value64; | |||
| #endif | |||
| return ret; | |||
| } | |||
| @@ -197,7 +197,7 @@ ifeq ($(F_COMPILER), INTEL) | |||
| -Wl,--whole-archive $< -Wl,--no-whole-archive \ | |||
| -Wl,-soname,$(INTERNALNAME) $(EXTRALIB) | |||
| $(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK. | |||
| else ifeq ($(F_COMPILER), FLANG) | |||
| else ifeq ($(F_COMPILER), $(filter $(F_COMPILER),FLANG FLANGNEW)) | |||
| $(FC) $(FFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \ | |||
| -Wl,--whole-archive $< -Wl,--no-whole-archive \ | |||
| -Wl,-soname,$(INTERNALNAME) $(EXTRALIB) | |||
| @@ -21,7 +21,7 @@ | |||
| chbmv,chemm,chemv,cher2,cher2k,cher,cherk,scabs1,scamax, | |||
| chpmv,chpr2,chpr,crotg,cscal,csrot,csscal,cswap,scamin,scasum,scnrm2, | |||
| csymm,csyr2k,csyrk,ctbmv,ctbsv,ctpmv,ctpsv,ctrmm,ctrmv,ctrsm, | |||
| ctrsv,icamax,icamin,cimatcopy,comatcopy,cgeadd,scsum,cgemmt); | |||
| ctrsv,icamax,icamin,cimatcopy,comatcopy,cgeadd,scsum,cgemmt,cgemmtr); | |||
| @blasobjsd = ( | |||
| damax,damin,dasum,daxpy,daxpby,dcabs1,dcopy,ddot,dgbmv,dgemm, | |||
| @@ -29,7 +29,7 @@ | |||
| dscal,dsdot,dspmv,dspr2,dimatcopy,domatcopy, | |||
| dspr,dswap,dsymm,dsymv,dsyr2,dsyr2k,dsyr,dsyrk,dtbmv,dtbsv, | |||
| dtpmv,dtpsv,dtrmm,dtrmv,dtrsm,dtrsv, | |||
| idamax,idamin,idmax,idmin,dgeadd,dsum,dgemmt); | |||
| idamax,idamin,idmax,idmin,dgeadd,dsum,dgemmt,dgemmtr); | |||
| @blasobjss = ( | |||
| isamax,isamin,ismax,ismin, | |||
| @@ -38,7 +38,7 @@ | |||
| smax,smin,snrm2,simatcopy,somatcopy, | |||
| srot,srotg,srotm,srotmg,ssbmv,sscal,sspmv,sspr2,sspr,sswap, | |||
| ssymm,ssymv,ssyr2,ssyr2k,ssyr,ssyrk,stbmv,stbsv,stpmv,stpsv, | |||
| strmm,strmv,strsm,strsv, sgeadd,ssum,sgemmt); | |||
| strmm,strmv,strsm,strsv, sgeadd,ssum,sgemmt,sgemmtr); | |||
| @blasobjsz = ( | |||
| izamax,izamin,, | |||
| @@ -48,28 +48,29 @@ | |||
| zhpr,zrotg,zscal,zswap,zsymm,zsyr2k,zsyrk,ztbmv, | |||
| ztbsv,ztpmv,ztpsv,ztrmm,ztrmv,ztrsm,ztrsv, | |||
| zomatcopy, zimatcopy,dzamax,dzamin,dzasum,dznrm2, | |||
| zgeadd, dzsum, zgemmt); | |||
| zgeadd, dzsum, zgemmt,zgemmtr); | |||
| @blasobjs = (lsame, xerbla); | |||
| @bfblasobjs = (sbgemm, sbgemv, sbdot, sbstobf16, sbdtobf16, sbf16tos, dbf16tod); | |||
| @bfblasobjs = (sbgemm, sbgemmt, sbgemmtr, sbgemv, sbdot, sbstobf16, sbdtobf16, sbf16tos, dbf16tod); | |||
| @cblasobjsc = ( | |||
| cblas_caxpy, cblas_ccopy, cblas_cdotc, cblas_cdotu, cblas_cgbmv, cblas_cgemm, cblas_cgemv, | |||
| cblas_cgerc, cblas_cgeru, cblas_chbmv, cblas_chemm, cblas_chemv, cblas_cher2, cblas_cher2k, | |||
| cblas_cher, cblas_cherk, cblas_chpmv, cblas_chpr2, cblas_chpr, cblas_cscal, cblas_caxpby, | |||
| cblas_csscal, cblas_cswap, cblas_csymm, cblas_csyr2k, cblas_csyrk, cblas_ctbmv, cblas_cgeadd, | |||
| cblas_ctbsv, cblas_ctpmv, cblas_ctpsv, cblas_ctrmm, cblas_ctrmv, cblas_ctrsm, cblas_ctrsv, | |||
| cblas_scnrm2, cblas_scasum, | |||
| cblas_icamax, cblas_icamin, cblas_icmin, cblas_icmax, cblas_scsum,cblas_cimatcopy,cblas_comatcopy | |||
| cblas_cgemmt); | |||
| cblas_scnrm2, cblas_scasum, cblas_cgemmt, cblas_cgemmtr, | |||
| cblas_icamax, cblas_icamin, cblas_icmin, cblas_icmax, cblas_scsum,cblas_cimatcopy,cblas_comatcopy, | |||
| cblas_caxpyc, cblas_crotg, cblas_csrot, cblas_scamax, cblas_scamin, cblas_cgemm_batch); | |||
| @cblasobjsd = ( | |||
| cblas_dasum, cblas_daxpy, cblas_dcopy, cblas_ddot, | |||
| cblas_dgbmv, cblas_dgemm, cblas_dgemv, cblas_dger, cblas_dnrm2, | |||
| cblas_drot, cblas_drotg, cblas_drotm, cblas_drotmg, cblas_dsbmv, cblas_dscal, cblas_dsdot, | |||
| cblas_dspmv, cblas_dspr2, cblas_dspr, cblas_dswap, cblas_dsymm, cblas_dsymv, cblas_dsyr2, | |||
| cblas_dsyr2k, cblas_dsyr, cblas_dsyrk, cblas_dtbmv, cblas_dtbsv, cblas_dtpmv, cblas_dtpsv, | |||
| cblas_dtrmm, cblas_dtrmv, cblas_dtrsm, cblas_dtrsv, cblas_daxpby, cblas_dgeadd, | |||
| cblas_idamax, cblas_idamin, cblas_idmin, cblas_idmax, cblas_dsum,cblas_dimatcopy,cblas_domatcopy | |||
| cblas_dgemmt); | |||
| cblas_dtrmm, cblas_dtrmv, cblas_dtrsm, cblas_dtrsv, cblas_daxpby, cblas_dgeadd, cblas_dgemmt, cblas_dgemmtr, | |||
| cblas_idamax, cblas_idamin, cblas_idmin, cblas_idmax, cblas_dsum,cblas_dimatcopy,cblas_domatcopy, | |||
| cblas_damax, cblas_damin, cblas_dgemm_batch); | |||
| @cblasobjss = ( | |||
| cblas_sasum, cblas_saxpy, cblas_saxpby, | |||
| @@ -78,9 +79,10 @@ | |||
| cblas_srotm, cblas_srotmg, cblas_ssbmv, cblas_sscal, cblas_sspmv, cblas_sspr2, cblas_sspr, | |||
| cblas_sswap, cblas_ssymm, cblas_ssymv, cblas_ssyr2, cblas_ssyr2k, cblas_ssyr, cblas_ssyrk, | |||
| cblas_stbmv, cblas_stbsv, cblas_stpmv, cblas_stpsv, cblas_strmm, cblas_strmv, cblas_strsm, | |||
| cblas_strsv, cblas_sgeadd, | |||
| cblas_isamax, cblas_isamin, cblas_ismin, cblas_ismax, cblas_ssum,cblas_simatcopy,cblas_somatcopy | |||
| cblas_sgemmt); | |||
| cblas_strsv, cblas_sgeadd, cblas_sgemmt, cblas_sgemmtr, | |||
| cblas_isamax, cblas_isamin, cblas_ismin, cblas_ismax, cblas_ssum,cblas_simatcopy,cblas_somatcopy, | |||
| cblas_samax, cblas_samin, cblas_sgemm_batch); | |||
| @cblasobjsz = ( | |||
| cblas_dzasum, cblas_dznrm2, cblas_zaxpy, cblas_zcopy, cblas_zdotc, cblas_zdotu, cblas_zdscal, | |||
| cblas_zgbmv, cblas_zgemm, cblas_zgemv, cblas_zgerc, cblas_zgeru, cblas_zhbmv, cblas_zhemm, | |||
| @@ -88,13 +90,13 @@ | |||
| cblas_zhpr, cblas_zscal, cblas_zswap, cblas_zsymm, cblas_zsyr2k, cblas_zsyrk, | |||
| cblas_ztbmv, cblas_ztbsv, cblas_ztpmv, cblas_ztpsv, cblas_ztrmm, cblas_ztrmv, cblas_ztrsm, | |||
| cblas_ztrsv, cblas_cdotc_sub, cblas_cdotu_sub, cblas_zdotc_sub, cblas_zdotu_sub, | |||
| cblas_zaxpby, cblas_zgeadd, | |||
| cblas_izamax, cblas_izamin, cblas_izmin, cblas_izmax, cblas_dzsum,cblas_zimatcopy,cblas_zomatcopy | |||
| cblas_zgemmt); | |||
| cblas_zaxpby, cblas_zgeadd, cblas_zgemmt, cblas_zgemmtr, | |||
| cblas_izamax, cblas_izamin, cblas_izmin, cblas_izmax, cblas_dzsum,cblas_zimatcopy,cblas_zomatcopy, | |||
| cblas_zaxpyc, cblas_zdrot, cblas_zrotg, cblas_dzamax, cblas_dzamin, cblas_zgemm_batch); | |||
| @cblasobjs = ( cblas_xerbla ); | |||
| @bfcblasobjs = (cblas_sbgemm, cblas_sbgemv, cblas_sbdot, cblas_sbstobf16, cblas_sbdtobf16, cblas_sbf16tos, cblas_dbf16tod); | |||
| @bfcblasobjs = (cblas_sbgemm, cblas_sbgemmt, cblas_sbgemmtr, cblas_sbgemv, cblas_sbdot, cblas_sbstobf16, cblas_sbdtobf16, cblas_sbf16tos, cblas_dbf16tod, cblas_sbgemm_batch); | |||
| @exblasobjs = ( | |||
| qamax,qamin,qasum,qaxpy,qcabs1,qcopy,qdot,qgbmv,qgemm, | |||
| @@ -709,6 +711,7 @@ zpotri, | |||
| # functions added for lapack-3.7.0 | |||
| @lapackobjs2s = (@lapackobjs2s, | |||
| slarfy, | |||
| ssyconvf, | |||
| strevc3, | |||
| sgelqt, | |||
| sgelqt3, | |||
| @@ -832,12 +835,82 @@ zpotri, | |||
| zungtsqr_row | |||
| ); | |||
| #functions added for lapack-3.11 | |||
| @lapackobjs2c = (@lapackobjs2c, | |||
| cgedmd, | |||
| cgedmdq | |||
| ); | |||
| @lapackobjs2d = (@lapackobjs2d, | |||
| dgedmd, | |||
| dgedmdq | |||
| ); | |||
| @lapackobjs2s = (@lapackobjs2s, | |||
| sgedmd, | |||
| sgedmdq | |||
| ); | |||
| @lapackobjs2z = (@lapackobjs2z, | |||
| zgedmd, | |||
| zgedmdq | |||
| ); | |||
| #functions added post 3.11 | |||
| @lapackobjs2c = (@lapackobjs2c, | |||
| cgelst, | |||
| cgeqp3rk, | |||
| claqp2rk, | |||
| claqp3rk, | |||
| clatrs3, | |||
| crscl, | |||
| ctrsyl3 | |||
| ); | |||
| # claqz0 | |||
| # claqz1 | |||
| # claqz2 | |||
| # claqz3 | |||
| # clatrs3 | |||
| @lapackobjs2d = (@lapackobjs2d, | |||
| dgelst, | |||
| dgeqp3rk, | |||
| dlaqp2rk, | |||
| dlaqp3rk, | |||
| dlarmm, | |||
| dlatrs3, | |||
| dtrsyl3 | |||
| ); | |||
| @lapackobjs2s = (@lapackobjs2s, | |||
| sgelst, | |||
| sgeqp3rk, | |||
| slaqp2rk, | |||
| slaqp3rk, | |||
| slarmm, | |||
| slatrs3, | |||
| strsyl3 | |||
| ); | |||
| @lapackobjs2z = (@lapackobjs2z, | |||
| zgelst, | |||
| zgeqp3rk, | |||
| zlaqp2rk, | |||
| zlaqp3rk, | |||
| zlatrs3, | |||
| zrscl, | |||
| ztrsyl3 | |||
| ); | |||
| # zlaqz0 | |||
| # zlaqz1 | |||
| # zlaqz2 | |||
| # zlaqz3 | |||
| @lapack_extendedprecision_objs = ( | |||
| zposvxx, clagge, clatms, chesvxx, cposvxx, cgesvxx, ssyrfssx, csyrfsx, | |||
| dlagsy, dsysvxx, sporfsx, slatms, zlatms, zherfsx, csysvxx, | |||
| ); | |||
| @lapack_deprecated_objsc = ( | |||
| cgelqs, cgeqrs, | |||
| cgegs, cggsvd, | |||
| cgegv, cggsvp, | |||
| cgelsx, clahrd, | |||
| @@ -845,13 +918,16 @@ zpotri, | |||
| ctzrqf, | |||
| ); | |||
| @lapack_deprecated_objsd = ( | |||
| dgelqs, dgeqrs, | |||
| dgegs, dgeqpf, | |||
| dgegv, dggsvd, | |||
| dgelsx, dggsvp, | |||
| dlahrd, | |||
| dlatzm, dtzrqf); | |||
| @lapack_deprecated_objss = ( | |||
| @lapack_deprecated_objss = ( | |||
| sgelqs, | |||
| sgeqrs, | |||
| sgelsx, | |||
| sgegs, | |||
| sgegv, | |||
| @@ -864,6 +940,8 @@ zpotri, | |||
| ); | |||
| @lapack_deprecated_objsz = ( | |||
| zgelqs, | |||
| zgeqrs, | |||
| zgegs, | |||
| zgegv, | |||
| zgelsx, | |||
| @@ -997,6 +1075,10 @@ zpotri, | |||
| LAPACKE_cgebrd_work, | |||
| LAPACKE_cgecon, | |||
| LAPACKE_cgecon_work, | |||
| LAPACKE_cgedmd, | |||
| LAPACKE_cgedmd_work, | |||
| LAPACKE_cgedmdq, | |||
| LAPACKE_cgedmdq_work, | |||
| LAPACKE_cgeequ, | |||
| LAPACKE_cgeequ_work, | |||
| LAPACKE_cgeequb, | |||
| @@ -1584,8 +1666,15 @@ zpotri, | |||
| LAPACKE_cgetsqrhrt, | |||
| LAPACKE_cgetsqrhrt_work, | |||
| LAPACKE_cungtsqr_row, | |||
| LAPACKE_cungtsqr_row_work | |||
| LAPACKE_cungtsqr_row_work, | |||
| LAPACKE_clangb, | |||
| LAPACKE_clangb_work, | |||
| LAPACKE_ctrsyl3, | |||
| LAPACKE_ctrsyl3_work, | |||
| LAPACKE_ctz_nancheck, | |||
| LAPACKE_ctz_trans, | |||
| LAPACKE_cunhr_col, | |||
| LAPACKE_cunhr_col_work | |||
| ); | |||
| @lapackeobjsd = ( | |||
| LAPACKE_dgb_nancheck, | |||
| @@ -1656,6 +1745,10 @@ zpotri, | |||
| LAPACKE_dgebrd_work, | |||
| LAPACKE_dgecon, | |||
| LAPACKE_dgecon_work, | |||
| LAPACKE_dgedmd, | |||
| LAPACKE_dgedmd_work, | |||
| LAPACKE_dgedmdq, | |||
| LAPACKE_dgedmdq_work, | |||
| LAPACKE_dgeequ, | |||
| LAPACKE_dgeequ_work, | |||
| LAPACKE_dgeequb, | |||
| @@ -2197,7 +2290,15 @@ zpotri, | |||
| LAPACKE_dgetsqrhrt, | |||
| LAPACKE_dgetsqrhrt_work, | |||
| LAPACKE_dorgtsqr_row, | |||
| LAPACKE_dorgtsqr_row_work | |||
| LAPACKE_dorgtsqr_row_work, | |||
| LAPACKE_dlangb, | |||
| LAPACKE_dlangb_work, | |||
| LAPACKE_dorhr_col, | |||
| LAPACKE_dorhr_col_work, | |||
| LAPACKE_dtrsyl3, | |||
| LAPACKE_dtrsyl3_work, | |||
| LAPACKE_dtz_nancheck, | |||
| LAPACKE_dtz_trans, | |||
| ); | |||
| @lapackeobjss = ( | |||
| @@ -2269,6 +2370,10 @@ zpotri, | |||
| LAPACKE_sgebrd_work, | |||
| LAPACKE_sgecon, | |||
| LAPACKE_sgecon_work, | |||
| LAPACKE_sgedmd, | |||
| LAPACKE_sgedmd_work, | |||
| LAPACKE_sgedmdq, | |||
| LAPACKE_sgedmdq_work, | |||
| LAPACKE_sgeequ, | |||
| LAPACKE_sgeequ_work, | |||
| LAPACKE_sgeequb, | |||
| @@ -2802,7 +2907,15 @@ zpotri, | |||
| LAPACKE_sgetsqrhrt, | |||
| LAPACKE_sgetsqrhrt_work, | |||
| LAPACKE_sorgtsqr_row, | |||
| LAPACKE_sorgtsqr_row_work | |||
| LAPACKE_sorgtsqr_row_work, | |||
| LAPACKE_slangb, | |||
| LAPACKE_slangb_work, | |||
| LAPACKE_sorhr_col, | |||
| LAPACKE_sorhr_col_work, | |||
| LAPACKE_strsyl3, | |||
| LAPACKE_strsyl3_work, | |||
| LAPACKE_stz_nancheck, | |||
| LAPACKE_stz_trans, | |||
| ); | |||
| @lapackeobjsz = ( | |||
| @@ -2878,6 +2991,10 @@ zpotri, | |||
| LAPACKE_zgebrd_work, | |||
| LAPACKE_zgecon, | |||
| LAPACKE_zgecon_work, | |||
| LAPACKE_zgedmd, | |||
| LAPACKE_zgedmd_work, | |||
| LAPACKE_zgedmdq, | |||
| LAPACKE_zgedmdq_work, | |||
| LAPACKE_zgeequ, | |||
| LAPACKE_zgeequ_work, | |||
| LAPACKE_zgeequb, | |||
| @@ -3345,7 +3462,15 @@ zpotri, | |||
| LAPACKE_zgetsqrhrt, | |||
| LAPACKE_zgetsqrhrt_work, | |||
| LAPACKE_zungtsqr_row, | |||
| LAPACKE_zungtsqr_row_work | |||
| LAPACKE_zungtsqr_row_work, | |||
| LAPACKE_zlangb, | |||
| LAPACKE_zlangb_work, | |||
| LAPACKE_zunhr_col, | |||
| LAPACKE_zunhr_col_work, | |||
| LAPACKE_ztrsyl3, | |||
| LAPACKE_ztrsyl3_work, | |||
| LAPACKE_ztz_nancheck, | |||
| LAPACKE_ztz_trans, | |||
| ## @(SRCX_OBJ) from `lapack-3.4.1/lapacke/src/Makefile` | |||
| ## Not exported: requires LAPACKE_EXTENDED to be set and depends on the | |||
| @@ -3551,7 +3676,7 @@ zpotri, | |||
| LAPACKE_zsytrs_aa_2stage_work, | |||
| # new functions from 3.9.0 | |||
| LAPACKE_zgesvdq, | |||
| LAPACKE_zgesvdq_work | |||
| LAPACKE_zgesvdq_work, | |||
| ); | |||
| #These function may need 2 underscores. | |||
| @@ -3573,7 +3698,7 @@ zpotri, | |||
| ssygv_2stage, | |||
| ssysv_aa_2stage, ssytrf_aa_2stage, | |||
| ssytrs_aa_2stage, | |||
| slaorhr_col_getrfnp, slaorhr_col_getrfnp2, sorhr_col, | |||
| slaorhr_col_getrfnp, slaorhr_col_getrfnp2, sorhr_col, slarfb_gett | |||
| ); | |||
| @lapack_embeded_underscore_objs_c=( | |||
| chetf2_rook, chetrf_rook, chetri_rook, | |||
| @@ -3598,7 +3723,7 @@ zpotri, | |||
| chetrf_aa_2stage, chetrs_aa_2stage, | |||
| csysv_aa_2stage, csytrf_aa_2stage, | |||
| csytrs_aa_2stage, | |||
| claunhr_col_getrfnp, claunhr_col_getrfnp2, cunhr_col, | |||
| claunhr_col_getrfnp, claunhr_col_getrfnp2, cunhr_col, clarfb_gett | |||
| ); | |||
| @lapack_embeded_underscore_objs_d=( | |||
| dlasyf_rook, | |||
| @@ -3615,7 +3740,7 @@ zpotri, | |||
| dsbevd_2stage, dsygv_2stage, | |||
| dsysv_aa_2stage, | |||
| dsytrf_aa_2stage, dsytrs_aa_2stage, | |||
| dlaorhr_col_getrfnp, dlaorhr_col_getrfnp2, dorhr_col, | |||
| dlaorhr_col_getrfnp, dlaorhr_col_getrfnp2, dorhr_col, dlarfb_gett | |||
| ); | |||
| @lapack_embeded_underscore_objs_z=( | |||
| zhetf2_rook, zhetrf_rook, zhetri_rook, | |||
| @@ -3639,7 +3764,7 @@ zpotri, | |||
| zhesv_aa_2stage, zhetrf_aa_2stage, | |||
| zhetrs_aa_2stage, zsysv_aa_2stage, | |||
| zsytrf_aa_2stage, zsytrs_aa_2stage, | |||
| zlaunhr_col_getrfnp, zlaunhr_col_getrfnp2, zunhr_col | |||
| zlaunhr_col_getrfnp, zlaunhr_col_getrfnp2, zunhr_col, zlarfb_gett | |||
| ); | |||
| @@ -245,6 +245,13 @@ else | |||
| ;; | |||
| *flang*) | |||
| vendor=FLANG | |||
| data=`$compiler -v 2>&1 > /dev/null` | |||
| v="${data#*version *}" | |||
| v="${v%%*.}" | |||
| major="${v%%.*}" | |||
| if [ "$major" -ge 17 ]; then | |||
| vendor=FLANGNEW | |||
| fi | |||
| bu=_ | |||
| openmp='-fopenmp' | |||
| ;; | |||
| @@ -1289,6 +1289,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define CORENAME "ARMV8SVE" | |||
| #endif | |||
| #ifdef FORCE_ARMV9SME | |||
| #define FORCE | |||
| #define ARCHITECTURE "ARM64" | |||
| #define SUBARCHITECTURE "ARMV9SME" | |||
| #define SUBDIRNAME "arm64" | |||
| #define ARCHCONFIG "-DARMV9SME " \ | |||
| "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ | |||
| "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 " \ | |||
| "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SVE -DHAVE_SME -DARMV8 -DARMV9" | |||
| #define LIBNAME "armv9sme" | |||
| #define CORENAME "ARMV9SME" | |||
| #endif | |||
| #ifdef FORCE_ARMV8 | |||
| #define FORCE | |||
| @@ -30,17 +30,17 @@ set(BLAS2_SOURCES | |||
| gemv.c ger.c | |||
| trsv.c trmv.c | |||
| syr2.c gbmv.c | |||
| sbmv.c | |||
| sbmv.c spmv.c | |||
| spr2.c | |||
| tbsv.c tbmv.c | |||
| tpsv.c tpmv.c | |||
| ) | |||
| set(BLAS2_REAL_ONLY_SOURCES | |||
| symv.c syr.c spmv.c spr.c | |||
| symv.c syr.c spr.c | |||
| ) | |||
| set(BLAS2_COMPLEX_LAPACK_SOURCES | |||
| symv.c syr.c spmv.c spr.c | |||
| symv.c syr.c spr.c | |||
| ) | |||
| set(BLAS2_COMPLEX_ONLY_MANGLED_SOURCES | |||
| @@ -109,7 +109,7 @@ endif () | |||
| GenerateNamedObjects("trsm.c" "TRMM" "trmm" ${CBLAS_FLAG}) | |||
| # gemmtr is gemmt under the name adopted by the Reference BLAS | |||
| GenerateNamedObjects("gemm.c" "" "gemmtr" ${CBLAS_FLAG}) | |||
| GenerateNamedObjects("gemm.c" "RNAME" "gemmtr" ${CBLAS_FLAG}) | |||
| # max and imax are compiled 4 times | |||
| GenerateNamedObjects("max.c" "" "" ${CBLAS_FLAG}) | |||
| @@ -125,8 +125,8 @@ endif () | |||
| if (BUILD_BFLOAT16) | |||
| GenerateNamedObjects("bf16dot.c" "" "sbdot" ${CBLAS_FLAG} "" "" true "BFLOAT16") | |||
| GenerateNamedObjects("gemm.c" "" "sbgemm" ${CBLAS_FLAG} "" "" true "BFLOAT16") | |||
| GenerateNamedObjects("gemmt.c" "" "sbgemmt" ${CBLAS_FLAG} "" "" true "BFLOAT16") | |||
| GenerateNamedObjects("gemmt.c" "" "sbgemmtr" ${CBLAS_FLAG} "" "" true "BFLOAT16") | |||
| GenerateNamedObjects("sbgemmt.c" "" "sbgemmt" ${CBLAS_FLAG} "" "" true "BFLOAT16") | |||
| GenerateNamedObjects("sbgemmt.c" "RNAME" "sbgemmtr" ${CBLAS_FLAG} "" "" true "BFLOAT16") | |||
| GenerateNamedObjects("sbgemv.c" "" "sbgemv" ${CBLAS_FLAG} "" "" true "BFLOAT16") | |||
| GenerateNamedObjects("tobf16.c" "SINGLE_PREC" "sbstobf16" ${CBLAS_FLAG} "" "" true "BFLOAT16") | |||
| GenerateNamedObjects("tobf16.c" "DOUBLE_PREC" "sbdtobf16" ${CBLAS_FLAG} "" "" true "BFLOAT16") | |||
| @@ -195,7 +195,7 @@ if (NOT DEFINED NO_CBLAS) | |||
| endforeach () | |||
| endif() | |||
| if (NOT DEFINED NO_LAPACK) | |||
| if (NOT NO_LAPACK) | |||
| set(LAPACK_SOURCES | |||
| lapack/gesv.c | |||
| ) | |||
| @@ -250,3 +250,7 @@ if ( BUILD_COMPLEX16 AND NOT BUILD_DOUBLE) | |||
| endif () | |||
| add_library(interface OBJECT ${OPENBLAS_SRC}) | |||
| if (USE_OPENMP) | |||
| target_link_libraries(interface OpenMP::OpenMP_C) | |||
| endif() | |||
| @@ -1304,9 +1304,9 @@ ifeq ($(BUILD_BFLOAT16),1) | |||
| sbgemm.$(SUFFIX) sbgemm.$(PSUFFIX) : gemm.c ../param.h | |||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||
| sbgemmt.$(SUFFIX) sbgemmt.$(PSUFFIX) : sbgemmt.c ../param.h | |||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||
| $(CC) -c $(CFLAGS) -URNAME $< -o $(@F) | |||
| sbgemmtr.$(SUFFIX) sbgemmtr.$(PSUFFIX) : sbgemmt.c ../param.h | |||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||
| $(CC) -c $(CFLAGS) -DRNAME $< -o $(@F) | |||
| endif | |||
| sgemm.$(SUFFIX) sgemm.$(PSUFFIX) : gemm.c ../param.h | |||
| @@ -1328,34 +1328,34 @@ xgemm.$(SUFFIX) xgemm.$(PSUFFIX) : gemm.c ../param.h | |||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||
| sgemmt.$(SUFFIX) sgemmt.$(PSUFFIX) : gemmt.c ../param.h | |||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||
| $(CC) -c $(CFLAGS) -URNAME $< -o $(@F) | |||
| dgemmt.$(SUFFIX) dgemmt.$(PSUFFIX) : gemmt.c ../param.h | |||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||
| $(CC) -c $(CFLAGS) -URNAME $< -o $(@F) | |||
| qgemmt.$(SUFFIX) qgemmt.$(PSUFFIX) : gemmt.c ../param.h | |||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||
| $(CC) -c $(CFLAGS) -URNAME $< -o $(@F) | |||
| cgemmt.$(SUFFIX) cgemmt.$(PSUFFIX) : gemmt.c ../param.h | |||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||
| $(CC) -c $(CFLAGS) -URNAME $< -o $(@F) | |||
| zgemmt.$(SUFFIX) zgemmt.$(PSUFFIX) : gemmt.c ../param.h | |||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||
| $(CC) -c $(CFLAGS) -URNAME $< -o $(@F) | |||
| sgemmtr.$(SUFFIX) sgemmtr.$(PSUFFIX) : gemmt.c ../param.h | |||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||
| $(CC) -c $(CFLAGS) -DRNAME $< -o $(@F) | |||
| dgemmtr.$(SUFFIX) dgemmtr.$(PSUFFIX) : gemmt.c ../param.h | |||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||
| $(CC) -c $(CFLAGS) -DRNAME $< -o $(@F) | |||
| qgemmtr.$(SUFFIX) qgemmtr.$(PSUFFIX) : gemmt.c ../param.h | |||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||
| $(CC) -c $(CFLAGS) -DRNAME $< -o $(@F) | |||
| cgemmtr.$(SUFFIX) cgemmtr.$(PSUFFIX) : gemmt.c ../param.h | |||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||
| $(CC) -c $(CFLAGS) -DRNAME $< -o $(@F) | |||
| zgemmtr.$(SUFFIX) zgemmtr.$(PSUFFIX) : gemmt.c ../param.h | |||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||
| $(CC) -c $(CFLAGS) -DRNAME $< -o $(@F) | |||
| ssymm.$(SUFFIX) ssymm.$(PSUFFIX) : symm.c | |||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||
| @@ -1,5 +1,5 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2024 The OpenBLAS Project */ | |||
| /* Copyright 2024, 2025 The OpenBLAS Project */ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| @@ -177,6 +177,74 @@ static int init_amxtile_permission() { | |||
| } | |||
| #endif | |||
| #ifdef SMP | |||
| #ifdef DYNAMIC_ARCH | |||
| extern char* gotoblas_corename(void); | |||
| #endif | |||
| #if defined(DYNAMIC_ARCH) || defined(NEOVERSEV1) | |||
| static inline int get_gemm_optimal_nthreads_neoversev1(double MNK, int ncpu) { | |||
| return | |||
| MNK < 262144L ? 1 | |||
| : MNK < 1124864L ? MIN(ncpu, 6) | |||
| : MNK < 7880599L ? MIN(ncpu, 12) | |||
| : MNK < 17173512L ? MIN(ncpu, 16) | |||
| : MNK < 33386248L ? MIN(ncpu, 20) | |||
| : MNK < 57066625L ? MIN(ncpu, 24) | |||
| : MNK < 91733851L ? MIN(ncpu, 32) | |||
| : MNK < 265847707L ? MIN(ncpu, 40) | |||
| : MNK < 458314011L ? MIN(ncpu, 48) | |||
| : MNK < 729000000L ? MIN(ncpu, 56) | |||
| : ncpu; | |||
| } | |||
| #endif | |||
| #if defined(DYNAMIC_ARCH) || defined(NEOVERSEV2) | |||
| static inline int get_gemm_optimal_nthreads_neoversev2(double MNK, int ncpu) { | |||
| return | |||
| MNK < 125000L ? 1 | |||
| : MNK < 1092727L ? MIN(ncpu, 6) | |||
| : MNK < 2628072L ? MIN(ncpu, 8) | |||
| : MNK < 8000000L ? MIN(ncpu, 12) | |||
| : MNK < 20346417L ? MIN(ncpu, 16) | |||
| : MNK < 57066625L ? MIN(ncpu, 24) | |||
| : MNK < 91125000L ? MIN(ncpu, 28) | |||
| : MNK < 238328000L ? MIN(ncpu, 40) | |||
| : MNK < 454756609L ? MIN(ncpu, 48) | |||
| : MNK < 857375000L ? MIN(ncpu, 56) | |||
| : MNK < 1073741824L ? MIN(ncpu, 64) | |||
| : ncpu; | |||
| } | |||
| #endif | |||
| static inline int get_gemm_optimal_nthreads(double MNK) { | |||
| int ncpu = num_cpu_avail(3); | |||
| #if defined(NEOVERSEV1) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) | |||
| return get_gemm_optimal_nthreads_neoversev1(MNK, ncpu); | |||
| #elif defined(NEOVERSEV2) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) | |||
| return get_gemm_optimal_nthreads_neoversev2(MNK, ncpu); | |||
| #elif defined(DYNAMIC_ARCH) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) | |||
| if (strcmp(gotoblas_corename(), "neoversev1") == 0) { | |||
| return get_gemm_optimal_nthreads_neoversev1(MNK, ncpu); | |||
| } | |||
| if (strcmp(gotoblas_corename(), "neoversev2") == 0) { | |||
| return get_gemm_optimal_nthreads_neoversev2(MNK, ncpu); | |||
| } | |||
| #endif | |||
| if ( MNK <= (SMP_THRESHOLD_MIN * (double) GEMM_MULTITHREAD_THRESHOLD) ) { | |||
| return 1; | |||
| } | |||
| else { | |||
| if (MNK/ncpu < SMP_THRESHOLD_MIN*(double)GEMM_MULTITHREAD_THRESHOLD) { | |||
| return MNK/(SMP_THRESHOLD_MIN*(double)GEMM_MULTITHREAD_THRESHOLD); | |||
| } | |||
| else { | |||
| return ncpu; | |||
| } | |||
| } | |||
| } | |||
| #endif | |||
| #ifndef CBLAS | |||
| void NAME(char *TRANSA, char *TRANSB, | |||
| @@ -310,7 +378,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS | |||
| FLOAT *beta = (FLOAT*) vbeta; | |||
| FLOAT *a = (FLOAT*) va; | |||
| FLOAT *b = (FLOAT*) vb; | |||
| FLOAT *c = (FLOAT*) vc; | |||
| FLOAT *c = (FLOAT*) vc; | |||
| #endif | |||
| blas_arg_t args; | |||
| @@ -349,15 +417,25 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS | |||
| PRINT_DEBUG_CNAME; | |||
| #if !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) && defined(USE_SGEMM_KERNEL_DIRECT) | |||
| #ifdef DYNAMIC_ARCH | |||
| if (support_avx512() ) | |||
| #endif | |||
| #if !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) | |||
| #if defined(ARCH_x86) && (defined(USE_SGEMM_KERNEL_DIRECT)||defined(DYNAMIC_ARCH)) | |||
| #if defined(DYNAMIC_ARCH) | |||
| if (support_avx512() ) | |||
| #endif | |||
| if (beta == 0 && alpha == 1.0 && order == CblasRowMajor && TransA == CblasNoTrans && TransB == CblasNoTrans && SGEMM_DIRECT_PERFORMANT(m,n,k)) { | |||
| SGEMM_DIRECT(m, n, k, a, lda, b, ldb, c, ldc); | |||
| return; | |||
| } | |||
| #endif | |||
| #if defined(ARCH_ARM64) && (defined(USE_SGEMM_KERNEL_DIRECT)||defined(DYNAMIC_ARCH)) | |||
| #if defined(DYNAMIC_ARCH) | |||
| if (support_sme1()) | |||
| #endif | |||
| if (beta == 0 && alpha == 1.0 && order == CblasRowMajor && TransA == CblasNoTrans && TransB == CblasNoTrans) { | |||
| SGEMM_DIRECT(m, n, k, a, lda, b, ldb, c, ldc); | |||
| return; | |||
| } | |||
| #endif | |||
| #endif | |||
| #ifndef COMPLEX | |||
| @@ -604,13 +682,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS | |||
| #endif | |||
| MNK = (double) args.m * (double) args.n * (double) args.k; | |||
| if ( MNK <= (SMP_THRESHOLD_MIN * (double) GEMM_MULTITHREAD_THRESHOLD) ) | |||
| args.nthreads = 1; | |||
| else { | |||
| args.nthreads = num_cpu_avail(3); | |||
| if (MNK/args.nthreads < SMP_THRESHOLD_MIN*(double)GEMM_MULTITHREAD_THRESHOLD) | |||
| args.nthreads = MNK/(SMP_THRESHOLD_MIN*(double)GEMM_MULTITHREAD_THRESHOLD); | |||
| } | |||
| args.nthreads = get_gemm_optimal_nthreads(MNK); | |||
| args.common = NULL; | |||
| @@ -38,6 +38,17 @@ | |||
| #ifndef COMPLEX | |||
| #define SMP_THRESHOLD_MIN 65536.0 | |||
| #ifdef RNAME | |||
| #ifdef XDOUBLE | |||
| #define ERROR_NAME "QGEMMTR" | |||
| #elif defined(DOUBLE) | |||
| #define ERROR_NAME "DGEMMTR" | |||
| #elif defined(BFLOAT16) | |||
| #define ERROR_NAME "SBGEMMTR" | |||
| #else | |||
| #define ERROR_NAME "SGEMMTR" | |||
| #endif | |||
| #else | |||
| #ifdef XDOUBLE | |||
| #define ERROR_NAME "QGEMMT " | |||
| #elif defined(DOUBLE) | |||
| @@ -47,8 +58,18 @@ | |||
| #else | |||
| #define ERROR_NAME "SGEMMT " | |||
| #endif | |||
| #endif | |||
| #else | |||
| #define SMP_THRESHOLD_MIN 8192.0 | |||
| #ifdef RNAME | |||
| #ifdef XDOUBLE | |||
| #define ERROR_NAME "XGEMMTR" | |||
| #elif defined(DOUBLE) | |||
| #define ERROR_NAME "ZGEMMTR" | |||
| #else | |||
| #define ERROR_NAME "CGEMMTR" | |||
| #endif | |||
| #else | |||
| #ifdef XDOUBLE | |||
| #define ERROR_NAME "XGEMMT " | |||
| #elif defined(DOUBLE) | |||
| @@ -57,6 +78,7 @@ | |||
| #define ERROR_NAME "CGEMMT " | |||
| #endif | |||
| #endif | |||
| #endif | |||
| #ifndef GEMM_MULTITHREAD_THRESHOLD | |||
| #define GEMM_MULTITHREAD_THRESHOLD 4 | |||
| @@ -666,5 +688,19 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, | |||
| IDEBUG_END; | |||
| /* transform B back if necessary */ | |||
| #if defined(COMPLEX) | |||
| if (transb > 1){ | |||
| #ifndef CBLAS | |||
| IMATCOPY_K_CNC(nrowb, ncolb, (FLOAT)(1.0), (FLOAT)(0.0), b, ldb); | |||
| #else | |||
| if (order == CblasColMajor) | |||
| IMATCOPY_K_CNC(nrowb, ncolb, (FLOAT)(1.0), (FLOAT)(0.0), b, ldb); | |||
| if (order == CblasRowMajor) | |||
| IMATCOPY_K_RNC(nrowb, ncolb, (FLOAT)(1.0), (FLOAT)(0.0), b, ldb); | |||
| #endif | |||
| } | |||
| #endif | |||
| return; | |||
| } | |||
| @@ -63,6 +63,70 @@ static int (*gemv_thread[])(BLASLONG, BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT | |||
| }; | |||
| #endif | |||
| #ifdef SMP | |||
| #ifdef DYNAMIC_ARCH | |||
| extern char* gotoblas_corename(void); | |||
| #endif | |||
| #if defined(DYNAMIC_ARCH) || defined(NEOVERSEV1) | |||
| static inline int get_gemv_optimal_nthreads_neoversev1(BLASLONG MN, int ncpu) { | |||
| #ifdef DOUBLE | |||
| return (MN < 8100L) ? 1 | |||
| : (MN < 12100L) ? MIN(ncpu, 2) | |||
| : (MN < 36100L) ? MIN(ncpu, 4) | |||
| : (MN < 84100L) ? MIN(ncpu, 8) | |||
| : (MN < 348100L) ? MIN(ncpu, 16) | |||
| : (MN < 435600L) ? MIN(ncpu, 24) | |||
| : (MN < 810000L) ? MIN(ncpu, 32) | |||
| : (MN < 1050625L) ? MIN(ncpu, 40) | |||
| : ncpu; | |||
| #else | |||
| return (MN < 25600L) ? 1 | |||
| : (MN < 63001L) ? MIN(ncpu, 4) | |||
| : (MN < 459684L) ? MIN(ncpu, 16) | |||
| : ncpu; | |||
| #endif | |||
| } | |||
| #endif | |||
| #if defined(DYNAMIC_ARCH) || defined(NEOVERSEV2) | |||
| static inline int get_gemv_optimal_nthreads_neoversev2(BLASLONG MN, int ncpu) { | |||
| return | |||
| MN < 24964L ? 1 | |||
| : MN < 65536L ? MIN(ncpu, 8) | |||
| : MN < 262144L ? MIN(ncpu, 32) | |||
| : MN < 1638400L ? MIN(ncpu, 64) | |||
| : ncpu; | |||
| } | |||
| #endif | |||
| static inline int get_gemv_optimal_nthreads(BLASLONG MN) { | |||
| int ncpu = num_cpu_avail(3); | |||
| #if defined(_WIN64) && defined(_M_ARM64) | |||
| if (MN > 100000000L) | |||
| return num_cpu_avail(4); | |||
| return 1; | |||
| #endif | |||
| #if defined(NEOVERSEV1) && !defined(COMPLEX) && !defined(BFLOAT16) | |||
| return get_gemv_optimal_nthreads_neoversev1(MN, ncpu); | |||
| #elif defined(NEOVERSEV2) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) | |||
| return get_gemv_optimal_nthreads_neoversev2(MN, ncpu); | |||
| #elif defined(DYNAMIC_ARCH) && !defined(COMPLEX) && !defined(BFLOAT16) | |||
| if (strcmp(gotoblas_corename(), "neoversev1") == 0) { | |||
| return get_gemv_optimal_nthreads_neoversev1(MN, ncpu); | |||
| } | |||
| if (strcmp(gotoblas_corename(), "neoversev2") == 0) { | |||
| return get_gemv_optimal_nthreads_neoversev2(MN, ncpu); | |||
| } | |||
| #endif | |||
| if ( MN < 115200L * GEMM_MULTITHREAD_THRESHOLD ) | |||
| return 1; | |||
| else | |||
| return num_cpu_avail(2); | |||
| } | |||
| #endif | |||
| #ifndef CBLAS | |||
| void NAME(char *TRANS, blasint *M, blasint *N, | |||
| @@ -202,13 +266,6 @@ void CNAME(enum CBLAS_ORDER order, | |||
| if (alpha == ZERO) return; | |||
| #if 0 | |||
| /* this optimization causes stack corruption on x86_64 under OSX, Windows and FreeBSD */ | |||
| if (trans == 0 && incx == 1 && incy == 1 && m*n < 2304 *GEMM_MULTITHREAD_THRESHOLD) { | |||
| GEMV_N(m, n, 0, alpha, a, lda, x, incx, y, incy, NULL); | |||
| return; | |||
| } | |||
| #endif | |||
| IDEBUG_START; | |||
| FUNCTION_PROFILE_START(); | |||
| @@ -225,11 +282,7 @@ void CNAME(enum CBLAS_ORDER order, | |||
| STACK_ALLOC(buffer_size, FLOAT, buffer); | |||
| #ifdef SMP | |||
| if ( 1L * m * n < 115200L * GEMM_MULTITHREAD_THRESHOLD ) | |||
| nthreads = 1; | |||
| else | |||
| nthreads = num_cpu_avail(2); | |||
| nthreads = get_gemv_optimal_nthreads(1L * m * n); | |||
| if (nthreads == 1) { | |||
| #endif | |||
| @@ -107,21 +107,35 @@ int NAME(blasint *N, blasint *NRHS, FLOAT *a, blasint *ldA, blasint *ipiv, | |||
| #ifndef PPC440 | |||
| buffer = (FLOAT *)blas_memory_alloc(1); | |||
| sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A); | |||
| sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); | |||
| #endif | |||
| #ifdef SMP | |||
| args.common = NULL; | |||
| #ifndef DOUBLE | |||
| if (args.m*args.n < 40000) | |||
| #if defined(_WIN64) && defined(_M_ARM64) | |||
| #ifdef COMPLEX | |||
| if (args.m * args.n <= 300) | |||
| #else | |||
| if (args.m * args.n <= 500) | |||
| #endif | |||
| args.nthreads = 1; | |||
| else if (args.m * args.n <= 1000) | |||
| args.nthreads = 4; | |||
| else | |||
| args.nthreads = num_cpu_avail(4); | |||
| #else | |||
| if (args.m*args.n < 10000) | |||
| #ifndef DOUBLE | |||
| if (args.m * args.n < 40000) | |||
| #else | |||
| if (args.m * args.n < 10000) | |||
| #endif | |||
| args.nthreads = 1; | |||
| else | |||
| args.nthreads = num_cpu_avail(4); | |||
| #endif | |||
| args.nthreads=1; | |||
| else | |||
| args.nthreads = num_cpu_avail(4); | |||
| if (args.nthreads == 1) { | |||
| #endif | |||
| @@ -61,6 +61,37 @@ FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX){ | |||
| #else | |||
| return fabsf(x[0]); | |||
| #endif | |||
| #endif | |||
| if (incx == 0) | |||
| #ifndef COMPLEX | |||
| #ifdef DOUBLE | |||
| return (sqrt((double)n)*fabs(x[0])); | |||
| #else | |||
| return (sqrt((float)n)*fabsf(x[0])); | |||
| #endif | |||
| #else | |||
| #ifdef DOUBLE | |||
| { | |||
| double fr=fabs(x[0]); | |||
| double fi=fabs(x[1]); | |||
| double fmin=MIN(fr,fi); | |||
| double fmax=MAX(fr,fi); | |||
| if (fmax==0.) return(fmax); | |||
| if (fmax==fmin) return(sqrt((double)n)*sqrt(2.)*fmax); | |||
| return (sqrt((double)n) * fmax * sqrt (1. + (fmin/fmax)*(fmin/fmax))); | |||
| } | |||
| #else | |||
| { | |||
| float fr=fabs(x[0]); | |||
| float fi=fabs(x[1]); | |||
| float fmin=MIN(fr,fi); | |||
| float fmax=MAX(fr,fi); | |||
| if (fmax==0.) return(fmax); | |||
| if (fmax==fmin) return(sqrt((float)n)*sqrt(2.)*fmax); | |||
| return (sqrt((float)n) * fmax * sqrt (1. + (fmin/fmax)*(fmin/fmax))); | |||
| } | |||
| #endif | |||
| #endif | |||
| if (incx < 0) | |||
| @@ -97,13 +128,44 @@ FLOAT CNAME(blasint n, FLOAT *x, blasint incx){ | |||
| if (n <= 0) return 0.; | |||
| #ifndef COMPLEX | |||
| #ifndef COMPLEX | |||
| if (n == 1) | |||
| #ifdef DOUBLE | |||
| return fabs(x[0]); | |||
| #else | |||
| return fabsf(x[0]); | |||
| #endif | |||
| #endif | |||
| if (incx == 0) | |||
| #ifndef COMPLEX | |||
| #ifdef DOUBLE | |||
| return (sqrt((double)n)*fabs(x[0])); | |||
| #else | |||
| return (sqrt((float)n)*fabsf(x[0])); | |||
| #endif | |||
| #else | |||
| #ifdef DOUBLE | |||
| { | |||
| double fr=fabs(x[0]); | |||
| double fi=fabs(x[1]); | |||
| double fmin=MIN(fr,fi); | |||
| double fmax=MAX(fr,fi); | |||
| if (fmax==0.) return(fmax); | |||
| if (fmax==fmin) return(sqrt((double)n)*sqrt(2.)*fmax); | |||
| return (sqrt((double)n) * fmax * sqrt (1. + (fmin/fmax)*(fmin/fmax))); | |||
| } | |||
| #else | |||
| { | |||
| float fr=fabs(x[0]); | |||
| float fi=fabs(x[1]); | |||
| float fmin=MIN(fr,fi); | |||
| float fmax=MAX(fr,fi); | |||
| if (fmax==0.) return(fmax); | |||
| if (fmax==fmin) return(sqrt((float)n)*sqrt(2.)*fmax); | |||
| return (sqrt((float)n) * fmax * sqrt (1. + (fmin/fmax)*(fmin/fmax))); | |||
| } | |||
| #endif | |||
| #endif | |||
| if (incx < 0) | |||
| @@ -7,149 +7,21 @@ | |||
| void NAME(blasint *N, FLOAT *dx, blasint *INCX, FLOAT *dy, blasint *INCY, FLOAT *dparam){ | |||
| blasint n = *N; | |||
| blasint incx = *INCX; | |||
| blasint incy = *INCY; | |||
| blasint n = *N; | |||
| blasint incx = *INCX; | |||
| blasint incy = *INCY; | |||
| PRINT_DEBUG_NAME | |||
| #else | |||
| void CNAME(blasint n, FLOAT *dx, blasint incx, FLOAT *dy, blasint incy, FLOAT *dparam){ | |||
| #endif | |||
| blasint i__1, i__2; | |||
| PRINT_DEBUG_CNAME; | |||
| blasint i__; | |||
| FLOAT w, z__; | |||
| blasint kx, ky; | |||
| FLOAT dh11, dh12, dh22, dh21, dflag; | |||
| blasint nsteps; | |||
| #ifndef CBLAS | |||
| PRINT_DEBUG_CNAME; | |||
| #else | |||
| PRINT_DEBUG_CNAME; | |||
| #endif | |||
| --dparam; | |||
| --dy; | |||
| --dx; | |||
| dflag = dparam[1]; | |||
| if (n <= 0 || dflag == - 2.0) goto L140; | |||
| if (! (incx == incy && incx > 0)) goto L70; | |||
| nsteps = n * incx; | |||
| if (dflag < 0.) { | |||
| goto L50; | |||
| } else if (dflag == 0) { | |||
| goto L10; | |||
| } else { | |||
| goto L30; | |||
| } | |||
| L10: | |||
| dh12 = dparam[4]; | |||
| dh21 = dparam[3]; | |||
| i__1 = nsteps; | |||
| i__2 = incx; | |||
| for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) { | |||
| w = dx[i__]; | |||
| z__ = dy[i__]; | |||
| dx[i__] = w + z__ * dh12; | |||
| dy[i__] = w * dh21 + z__; | |||
| /* L20: */ | |||
| } | |||
| goto L140; | |||
| L30: | |||
| dh11 = dparam[2]; | |||
| dh22 = dparam[5]; | |||
| i__2 = nsteps; | |||
| i__1 = incx; | |||
| for (i__ = 1; i__1 < 0 ? i__ >= i__2 : i__ <= i__2; i__ += i__1) { | |||
| w = dx[i__]; | |||
| z__ = dy[i__]; | |||
| dx[i__] = w * dh11 + z__; | |||
| dy[i__] = -w + dh22 * z__; | |||
| /* L40: */ | |||
| } | |||
| goto L140; | |||
| L50: | |||
| dh11 = dparam[2]; | |||
| dh12 = dparam[4]; | |||
| dh21 = dparam[3]; | |||
| dh22 = dparam[5]; | |||
| i__1 = nsteps; | |||
| i__2 = incx; | |||
| for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) { | |||
| w = dx[i__]; | |||
| z__ = dy[i__]; | |||
| dx[i__] = w * dh11 + z__ * dh12; | |||
| dy[i__] = w * dh21 + z__ * dh22; | |||
| /* L60: */ | |||
| } | |||
| goto L140; | |||
| L70: | |||
| kx = 1; | |||
| ky = 1; | |||
| if (incx < 0) { | |||
| kx = (1 - n) * incx + 1; | |||
| } | |||
| if (incy < 0) { | |||
| ky = (1 - n) * incy + 1; | |||
| } | |||
| ROTM_K(n, dx, incx, dy, incy, dparam); | |||
| if (dflag < 0.) { | |||
| goto L120; | |||
| } else if (dflag == 0) { | |||
| goto L80; | |||
| } else { | |||
| goto L100; | |||
| } | |||
| L80: | |||
| dh12 = dparam[4]; | |||
| dh21 = dparam[3]; | |||
| i__2 = n; | |||
| for (i__ = 1; i__ <= i__2; ++i__) { | |||
| w = dx[kx]; | |||
| z__ = dy[ky]; | |||
| dx[kx] = w + z__ * dh12; | |||
| dy[ky] = w * dh21 + z__; | |||
| kx += incx; | |||
| ky += incy; | |||
| /* L90: */ | |||
| } | |||
| goto L140; | |||
| L100: | |||
| dh11 = dparam[2]; | |||
| dh22 = dparam[5]; | |||
| i__2 = n; | |||
| for (i__ = 1; i__ <= i__2; ++i__) { | |||
| w = dx[kx]; | |||
| z__ = dy[ky]; | |||
| dx[kx] = w * dh11 + z__; | |||
| dy[ky] = -w + dh22 * z__; | |||
| kx += incx; | |||
| ky += incy; | |||
| /* L110: */ | |||
| } | |||
| goto L140; | |||
| L120: | |||
| dh11 = dparam[2]; | |||
| dh12 = dparam[4]; | |||
| dh21 = dparam[3]; | |||
| dh22 = dparam[5]; | |||
| i__2 = n; | |||
| for (i__ = 1; i__ <= i__2; ++i__) { | |||
| w = dx[kx]; | |||
| z__ = dy[ky]; | |||
| dx[kx] = w * dh11 + z__ * dh12; | |||
| dy[ky] = w * dh21 + z__ * dh22; | |||
| kx += incx; | |||
| ky += incy; | |||
| /* L130: */ | |||
| } | |||
| L140: | |||
| return; | |||
| } | |||
| @@ -252,25 +252,30 @@ void CNAME(enum CBLAS_ORDER order, | |||
| #ifdef SMP | |||
| if ( 1L * m * n < 1024L * GEMM_MULTITHREAD_THRESHOLD ) | |||
| #if defined(_WIN64) && defined(_M_ARM64) | |||
| if (m*n > 25000000L) | |||
| nthreads = num_cpu_avail(4); | |||
| else | |||
| nthreads = 1; | |||
| #else | |||
| if (1L * m * n < 1024L * GEMM_MULTITHREAD_THRESHOLD) | |||
| nthreads = 1; | |||
| else | |||
| nthreads = num_cpu_avail(2); | |||
| #endif | |||
| if (nthreads == 1) { | |||
| #endif | |||
| #endif | |||
| (gemv[(int)trans])(m, n, 0, alpha_r, alpha_i, a, lda, x, incx, y, incy, buffer); | |||
| #ifdef SMP | |||
| } else { | |||
| (gemv_thread[(int)trans])(m, n, ALPHA, a, lda, x, incx, y, incy, buffer, nthreads); | |||
| } | |||
| #endif | |||
| STACK_FREE(buffer); | |||
| FUNCTION_PROFILE_END(4, m * n + m + n, 2 * m * n); | |||
| @@ -98,7 +98,7 @@ void CNAME(blasint n, FLOAT alpha_r, void *vx, blasint incx){ | |||
| if (nthreads == 1) { | |||
| #endif | |||
| SCAL_K(n, 0, 0, alpha[0], alpha[1], x, incx, NULL, 0, NULL, 0); | |||
| SCAL_K(n, 0, 0, alpha[0], alpha[1], x, incx, NULL, 0, NULL, 1); | |||
| #ifdef SMP | |||
| } else { | |||
| @@ -108,7 +108,7 @@ void CNAME(blasint n, FLOAT alpha_r, void *vx, blasint incx){ | |||
| mode = BLAS_SINGLE | BLAS_COMPLEX; | |||
| #endif | |||
| blas_level1_thread(mode, n, 0, 0, alpha, x, incx, NULL, 0, NULL, 0, (int (*)(void))SCAL_K, nthreads); | |||
| blas_level1_thread(mode, n, 0, 0, alpha, x, incx, NULL, 0, NULL, 1, (int (*)(void))SCAL_K, nthreads); | |||
| } | |||
| #endif | |||
| @@ -116,12 +116,12 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, | |||
| #else | |||
| void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, int n, FLOAT alpha, FLOAT *x, int incx, FLOAT *a, int lda) { | |||
| void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, int n, void* valpha, FLOAT *x, int incx, FLOAT *a, int lda) { | |||
| FLOAT *buffer; | |||
| int uplo; | |||
| blasint info; | |||
| FLOAT * ALPHA = α | |||
| FLOAT * ALPHA = (FLOAT*)valpha; | |||
| FLOAT alpha_r = ALPHA[0]; | |||
| FLOAT alpha_i = ALPHA[1]; | |||
| #ifdef SMP | |||
| @@ -65,6 +65,7 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) | |||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}COPYKERNEL}" "C_INTERFACE" "copy_k" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}NRM2KERNEL}" "" "nrm2_k" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}ROTKERNEL}" "" "rot_k" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}ROTMKERNEL}" "" "rotm_k" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}SCALKERNEL}" "" "scal_k" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}SWAPKERNEL}" "" "swap_k" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}AXPBYKERNEL}" "" "axpby_k" false "" "" false ${float_type}) | |||
| @@ -125,6 +126,7 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) | |||
| GenerateNamedObjects("${KERNELDIR}/${SNRM2KERNEL}" "" "nrm2_k" false "" "" false "SINGLE") | |||
| GenerateNamedObjects("${KERNELDIR}/${SDOTKERNEL}" "" "dot_k" false "" "" false "SINGLE") | |||
| GenerateNamedObjects("${KERNELDIR}/${SROTKERNEL}" "" "rot_k" false "" "" false "SINGLE") | |||
| GenerateNamedObjects("${KERNELDIR}/${SROTMKERNEL}" "" "rotm_k" false "" "" false "SINGLE") | |||
| endif () | |||
| if (BUILD_COMPLEX16 AND NOT BUILD_DOUBLE) | |||
| GenerateNamedObjects("${KERNELDIR}/${DAMAXKERNEL}" "USE_ABS" "amax_k" false "" "" false "DOUBLE") | |||
| @@ -148,6 +150,7 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) | |||
| GenerateNamedObjects("${KERNELDIR}/${DCOPYKERNEL}" "C_INTERFACE" "copy_k" false "" "" false "DOUBLE") | |||
| GenerateNamedObjects("${KERNELDIR}/${DNRM2KERNEL}" "" "nrm2_k" false "" "" false "DOUBLE") | |||
| GenerateNamedObjects("${KERNELDIR}/${DROTKERNEL}" "" "rot_k" false "" "" false "DOUBLE") | |||
| GenerateNamedObjects("${KERNELDIR}/${DROTMKERNEL}" "" "rotm_k" false "" "" false "DOUBLE") | |||
| GenerateNamedObjects("${KERNELDIR}/${DDOTKERNEL}" "" "dot_k" false "" "" false "DOUBLE") | |||
| GenerateNamedObjects("${KERNELDIR}/${DSWAPKERNEL}" "" "swap_k" false "" "" false "DOUBLE") | |||
| GenerateNamedObjects("${KERNELDIR}/${DAXPYKERNEL}" "" "axpy_k" false "" "" false "DOUBLE") | |||
| @@ -198,25 +201,35 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) | |||
| # Makefile.L3 | |||
| set(USE_TRMM false) | |||
| string(TOUPPER ${TARGET_CORE} UC_TARGET_CORE) | |||
| if (ARM OR ARM64 OR (UC_TARGET_CORE MATCHES LONGSOON3B) OR (UC_TARGET_CORE MATCHES GENERIC) OR (UC_TARGET_CORE MATCHES HASWELL) OR (UC_TARGET_CORE MATCHES ZEN) OR (UC_TARGET_CORE MATCHES SKYLAKEX) OR (UC_TARGET_CORE MATCHES COOPERLAKE) OR (UC_TARGET_CORE MATCHES SAPPHIRERAPIDS)) | |||
| if (ARM OR ARM64 OR RISCV64 OR (UC_TARGET_CORE MATCHES LONGSOON3B) OR (UC_TARGET_CORE MATCHES GENERIC) OR (UC_TARGET_CORE MATCHES HASWELL) OR (UC_TARGET_CORE MATCHES ZEN) OR (UC_TARGET_CORE MATCHES SKYLAKEX) OR (UC_TARGET_CORE MATCHES COOPERLAKE) OR (UC_TARGET_CORE MATCHES SAPPHIRERAPIDS)) | |||
| set(USE_TRMM true) | |||
| endif () | |||
| if (ZARCH OR (UC_TARGET_CORE MATCHES POWER8) OR (UC_TARGET_CORE MATCHES POWER9) OR (UC_TARGET_CORE MATCHES POWER10)) | |||
| set(USE_TRMM true) | |||
| endif () | |||
| set(USE_DIRECT_SGEMM false) | |||
| if (X86_64) | |||
| if (X86_64 OR ARM64) | |||
| set(USE_DIRECT_SGEMM true) | |||
| endif() | |||
| if (USE_DIRECT_SGEMM) | |||
| # if (NOT DEFINED SGEMMDIRECTKERNEL) | |||
| if (X86_64) | |||
| set (SGEMMDIRECTKERNEL sgemm_direct_skylakex.c) | |||
| set (SGEMMDIRECTPERFORMANT sgemm_direct_performant.c) | |||
| # endif() | |||
| GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTKERNEL}" "" "gemm_direct" false "" "" false SINGLE) | |||
| GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTPERFORMANT}" "" "gemm_direct_performant" false "" "" false SINGLE) | |||
| elseif (ARM64) | |||
| set (SGEMMDIRECTKERNEL sgemm_direct_arm64_sme1.c) | |||
| set (SGEMMDIRECTSMEKERNEL sgemm_direct_sme1.S) | |||
| set (SGEMMDIRECTPREKERNEL sgemm_direct_sme1_preprocess.S) | |||
| GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTKERNEL}" "" "gemm_direct" false "" "" false SINGLE) | |||
| if (HAVE_SME) | |||
| GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTSMEKERNEL}" "" "gemm_direct_sme1" false "" "" false SINGLE) | |||
| GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTPREKERNEL}" "" "gemm_direct_sme1_preprocess" false "" "" false SINGLE) | |||
| endif () | |||
| endif () | |||
| endif() | |||
| foreach (float_type SINGLE DOUBLE) | |||
| @@ -1105,6 +1118,7 @@ endif () | |||
| GenerateNamedObjects("${KERNELDIR}/${DCOPYKERNEL}" "C_INTERFACE" "copy_k" false "" "" false "DOUBLE") | |||
| GenerateNamedObjects("${KERNELDIR}/${DNRM2KERNEL}" "" "nrm2_k" false "" "" false "DOUBLE") | |||
| GenerateNamedObjects("${KERNELDIR}/${DROTKERNEL}" "" "rot_k" false "" "" false "DOUBLE") | |||
| GenerateNamedObjects("${KERNELDIR}/${DROTMKERNEL}" "" "rotm_k" false "" "" false "DOUBLE") | |||
| GenerateNamedObjects("${KERNELDIR}/${DDOTKERNEL}" "" "dot_k" false "" "" false "DOUBLE") | |||
| GenerateNamedObjects("${KERNELDIR}/${DSWAPKERNEL}" "" "swap_k" false "" "" false "DOUBLE") | |||
| GenerateNamedObjects("${KERNELDIR}/${DAXPYKERNEL}" "" "axpy_k" false "" "" false "DOUBLE") | |||
| @@ -1352,6 +1366,9 @@ endif () | |||
| if (USE_GEMM3M) | |||
| target_compile_definitions(kernel${TSUFFIX} PRIVATE USE_GEMM3M) | |||
| endif() | |||
| if (USE_OPENMP) | |||
| target_link_libraries(kernel${TSUFFIX} OpenMP::OpenMP_C) | |||
| endif() | |||
| endfunction () | |||
| @@ -24,7 +24,11 @@ ifdef NO_AVX2 | |||
| AVX2OPT= | |||
| endif | |||
| ifdef TARGET_CORE | |||
| ifeq ($(TARGET_CORE), ARMV9SME) | |||
| override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -DHAVE_SME -march=armv9-a+sve2+sme | |||
| endif | |||
| ifeq ($(TARGET_CORE), SAPPHIRERAPIDS) | |||
| override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) | |||
| ifeq (1, $(filter 1,$(GCCVERSIONGTEQ11) $(CLANGVERSIONGTEQ12))) | |||
| @@ -336,6 +336,18 @@ ifndef XROTKERNEL | |||
| XROTKERNEL = zrot.S | |||
| endif | |||
| ifndef SROTMKERNEL | |||
| SROTMKERNEL = rotm.S | |||
| endif | |||
| ifndef DROTMKERNEL | |||
| DROTMKERNEL = rotm.S | |||
| endif | |||
| ifndef QROTMKERNEL | |||
| QROTMKERNEL = rotm.S | |||
| endif | |||
| ### SCAL ### | |||
| ifndef SSCALKERNEL | |||
| @@ -504,21 +516,21 @@ SBLASOBJS += \ | |||
| sasum_k$(TSUFFIX).$(SUFFIX) ssum_k$(TSUFFIX).$(SUFFIX) saxpy_k$(TSUFFIX).$(SUFFIX) scopy_k$(TSUFFIX).$(SUFFIX) \ | |||
| sdot_k$(TSUFFIX).$(SUFFIX) sdsdot_k$(TSUFFIX).$(SUFFIX) dsdot_k$(TSUFFIX).$(SUFFIX) \ | |||
| snrm2_k$(TSUFFIX).$(SUFFIX) srot_k$(TSUFFIX).$(SUFFIX) sscal_k$(TSUFFIX).$(SUFFIX) sswap_k$(TSUFFIX).$(SUFFIX) \ | |||
| saxpby_k$(TSUFFIX).$(SUFFIX) | |||
| saxpby_k$(TSUFFIX).$(SUFFIX) srotm_k$(TSUFFIX).$(SUFFIX) | |||
| DBLASOBJS += \ | |||
| damax_k$(TSUFFIX).$(SUFFIX) damin_k$(TSUFFIX).$(SUFFIX) dmax_k$(TSUFFIX).$(SUFFIX) dmin_k$(TSUFFIX).$(SUFFIX) \ | |||
| idamax_k$(TSUFFIX).$(SUFFIX) idamin_k$(TSUFFIX).$(SUFFIX) idmax_k$(TSUFFIX).$(SUFFIX) idmin_k$(TSUFFIX).$(SUFFIX) \ | |||
| dasum_k$(TSUFFIX).$(SUFFIX) daxpy_k$(TSUFFIX).$(SUFFIX) dcopy_k$(TSUFFIX).$(SUFFIX) ddot_k$(TSUFFIX).$(SUFFIX) \ | |||
| dnrm2_k$(TSUFFIX).$(SUFFIX) drot_k$(TSUFFIX).$(SUFFIX) dscal_k$(TSUFFIX).$(SUFFIX) dswap_k$(TSUFFIX).$(SUFFIX) \ | |||
| daxpby_k$(TSUFFIX).$(SUFFIX) dsum_k$(TSUFFIX).$(SUFFIX) | |||
| daxpby_k$(TSUFFIX).$(SUFFIX) dsum_k$(TSUFFIX).$(SUFFIX) drotm_k$(TSUFFIX).$(SUFFIX) | |||
| QBLASOBJS += \ | |||
| qamax_k$(TSUFFIX).$(SUFFIX) qamin_k$(TSUFFIX).$(SUFFIX) qmax_k$(TSUFFIX).$(SUFFIX) qmin_k$(TSUFFIX).$(SUFFIX) \ | |||
| iqamax_k$(TSUFFIX).$(SUFFIX) iqamin_k$(TSUFFIX).$(SUFFIX) iqmax_k$(TSUFFIX).$(SUFFIX) iqmin_k$(TSUFFIX).$(SUFFIX) \ | |||
| qasum_k$(TSUFFIX).$(SUFFIX) qaxpy_k$(TSUFFIX).$(SUFFIX) qcopy_k$(TSUFFIX).$(SUFFIX) qdot_k$(TSUFFIX).$(SUFFIX) \ | |||
| qnrm2_k$(TSUFFIX).$(SUFFIX) qrot_k$(TSUFFIX).$(SUFFIX) qscal_k$(TSUFFIX).$(SUFFIX) qswap_k$(TSUFFIX).$(SUFFIX) \ | |||
| qsum_k$(TSUFFIX).$(SUFFIX) | |||
| qsum_k$(TSUFFIX).$(SUFFIX) qrotm_k$(TSUFFIX).$(SUFFIX) | |||
| CBLASOBJS += \ | |||
| camax_k$(TSUFFIX).$(SUFFIX) camin_k$(TSUFFIX).$(SUFFIX) icamax_k$(TSUFFIX).$(SUFFIX) icamin_k$(TSUFFIX).$(SUFFIX) \ | |||
| @@ -842,7 +854,16 @@ $(KDIR)drot_k$(TSUFFIX).$(SUFFIX) $(KDIR)drot_k$(TPSUFFIX).$(PSUFFIX) : $(KERN | |||
| $(CC) -c $(CFLAGS) $(FMAFLAG) -UCOMPLEX -UCOMPLEX -DDOUBLE $< -o $@ | |||
| $(KDIR)qrot_k$(TSUFFIX).$(SUFFIX) $(KDIR)qrot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QROTKERNEL) | |||
| $(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -DXDOUBLE $< -o $@ | |||
| $(CC) -c $(CFLAGS) $(FMAFLAG) -UCOMPLEX -UCOMPLEX -DXDOUBLE $< -o $@ | |||
| $(KDIR)srotm_k$(TSUFFIX).$(SUFFIX) $(KDIR)srotm_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SROTMKERNEL) | |||
| $(CC) -c $(CFLAGS) $(FMAFLAG) -UCOMPLEX -UCOMPLEX -UDOUBLE $< -o $@ | |||
| $(KDIR)drotm_k$(TSUFFIX).$(SUFFIX) $(KDIR)drotm_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DROTMKERNEL) | |||
| $(CC) -c $(CFLAGS) $(FMAFLAG) -UCOMPLEX -UCOMPLEX -DDOUBLE $< -o $@ | |||
| $(KDIR)qrotm_k$(TSUFFIX).$(SUFFIX) $(KDIR)qrotm_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QROTMKERNEL) | |||
| $(CC) -c $(CFLAGS) $(FMAFLAG) -UCOMPLEX -UCOMPLEX -DXDOUBLE $< -o $@ | |||
| $(KDIR)csrot_k$(TSUFFIX).$(SUFFIX) $(KDIR)csrot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CROTKERNEL) | |||
| $(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -UDOUBLE $< -o $@ | |||
| @@ -24,6 +24,7 @@ endif | |||
| ifeq ($(ARCH), arm64) | |||
| USE_TRMM = 1 | |||
| USE_DIRECT_SGEMM = 1 | |||
| endif | |||
| ifeq ($(ARCH), riscv64) | |||
| @@ -95,9 +96,17 @@ endif | |||
| ifdef USE_DIRECT_SGEMM | |||
| ifndef SGEMMDIRECTKERNEL | |||
| ifeq ($(ARCH), x86_64) | |||
| SGEMMDIRECTKERNEL = sgemm_direct_skylakex.c | |||
| SGEMMDIRECTPERFORMANT = sgemm_direct_performant.c | |||
| endif | |||
| ifeq ($(ARCH), arm64) | |||
| ifeq ($(TARGET_CORE), ARMV9SME) | |||
| HAVE_SME = 1 | |||
| endif | |||
| SGEMMDIRECTKERNEL = sgemm_direct_arm64_sme1.c | |||
| endif | |||
| endif | |||
| endif | |||
| ifeq ($(BUILD_BFLOAT16), 1) | |||
| @@ -128,9 +137,20 @@ SKERNELOBJS += \ | |||
| $(SGEMMONCOPYOBJ) $(SGEMMOTCOPYOBJ) | |||
| ifdef USE_DIRECT_SGEMM | |||
| ifeq ($(ARCH), x86_64) | |||
| SKERNELOBJS += \ | |||
| sgemm_direct$(TSUFFIX).$(SUFFIX) \ | |||
| sgemm_direct_performant$(TSUFFIX).$(SUFFIX) | |||
| sgemm_direct_performant$(TSUFFIX).$(SUFFIX) | |||
| endif | |||
| ifeq ($(ARCH), arm64) | |||
| SKERNELOBJS += \ | |||
| sgemm_direct$(TSUFFIX).$(SUFFIX) | |||
| ifdef HAVE_SME | |||
| SKERNELOBJS += \ | |||
| sgemm_direct_sme1$(TSUFFIX).$(SUFFIX) \ | |||
| sgemm_direct_sme1_preprocess$(TSUFFIX).$(SUFFIX) | |||
| endif | |||
| endif | |||
| endif | |||
| endif | |||
| @@ -809,11 +829,23 @@ else | |||
| endif | |||
| ifdef USE_DIRECT_SGEMM | |||
| ifeq ($(ARCH), x86_64) | |||
| $(KDIR)sgemm_direct_performant$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMDIRECTPERFORMANT) | |||
| $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ | |||
| $(KDIR)sgemm_direct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMDIRECTKERNEL) | |||
| $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ | |||
| endif | |||
| ifeq ($(ARCH), arm64) | |||
| $(KDIR)sgemm_direct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMDIRECTKERNEL) | |||
| $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ | |||
| ifdef HAVE_SME | |||
| $(KDIR)sgemm_direct_sme1$(TSUFFIX).$(SUFFIX) : | |||
| $(CC) $(CFLAGS) -c $(KERNELDIR)/sgemm_direct_sme1.S -UDOUBLE -UCOMPLEX -o $@ | |||
| $(KDIR)sgemm_direct_sme1_preprocess$(TSUFFIX).$(SUFFIX) : | |||
| $(CC) $(CFLAGS) -c $(KERNELDIR)/sgemm_direct_sme1_preprocess.S -UDOUBLE -UCOMPLEX -o $@ | |||
| endif | |||
| endif | |||
| endif | |||
| ifeq ($(BUILD_BFLOAT16), 1) | |||
| @@ -122,3 +122,15 @@ ZTRSMKERNEL_LN = ztrsm_kernel_2x2_LN.S | |||
| ZTRSMKERNEL_LT = ztrsm_kernel_2x2_LT.S | |||
| ZTRSMKERNEL_RN = ztrsm_kernel_2x2_LT.S | |||
| ZTRSMKERNEL_RT = ztrsm_kernel_2x2_RT.S | |||
| ifndef SROTMKERNEL | |||
| SROTMKERNEL = ../generic/rotm.c | |||
| endif | |||
| ifndef DROTMKERNEL | |||
| DROTMKERNEL = ../generic/rotm.c | |||
| endif | |||
| ifndef QROTMKERNEL | |||
| QROTMKERNEL = ../generic/rotm.c | |||
| endif | |||
| @@ -43,4 +43,14 @@ ifndef ZGEMM_BETA | |||
| ZGEMM_BETA = ../generic/zgemm_beta.c | |||
| endif | |||
| ifndef SROTMKERNEL | |||
| SROTMKERNEL = ../generic/rotm.c | |||
| endif | |||
| ifndef DROTMKERNEL | |||
| DROTMKERNEL = ../generic/rotm.c | |||
| endif | |||
| ifndef QROTMKERNEL | |||
| QROTMKERNEL = ../generic/rotm.c | |||
| endif | |||
| @@ -27,65 +27,56 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| /************************************************************************************** | |||
| * 2013/09/14 Saar | |||
| * BLASTEST float : OK | |||
| * BLASTEST double : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| * BLASTEST float : OK | |||
| * BLASTEST double : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| * | |||
| **************************************************************************************/ | |||
| #include "common.h" | |||
| // The c/zscal_k function is called not only by cblas_c/zscal but also by other upper-level interfaces. | |||
| // In certain cases, the expected return values for cblas_s/zscal differ from those of other upper-level interfaces. | |||
| // To handle this, we use the dummy2 parameter to differentiate between them. | |||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | |||
| { | |||
| BLASLONG i=0; | |||
| BLASLONG inc_x2; | |||
| BLASLONG ip = 0; | |||
| FLOAT temp; | |||
| BLASLONG i = 0; | |||
| BLASLONG inc_x2; | |||
| BLASLONG ip = 0; | |||
| FLOAT temp; | |||
| if ( (n <= 0) || (inc_x <= 0)) | |||
| return(0); | |||
| if ((n <= 0) || (inc_x <= 0)) | |||
| return(0); | |||
| inc_x2 = 2 * inc_x; | |||
| if (dummy2 == 0) { | |||
| for (i = 0; i < n; i++) | |||
| { | |||
| if (da_r == 0.0 && da_i == 0.0) | |||
| { | |||
| x[ip] = 0.0; | |||
| x[ip+1] = 0.0; | |||
| } | |||
| else | |||
| { | |||
| temp = da_r * x[ip] - da_i * x[ip+1]; | |||
| x[ip+1] = da_r * x[ip+1] + da_i * x[ip] ; | |||
| x[ip] = temp; | |||
| } | |||
| inc_x2 = 2 * inc_x; | |||
| for ( i=0; i<n; i++ ) | |||
| { | |||
| if ( da_r == 0.0 ) | |||
| { | |||
| if ( da_i == 0.0 ) | |||
| { | |||
| temp = 0.0; | |||
| x[ip+1] = 0.0 ; | |||
| } | |||
| else | |||
| { | |||
| temp = - da_i * x[ip+1] ; | |||
| if (isnan(x[ip]) || isinf(x[ip])) temp = NAN; | |||
| if (!isinf(x[ip+1])) | |||
| x[ip+1] = da_i * x[ip] ; | |||
| else x[ip+1] = NAN; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| if ( da_i == 0.0 ) | |||
| { | |||
| temp = da_r * x[ip] ; | |||
| x[ip+1] = da_r * x[ip+1]; | |||
| } | |||
| else | |||
| { | |||
| temp = da_r * x[ip] - da_i * x[ip+1] ; | |||
| x[ip+1] = da_r * x[ip+1] + da_i * x[ip] ; | |||
| } | |||
| } | |||
| x[ip] = temp; | |||
| ip += inc_x2; | |||
| } | |||
| return(0); | |||
| } | |||
| for (i = 0; i < n; i++) | |||
| { | |||
| temp = da_r * x[ip] - da_i * x[ip+1]; | |||
| x[ip+1] = da_r * x[ip+1] + da_i * x[ip] ; | |||
| ip += inc_x2; | |||
| } | |||
| return(0); | |||
| x[ip] = temp; | |||
| ip += inc_x2; | |||
| } | |||
| return(0); | |||
| } | |||
| @@ -45,4 +45,14 @@ ifndef ZGEMM_BETA | |||
| ZGEMM_BETA = ../generic/zgemm_beta.c | |||
| endif | |||
| ifndef SROTMKERNEL | |||
| SROTMKERNEL = ../generic/rotm.c | |||
| endif | |||
| ifndef DROTMKERNEL | |||
| DROTMKERNEL = ../generic/rotm.c | |||
| endif | |||
| ifndef QROTMKERNEL | |||
| QROTMKERNEL = ../generic/rotm.c | |||
| endif | |||
| @@ -1,6 +1,6 @@ | |||
| include $(KERNELDIR)/KERNEL.ARMV8SVE | |||
| SGEMVNKERNEL = gemv_n_sve.c | |||
| DGEMVNKERNEL = gemv_n_sve.c | |||
| SGEMVNKERNEL = gemv_n_sve_v4x3.c | |||
| DGEMVNKERNEL = gemv_n_sve_v4x3.c | |||
| SGEMVTKERNEL = gemv_t_sve_v4x3.c | |||
| DGEMVTKERNEL = gemv_t_sve_v4x3.c | |||
| @@ -74,16 +74,21 @@ DSCALKERNEL = scal.S | |||
| CSCALKERNEL = zscal.S | |||
| ZSCALKERNEL = zscal.S | |||
| SGEMVNKERNEL = gemv_n.S | |||
| DGEMVNKERNEL = gemv_n.S | |||
| SGEMVNKERNEL = gemv_n_sve_v1x3.c | |||
| DGEMVNKERNEL = gemv_n_sve_v1x3.c | |||
| CGEMVNKERNEL = zgemv_n.S | |||
| ZGEMVNKERNEL = zgemv_n.S | |||
| SGEMVTKERNEL = gemv_t.S | |||
| DGEMVTKERNEL = gemv_t.S | |||
| SGEMVTKERNEL = gemv_t_sve_v1x3.c | |||
| DGEMVTKERNEL = gemv_t_sve_v1x3.c | |||
| CGEMVTKERNEL = zgemv_t.S | |||
| ZGEMVTKERNEL = zgemv_t.S | |||
| SSYMV_L_KERNEL = symv_L_sve_v1x4.c | |||
| SSYMV_U_KERNEL = symv_U_sve_v1x4.c | |||
| DSYMV_L_KERNEL = symv_L_sve_v1x4.c | |||
| DSYMV_U_KERNEL = symv_U_sve_v1x4.c | |||
| SASUMKERNEL = sasum_thunderx2t99.c | |||
| DASUMKERNEL = dasum_thunderx2t99.c | |||
| CASUMKERNEL = casum_thunderx2t99.c | |||
| @@ -0,0 +1,3 @@ | |||
| include $(KERNELDIR)/KERNEL.ARMV8SVE | |||
| @@ -60,7 +60,7 @@ DSCALKERNEL = scal.S | |||
| CSCALKERNEL = zscal.S | |||
| ZSCALKERNEL = zscal.S | |||
| SGEMVNKERNEL = gemv_n.S | |||
| SGEMVNKERNEL = sgemv_n_neon.c | |||
| DGEMVNKERNEL = gemv_n.S | |||
| CGEMVNKERNEL = zgemv_n.S | |||
| ZGEMVNKERNEL = zgemv_n.S | |||
| @@ -70,6 +70,10 @@ DGEMVTKERNEL = gemv_t.S | |||
| CGEMVTKERNEL = zgemv_t.S | |||
| ZGEMVTKERNEL = zgemv_t.S | |||
| SSYMV_L_KERNEL = symv_L_asimd_4x4.c | |||
| SSYMV_U_KERNEL = symv_U_asimd_4x4.c | |||
| DSYMV_L_KERNEL = symv_L_asimd_4x4.c | |||
| DSYMV_U_KERNEL = symv_U_asimd_4x4.c | |||
| SASUMKERNEL = sasum_thunderx2t99.c | |||
| DASUMKERNEL = dasum_thunderx2t99.c | |||
| @@ -98,8 +102,18 @@ ZNRM2KERNEL = znrm2.S | |||
| DDOTKERNEL = dot.c | |||
| SDOTKERNEL = dot.c | |||
| ifeq ($(OSNAME), WINNT) | |||
| ifeq ($(C_COMPILER), CLANG) | |||
| CDOTKERNEL = zdot.S | |||
| ZDOTKERNEL = zdot.S | |||
| else | |||
| CDOTKERNEL = zdot_thunderx2t99.c | |||
| ZDOTKERNEL = zdot_thunderx2t99.c | |||
| endif | |||
| else | |||
| CDOTKERNEL = zdot_thunderx2t99.c | |||
| ZDOTKERNEL = zdot_thunderx2t99.c | |||
| endif | |||
| DSDOTKERNEL = dot.S | |||
| DGEMM_BETA = dgemm_beta.S | |||
| @@ -60,13 +60,13 @@ DSCALKERNEL = scal.S | |||
| CSCALKERNEL = zscal.S | |||
| ZSCALKERNEL = zscal.S | |||
| SGEMVNKERNEL = gemv_n.S | |||
| DGEMVNKERNEL = gemv_n.S | |||
| SGEMVNKERNEL = gemv_n_sve_v1x3.c | |||
| DGEMVNKERNEL = gemv_n_sve_v1x3.c | |||
| CGEMVNKERNEL = zgemv_n.S | |||
| ZGEMVNKERNEL = zgemv_n.S | |||
| SGEMVTKERNEL = gemv_t.S | |||
| DGEMVTKERNEL = gemv_t.S | |||
| SGEMVTKERNEL = gemv_t_sve_v1x3.c | |||
| DGEMVTKERNEL = gemv_t_sve_v1x3.c | |||
| CGEMVTKERNEL = zgemv_t.S | |||
| ZGEMVTKERNEL = zgemv_t.S | |||
| @@ -198,3 +198,5 @@ SBGEMMINCOPYOBJ = sbgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| SBGEMMITCOPYOBJ = sbgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| SBGEMMONCOPYOBJ = sbgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| SBGEMMOTCOPYOBJ = sbgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| SBGEMVTKERNEL = sbgemv_t_bfdot.c | |||
| SBGEMVNKERNEL = sbgemv_n_neon.c | |||
| @@ -1,4 +1,24 @@ | |||
| include $(KERNELDIR)/KERNEL.ARMV8SVE | |||
| SGEMVNKERNEL = gemv_n_sve_v1x3.c | |||
| DGEMVNKERNEL = gemv_n_sve_v1x3.c | |||
| SGEMVTKERNEL = gemv_t_sve_v1x3.c | |||
| DGEMVTKERNEL = gemv_t_sve_v1x3.c | |||
| ifeq ($(BUILD_BFLOAT16), 1) | |||
| SBGEMM_BETA = sbgemm_beta_neoversev1.c | |||
| SBGEMMKERNEL = sbgemm_kernel_$(SBGEMM_UNROLL_M)x$(SBGEMM_UNROLL_N)_neoversev1.c | |||
| ifneq ($(SBGEMM_UNROLL_M), $(SBGEMM_UNROLL_N)) | |||
| SBGEMMINCOPY = sbgemm_ncopy_$(SBGEMM_UNROLL_M)_neoversev1.c | |||
| SBGEMMITCOPY = sbgemm_tcopy_$(SBGEMM_UNROLL_M)_neoversev1.c | |||
| SBGEMMINCOPYOBJ = sbgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| SBGEMMITCOPYOBJ = sbgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| endif | |||
| SBGEMMONCOPY = sbgemm_ncopy_$(SBGEMM_UNROLL_N)_neoversev1.c | |||
| SBGEMMOTCOPY = sbgemm_tcopy_$(SBGEMM_UNROLL_N)_neoversev1.c | |||
| SBGEMMONCOPYOBJ = sbgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| SBGEMMOTCOPYOBJ = sbgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| SBGEMVNKERNEL = sbgemv_n_neon.c | |||
| SBGEMVTKERNEL = sbgemv_t_bfdot.c | |||
| endif | |||
| @@ -1 +1,6 @@ | |||
| include $(KERNELDIR)/KERNEL.ARMV8SVE | |||
| ifeq ($(BUILD_BFLOAT16), 1) | |||
| SBGEMVTKERNEL = sbgemv_t_bfdot.c | |||
| SBGEMVNKERNEL = sbgemv_n_neon.c | |||
| endif | |||
| @@ -171,3 +171,15 @@ QCABS_KERNEL = ../generic/cabs.c | |||
| #Dump kernel | |||
| CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c | |||
| ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c | |||
| ifndef SROTMKERNEL | |||
| SROTMKERNEL = ../generic/rotm.c | |||
| endif | |||
| ifndef DROTMKERNEL | |||
| DROTMKERNEL = ../generic/rotm.c | |||
| endif | |||
| ifndef QROTMKERNEL | |||
| QROTMKERNEL = ../generic/rotm.c | |||
| endif | |||
| @@ -1,216 +1,217 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2017, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #include <arm_neon.h> | |||
| #define N "x0" /* vector length */ | |||
| #define X "x1" /* X vector address */ | |||
| #define INC_X "x2" /* X stride */ | |||
| #define Y "x3" /* Y vector address */ | |||
| #define INC_Y "x4" /* Y stride */ | |||
| #define J "x5" /* loop variable */ | |||
| /******************************************************************************* | |||
| * Macro definitions | |||
| *******************************************************************************/ | |||
| #if !defined(COMPLEX) | |||
| #if !defined(DOUBLE) | |||
| #define TMPF "s0" | |||
| #define INC_SHIFT "2" | |||
| #define N_DIV_SHIFT "2" | |||
| #define N_REM_MASK "3" | |||
| #else | |||
| #define TMPF "d0" | |||
| #define INC_SHIFT "3" | |||
| #define N_DIV_SHIFT "1" | |||
| #define N_REM_MASK "1" | |||
| #endif | |||
| #else | |||
| #if !defined(DOUBLE) | |||
| #define TMPF "d0" | |||
| #define INC_SHIFT "3" | |||
| #define N_DIV_SHIFT "1" | |||
| #define N_REM_MASK "1" | |||
| #else | |||
| #define TMPF "q0" | |||
| #define INC_SHIFT "4" | |||
| #define N_DIV_SHIFT "0" | |||
| #define N_REM_MASK "0" | |||
| #endif | |||
| #endif | |||
| #define KERNEL_F1 \ | |||
| "ldr "TMPF", ["X"] \n" \ | |||
| "add "X", "X", "INC_X" \n" \ | |||
| "str "TMPF", ["Y"] \n" \ | |||
| "add "Y", "Y", "INC_Y" \n" | |||
| #define KERNEL_F \ | |||
| "ldr q0, ["X"], #16 \n" \ | |||
| "str q0, ["Y"], #16 \n" | |||
| #define INIT \ | |||
| "lsl "INC_X", "INC_X", #"INC_SHIFT" \n" \ | |||
| "lsl "INC_Y", "INC_Y", #"INC_SHIFT" \n" | |||
| static int do_copy(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| { | |||
| if ( n < 0 ) return 0; | |||
| __asm__ __volatile__ ( | |||
| " mov "N", %[N_] \n" | |||
| " mov "X", %[X_] \n" | |||
| " mov "INC_X", %[INCX_] \n" | |||
| " mov "Y", %[Y_] \n" | |||
| " mov "INC_Y", %[INCY_] \n" | |||
| " cmp "N", xzr \n" | |||
| " ble 8f //copy_kernel_L999 \n" | |||
| " cmp "INC_X", #1 \n" | |||
| " bne 4f //copy_kernel_S_BEGIN \n" | |||
| " cmp "INC_Y", #1 \n" | |||
| " bne 4f //copy_kernel_S_BEGIN \n" | |||
| "// .Lcopy_kernel_F_BEGIN: \n" | |||
| " "INIT" \n" | |||
| " asr "J", "N", #"N_DIV_SHIFT" \n" | |||
| " cmp "J", xzr \n" | |||
| " beq 2f //copy_kernel_F1 \n" | |||
| " .align 5 \n" | |||
| "1: //copy_kernel_F: \n" | |||
| " "KERNEL_F" \n" | |||
| " subs "J", "J", #1 \n" | |||
| " bne 1b //copy_kernel_F \n" | |||
| "2: //copy_kernel_F1: \n" | |||
| #if defined(COMPLEX) && defined(DOUBLE) | |||
| " b 8f //copy_kernel_L999 \n" | |||
| #else | |||
| " ands "J", "N", #"N_REM_MASK" \n" | |||
| " ble 8f //copy_kernel_L999 \n" | |||
| #endif | |||
| "3: //copy_kernel_F10: \n" | |||
| " "KERNEL_F1" \n" | |||
| " subs "J", "J", #1 \n" | |||
| " bne 3b //copy_kernel_F10 \n" | |||
| " b 8f //copy_kernel_L999 \n" | |||
| "4: //copy_kernel_S_BEGIN: \n" | |||
| " "INIT" \n" | |||
| " asr "J", "N", #2 \n" | |||
| " cmp "J", xzr \n" | |||
| " ble 6f //copy_kernel_S1 \n" | |||
| "5: //copy_kernel_S4: \n" | |||
| " "KERNEL_F1" \n" | |||
| " "KERNEL_F1" \n" | |||
| " "KERNEL_F1" \n" | |||
| " "KERNEL_F1" \n" | |||
| " subs "J", "J", #1 \n" | |||
| " bne 5b //copy_kernel_S4 \n" | |||
| "6: //copy_kernel_S1: \n" | |||
| " ands "J", "N", #3 \n" | |||
| " ble 8f //copy_kernel_L999 \n" | |||
| "7: //copy_kernel_S10: \n" | |||
| " "KERNEL_F1" \n" | |||
| " subs "J", "J", #1 \n" | |||
| " bne 7b //copy_kernel_S10 \n" | |||
| "8: //copy_kernel_L999: \n" | |||
| : | |||
| : [N_] "r" (n), //%1 | |||
| [X_] "r" (x), //%2 | |||
| [INCX_] "r" (inc_x), //%3 | |||
| [Y_] "r" (y), //%4 | |||
| [INCY_] "r" (inc_y) //%5 | |||
| : "cc", | |||
| "memory", | |||
| "x0", "x1", "x2", "x3", "x4", "x5", | |||
| "d0" | |||
| ); | |||
| return 0; | |||
| } | |||
| #if defined(SMP) | |||
| static int copy_thread_function(BLASLONG n, BLASLONG dummy0, | |||
| BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *y, | |||
| BLASLONG inc_y, FLOAT *dummy3, BLASLONG dummy4) | |||
| { | |||
| do_copy(n, x, inc_x, y, inc_y); | |||
| return 0; | |||
| } | |||
| #endif | |||
| int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| { | |||
| #if defined(SMP) | |||
| int nthreads; | |||
| FLOAT dummy_alpha; | |||
| #endif | |||
| if (n <= 0) return 0; | |||
| #if defined(SMP) | |||
| if (inc_x == 0 || n <= 10000) | |||
| nthreads = 1; | |||
| else | |||
| nthreads = num_cpu_avail(1); | |||
| if (nthreads == 1) { | |||
| do_copy(n, x, inc_x, y, inc_y); | |||
| } else { | |||
| int mode = 0; | |||
| #if !defined(COMPLEX) | |||
| mode = BLAS_REAL; | |||
| #else | |||
| mode = BLAS_COMPLEX; | |||
| #endif | |||
| #if !defined(DOUBLE) | |||
| mode |= BLAS_SINGLE; | |||
| #else | |||
| mode |= BLAS_DOUBLE; | |||
| #endif | |||
| blas_level1_thread(mode, n, 0, 0, &dummy_alpha, | |||
| x, inc_x, y, inc_y, NULL, 0, | |||
| ( void *)copy_thread_function, nthreads); | |||
| } | |||
| #else | |||
| do_copy(n, x, inc_x, y, inc_y); | |||
| #endif | |||
| return 0; | |||
| } | |||
| /*************************************************************************** | |||
| Copyright (c) 2017, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #include <arm_neon.h> | |||
| #define N "x0" /* vector length */ | |||
| #define X "x1" /* X vector address */ | |||
| #define INC_X "x2" /* X stride */ | |||
| #define Y "x3" /* Y vector address */ | |||
| #define INC_Y "x4" /* Y stride */ | |||
| #define J "x5" /* loop variable */ | |||
| /******************************************************************************* | |||
| * Macro definitions | |||
| *******************************************************************************/ | |||
| #if !defined(COMPLEX) | |||
| #if !defined(DOUBLE) | |||
| #define TMPF "s0" | |||
| #define INC_SHIFT "2" | |||
| #define N_DIV_SHIFT "2" | |||
| #define N_REM_MASK "3" | |||
| #else | |||
| #define TMPF "d0" | |||
| #define INC_SHIFT "3" | |||
| #define N_DIV_SHIFT "1" | |||
| #define N_REM_MASK "1" | |||
| #endif | |||
| #else | |||
| #if !defined(DOUBLE) | |||
| #define TMPF "d0" | |||
| #define INC_SHIFT "3" | |||
| #define N_DIV_SHIFT "1" | |||
| #define N_REM_MASK "1" | |||
| #else | |||
| #define TMPF "q0" | |||
| #define INC_SHIFT "4" | |||
| #define N_DIV_SHIFT "0" | |||
| #define N_REM_MASK "0" | |||
| #endif | |||
| #endif | |||
| #define KERNEL_F1 \ | |||
| "ldr "TMPF", ["X"] \n" \ | |||
| "add "X", "X", "INC_X" \n" \ | |||
| "str "TMPF", ["Y"] \n" \ | |||
| "add "Y", "Y", "INC_Y" \n" | |||
| #define KERNEL_F \ | |||
| "ldr q0, ["X"], #16 \n" \ | |||
| "str q0, ["Y"], #16 \n" | |||
| #define INIT \ | |||
| "lsl "INC_X", "INC_X", #"INC_SHIFT" \n" \ | |||
| "lsl "INC_Y", "INC_Y", #"INC_SHIFT" \n" | |||
| static int do_copy(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| { | |||
| if ( n < 0 ) return 0; | |||
| __asm__ __volatile__ ( | |||
| " mov "N", %[N_] \n" | |||
| " mov "X", %[X_] \n" | |||
| " mov "INC_X", %[INCX_] \n" | |||
| " mov "Y", %[Y_] \n" | |||
| " mov "INC_Y", %[INCY_] \n" | |||
| " cmp "N", xzr \n" | |||
| " ble 8f //copy_kernel_L999 \n" | |||
| " cmp "INC_X", #1 \n" | |||
| " bne 4f //copy_kernel_S_BEGIN \n" | |||
| " cmp "INC_Y", #1 \n" | |||
| " bne 4f //copy_kernel_S_BEGIN \n" | |||
| "// .Lcopy_kernel_F_BEGIN: \n" | |||
| " "INIT" \n" | |||
| " asr "J", "N", #"N_DIV_SHIFT" \n" | |||
| " cmp "J", xzr \n" | |||
| " beq 2f //copy_kernel_F1 \n" | |||
| #if !(defined(__clang__) && defined(OS_WINDOWS)) | |||
| " .align 5 \n" | |||
| #endif | |||
| "1: //copy_kernel_F: \n" | |||
| " "KERNEL_F" \n" | |||
| " subs "J", "J", #1 \n" | |||
| " bne 1b //copy_kernel_F \n" | |||
| "2: //copy_kernel_F1: \n" | |||
| #if defined(COMPLEX) && defined(DOUBLE) | |||
| " b 8f //copy_kernel_L999 \n" | |||
| #else | |||
| " ands "J", "N", #"N_REM_MASK" \n" | |||
| " ble 8f //copy_kernel_L999 \n" | |||
| #endif | |||
| "3: //copy_kernel_F10: \n" | |||
| " "KERNEL_F1" \n" | |||
| " subs "J", "J", #1 \n" | |||
| " bne 3b //copy_kernel_F10 \n" | |||
| " b 8f //copy_kernel_L999 \n" | |||
| "4: //copy_kernel_S_BEGIN: \n" | |||
| " "INIT" \n" | |||
| " asr "J", "N", #2 \n" | |||
| " cmp "J", xzr \n" | |||
| " ble 6f //copy_kernel_S1 \n" | |||
| "5: //copy_kernel_S4: \n" | |||
| " "KERNEL_F1" \n" | |||
| " "KERNEL_F1" \n" | |||
| " "KERNEL_F1" \n" | |||
| " "KERNEL_F1" \n" | |||
| " subs "J", "J", #1 \n" | |||
| " bne 5b //copy_kernel_S4 \n" | |||
| "6: //copy_kernel_S1: \n" | |||
| " ands "J", "N", #3 \n" | |||
| " ble 8f //copy_kernel_L999 \n" | |||
| "7: //copy_kernel_S10: \n" | |||
| " "KERNEL_F1" \n" | |||
| " subs "J", "J", #1 \n" | |||
| " bne 7b //copy_kernel_S10 \n" | |||
| "8: //copy_kernel_L999: \n" | |||
| : | |||
| : [N_] "r" (n), //%1 | |||
| [X_] "r" (x), //%2 | |||
| [INCX_] "r" (inc_x), //%3 | |||
| [Y_] "r" (y), //%4 | |||
| [INCY_] "r" (inc_y) //%5 | |||
| : "cc", | |||
| "memory", | |||
| "x0", "x1", "x2", "x3", "x4", "x5", | |||
| "d0" | |||
| ); | |||
| return 0; | |||
| } | |||
| #if defined(SMP) | |||
| static int copy_thread_function(BLASLONG n, BLASLONG dummy0, | |||
| BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *y, | |||
| BLASLONG inc_y, FLOAT *dummy3, BLASLONG dummy4) | |||
| { | |||
| do_copy(n, x, inc_x, y, inc_y); | |||
| return 0; | |||
| } | |||
| #endif | |||
| int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| { | |||
| #if defined(SMP) | |||
| int nthreads; | |||
| FLOAT dummy_alpha; | |||
| #endif | |||
| if (n <= 0) return 0; | |||
| #if defined(SMP) | |||
| if (inc_x == 0 || n <= 10000) | |||
| nthreads = 1; | |||
| else | |||
| nthreads = num_cpu_avail(1); | |||
| if (nthreads == 1) { | |||
| do_copy(n, x, inc_x, y, inc_y); | |||
| } else { | |||
| int mode = 0; | |||
| #if !defined(COMPLEX) | |||
| mode = BLAS_REAL; | |||
| #else | |||
| mode = BLAS_COMPLEX; | |||
| #endif | |||
| #if !defined(DOUBLE) | |||
| mode |= BLAS_SINGLE; | |||
| #else | |||
| mode |= BLAS_DOUBLE; | |||
| #endif | |||
| blas_level1_thread(mode, n, 0, 0, &dummy_alpha, | |||
| x, inc_x, y, inc_y, NULL, 0, | |||
| ( void *)copy_thread_function, nthreads); | |||
| } | |||
| #else | |||
| do_copy(n, x, inc_x, y, inc_y); | |||
| #endif | |||
| return 0; | |||
| } | |||
| @@ -152,7 +152,9 @@ static FLOAT dasum_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| " cmp "J", xzr \n" | |||
| " beq 3f //asum_kernel_F1 \n" | |||
| #if !(defined(__clang__) && defined(OS_WINDOWS)) | |||
| ".align 5 \n" | |||
| #endif | |||
| "2: //asum_kernel_F32: \n" | |||
| " "KERNEL_F32" \n" | |||
| " subs "J", "J", #1 \n" | |||
| @@ -213,7 +213,7 @@ CNAME(BLASLONG M, | |||
| const BLASLONG n2 = N & -2; | |||
| const BLASLONG n8 = N & -8; | |||
| const int pack_a = M >= v_size2 && N >= 8 && K >= 8 ? 1 : 0; | |||
| const int pack_a = M >= v_size2 && N >= 8 ? 1 : 0; | |||
| FLOAT* packed_a = | |||
| (pack_a) ? packed_a = (FLOAT*)malloc(K * v_size2 * sizeof(FLOAT)) : NULL; | |||
| @@ -219,7 +219,7 @@ CNAME(BLASLONG M, | |||
| const BLASLONG n4 = N & -4; | |||
| const BLASLONG n2 = N & -2; | |||
| const int pack_a = M >= v_size2 && N >= 8 && K >= 8 ? 1 : 0; | |||
| const int pack_a = M >= v_size2 && N >= 8 ? 1 : 0; | |||
| FLOAT* packed_a = | |||
| (pack_a) ? packed_a = (FLOAT*)malloc(K * v_size2 * sizeof(FLOAT)) : NULL; | |||
| @@ -48,6 +48,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n, | |||
| BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, | |||
| void *c, BLASLONG ldc, int (*function)(), int nthreads); | |||
| #ifdef DYNAMIC_ARCH | |||
| extern char* gotoblas_corename(void); | |||
| #endif | |||
| #if defined(DYNAMIC_ARCH) || defined(NEOVERSEV1) | |||
| static inline int get_dot_optimal_nthreads_neoversev1(BLASLONG N, int ncpu) { | |||
| #ifdef DOUBLE | |||
| return (N <= 10000L) ? 1 | |||
| : (N <= 64500L) ? 1 | |||
| : (N <= 100000L) ? MIN(ncpu, 2) | |||
| : (N <= 150000L) ? MIN(ncpu, 4) | |||
| : (N <= 260000L) ? MIN(ncpu, 8) | |||
| : (N <= 360000L) ? MIN(ncpu, 16) | |||
| : (N <= 520000L) ? MIN(ncpu, 24) | |||
| : (N <= 1010000L) ? MIN(ncpu, 56) | |||
| : ncpu; | |||
| #else | |||
| return (N <= 10000L) ? 1 | |||
| : (N <= 110000L) ? 1 | |||
| : (N <= 200000L) ? MIN(ncpu, 2) | |||
| : (N <= 280000L) ? MIN(ncpu, 4) | |||
| : (N <= 520000L) ? MIN(ncpu, 8) | |||
| : (N <= 830000L) ? MIN(ncpu, 16) | |||
| : (N <= 1010000L) ? MIN(ncpu, 24) | |||
| : ncpu; | |||
| #endif | |||
| } | |||
| #endif | |||
| static inline int get_dot_optimal_nthreads(BLASLONG n) { | |||
| int ncpu = num_cpu_avail(1); | |||
| #if defined(NEOVERSEV1) && !defined(COMPLEX) && !defined(BFLOAT16) | |||
| return get_dot_optimal_nthreads_neoversev1(n, ncpu); | |||
| #elif defined(DYNAMIC_ARCH) && !defined(COMPLEX) && !defined(BFLOAT16) | |||
| if (strcmp(gotoblas_corename(), "neoversev1") == 0) { | |||
| return get_dot_optimal_nthreads_neoversev1(n, ncpu); | |||
| } | |||
| #endif | |||
| // Default case | |||
| if (n <= 10000L) | |||
| return 1; | |||
| else | |||
| return num_cpu_avail(1); | |||
| } | |||
| #endif | |||
| static RETURN_TYPE dot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| @@ -85,10 +132,10 @@ RETURN_TYPE CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y | |||
| RETURN_TYPE dot = 0.0; | |||
| #if defined(SMP) | |||
| if (inc_x == 0 || inc_y == 0 || n <= 10000) | |||
| if (inc_x == 0 || inc_y == 0) | |||
| nthreads = 1; | |||
| else | |||
| nthreads = num_cpu_avail(1); | |||
| nthreads = get_dot_optimal_nthreads(n); | |||
| if (nthreads == 1) { | |||
| dot = dot_compute(n, x, inc_x, y, inc_y); | |||
| @@ -105,7 +152,7 @@ RETURN_TYPE CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y | |||
| blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, | |||
| x, inc_x, y, inc_y, result, 0, | |||
| ( void *)dot_thread_function, nthreads); | |||
| (void *)dot_thread_function, nthreads); | |||
| ptr = (RETURN_TYPE *)result; | |||
| for (i = 0; i < nthreads; i++) { | |||
| @@ -134,7 +134,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| " fadd v4.4s, v4.4s, v6.4s \n" \ | |||
| " fadd v0.4s, v0.4s, v4.4s \n" \ | |||
| " faddp v0.4s, v0.4s, v0.4s \n" \ | |||
| " faddp v0.4s, v0.4s, v0.4s \n" | |||
| " faddp "OUT", v0.2s \n" | |||
| #else /* !defined(DSDOT) */ | |||
| #define KERNEL_F1 \ | |||
| @@ -285,8 +285,9 @@ static RETURN_TYPE dot_kernel_asimd(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT | |||
| " asr %[J_], %[N_], #"N_DIV_SHIFT" \n" | |||
| " cmp %[J_], xzr \n" | |||
| " beq 3f //dot_kernel_F1 \n" | |||
| #if !(defined(__clang__) && defined(OS_WINDOWS)) | |||
| " .align 5 \n" | |||
| #endif | |||
| "2: //dot_kernel_F: \n" | |||
| " "KERNEL_F" \n" | |||
| " subs %[J_], %[J_], #1 \n" | |||
| @@ -1,5 +1,5 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2024, The OpenBLAS Project | |||
| Copyright (c) 2024-2025, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| @@ -59,23 +59,82 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO | |||
| a_ptr = a; | |||
| if (inc_y == 1) { | |||
| BLASLONG width = n / 3; | |||
| uint64_t sve_size = SV_COUNT(); | |||
| for (j = 0; j < n; j++) { | |||
| SV_TYPE temp_vec = SV_DUP(alpha * x[ix]); | |||
| i = 0; | |||
| svbool_t pg = SV_WHILE(i, m); | |||
| while (svptest_any(SV_TRUE(), pg)) { | |||
| SV_TYPE a_vec = svld1(pg, a_ptr + i); | |||
| svbool_t pg_true = SV_TRUE(); | |||
| svbool_t pg = SV_WHILE(0, m % sve_size); | |||
| FLOAT *a0_ptr = a + lda * width * 0; | |||
| FLOAT *a1_ptr = a + lda * width * 1; | |||
| FLOAT *a2_ptr = a + lda * width * 2; | |||
| for (j = 0; j < width; j++) { | |||
| for (i = 0; (i + sve_size - 1) < m; i += sve_size) { | |||
| ix = j * inc_x; | |||
| SV_TYPE x0_vec = SV_DUP(alpha * x[ix + (inc_x * width * 0)]); | |||
| SV_TYPE x1_vec = SV_DUP(alpha * x[ix + (inc_x * width * 1)]); | |||
| SV_TYPE x2_vec = SV_DUP(alpha * x[ix + (inc_x * width * 2)]); | |||
| SV_TYPE a00_vec = svld1(pg_true, a0_ptr + i); | |||
| SV_TYPE a01_vec = svld1(pg_true, a1_ptr + i); | |||
| SV_TYPE a02_vec = svld1(pg_true, a2_ptr + i); | |||
| SV_TYPE y_vec = svld1(pg_true, y + i); | |||
| y_vec = svmla_lane(y_vec, a00_vec, x0_vec, 0); | |||
| y_vec = svmla_lane(y_vec, a01_vec, x1_vec, 0); | |||
| y_vec = svmla_lane(y_vec, a02_vec, x2_vec, 0); | |||
| svst1(pg_true, y + i, y_vec); | |||
| } | |||
| if (i < m) { | |||
| SV_TYPE x0_vec = SV_DUP(alpha * x[ix + (inc_x * width * 0)]); | |||
| SV_TYPE x1_vec = SV_DUP(alpha * x[ix + (inc_x * width * 1)]); | |||
| SV_TYPE x2_vec = SV_DUP(alpha * x[ix + (inc_x * width * 2)]); | |||
| SV_TYPE a00_vec = svld1(pg, a0_ptr + i); | |||
| SV_TYPE a01_vec = svld1(pg, a1_ptr + i); | |||
| SV_TYPE a02_vec = svld1(pg, a2_ptr + i); | |||
| SV_TYPE y_vec = svld1(pg, y + i); | |||
| y_vec = svmla_x(pg, y_vec, temp_vec, a_vec); | |||
| y_vec = svmla_m(pg, y_vec, a00_vec, x0_vec); | |||
| y_vec = svmla_m(pg, y_vec, a01_vec, x1_vec); | |||
| y_vec = svmla_m(pg, y_vec, a02_vec, x2_vec); | |||
| ix += inc_x; | |||
| svst1(pg, y + i, y_vec); | |||
| i += sve_size; | |||
| pg = SV_WHILE(i, m); | |||
| } | |||
| a0_ptr += lda; | |||
| a1_ptr += lda; | |||
| a2_ptr += lda; | |||
| } | |||
| a_ptr = a2_ptr; | |||
| for (j = width * 3; j < n; j++) { | |||
| ix = j * inc_x; | |||
| for (i = 0; (i + sve_size - 1) < m; i += sve_size) { | |||
| SV_TYPE y_vec = svld1(pg_true, y + i); | |||
| SV_TYPE x_vec = SV_DUP(alpha * x[(ix)]); | |||
| SV_TYPE a_vec = svld1(pg_true, a_ptr + i); | |||
| y_vec = svmla_x(pg_true, y_vec, a_vec, x_vec); | |||
| svst1(pg_true, y + i, y_vec); | |||
| } | |||
| if (i < m) { | |||
| SV_TYPE y_vec = svld1(pg, y + i); | |||
| SV_TYPE x_vec = SV_DUP(alpha * x[(ix)]); | |||
| SV_TYPE a_vec = svld1(pg, a_ptr + i); | |||
| y_vec = svmla_m(pg, y_vec, a_vec, x_vec); | |||
| svst1(pg, y + i, y_vec); | |||
| } | |||
| a_ptr += lda; | |||
| ix += inc_x; | |||
| } | |||
| return(0); | |||
| return (0); | |||
| } | |||
| for (j = 0; j < n; j++) { | |||
| @@ -89,4 +148,4 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO | |||
| ix += inc_x; | |||
| } | |||
| return (0); | |||
| } | |||
| } | |||
| @@ -0,0 +1,138 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2025, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written | |||
| permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include <arm_sve.h> | |||
| #include "common.h" | |||
| #ifdef DOUBLE | |||
| #define SV_COUNT svcntd | |||
| #define SV_TYPE svfloat64_t | |||
| #define SV_TRUE svptrue_b64 | |||
| #define SV_WHILE svwhilelt_b64_s64 | |||
| #define SV_DUP svdup_f64 | |||
| #else | |||
| #define SV_COUNT svcntw | |||
| #define SV_TYPE svfloat32_t | |||
| #define SV_TRUE svptrue_b32 | |||
| #define SV_WHILE svwhilelt_b32_s64 | |||
| #define SV_DUP svdup_f32 | |||
| #endif | |||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, | |||
| BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, | |||
| FLOAT *buffer) | |||
| { | |||
| BLASLONG i; | |||
| BLASLONG ix,iy; | |||
| BLASLONG j; | |||
| FLOAT *a_ptr; | |||
| FLOAT temp; | |||
| ix = 0; | |||
| a_ptr = a; | |||
| if (inc_y == 1) { | |||
| BLASLONG width = (n + 3 - 1) / 3; | |||
| FLOAT *a0_ptr = a_ptr + lda * width * 0; | |||
| FLOAT *a1_ptr = a_ptr + lda * width * 1; | |||
| FLOAT *a2_ptr = a_ptr + lda * width * 2; | |||
| FLOAT *x0_ptr = x + inc_x * width * 0; | |||
| FLOAT *x1_ptr = x + inc_x * width * 1; | |||
| FLOAT *x2_ptr = x + inc_x * width * 2; | |||
| for (j = 0; j < width; j++) { | |||
| svbool_t pg00 = ((j + width * 0) < n) ? SV_TRUE() : svpfalse(); | |||
| svbool_t pg01 = ((j + width * 1) < n) ? SV_TRUE() : svpfalse(); | |||
| svbool_t pg02 = ((j + width * 2) < n) ? SV_TRUE() : svpfalse(); | |||
| SV_TYPE temp0_vec = ((j + width * 0) < n) ? SV_DUP(alpha * x0_ptr[ix]) : SV_DUP(0.0); | |||
| SV_TYPE temp1_vec = ((j + width * 1) < n) ? SV_DUP(alpha * x1_ptr[ix]) : SV_DUP(0.0); | |||
| SV_TYPE temp2_vec = ((j + width * 2) < n) ? SV_DUP(alpha * x2_ptr[ix]) : SV_DUP(0.0); | |||
| i = 0; | |||
| BLASLONG sve_size = SV_COUNT(); | |||
| while ((i + sve_size * 1 - 1) < m) { | |||
| SV_TYPE y0_vec = svld1_vnum(SV_TRUE(), y + i, 0); | |||
| SV_TYPE a00_vec = svld1_vnum(pg00, a0_ptr + i, 0); | |||
| SV_TYPE a01_vec = svld1_vnum(pg01, a1_ptr + i, 0); | |||
| SV_TYPE a02_vec = svld1_vnum(pg02, a2_ptr + i, 0); | |||
| y0_vec = svmla_m(pg00, y0_vec, temp0_vec, a00_vec); | |||
| y0_vec = svmla_m(pg01, y0_vec, temp1_vec, a01_vec); | |||
| y0_vec = svmla_m(pg02, y0_vec, temp2_vec, a02_vec); | |||
| svst1_vnum(SV_TRUE(), y + i, 0, y0_vec); | |||
| i += sve_size * 1; | |||
| } | |||
| if (i < m) { | |||
| svbool_t pg0 = SV_WHILE(i + sve_size * 0, m); | |||
| pg00 = svand_z(SV_TRUE(), pg0, pg00); | |||
| pg01 = svand_z(SV_TRUE(), pg0, pg01); | |||
| pg02 = svand_z(SV_TRUE(), pg0, pg02); | |||
| SV_TYPE y0_vec = svld1_vnum(pg0, y + i, 0); | |||
| SV_TYPE a00_vec = svld1_vnum(pg00, a0_ptr + i, 0); | |||
| SV_TYPE a01_vec = svld1_vnum(pg01, a1_ptr + i, 0); | |||
| SV_TYPE a02_vec = svld1_vnum(pg02, a2_ptr + i, 0); | |||
| y0_vec = svmla_m(pg00, y0_vec, temp0_vec, a00_vec); | |||
| y0_vec = svmla_m(pg01, y0_vec, temp1_vec, a01_vec); | |||
| y0_vec = svmla_m(pg02, y0_vec, temp2_vec, a02_vec); | |||
| svst1_vnum(pg0, y + i, 0, y0_vec); | |||
| } | |||
| a0_ptr += lda; | |||
| a1_ptr += lda; | |||
| a2_ptr += lda; | |||
| ix += inc_x; | |||
| } | |||
| return(0); | |||
| } | |||
| for (j = 0; j < n; j++) { | |||
| temp = alpha * x[ix]; | |||
| iy = 0; | |||
| for (i = 0; i < m; i++) { | |||
| y[iy] += temp * a_ptr[i]; | |||
| iy += inc_y; | |||
| } | |||
| a_ptr += lda; | |||
| ix += inc_x; | |||
| } | |||
| return (0); | |||
| } | |||
| @@ -0,0 +1,207 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2025, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written | |||
| permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include <arm_sve.h> | |||
| #include "common.h" | |||
| #ifdef DOUBLE | |||
| #define SV_COUNT svcntd | |||
| #define SV_TYPE svfloat64_t | |||
| #define SV_TRUE svptrue_b64 | |||
| #define SV_WHILE svwhilelt_b64_s64 | |||
| #define SV_DUP svdup_f64 | |||
| #else | |||
| #define SV_COUNT svcntw | |||
| #define SV_TYPE svfloat32_t | |||
| #define SV_TRUE svptrue_b32 | |||
| #define SV_WHILE svwhilelt_b32_s64 | |||
| #define SV_DUP svdup_f32 | |||
| #endif | |||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, | |||
| BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, | |||
| FLOAT *buffer) | |||
| { | |||
| BLASLONG i; | |||
| BLASLONG ix,iy; | |||
| BLASLONG j; | |||
| FLOAT *a_ptr; | |||
| FLOAT temp; | |||
| ix = 0; | |||
| a_ptr = a; | |||
| if (inc_y == 1) { | |||
| BLASLONG width = (n + 3 - 1) / 3; | |||
| FLOAT *a0_ptr = a_ptr + lda * width * 0; | |||
| FLOAT *a1_ptr = a_ptr + lda * width * 1; | |||
| FLOAT *a2_ptr = a_ptr + lda * width * 2; | |||
| FLOAT *x0_ptr = x + inc_x * width * 0; | |||
| FLOAT *x1_ptr = x + inc_x * width * 1; | |||
| FLOAT *x2_ptr = x + inc_x * width * 2; | |||
| for (j = 0; j < width; j++) { | |||
| svbool_t pg00 = ((j + width * 0) < n) ? SV_TRUE() : svpfalse(); | |||
| svbool_t pg10 = ((j + width * 0) < n) ? SV_TRUE() : svpfalse(); | |||
| svbool_t pg20 = ((j + width * 0) < n) ? SV_TRUE() : svpfalse(); | |||
| svbool_t pg30 = ((j + width * 0) < n) ? SV_TRUE() : svpfalse(); | |||
| svbool_t pg01 = ((j + width * 1) < n) ? SV_TRUE() : svpfalse(); | |||
| svbool_t pg11 = ((j + width * 1) < n) ? SV_TRUE() : svpfalse(); | |||
| svbool_t pg21 = ((j + width * 1) < n) ? SV_TRUE() : svpfalse(); | |||
| svbool_t pg31 = ((j + width * 1) < n) ? SV_TRUE() : svpfalse(); | |||
| svbool_t pg02 = ((j + width * 2) < n) ? SV_TRUE() : svpfalse(); | |||
| svbool_t pg12 = ((j + width * 2) < n) ? SV_TRUE() : svpfalse(); | |||
| svbool_t pg22 = ((j + width * 2) < n) ? SV_TRUE() : svpfalse(); | |||
| svbool_t pg32 = ((j + width * 2) < n) ? SV_TRUE() : svpfalse(); | |||
| SV_TYPE temp0_vec = ((j + width * 0) < n) ? SV_DUP(alpha * x0_ptr[ix]) : SV_DUP(0.0); | |||
| SV_TYPE temp1_vec = ((j + width * 1) < n) ? SV_DUP(alpha * x1_ptr[ix]) : SV_DUP(0.0); | |||
| SV_TYPE temp2_vec = ((j + width * 2) < n) ? SV_DUP(alpha * x2_ptr[ix]) : SV_DUP(0.0); | |||
| i = 0; | |||
| BLASLONG sve_size = SV_COUNT(); | |||
| while ((i + sve_size * 4 - 1) < m) { | |||
| SV_TYPE y0_vec = svld1_vnum(SV_TRUE(), y + i, 0); | |||
| SV_TYPE y1_vec = svld1_vnum(SV_TRUE(), y + i, 1); | |||
| SV_TYPE y2_vec = svld1_vnum(SV_TRUE(), y + i, 2); | |||
| SV_TYPE y3_vec = svld1_vnum(SV_TRUE(), y + i, 3); | |||
| SV_TYPE a00_vec = svld1_vnum(pg00, a0_ptr + i, 0); | |||
| SV_TYPE a10_vec = svld1_vnum(pg10, a0_ptr + i, 1); | |||
| SV_TYPE a20_vec = svld1_vnum(pg20, a0_ptr + i, 2); | |||
| SV_TYPE a30_vec = svld1_vnum(pg30, a0_ptr + i, 3); | |||
| SV_TYPE a01_vec = svld1_vnum(pg01, a1_ptr + i, 0); | |||
| SV_TYPE a11_vec = svld1_vnum(pg11, a1_ptr + i, 1); | |||
| SV_TYPE a21_vec = svld1_vnum(pg21, a1_ptr + i, 2); | |||
| SV_TYPE a31_vec = svld1_vnum(pg31, a1_ptr + i, 3); | |||
| SV_TYPE a02_vec = svld1_vnum(pg02, a2_ptr + i, 0); | |||
| SV_TYPE a12_vec = svld1_vnum(pg12, a2_ptr + i, 1); | |||
| SV_TYPE a22_vec = svld1_vnum(pg22, a2_ptr + i, 2); | |||
| SV_TYPE a32_vec = svld1_vnum(pg32, a2_ptr + i, 3); | |||
| y0_vec = svmla_m(pg00, y0_vec, temp0_vec, a00_vec); | |||
| y1_vec = svmla_m(pg10, y1_vec, temp0_vec, a10_vec); | |||
| y2_vec = svmla_m(pg20, y2_vec, temp0_vec, a20_vec); | |||
| y3_vec = svmla_m(pg30, y3_vec, temp0_vec, a30_vec); | |||
| y0_vec = svmla_m(pg01, y0_vec, temp1_vec, a01_vec); | |||
| y1_vec = svmla_m(pg11, y1_vec, temp1_vec, a11_vec); | |||
| y2_vec = svmla_m(pg21, y2_vec, temp1_vec, a21_vec); | |||
| y3_vec = svmla_m(pg31, y3_vec, temp1_vec, a31_vec); | |||
| y0_vec = svmla_m(pg02, y0_vec, temp2_vec, a02_vec); | |||
| y1_vec = svmla_m(pg12, y1_vec, temp2_vec, a12_vec); | |||
| y2_vec = svmla_m(pg22, y2_vec, temp2_vec, a22_vec); | |||
| y3_vec = svmla_m(pg32, y3_vec, temp2_vec, a32_vec); | |||
| svst1_vnum(SV_TRUE(), y + i, 0, y0_vec); | |||
| svst1_vnum(SV_TRUE(), y + i, 1, y1_vec); | |||
| svst1_vnum(SV_TRUE(), y + i, 2, y2_vec); | |||
| svst1_vnum(SV_TRUE(), y + i, 3, y3_vec); | |||
| i += sve_size * 4; | |||
| } | |||
| if (i < m) { | |||
| svbool_t pg0 = SV_WHILE(i + sve_size * 0, m); | |||
| svbool_t pg1 = SV_WHILE(i + sve_size * 1, m); | |||
| svbool_t pg2 = SV_WHILE(i + sve_size * 2, m); | |||
| svbool_t pg3 = SV_WHILE(i + sve_size * 3, m); | |||
| pg00 = svand_z(SV_TRUE(), pg0, pg00); | |||
| pg10 = svand_z(SV_TRUE(), pg1, pg10); | |||
| pg20 = svand_z(SV_TRUE(), pg2, pg20); | |||
| pg30 = svand_z(SV_TRUE(), pg3, pg30); | |||
| pg01 = svand_z(SV_TRUE(), pg0, pg01); | |||
| pg11 = svand_z(SV_TRUE(), pg1, pg11); | |||
| pg21 = svand_z(SV_TRUE(), pg2, pg21); | |||
| pg31 = svand_z(SV_TRUE(), pg3, pg31); | |||
| pg02 = svand_z(SV_TRUE(), pg0, pg02); | |||
| pg12 = svand_z(SV_TRUE(), pg1, pg12); | |||
| pg22 = svand_z(SV_TRUE(), pg2, pg22); | |||
| pg32 = svand_z(SV_TRUE(), pg3, pg32); | |||
| SV_TYPE y0_vec = svld1_vnum(pg0, y + i, 0); | |||
| SV_TYPE y1_vec = svld1_vnum(pg1, y + i, 1); | |||
| SV_TYPE y2_vec = svld1_vnum(pg2, y + i, 2); | |||
| SV_TYPE y3_vec = svld1_vnum(pg3, y + i, 3); | |||
| SV_TYPE a00_vec = svld1_vnum(pg00, a0_ptr + i, 0); | |||
| SV_TYPE a10_vec = svld1_vnum(pg10, a0_ptr + i, 1); | |||
| SV_TYPE a20_vec = svld1_vnum(pg20, a0_ptr + i, 2); | |||
| SV_TYPE a30_vec = svld1_vnum(pg30, a0_ptr + i, 3); | |||
| SV_TYPE a01_vec = svld1_vnum(pg01, a1_ptr + i, 0); | |||
| SV_TYPE a11_vec = svld1_vnum(pg11, a1_ptr + i, 1); | |||
| SV_TYPE a21_vec = svld1_vnum(pg21, a1_ptr + i, 2); | |||
| SV_TYPE a31_vec = svld1_vnum(pg31, a1_ptr + i, 3); | |||
| SV_TYPE a02_vec = svld1_vnum(pg02, a2_ptr + i, 0); | |||
| SV_TYPE a12_vec = svld1_vnum(pg12, a2_ptr + i, 1); | |||
| SV_TYPE a22_vec = svld1_vnum(pg22, a2_ptr + i, 2); | |||
| SV_TYPE a32_vec = svld1_vnum(pg32, a2_ptr + i, 3); | |||
| y0_vec = svmla_m(pg00, y0_vec, temp0_vec, a00_vec); | |||
| y1_vec = svmla_m(pg10, y1_vec, temp0_vec, a10_vec); | |||
| y2_vec = svmla_m(pg20, y2_vec, temp0_vec, a20_vec); | |||
| y3_vec = svmla_m(pg30, y3_vec, temp0_vec, a30_vec); | |||
| y0_vec = svmla_m(pg01, y0_vec, temp1_vec, a01_vec); | |||
| y1_vec = svmla_m(pg11, y1_vec, temp1_vec, a11_vec); | |||
| y2_vec = svmla_m(pg21, y2_vec, temp1_vec, a21_vec); | |||
| y3_vec = svmla_m(pg31, y3_vec, temp1_vec, a31_vec); | |||
| y0_vec = svmla_m(pg02, y0_vec, temp2_vec, a02_vec); | |||
| y1_vec = svmla_m(pg12, y1_vec, temp2_vec, a12_vec); | |||
| y2_vec = svmla_m(pg22, y2_vec, temp2_vec, a22_vec); | |||
| y3_vec = svmla_m(pg32, y3_vec, temp2_vec, a32_vec); | |||
| svst1_vnum(pg0, y + i, 0, y0_vec); | |||
| svst1_vnum(pg1, y + i, 1, y1_vec); | |||
| svst1_vnum(pg2, y + i, 2, y2_vec); | |||
| svst1_vnum(pg3, y + i, 3, y3_vec); | |||
| } | |||
| a0_ptr += lda; | |||
| a1_ptr += lda; | |||
| a2_ptr += lda; | |||
| ix += inc_x; | |||
| } | |||
| return(0); | |||
| } | |||
| for (j = 0; j < n; j++) { | |||
| temp = alpha * x[ix]; | |||
| iy = 0; | |||
| for (i = 0; i < m; i++) { | |||
| y[iy] += temp * a_ptr[i]; | |||
| iy += inc_y; | |||
| } | |||
| a_ptr += lda; | |||
| ix += inc_x; | |||
| } | |||
| return (0); | |||
| } | |||
| @@ -1,5 +1,5 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2024, The OpenBLAS Project | |||
| Copyright (c) 2024, 2025 The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| @@ -56,12 +56,16 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, | |||
| BLASLONG ix,iy; | |||
| BLASLONG j; | |||
| FLOAT *a_ptr; | |||
| FLOAT *y_ptr; | |||
| FLOAT temp; | |||
| iy = 0; | |||
| if (inc_x == 1) { | |||
| BLASLONG width = (n + 3 - 1) / 3; | |||
| BLASLONG width = n / 3; | |||
| BLASLONG sve_size = SV_COUNT(); | |||
| svbool_t pg_true = SV_TRUE(); | |||
| svbool_t pg = SV_WHILE(0, m % sve_size); | |||
| FLOAT *a0_ptr = a + lda * width * 0; | |||
| FLOAT *a1_ptr = a + lda * width * 1; | |||
| @@ -72,60 +76,41 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, | |||
| FLOAT *y2_ptr = y + inc_y * width * 2; | |||
| for (j = 0; j < width; j++) { | |||
| svbool_t pg00 = ((j + width * 0) < n) ? SV_TRUE() : svpfalse(); | |||
| svbool_t pg01 = ((j + width * 1) < n) ? SV_TRUE() : svpfalse(); | |||
| svbool_t pg02 = ((j + width * 2) < n) ? SV_TRUE() : svpfalse(); | |||
| SV_TYPE temp00_vec = SV_DUP(0.0); | |||
| SV_TYPE temp01_vec = SV_DUP(0.0); | |||
| SV_TYPE temp02_vec = SV_DUP(0.0); | |||
| i = 0; | |||
| BLASLONG sve_size = SV_COUNT(); | |||
| while ((i + sve_size * 1 - 1) < m) { | |||
| SV_TYPE x0_vec = svld1_vnum(SV_TRUE(), x + i, 0); | |||
| SV_TYPE x0_vec = svld1(pg_true, x + i); | |||
| SV_TYPE a00_vec = svld1_vnum(pg00, a0_ptr + i, 0); | |||
| SV_TYPE a01_vec = svld1_vnum(pg01, a1_ptr + i, 0); | |||
| SV_TYPE a02_vec = svld1_vnum(pg02, a2_ptr + i, 0); | |||
| SV_TYPE a00_vec = svld1(pg_true, a0_ptr + i); | |||
| SV_TYPE a01_vec = svld1(pg_true, a1_ptr + i); | |||
| SV_TYPE a02_vec = svld1(pg_true, a2_ptr + i); | |||
| temp00_vec = svmla_m(pg00, temp00_vec, a00_vec, x0_vec); | |||
| temp01_vec = svmla_m(pg01, temp01_vec, a01_vec, x0_vec); | |||
| temp02_vec = svmla_m(pg02, temp02_vec, a02_vec, x0_vec); | |||
| temp00_vec = svmla_x(pg_true, temp00_vec, a00_vec, x0_vec); | |||
| temp01_vec = svmla_x(pg_true, temp01_vec, a01_vec, x0_vec); | |||
| temp02_vec = svmla_x(pg_true, temp02_vec, a02_vec, x0_vec); | |||
| i += sve_size * 1; | |||
| } | |||
| if (i < m) { | |||
| svbool_t pg0 = SV_WHILE(i + sve_size * 0, m); | |||
| pg00 = svand_z(SV_TRUE(), pg0, pg00); | |||
| pg01 = svand_z(SV_TRUE(), pg0, pg01); | |||
| pg02 = svand_z(SV_TRUE(), pg0, pg02); | |||
| SV_TYPE x0_vec = svld1(pg, x + i); | |||
| SV_TYPE x0_vec = svld1_vnum(pg0, x + i, 0); | |||
| SV_TYPE a00_vec = svld1(pg, a0_ptr + i); | |||
| SV_TYPE a01_vec = svld1(pg, a1_ptr + i); | |||
| SV_TYPE a02_vec = svld1(pg, a2_ptr + i); | |||
| SV_TYPE a00_vec = svld1_vnum(pg00, a0_ptr + i, 0); | |||
| SV_TYPE a01_vec = svld1_vnum(pg01, a1_ptr + i, 0); | |||
| SV_TYPE a02_vec = svld1_vnum(pg02, a2_ptr + i, 0); | |||
| temp00_vec = svmla_m(pg00, temp00_vec, a00_vec, x0_vec); | |||
| temp01_vec = svmla_m(pg01, temp01_vec, a01_vec, x0_vec); | |||
| temp02_vec = svmla_m(pg02, temp02_vec, a02_vec, x0_vec); | |||
| temp00_vec = svmla_m(pg, temp00_vec, a00_vec, x0_vec); | |||
| temp01_vec = svmla_m(pg, temp01_vec, a01_vec, x0_vec); | |||
| temp02_vec = svmla_m(pg, temp02_vec, a02_vec, x0_vec); | |||
| } | |||
| if ((j + width * 0) < n) { | |||
| temp = svaddv(SV_TRUE(), temp00_vec); | |||
| y0_ptr[iy] += alpha * temp; | |||
| } | |||
| if ((j + width * 1) < n) { | |||
| temp = svaddv(SV_TRUE(), temp01_vec); | |||
| y1_ptr[iy] += alpha * temp; | |||
| } | |||
| if ((j + width * 2) < n) { | |||
| temp = svaddv(SV_TRUE(), temp02_vec); | |||
| y2_ptr[iy] += alpha * temp; | |||
| } | |||
| y0_ptr[iy] += alpha * svaddv(pg_true, temp00_vec); | |||
| y1_ptr[iy] += alpha * svaddv(pg_true, temp01_vec); | |||
| y2_ptr[iy] += alpha * svaddv(pg_true, temp02_vec); | |||
| iy += inc_y; | |||
| a0_ptr += lda; | |||
| @@ -133,6 +118,37 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, | |||
| a2_ptr += lda; | |||
| } | |||
| a_ptr = a2_ptr; | |||
| y_ptr = y2_ptr; | |||
| for (j = width * 3; j < n; j++) { | |||
| SV_TYPE temp_vec = SV_DUP(0.0); | |||
| i = 0; | |||
| while ((i + sve_size * 1 - 1) < m) { | |||
| SV_TYPE x_vec = svld1(pg_true, x + i); | |||
| SV_TYPE a_vec = svld1(pg_true, a_ptr + i); | |||
| temp_vec = svmla_x(pg_true, temp_vec, a_vec, x_vec); | |||
| i += sve_size * 1; | |||
| } | |||
| if (i < m) { | |||
| SV_TYPE x_vec = svld1(pg, x + i); | |||
| SV_TYPE a_vec = svld1(pg, a_ptr + i); | |||
| temp_vec = svmla_m(pg, temp_vec, a_vec, x_vec); | |||
| } | |||
| y_ptr[iy] += alpha * svaddv(pg_true, temp_vec); | |||
| iy += inc_y; | |||
| a_ptr += lda; | |||
| } | |||
| return(0); | |||
| } | |||
| @@ -153,8 +153,9 @@ static FLOAT sasum_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| " asr "J", "N", #6 \n" | |||
| " cmp "J", xzr \n" | |||
| " beq 3f //asum_kernel_F1 \n" | |||
| #if !(defined(__clang__) && defined(OS_WINDOWS)) | |||
| ".align 5 \n" | |||
| #endif | |||
| "2: //asum_kernel_F64: \n" | |||
| " "KERNEL_F64" \n" | |||
| " subs "J", "J", #1 \n" | |||
| @@ -0,0 +1,83 @@ | |||
| /*************************************************************************** | |||
| * Copyright (c) 2024, The OpenBLAS Project | |||
| * All rights reserved. | |||
| * Redistribution and use in source and binary forms, with or without | |||
| * modification, are permitted provided that the following conditions are | |||
| * met: | |||
| * 1. Redistributions of source code must retain the above copyright | |||
| * notice, this list of conditions and the following disclaimer. | |||
| * 2. Redistributions in binary form must reproduce the above copyright | |||
| * notice, this list of conditions and the following disclaimer in | |||
| * the documentation and/or other materials provided with the | |||
| * distribution. | |||
| * 3. Neither the name of the OpenBLAS project nor the names of | |||
| * its contributors may be used to endorse or promote products | |||
| * derived from this software without specific prior written permission. | |||
| * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |||
| * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | |||
| * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | |||
| * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | |||
| * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |||
| * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | |||
| * POSSIBILITY OF SUCH DAMAGE. | |||
| * *****************************************************************************/ | |||
| #include "common.h" | |||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, IFLOAT *dummy2, | |||
| BLASLONG dummy3, IFLOAT *dummy4, BLASLONG dummy5, FLOAT *c, | |||
| BLASLONG ldc) { | |||
| BLASLONG i, j; | |||
| BLASLONG chunk, remain; | |||
| FLOAT *c_offset1, *c_offset; | |||
| c_offset = c; | |||
| chunk = m >> 3; | |||
| remain = m & 7; | |||
| if (beta == ZERO) { | |||
| for (j = n; j > 0; j--) { | |||
| c_offset1 = c_offset; | |||
| c_offset += ldc; | |||
| for (i = chunk; i > 0; i--) { | |||
| *(c_offset1 + 0) = ZERO; | |||
| *(c_offset1 + 1) = ZERO; | |||
| *(c_offset1 + 2) = ZERO; | |||
| *(c_offset1 + 3) = ZERO; | |||
| *(c_offset1 + 4) = ZERO; | |||
| *(c_offset1 + 5) = ZERO; | |||
| *(c_offset1 + 6) = ZERO; | |||
| *(c_offset1 + 7) = ZERO; | |||
| c_offset1 += 8; | |||
| } | |||
| for (i = remain; i > 0; i--) { | |||
| *c_offset1 = ZERO; | |||
| c_offset1++; | |||
| } | |||
| } | |||
| } else { | |||
| for (j = n; j > 0; j--) { | |||
| c_offset1 = c_offset; | |||
| c_offset += ldc; | |||
| for (i = chunk; i > 0; i--) { | |||
| *(c_offset1 + 0) *= beta; | |||
| *(c_offset1 + 1) *= beta; | |||
| *(c_offset1 + 2) *= beta; | |||
| *(c_offset1 + 3) *= beta; | |||
| *(c_offset1 + 4) *= beta; | |||
| *(c_offset1 + 5) *= beta; | |||
| *(c_offset1 + 6) *= beta; | |||
| *(c_offset1 + 7) *= beta; | |||
| c_offset1 += 8; | |||
| } | |||
| for (i = remain; i > 0; i--) { | |||
| *c_offset1 *= beta; | |||
| c_offset1++; | |||
| } | |||
| } | |||
| } | |||
| return 0; | |||
| }; | |||
| @@ -0,0 +1,46 @@ | |||
| /*************************************************************************** | |||
| * Copyright (c) 2024-2025, The OpenBLAS Project | |||
| * All rights reserved. | |||
| * Redistribution and use in source and binary forms, with or without | |||
| * modification, are permitted provided that the following conditions are | |||
| * met: | |||
| * 1. Redistributions of source code must retain the above copyright | |||
| * notice, this list of conditions and the following disclaimer. | |||
| * 2. Redistributions in binary form must reproduce the above copyright | |||
| * notice, this list of conditions and the following disclaimer in | |||
| * the documentation and/or other materials provided with the | |||
| * distribution. | |||
| * 3. Neither the name of the OpenBLAS project nor the names of | |||
| * its contributors may be used to endorse or promote products | |||
| * derived from this software without specific prior written permission. | |||
| * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |||
| * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | |||
| * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | |||
| * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | |||
| * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |||
| * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | |||
| * POSSIBILITY OF SUCH DAMAGE. | |||
| * *****************************************************************************/ | |||
| #include <arm_sve.h> | |||
| #include "common.h" | |||
| #define ALPHA_ONE | |||
| #include "sbgemm_kernel_4x4_neoversev1_impl.c" | |||
| #undef ALPHA_ONE | |||
| #include "sbgemm_kernel_4x4_neoversev1_impl.c" | |||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT *A, IFLOAT *B, | |||
| FLOAT *C, BLASLONG ldc) { | |||
| if (alpha == 1.0f) | |||
| return sbgemm_kernel_neoversev1_alpha_one(m, n, k, alpha, A, B, C, ldc); | |||
| else | |||
| return sbgemm_kernel_neoversev1_alpha(m, n, k, alpha, A, B, C, ldc); | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,414 @@ | |||
| /*************************************************************************** | |||
| * Copyright (c) 2024-2025, The OpenBLAS Project | |||
| * All rights reserved. | |||
| * Redistribution and use in source and binary forms, with or without | |||
| * modification, are permitted provided that the following conditions are | |||
| * met: | |||
| * 1. Redistributions of source code must retain the above copyright | |||
| * notice, this list of conditions and the following disclaimer. | |||
| * 2. Redistributions in binary form must reproduce the above copyright | |||
| * notice, this list of conditions and the following disclaimer in | |||
| * the documentation and/or other materials provided with the | |||
| * distribution. | |||
| * 3. Neither the name of the OpenBLAS project nor the names of | |||
| * its contributors may be used to endorse or promote products | |||
| * derived from this software without specific prior written permission. | |||
| * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |||
| * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | |||
| * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | |||
| * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | |||
| * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |||
| * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | |||
| * POSSIBILITY OF SUCH DAMAGE. | |||
| * *****************************************************************************/ | |||
| #include <arm_sve.h> | |||
| #include "common.h" | |||
| #define INIT_C(M, N) mc##M##N = svdup_f32(0); | |||
| #define MATMUL(M, N) mc##M##N = svbfmmla(mc##M##N, ma##M, mb##N); | |||
| #define INIT_C_4x4 \ | |||
| do { \ | |||
| INIT_C(0, 0); \ | |||
| INIT_C(0, 1); \ | |||
| INIT_C(1, 0); \ | |||
| INIT_C(1, 1); \ | |||
| } while (0); | |||
| #ifdef ALPHA_ONE | |||
| #define UPDATE_C(PG, PTR, DST, SRC) \ | |||
| do { \ | |||
| DST = svld1_f32((PG), (PTR)); \ | |||
| DST = svadd_z((PG), SRC, DST); \ | |||
| svst1_f32((PG), (PTR), DST); \ | |||
| } while (0); | |||
| #else | |||
| #define UPDATE_C(PG, PTR, DST, SRC) \ | |||
| do { \ | |||
| DST = svld1_f32((PG), (PTR)); \ | |||
| DST = svmad_z((PG), svalpha, SRC, DST); \ | |||
| svst1_f32((PG), (PTR), DST); \ | |||
| } while (0); | |||
| #endif | |||
| #define ZIP_EVEN_ELEMENTS(PG, mc0, mc1, tmp, vc) \ | |||
| do { \ | |||
| (tmp) = svuzp1_f32((mc0), (mc1)); \ | |||
| (vc) = svcompact_f32((PG), (tmp)); \ | |||
| } while (0) | |||
| #define ZIP_ODD_ELEMENTS(PG, mc0, mc1, tmp, vc) \ | |||
| do { \ | |||
| (tmp) = svuzp2_f32((mc0), (mc1)); \ | |||
| (vc) = svcompact_f32((PG), (tmp)); \ | |||
| } while (0) | |||
| #define ACCUMULATE_LAST4_TO_FIRST4(M, N, TMP) \ | |||
| do { \ | |||
| TMP = svext_f32(mc##M##N, mc##M##N, 4); \ | |||
| mc##M##N = svadd_f32_z(svptrue_b32(), mc##M##N, (TMP)); \ | |||
| } while (0) | |||
| #ifdef ALPHA_ONE | |||
| int sbgemm_kernel_neoversev1_alpha_one(BLASLONG m, BLASLONG n, BLASLONG k, | |||
| FLOAT alpha, IFLOAT *A, IFLOAT *B, | |||
| FLOAT *C, BLASLONG ldc) | |||
| #else | |||
| int sbgemm_kernel_neoversev1_alpha(BLASLONG m, BLASLONG n, BLASLONG k, | |||
| FLOAT alpha, IFLOAT *A, IFLOAT *B, FLOAT *C, | |||
| BLASLONG ldc) | |||
| #endif | |||
| { | |||
| BLASLONG pad_k = (k + 7) & ~7; | |||
| svbfloat16_t ma0, ma1, mb0, mb1; | |||
| svfloat32_t mc00, mc01, mc10, mc11, vc0, vc1, vc2, vc3, oc0, oc1, oc2, oc3; | |||
| svfloat32_t tmp; | |||
| svfloat32_t svalpha = svdup_f32(alpha); | |||
| svbool_t pg16_all = svptrue_b16(); | |||
| svbool_t pg32_first_1 = svwhilelt_b32(0, 1); | |||
| svbool_t pg32_first_2 = svwhilelt_b32(0, 2); | |||
| svbool_t pg32_first_4 = svwhilelt_b32(0, 4); | |||
| svbool_t pg32_select_first_2_per_quadword = svdupq_b32(1, 1, 0, 0); | |||
| bfloat16_t *ptr_a = (bfloat16_t *)A; | |||
| bfloat16_t *ptr_b = (bfloat16_t *)B; | |||
| FLOAT *ptr_c = C; | |||
| bfloat16_t *ptr_a0; | |||
| bfloat16_t *ptr_b0; | |||
| FLOAT *ptr_c0, *ptr_c1, *ptr_c2, *ptr_c3; | |||
| for (BLASLONG j = 0; j < n / 4; j++) { | |||
| ptr_c0 = ptr_c; | |||
| ptr_c1 = ptr_c0 + ldc; | |||
| ptr_c2 = ptr_c1 + ldc; | |||
| ptr_c3 = ptr_c2 + ldc; | |||
| ptr_c += 4 * ldc; | |||
| ptr_a = (bfloat16_t *)A; | |||
| for (BLASLONG i = 0; i < m / 4; i++) { | |||
| ptr_a0 = ptr_a; | |||
| ptr_a += 4 * pad_k; | |||
| ptr_b0 = ptr_b; | |||
| INIT_C_4x4; | |||
| for (BLASLONG p = 0; p < pad_k; p += 8) { | |||
| ma0 = svld1_bf16(pg16_all, ptr_a0); | |||
| ma1 = svld1_bf16(pg16_all, ptr_a0 + 16); | |||
| mb0 = svld1_bf16(pg16_all, ptr_b0); | |||
| mb1 = svld1_bf16(pg16_all, ptr_b0 + 16); | |||
| MATMUL(0, 0); | |||
| MATMUL(0, 1); | |||
| MATMUL(1, 0); | |||
| MATMUL(1, 1); | |||
| ptr_a0 += 32; | |||
| ptr_b0 += 32; | |||
| } | |||
| ACCUMULATE_LAST4_TO_FIRST4(0, 0, tmp); | |||
| ACCUMULATE_LAST4_TO_FIRST4(0, 1, tmp); | |||
| ACCUMULATE_LAST4_TO_FIRST4(1, 0, tmp); | |||
| ACCUMULATE_LAST4_TO_FIRST4(1, 1, tmp); | |||
| ZIP_EVEN_ELEMENTS(pg32_select_first_2_per_quadword, mc00, mc10, tmp, vc0); | |||
| ZIP_ODD_ELEMENTS(pg32_select_first_2_per_quadword, mc00, mc10, tmp, vc1); | |||
| ZIP_EVEN_ELEMENTS(pg32_select_first_2_per_quadword, mc01, mc11, tmp, vc2); | |||
| ZIP_ODD_ELEMENTS(pg32_select_first_2_per_quadword, mc01, mc11, tmp, vc3); | |||
| UPDATE_C(pg32_first_4, ptr_c0, oc0, vc0); | |||
| UPDATE_C(pg32_first_4, ptr_c1, oc1, vc1); | |||
| UPDATE_C(pg32_first_4, ptr_c2, oc2, vc2) | |||
| UPDATE_C(pg32_first_4, ptr_c3, oc3, vc3) | |||
| ptr_c0 += 4; | |||
| ptr_c1 += 4; | |||
| ptr_c2 += 4; | |||
| ptr_c3 += 4; | |||
| } | |||
| if (m & 2) { | |||
| ptr_a0 = ptr_a; | |||
| ptr_a += 2 * pad_k; | |||
| ptr_b0 = ptr_b; | |||
| INIT_C(0, 0); | |||
| INIT_C(0, 1); | |||
| for (BLASLONG p = 0; p < pad_k; p += 8) { | |||
| ma0 = svld1_bf16(pg16_all, ptr_a0); | |||
| mb0 = svld1_bf16(pg16_all, ptr_b0); | |||
| mb1 = svld1_bf16(pg16_all, ptr_b0 + 16); | |||
| MATMUL(0, 0); | |||
| MATMUL(0, 1); | |||
| ptr_a0 += 16; | |||
| ptr_b0 += 32; | |||
| } | |||
| ACCUMULATE_LAST4_TO_FIRST4(0, 0, tmp); | |||
| ACCUMULATE_LAST4_TO_FIRST4(0, 1, tmp); | |||
| vc0 = svuzp1(mc00, mc00); | |||
| vc1 = svuzp2(mc00, mc00); | |||
| vc2 = svuzp1(mc01, mc01); | |||
| vc3 = svuzp2(mc01, mc01); | |||
| UPDATE_C(pg32_first_2, ptr_c0, oc0, vc0); | |||
| UPDATE_C(pg32_first_2, ptr_c1, oc1, vc1); | |||
| UPDATE_C(pg32_first_2, ptr_c2, oc2, vc2); | |||
| UPDATE_C(pg32_first_2, ptr_c3, oc3, vc3); | |||
| ptr_c0 += 2; | |||
| ptr_c1 += 2; | |||
| ptr_c2 += 2; | |||
| ptr_c3 += 2; | |||
| } | |||
| if (m & 1) { | |||
| ptr_a0 = ptr_a; | |||
| ptr_b0 = ptr_b; | |||
| INIT_C(0, 0); | |||
| INIT_C(0, 1); | |||
| for (BLASLONG p = 0; p < pad_k; p += 8) { | |||
| ma0 = svld1_bf16(pg16_all, ptr_a0); | |||
| mb0 = svld1_bf16(pg16_all, ptr_b0); | |||
| mb1 = svld1_bf16(pg16_all, ptr_b0 + 16); | |||
| MATMUL(0, 0); | |||
| MATMUL(0, 1); | |||
| ptr_a0 += 16; | |||
| ptr_b0 += 32; | |||
| } | |||
| ACCUMULATE_LAST4_TO_FIRST4(0, 0, tmp); | |||
| ACCUMULATE_LAST4_TO_FIRST4(0, 1, tmp); | |||
| // use compact is more straightforward | |||
| vc1 = svuzp2(mc00, mc00); | |||
| vc3 = svuzp2(mc01, mc01); | |||
| UPDATE_C(pg32_first_1, ptr_c0, oc0, mc00); | |||
| UPDATE_C(pg32_first_1, ptr_c1, oc1, vc1); | |||
| UPDATE_C(pg32_first_1, ptr_c2, oc2, mc01); | |||
| UPDATE_C(pg32_first_1, ptr_c3, oc3, vc3); | |||
| } | |||
| ptr_b += 4 * pad_k; | |||
| } | |||
| if (n & 2) { | |||
| ptr_c0 = ptr_c; | |||
| ptr_c1 = ptr_c0 + ldc; | |||
| ptr_c += 2 * ldc; | |||
| ptr_a = (bfloat16_t *)A; | |||
| for (BLASLONG i = 0; i < m / 4; i++) { | |||
| ptr_a0 = ptr_a; | |||
| ptr_a += 4 * pad_k; | |||
| ptr_b0 = ptr_b; | |||
| INIT_C(0, 0); | |||
| INIT_C(1, 0); | |||
| for (BLASLONG p = 0; p < pad_k; p += 8) { | |||
| ma0 = svld1_bf16(pg16_all, ptr_a0); | |||
| ma1 = svld1_bf16(pg16_all, ptr_a0 + 16); | |||
| mb0 = svld1_bf16(pg16_all, ptr_b0); | |||
| MATMUL(0, 0); | |||
| MATMUL(1, 0); | |||
| ptr_a0 += 32; | |||
| ptr_b0 += 16; | |||
| } | |||
| ACCUMULATE_LAST4_TO_FIRST4(0, 0, tmp); | |||
| ACCUMULATE_LAST4_TO_FIRST4(1, 0, tmp); | |||
| ZIP_EVEN_ELEMENTS(pg32_select_first_2_per_quadword, mc00, mc10, tmp, vc0); | |||
| ZIP_ODD_ELEMENTS(pg32_select_first_2_per_quadword, mc00, mc10, tmp, vc2); | |||
| UPDATE_C(pg32_first_4, ptr_c0, oc0, vc0); | |||
| UPDATE_C(pg32_first_4, ptr_c1, oc2, vc2); | |||
| ptr_c0 += 4; | |||
| ptr_c1 += 4; | |||
| } | |||
| if (m & 2) { | |||
| ptr_a0 = ptr_a; | |||
| ptr_a += 2 * pad_k; | |||
| ptr_b0 = ptr_b; | |||
| INIT_C(0, 0); | |||
| for (BLASLONG p = 0; p < pad_k; p += 8) { | |||
| ma0 = svld1_bf16(pg16_all, ptr_a0); | |||
| mb0 = svld1_bf16(pg16_all, ptr_b0); | |||
| MATMUL(0, 0); | |||
| ptr_a0 += 16; | |||
| ptr_b0 += 16; | |||
| } | |||
| ACCUMULATE_LAST4_TO_FIRST4(0, 0, tmp); | |||
| vc0 = svuzp1(mc00, mc00); | |||
| vc1 = svuzp2(mc00, mc00); | |||
| UPDATE_C(pg32_first_2, ptr_c0, oc0, vc0); | |||
| UPDATE_C(pg32_first_2, ptr_c1, oc1, vc1); | |||
| ptr_c0 += 2; | |||
| ptr_c1 += 2; | |||
| } | |||
| if (m & 1) { | |||
| ptr_a0 = ptr_a; | |||
| ptr_b0 = ptr_b; | |||
| INIT_C(0, 0); | |||
| for (BLASLONG p = 0; p < pad_k; p += 8) { | |||
| ma0 = svld1_bf16(pg16_all, ptr_a0); | |||
| mb0 = svld1_bf16(pg16_all, ptr_b0); | |||
| MATMUL(0, 0); | |||
| ptr_a0 += 16; | |||
| ptr_b0 += 16; | |||
| } | |||
| ACCUMULATE_LAST4_TO_FIRST4(0, 0, tmp); | |||
| vc1 = svuzp2(mc00, mc00); | |||
| UPDATE_C(pg32_first_1, ptr_c0, oc0, mc00); | |||
| UPDATE_C(pg32_first_1, ptr_c1, oc1, vc1); | |||
| } | |||
| ptr_b += 2 * pad_k; | |||
| } | |||
| if (n & 1) { // TODO: this case seems a overhead. find out whether it's in our | |||
| // case. | |||
| ptr_c0 = ptr_c; | |||
| ptr_a = (bfloat16_t *)A; | |||
| for (BLASLONG i = 0; i < m / 4; i++) { | |||
| ptr_a0 = ptr_a; | |||
| ptr_a += 4 * pad_k; | |||
| ptr_b0 = ptr_b; | |||
| INIT_C(0, 0); | |||
| INIT_C(1, 0); | |||
| for (BLASLONG p = 0; p < pad_k; p += 8) { | |||
| ma0 = svld1_bf16(pg16_all, ptr_a0); | |||
| ma1 = svld1_bf16(pg16_all, ptr_a0 + 16); | |||
| mb0 = svld1_bf16(pg16_all, ptr_b0); | |||
| MATMUL(0, 0); | |||
| MATMUL(1, 0); | |||
| ptr_a0 += 32; | |||
| ptr_b0 += 16; | |||
| } | |||
| ACCUMULATE_LAST4_TO_FIRST4(0, 0, tmp); | |||
| ACCUMULATE_LAST4_TO_FIRST4(1, 0, tmp); | |||
| ZIP_EVEN_ELEMENTS(pg32_select_first_2_per_quadword, mc00, mc10, tmp, vc0); | |||
| UPDATE_C(pg32_first_4, ptr_c0, oc0, vc0); | |||
| ptr_c0 += 4; | |||
| } | |||
| if (m & 2) { | |||
| ptr_a0 = ptr_a; | |||
| ptr_a += 2 * pad_k; | |||
| ptr_b0 = ptr_b; | |||
| INIT_C(0, 0); | |||
| for (BLASLONG p = 0; p < pad_k; p += 8) { | |||
| ma0 = svld1_bf16(pg16_all, ptr_a0); | |||
| mb0 = svld1_bf16(pg16_all, ptr_b0); | |||
| MATMUL(0, 0); | |||
| ptr_a0 += 16; | |||
| ptr_b0 += 16; | |||
| } | |||
| ACCUMULATE_LAST4_TO_FIRST4(0, 0, tmp); | |||
| vc0 = svuzp1(mc00, mc00); | |||
| UPDATE_C(pg32_first_2, ptr_c0, oc0, vc0); | |||
| ptr_c0 += 2; | |||
| } | |||
| if (m & 1) { | |||
| ptr_a0 = ptr_a; | |||
| ptr_b0 = ptr_b; | |||
| INIT_C(0, 0); | |||
| for (BLASLONG p = 0; p < pad_k; p += 8) { | |||
| ma0 = svld1_bf16(pg16_all, ptr_a0); | |||
| mb0 = svld1_bf16(pg16_all, ptr_b0); | |||
| MATMUL(0, 0); | |||
| ptr_a0 += 16; | |||
| ptr_b0 += 16; | |||
| } | |||
| ACCUMULATE_LAST4_TO_FIRST4(0, 0, tmp); | |||
| UPDATE_C(pg32_first_1, ptr_c0, oc0, mc00); | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,148 @@ | |||
| /*************************************************************************** | |||
| * Copyright (c) 2024-2025, The OpenBLAS Project | |||
| * All rights reserved. | |||
| * Redistribution and use in source and binary forms, with or without | |||
| * modification, are permitted provided that the following conditions are | |||
| * met: | |||
| * 1. Redistributions of source code must retain the above copyright | |||
| * notice, this list of conditions and the following disclaimer. | |||
| * 2. Redistributions in binary form must reproduce the above copyright | |||
| * notice, this list of conditions and the following disclaimer in | |||
| * the documentation and/or other materials provided with the | |||
| * distribution. | |||
| * 3. Neither the name of the OpenBLAS project nor the names of | |||
| * its contributors may be used to endorse or promote products | |||
| * derived from this software without specific prior written permission. | |||
| * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |||
| * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | |||
| * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | |||
| * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | |||
| * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |||
| * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | |||
| * POSSIBILITY OF SUCH DAMAGE. | |||
| * *****************************************************************************/ | |||
| #include <arm_sve.h> | |||
| #include "common.h" | |||
| int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) { | |||
| IFLOAT *a_offset; | |||
| IFLOAT *a_offsetx[4]; | |||
| IFLOAT *b_offset; | |||
| a_offset = a; | |||
| b_offset = b; | |||
| bfloat16_t zero_value_bf16; | |||
| *((uint16_t *)(&zero_value_bf16)) = 0; | |||
| svbool_t pg16_all = svptrue_b16(); // 16 elements for sve-256 machine. | |||
| svbool_t pg16_first_8 = svwhilelt_b16(0, 8); | |||
| svbfloat16_t v0, v1, v2, v3; | |||
| svuint64_t t0, t1; | |||
| BLASLONG rest = m & 7; | |||
| svbool_t pg16_rest = svwhilelt_b16_s32(0, rest); | |||
| for (BLASLONG j = 0; j < n / 4; j++) { | |||
| a_offsetx[0] = a_offset; | |||
| a_offsetx[1] = a_offsetx[0] + lda; | |||
| a_offsetx[2] = a_offsetx[1] + lda; | |||
| a_offsetx[3] = a_offsetx[2] + lda; | |||
| a_offset += 4 * lda; | |||
| for (BLASLONG i = 0; i < m / 8; i++) { | |||
| v0 = svld1_bf16(pg16_first_8, (bfloat16_t *)a_offsetx[0]); | |||
| v1 = svld1_bf16(pg16_first_8, (bfloat16_t *)a_offsetx[1]); | |||
| v2 = svld1_bf16(pg16_first_8, (bfloat16_t *)a_offsetx[2]); | |||
| v3 = svld1_bf16(pg16_first_8, (bfloat16_t *)a_offsetx[3]); | |||
| t0 = svzip1_u64(svreinterpret_u64_bf16(v0), svreinterpret_u64_bf16(v1)); | |||
| t1 = svzip1_u64(svreinterpret_u64_bf16(v2), svreinterpret_u64_bf16(v3)); | |||
| svst1_bf16(pg16_all, (bfloat16_t *)b_offset, svreinterpret_bf16_u64(t0)); | |||
| svst1_bf16(pg16_all, (bfloat16_t *)b_offset + 16, | |||
| svreinterpret_bf16_u64(t1)); | |||
| a_offsetx[0] += 8; | |||
| a_offsetx[1] += 8; | |||
| a_offsetx[2] += 8; | |||
| a_offsetx[3] += 8; | |||
| b_offset += 32; | |||
| } | |||
| if (rest) { // remainder along k dim | |||
| v0 = svld1_bf16(pg16_rest, (bfloat16_t *)a_offsetx[0]); | |||
| v1 = svld1_bf16(pg16_rest, (bfloat16_t *)a_offsetx[1]); | |||
| v2 = svld1_bf16(pg16_rest, (bfloat16_t *)a_offsetx[2]); | |||
| v3 = svld1_bf16(pg16_rest, (bfloat16_t *)a_offsetx[3]); | |||
| t0 = svzip1_u64(svreinterpret_u64_bf16(v0), svreinterpret_u64_bf16(v1)); | |||
| t1 = svzip1_u64(svreinterpret_u64_bf16(v2), svreinterpret_u64_bf16(v3)); | |||
| svst1_bf16(pg16_all, (bfloat16_t *)b_offset, svreinterpret_bf16_u64(t0)); | |||
| svst1_bf16(pg16_all, (bfloat16_t *)b_offset + 16, | |||
| svreinterpret_bf16_u64(t1)); | |||
| b_offset += 32; | |||
| } | |||
| } | |||
| if (n & 2) { | |||
| a_offsetx[0] = a_offset; | |||
| a_offsetx[1] = a_offsetx[0] + lda; | |||
| a_offset += 2 * lda; | |||
| for (BLASLONG i = 0; i < m / 8; i++) { | |||
| v0 = svld1_bf16(pg16_first_8, (bfloat16_t *)a_offsetx[0]); | |||
| v1 = svld1_bf16(pg16_first_8, (bfloat16_t *)a_offsetx[1]); | |||
| t0 = svzip1_u64(svreinterpret_u64_bf16(v0), svreinterpret_u64_bf16(v1)); | |||
| svst1_bf16(pg16_all, (bfloat16_t *)b_offset, svreinterpret_bf16_u64(t0)); | |||
| b_offset += 16; | |||
| a_offsetx[0] += 8; | |||
| a_offsetx[1] += 8; | |||
| } | |||
| if (rest) { // remainder along k dim | |||
| v0 = svld1_bf16(pg16_rest, (bfloat16_t *)a_offsetx[0]); | |||
| v1 = svld1_bf16(pg16_rest, (bfloat16_t *)a_offsetx[1]); | |||
| t0 = svzip1_u64(svreinterpret_u64_bf16(v0), svreinterpret_u64_bf16(v1)); | |||
| svst1_bf16(pg16_all, (bfloat16_t *)b_offset, svreinterpret_bf16_u64(t0)); | |||
| b_offset += 16; | |||
| } | |||
| } | |||
| if (n & 1) { | |||
| a_offsetx[0] = a_offset; | |||
| for (BLASLONG i = 0; i < m / 8; i++) { | |||
| v0 = svld1_bf16(pg16_first_8, (bfloat16_t *)a_offsetx[0]); | |||
| v1 = svdup_bf16(zero_value_bf16); | |||
| t0 = svzip1_u64(svreinterpret_u64_bf16(v0), svreinterpret_u64_bf16(v1)); | |||
| svst1_bf16(pg16_all, (bfloat16_t *)b_offset, svreinterpret_bf16_u64(t0)); | |||
| b_offset += 16; | |||
| a_offsetx[0] += 8; | |||
| } | |||
| if (rest) { // remainder along k dim | |||
| v0 = svld1_bf16(pg16_rest, (bfloat16_t *)a_offsetx[0]); | |||
| v1 = svdup_bf16(zero_value_bf16); | |||
| t0 = svzip1_u64(svreinterpret_u64_bf16(v0), svreinterpret_u64_bf16(v1)); | |||
| svst1_bf16(pg16_all, (bfloat16_t *)b_offset, svreinterpret_bf16_u64(t0)); | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,361 @@ | |||
| /*************************************************************************** | |||
| * Copyright (c) 2024-2025, The OpenBLAS Project | |||
| * All rights reserved. | |||
| * Redistribution and use in source and binary forms, with or without | |||
| * modification, are permitted provided that the following conditions are | |||
| * met: | |||
| * 1. Redistributions of source code must retain the above copyright | |||
| * notice, this list of conditions and the following disclaimer. | |||
| * 2. Redistributions in binary form must reproduce the above copyright | |||
| * notice, this list of conditions and the following disclaimer in | |||
| * the documentation and/or other materials provided with the | |||
| * distribution. | |||
| * 3. Neither the name of the OpenBLAS project nor the names of | |||
| * its contributors may be used to endorse or promote products | |||
| * derived from this software without specific prior written permission. | |||
| * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |||
| * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | |||
| * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | |||
| * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | |||
| * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |||
| * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | |||
| * POSSIBILITY OF SUCH DAMAGE. | |||
| * *****************************************************************************/ | |||
| #include "common.h" | |||
| #include <arm_neon.h> | |||
| #include <arm_sve.h> | |||
| int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) { | |||
| BLASLONG pad_m = ((m + 7) & ~7); | |||
| BLASLONG rest = (m & 7); // rest along m dim | |||
| IFLOAT *a_offset; | |||
| IFLOAT *a_offset0, *a_offset1, *a_offset2, *a_offset3; | |||
| IFLOAT *a_offset4, *a_offset5, *a_offset6, *a_offset7; | |||
| IFLOAT *b_offset; | |||
| IFLOAT *b_offset0, *b_offset1; | |||
| a_offset = a; | |||
| b_offset = b; | |||
| svuint16_t c0, c1, c2, c3, c4, c5, c6, c7; | |||
| svuint16_t t0, t1, t2, t3; | |||
| svuint32_t m00, m01, m10, m11; | |||
| svuint64_t st_offsets_0, st_offsets_1; | |||
| svbool_t pg16_first_4 = svwhilelt_b16(0, 4); | |||
| svbool_t pg16_first_8 = svwhilelt_b16(0, 8); | |||
| svbool_t pg64_first_4 = svwhilelt_b64(0, 4); | |||
| u_int32_t sizeof_u64 = 8; | |||
| u_int64_t _st_offsets_0[4] = { | |||
| 0 * sizeof_u64, | |||
| 1 * sizeof_u64, | |||
| 4 * sizeof_u64, | |||
| 5 * sizeof_u64, | |||
| }; | |||
| u_int64_t _st_offsets_1[4] = { | |||
| 2 * sizeof_u64, | |||
| 3 * sizeof_u64, | |||
| 6 * sizeof_u64, | |||
| 7 * sizeof_u64, | |||
| }; | |||
| st_offsets_0 = svld1_u64(pg64_first_4, _st_offsets_0); | |||
| st_offsets_1 = svld1_u64(pg64_first_4, _st_offsets_1); | |||
| for (BLASLONG j = 0; j < n / 8; j++) { | |||
| a_offset0 = a_offset; | |||
| a_offset1 = a_offset0 + lda; | |||
| a_offset2 = a_offset1 + lda; | |||
| a_offset3 = a_offset2 + lda; | |||
| a_offset4 = a_offset3 + lda; | |||
| a_offset5 = a_offset4 + lda; | |||
| a_offset6 = a_offset5 + lda; | |||
| a_offset7 = a_offset6 + lda; | |||
| a_offset += 8; | |||
| b_offset0 = b_offset; | |||
| b_offset1 = b_offset0 + 4 * pad_m; | |||
| b_offset += 8 * pad_m; | |||
| for (BLASLONG i = 0; i < m / 8; i++) { | |||
| // transpose 8x8 matrix and pack into two 4x8 block consists of two 2x4 | |||
| // small blocks | |||
| c0 = svld1_u16(pg16_first_8, a_offset0); | |||
| c1 = svld1_u16(pg16_first_8, a_offset1); | |||
| c2 = svld1_u16(pg16_first_8, a_offset2); | |||
| c3 = svld1_u16(pg16_first_8, a_offset3); | |||
| c4 = svld1_u16(pg16_first_8, a_offset4); | |||
| c5 = svld1_u16(pg16_first_8, a_offset5); | |||
| c6 = svld1_u16(pg16_first_8, a_offset6); | |||
| c7 = svld1_u16(pg16_first_8, a_offset7); | |||
| t0 = svzip1_u16(c0, c1); | |||
| t1 = svzip1_u16(c2, c3); | |||
| t2 = svzip1_u16(c4, c5); | |||
| t3 = svzip1_u16(c6, c7); | |||
| m00 = svzip1_u32(svreinterpret_u32_u16(t0), svreinterpret_u32_u16(t1)); | |||
| m10 = svzip2_u32(svreinterpret_u32_u16(t0), svreinterpret_u32_u16(t1)); | |||
| m01 = svzip1_u32(svreinterpret_u32_u16(t2), svreinterpret_u32_u16(t3)); | |||
| m11 = svzip2_u32(svreinterpret_u32_u16(t2), svreinterpret_u32_u16(t3)); | |||
| svst1_scatter_u64offset_u64(pg64_first_4, (u_int64_t *)b_offset0, | |||
| st_offsets_0, svreinterpret_u64_u32(m00)); | |||
| svst1_scatter_u64offset_u64(pg64_first_4, (u_int64_t *)b_offset0, | |||
| st_offsets_1, svreinterpret_u64_u32(m01)); | |||
| svst1_scatter_u64offset_u64(pg64_first_4, (u_int64_t *)b_offset1, | |||
| st_offsets_0, svreinterpret_u64_u32(m10)); | |||
| svst1_scatter_u64offset_u64(pg64_first_4, (u_int64_t *)b_offset1, | |||
| st_offsets_1, svreinterpret_u64_u32(m11)); | |||
| a_offset0 += 8 * lda; | |||
| a_offset1 += 8 * lda; | |||
| a_offset2 += 8 * lda; | |||
| a_offset3 += 8 * lda; | |||
| a_offset4 += 8 * lda; | |||
| a_offset5 += 8 * lda; | |||
| a_offset6 += 8 * lda; | |||
| a_offset7 += 8 * lda; | |||
| b_offset0 += 32; | |||
| b_offset1 += 32; | |||
| } | |||
| if (rest) { | |||
| c0 = svld1_u16(pg16_first_8, a_offset0); | |||
| c1 = (rest >= 2 ? svld1_u16(pg16_first_8, a_offset1) : svdup_u16(0)); | |||
| c2 = (rest >= 3 ? svld1_u16(pg16_first_8, a_offset2) : svdup_u16(0)); | |||
| c3 = (rest >= 4 ? svld1_u16(pg16_first_8, a_offset3) : svdup_u16(0)); | |||
| c4 = (rest >= 5 ? svld1_u16(pg16_first_8, a_offset4) : svdup_u16(0)); | |||
| c5 = (rest >= 6 ? svld1_u16(pg16_first_8, a_offset5) : svdup_u16(0)); | |||
| c6 = (rest == 7 ? svld1_u16(pg16_first_8, a_offset6) : svdup_u16(0)); | |||
| c7 = (svdup_u16(0)); | |||
| t0 = svzip1_u16(c0, c1); | |||
| t1 = svzip1_u16(c2, c3); | |||
| t2 = svzip1_u16(c4, c5); | |||
| t3 = svzip1_u16(c6, c7); | |||
| m00 = svzip1_u32(svreinterpret_u32_u16(t0), svreinterpret_u32_u16(t1)); | |||
| m10 = svzip2_u32(svreinterpret_u32_u16(t0), svreinterpret_u32_u16(t1)); | |||
| m01 = svzip1_u32(svreinterpret_u32_u16(t2), svreinterpret_u32_u16(t3)); | |||
| m11 = svzip2_u32(svreinterpret_u32_u16(t2), svreinterpret_u32_u16(t3)); | |||
| svst1_scatter_u64offset_u64(pg64_first_4, (u_int64_t *)b_offset0, | |||
| st_offsets_0, svreinterpret_u64_u32(m00)); | |||
| svst1_scatter_u64offset_u64(pg64_first_4, (u_int64_t *)b_offset0, | |||
| st_offsets_1, svreinterpret_u64_u32(m01)); | |||
| svst1_scatter_u64offset_u64(pg64_first_4, (u_int64_t *)b_offset1, | |||
| st_offsets_0, svreinterpret_u64_u32(m10)); | |||
| svst1_scatter_u64offset_u64(pg64_first_4, (u_int64_t *)b_offset1, | |||
| st_offsets_1, svreinterpret_u64_u32(m11)); | |||
| } | |||
| } | |||
| if (n & 4) { | |||
| a_offset0 = a_offset; | |||
| a_offset1 = a_offset0 + lda; | |||
| a_offset2 = a_offset1 + lda; | |||
| a_offset3 = a_offset2 + lda; | |||
| a_offset4 = a_offset3 + lda; | |||
| a_offset5 = a_offset4 + lda; | |||
| a_offset6 = a_offset5 + lda; | |||
| a_offset7 = a_offset6 + lda; | |||
| a_offset += 4; | |||
| b_offset0 = b_offset; | |||
| b_offset += 4 * pad_m; | |||
| for (BLASLONG i = 0; i < m / 8; i++) { | |||
| // transpose 8x8 matrix and pack into two 4x8 block consists of two 2x4 | |||
| // small blocks | |||
| c0 = svld1_u16(pg16_first_4, a_offset0); | |||
| c1 = svld1_u16(pg16_first_4, a_offset1); | |||
| c2 = svld1_u16(pg16_first_4, a_offset2); | |||
| c3 = svld1_u16(pg16_first_4, a_offset3); | |||
| c4 = svld1_u16(pg16_first_4, a_offset4); | |||
| c5 = svld1_u16(pg16_first_4, a_offset5); | |||
| c6 = svld1_u16(pg16_first_4, a_offset6); | |||
| c7 = svld1_u16(pg16_first_4, a_offset7); | |||
| t0 = svzip1_u16(c0, c1); | |||
| t1 = svzip1_u16(c2, c3); | |||
| t2 = svzip1_u16(c4, c5); | |||
| t3 = svzip1_u16(c6, c7); | |||
| m00 = svzip1_u32(svreinterpret_u32_u16(t0), svreinterpret_u32_u16(t1)); | |||
| m01 = svzip1_u32(svreinterpret_u32_u16(t2), svreinterpret_u32_u16(t3)); | |||
| svst1_scatter_u64offset_u64(pg64_first_4, (u_int64_t *)b_offset0, | |||
| st_offsets_0, svreinterpret_u64_u32(m00)); | |||
| svst1_scatter_u64offset_u64(pg64_first_4, (u_int64_t *)b_offset0, | |||
| st_offsets_1, svreinterpret_u64_u32(m01)); | |||
| a_offset0 += 8 * lda; | |||
| a_offset1 += 8 * lda; | |||
| a_offset2 += 8 * lda; | |||
| a_offset3 += 8 * lda; | |||
| a_offset4 += 8 * lda; | |||
| a_offset5 += 8 * lda; | |||
| a_offset6 += 8 * lda; | |||
| a_offset7 += 8 * lda; | |||
| b_offset0 += 32; | |||
| } | |||
| if (rest) { | |||
| c0 = svld1_u16(pg16_first_4, a_offset0); // rest >= 1 | |||
| c1 = (rest >= 2 ? svld1_u16(pg16_first_4, a_offset1) : svdup_u16(0)); | |||
| c2 = (rest >= 3 ? svld1_u16(pg16_first_4, a_offset2) : svdup_u16(0)); | |||
| c3 = (rest >= 4 ? svld1_u16(pg16_first_4, a_offset3) : svdup_u16(0)); | |||
| c4 = (rest >= 5 ? svld1_u16(pg16_first_4, a_offset4) : svdup_u16(0)); | |||
| c5 = (rest >= 6 ? svld1_u16(pg16_first_4, a_offset5) : svdup_u16(0)); | |||
| c6 = (rest == 7 ? svld1_u16(pg16_first_4, a_offset6) : svdup_u16(0)); | |||
| c7 = (svdup_u16(0)); | |||
| t0 = svzip1_u16(c0, c1); | |||
| t1 = svzip1_u16(c2, c3); | |||
| t2 = svzip1_u16(c4, c5); | |||
| t3 = svzip1_u16(c6, c7); | |||
| m00 = svzip1_u32(svreinterpret_u32_u16(t0), svreinterpret_u32_u16(t1)); | |||
| m01 = svzip1_u32(svreinterpret_u32_u16(t2), svreinterpret_u32_u16(t3)); | |||
| svst1_scatter_u64offset_u64(pg64_first_4, (u_int64_t *)b_offset0, | |||
| st_offsets_0, svreinterpret_u64_u32(m00)); | |||
| svst1_scatter_u64offset_u64(pg64_first_4, (u_int64_t *)b_offset0, | |||
| st_offsets_1, svreinterpret_u64_u32(m01)); | |||
| } | |||
| } | |||
| if (n & 2) { | |||
| a_offset0 = a_offset; | |||
| a_offset1 = a_offset0 + lda; | |||
| a_offset2 = a_offset1 + lda; | |||
| a_offset3 = a_offset2 + lda; | |||
| a_offset4 = a_offset3 + lda; | |||
| a_offset5 = a_offset4 + lda; | |||
| a_offset6 = a_offset5 + lda; | |||
| a_offset7 = a_offset6 + lda; | |||
| a_offset += 2; | |||
| b_offset0 = b_offset; | |||
| b_offset1 = b_offset0 + 8; | |||
| b_offset += 2 * pad_m; | |||
| for (BLASLONG i = 0; i < m / 8; i++) { | |||
| for (BLASLONG line = 0; line < 2; line++) { | |||
| b_offset0[line * 4] = a_offset0[line]; | |||
| b_offset0[line * 4 + 1] = a_offset1[line]; | |||
| b_offset0[line * 4 + 2] = a_offset2[line]; | |||
| b_offset0[line * 4 + 3] = a_offset3[line]; | |||
| b_offset1[line * 4] = a_offset4[line]; | |||
| b_offset1[line * 4 + 1] = a_offset5[line]; | |||
| b_offset1[line * 4 + 2] = a_offset6[line]; | |||
| b_offset1[line * 4 + 3] = a_offset7[line]; | |||
| } | |||
| b_offset0 += 16; | |||
| b_offset1 += 16; | |||
| a_offset0 += 8 * lda; | |||
| a_offset1 += 8 * lda; | |||
| a_offset2 += 8 * lda; | |||
| a_offset3 += 8 * lda; | |||
| a_offset4 += 8 * lda; | |||
| a_offset5 += 8 * lda; | |||
| a_offset6 += 8 * lda; | |||
| a_offset7 += 8 * lda; | |||
| } | |||
| if (rest) { | |||
| for (BLASLONG line = 0; line < 2; line++) { | |||
| b_offset0[line * 4] = a_offset0[line]; | |||
| b_offset0[line * 4 + 1] = rest == 1 ? 0 : a_offset1[line]; | |||
| b_offset0[line * 4 + 2] = rest <= 2 ? 0 : a_offset2[line]; | |||
| b_offset0[line * 4 + 3] = rest <= 3 ? 0 : a_offset3[line]; | |||
| b_offset1[line * 4] = rest <= 4 ? 0 : a_offset4[line]; | |||
| b_offset1[line * 4 + 1] = rest <= 5 ? 0 : a_offset5[line]; | |||
| b_offset1[line * 4 + 2] = rest <= 6 ? 0 : a_offset6[line]; | |||
| b_offset1[line * 4 + 3] = 0; | |||
| } | |||
| } | |||
| } | |||
| if (n & 1) { | |||
| a_offset0 = a_offset; | |||
| a_offset1 = a_offset0 + lda; | |||
| a_offset2 = a_offset1 + lda; | |||
| a_offset3 = a_offset2 + lda; | |||
| a_offset4 = a_offset3 + lda; | |||
| a_offset5 = a_offset4 + lda; | |||
| a_offset6 = a_offset5 + lda; | |||
| a_offset7 = a_offset6 + lda; | |||
| for (BLASLONG i = 0; i < m / 8; i++) { | |||
| b_offset[0] = a_offset0[0]; | |||
| b_offset[1] = a_offset1[0]; | |||
| b_offset[2] = a_offset2[0]; | |||
| b_offset[3] = a_offset3[0]; | |||
| b_offset[4] = 0; | |||
| b_offset[5] = 0; | |||
| b_offset[6] = 0; | |||
| b_offset[7] = 0; | |||
| b_offset[8] = a_offset4[0]; | |||
| b_offset[9] = a_offset5[0]; | |||
| b_offset[10] = a_offset6[0]; | |||
| b_offset[11] = a_offset7[0]; | |||
| b_offset[12] = 0; | |||
| b_offset[13] = 0; | |||
| b_offset[14] = 0; | |||
| b_offset[15] = 0; | |||
| b_offset += 16; | |||
| a_offset0 += 8 * lda; | |||
| a_offset1 += 8 * lda; | |||
| a_offset2 += 8 * lda; | |||
| a_offset3 += 8 * lda; | |||
| a_offset4 += 8 * lda; | |||
| a_offset5 += 8 * lda; | |||
| a_offset6 += 8 * lda; | |||
| a_offset7 += 8 * lda; | |||
| } | |||
| if (rest) { | |||
| b_offset[0] = *a_offset0; | |||
| b_offset[1] = rest == 1 ? 0 : *a_offset1; | |||
| b_offset[2] = rest <= 2 ? 0 : *a_offset2; | |||
| b_offset[3] = rest <= 3 ? 0 : *a_offset3; | |||
| b_offset[4] = 0; | |||
| b_offset[5] = 0; | |||
| b_offset[6] = 0; | |||
| b_offset[7] = 0; | |||
| b_offset[8] = rest <= 4 ? 0 : *a_offset4; | |||
| b_offset[9] = rest <= 5 ? 0 : *a_offset5; | |||
| b_offset[10] = rest <= 6 ? 0 : *a_offset6; | |||
| b_offset[11] = 0; | |||
| b_offset[12] = 0; | |||
| b_offset[13] = 0; | |||
| b_offset[14] = 0; | |||
| b_offset[15] = 0; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,515 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2025, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written | |||
| permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |||
| CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE | |||
| GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |||
| HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | |||
| LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF | |||
| THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #include <arm_neon.h> | |||
| static void beta_op(float *x, BLASLONG n, FLOAT beta) { | |||
| if (beta == 0) { | |||
| memset(x, 0, n * sizeof(float)); | |||
| return; | |||
| } | |||
| float32x4_t y0, y1, y2, y3; | |||
| for (BLASLONG i = 0; i < n / 16; i++) { | |||
| y0 = vld1q_f32(x); | |||
| y1 = vld1q_f32(x + 4); | |||
| y2 = vld1q_f32(x + 8); | |||
| y3 = vld1q_f32(x + 12); | |||
| y0 = vmulq_n_f32(y0, beta); | |||
| y1 = vmulq_n_f32(y1, beta); | |||
| y2 = vmulq_n_f32(y2, beta); | |||
| y3 = vmulq_n_f32(y3, beta); | |||
| vst1q_f32(x, y0); | |||
| vst1q_f32(x + 4, y1); | |||
| vst1q_f32(x + 8, y2); | |||
| vst1q_f32(x + 12, y3); | |||
| x += 16; | |||
| } | |||
| if (n & 15) { | |||
| BLASLONG rest_n = n & 15; | |||
| for (BLASLONG i = 0; i < (rest_n) / 4; i++) { | |||
| y0 = vld1q_f32(x); | |||
| y0 = vmulq_n_f32(y0, beta); | |||
| vst1q_f32(x, y0); | |||
| x += 4; | |||
| } | |||
| for (BLASLONG i = 0; i < (rest_n & 3); i ++) { | |||
| x[i] *= beta; | |||
| } | |||
| } | |||
| return; | |||
| } | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT alpha, bfloat16 *a, BLASLONG lda, | |||
| bfloat16 *x, BLASLONG incx, float beta, float *y, BLASLONG incy) { | |||
| BLASLONG i, j; | |||
| bfloat16_t *a_ptr, *x_ptr; | |||
| FLOAT *y_ptr; | |||
| bfloat16x8_t a0, a1, a2, a3, a4, a5, a6, a7; | |||
| bfloat16x8_t t0, t1, t2, t3, t4, t5, t6, t7; | |||
| bfloat16x8_t x_vec; | |||
| bfloat16x4_t x_vecx4; | |||
| float32x4_t y1_vec, y2_vec; | |||
| float32x4_t fp32_low, fp32_high; | |||
| float x0, x1, x2, x3, x4, x5, x6, x7; | |||
| bfloat16_t *a_ptr0, *a_ptr1, *a_ptr2, *a_ptr3, *a_ptr4, *a_ptr5, *a_ptr6, | |||
| *a_ptr7; | |||
| a_ptr = (bfloat16_t *)a; | |||
| x_ptr = (bfloat16_t *)x; | |||
| BLASLONG rest_m = m & 3; | |||
| bfloat16x4_t bf16_zero = vreinterpret_bf16_u16(vdup_n_u16(0)); | |||
| bfloat16x8_t bf16_zero_q = vreinterpretq_bf16_u16(vdupq_n_u16(0)); | |||
| if (incx == 1 && incy == 1) { | |||
| if (beta != 1) { | |||
| beta_op(y, m, beta); | |||
| } | |||
| for (i = 0; i < n / 8; i++) { | |||
| a_ptr0 = a_ptr; | |||
| a_ptr1 = a_ptr0 + lda; | |||
| a_ptr2 = a_ptr1 + lda; | |||
| a_ptr3 = a_ptr2 + lda; | |||
| a_ptr4 = a_ptr3 + lda; | |||
| a_ptr5 = a_ptr4 + lda; | |||
| a_ptr6 = a_ptr5 + lda; | |||
| a_ptr7 = a_ptr6 + lda; | |||
| a_ptr += 8 * lda; | |||
| y_ptr = y; | |||
| x_vec = vld1q_bf16(x_ptr); | |||
| if (alpha != 1) { | |||
| fp32_low = vreinterpretq_f32_u16( | |||
| vzip1q_u16(vreinterpretq_u16_bf16(bf16_zero_q), | |||
| vreinterpretq_u16_bf16(x_vec))); | |||
| fp32_high = vreinterpretq_f32_u16( | |||
| vzip2q_u16(vreinterpretq_u16_bf16(bf16_zero_q), | |||
| vreinterpretq_u16_bf16(x_vec))); | |||
| fp32_low = vmulq_n_f32(fp32_low, alpha); | |||
| fp32_high = vmulq_n_f32(fp32_high, alpha); | |||
| x_vec = | |||
| vcombine_bf16(vcvt_bf16_f32(fp32_low), vcvt_bf16_f32(fp32_high)); | |||
| } | |||
| for (j = 0; j < m / 8; j++) { | |||
| a0 = vld1q_bf16(a_ptr0); | |||
| a1 = vld1q_bf16(a_ptr1); | |||
| a2 = vld1q_bf16(a_ptr2); | |||
| a3 = vld1q_bf16(a_ptr3); | |||
| a4 = vld1q_bf16(a_ptr4); | |||
| a5 = vld1q_bf16(a_ptr5); | |||
| a6 = vld1q_bf16(a_ptr6); | |||
| a7 = vld1q_bf16(a_ptr7); | |||
| y1_vec = vld1q_f32(y_ptr); | |||
| y2_vec = vld1q_f32(y_ptr + 4); | |||
| t0 = vreinterpretq_bf16_u16( | |||
| vzip1q_u16(vreinterpretq_u16_bf16(a0), vreinterpretq_u16_bf16(a1))); | |||
| t1 = vreinterpretq_bf16_u16( | |||
| vzip1q_u16(vreinterpretq_u16_bf16(a2), vreinterpretq_u16_bf16(a3))); | |||
| t2 = vreinterpretq_bf16_u16( | |||
| vzip1q_u16(vreinterpretq_u16_bf16(a4), vreinterpretq_u16_bf16(a5))); | |||
| t3 = vreinterpretq_bf16_u16( | |||
| vzip1q_u16(vreinterpretq_u16_bf16(a6), vreinterpretq_u16_bf16(a7))); | |||
| t4 = vreinterpretq_bf16_u16( | |||
| vzip2q_u16(vreinterpretq_u16_bf16(a0), vreinterpretq_u16_bf16(a1))); | |||
| t5 = vreinterpretq_bf16_u16( | |||
| vzip2q_u16(vreinterpretq_u16_bf16(a2), vreinterpretq_u16_bf16(a3))); | |||
| t6 = vreinterpretq_bf16_u16( | |||
| vzip2q_u16(vreinterpretq_u16_bf16(a4), vreinterpretq_u16_bf16(a5))); | |||
| t7 = vreinterpretq_bf16_u16( | |||
| vzip2q_u16(vreinterpretq_u16_bf16(a6), vreinterpretq_u16_bf16(a7))); | |||
| y1_vec = vbfmlalbq_laneq_f32(y1_vec, t0, x_vec, 0); | |||
| y1_vec = vbfmlaltq_laneq_f32(y1_vec, t0, x_vec, 1); | |||
| y1_vec = vbfmlalbq_laneq_f32(y1_vec, t1, x_vec, 2); | |||
| y1_vec = vbfmlaltq_laneq_f32(y1_vec, t1, x_vec, 3); | |||
| y1_vec = vbfmlalbq_laneq_f32(y1_vec, t2, x_vec, 4); | |||
| y1_vec = vbfmlaltq_laneq_f32(y1_vec, t2, x_vec, 5); | |||
| y1_vec = vbfmlalbq_laneq_f32(y1_vec, t3, x_vec, 6); | |||
| y1_vec = vbfmlaltq_laneq_f32(y1_vec, t3, x_vec, 7); | |||
| y2_vec = vbfmlalbq_laneq_f32(y2_vec, t4, x_vec, 0); | |||
| y2_vec = vbfmlaltq_laneq_f32(y2_vec, t4, x_vec, 1); | |||
| y2_vec = vbfmlalbq_laneq_f32(y2_vec, t5, x_vec, 2); | |||
| y2_vec = vbfmlaltq_laneq_f32(y2_vec, t5, x_vec, 3); | |||
| y2_vec = vbfmlalbq_laneq_f32(y2_vec, t6, x_vec, 4); | |||
| y2_vec = vbfmlaltq_laneq_f32(y2_vec, t6, x_vec, 5); | |||
| y2_vec = vbfmlalbq_laneq_f32(y2_vec, t7, x_vec, 6); | |||
| y2_vec = vbfmlaltq_laneq_f32(y2_vec, t7, x_vec, 7); | |||
| vst1q_f32(y_ptr, y1_vec); | |||
| vst1q_f32(y_ptr + 4, y2_vec); | |||
| a_ptr0 += 8; | |||
| a_ptr1 += 8; | |||
| a_ptr2 += 8; | |||
| a_ptr3 += 8; | |||
| a_ptr4 += 8; | |||
| a_ptr5 += 8; | |||
| a_ptr6 += 8; | |||
| a_ptr7 += 8; | |||
| y_ptr += 8; | |||
| } | |||
| if (m & 4) { | |||
| bfloat16x4_t a0x4 = vld1_bf16(a_ptr0); | |||
| bfloat16x4_t a1x4 = vld1_bf16(a_ptr1); | |||
| bfloat16x4_t a2x4 = vld1_bf16(a_ptr2); | |||
| bfloat16x4_t a3x4 = vld1_bf16(a_ptr3); | |||
| bfloat16x4_t a4x4 = vld1_bf16(a_ptr4); | |||
| bfloat16x4_t a5x4 = vld1_bf16(a_ptr5); | |||
| bfloat16x4_t a6x4 = vld1_bf16(a_ptr6); | |||
| bfloat16x4_t a7x4 = vld1_bf16(a_ptr7); | |||
| y1_vec = vld1q_f32(y_ptr); | |||
| a0 = vcombine_bf16(a0x4, bf16_zero); | |||
| a1 = vcombine_bf16(a1x4, bf16_zero); | |||
| a2 = vcombine_bf16(a2x4, bf16_zero); | |||
| a3 = vcombine_bf16(a3x4, bf16_zero); | |||
| a4 = vcombine_bf16(a4x4, bf16_zero); | |||
| a5 = vcombine_bf16(a5x4, bf16_zero); | |||
| a6 = vcombine_bf16(a6x4, bf16_zero); | |||
| a7 = vcombine_bf16(a7x4, bf16_zero); | |||
| t0 = vreinterpretq_bf16_u16( | |||
| vzip1q_u16(vreinterpretq_u16_bf16(a0), vreinterpretq_u16_bf16(a1))); | |||
| t1 = vreinterpretq_bf16_u16( | |||
| vzip1q_u16(vreinterpretq_u16_bf16(a2), vreinterpretq_u16_bf16(a3))); | |||
| t2 = vreinterpretq_bf16_u16( | |||
| vzip1q_u16(vreinterpretq_u16_bf16(a4), vreinterpretq_u16_bf16(a5))); | |||
| t3 = vreinterpretq_bf16_u16( | |||
| vzip1q_u16(vreinterpretq_u16_bf16(a6), vreinterpretq_u16_bf16(a7))); | |||
| y1_vec = vbfmlalbq_laneq_f32(y1_vec, t0, x_vec, 0); | |||
| y1_vec = vbfmlaltq_laneq_f32(y1_vec, t0, x_vec, 1); | |||
| y1_vec = vbfmlalbq_laneq_f32(y1_vec, t1, x_vec, 2); | |||
| y1_vec = vbfmlaltq_laneq_f32(y1_vec, t1, x_vec, 3); | |||
| y1_vec = vbfmlalbq_laneq_f32(y1_vec, t2, x_vec, 4); | |||
| y1_vec = vbfmlaltq_laneq_f32(y1_vec, t2, x_vec, 5); | |||
| y1_vec = vbfmlalbq_laneq_f32(y1_vec, t3, x_vec, 6); | |||
| y1_vec = vbfmlaltq_laneq_f32(y1_vec, t3, x_vec, 7); | |||
| vst1q_f32(y_ptr, y1_vec); | |||
| a_ptr0 += 4; | |||
| a_ptr1 += 4; | |||
| a_ptr2 += 4; | |||
| a_ptr3 += 4; | |||
| a_ptr4 += 4; | |||
| a_ptr5 += 4; | |||
| a_ptr6 += 4; | |||
| a_ptr7 += 4; | |||
| y_ptr += 4; | |||
| } | |||
| if (rest_m) { | |||
| x0 = alpha * vcvtah_f32_bf16(x_ptr[0]); | |||
| x1 = alpha * vcvtah_f32_bf16(x_ptr[1]); | |||
| x2 = alpha * vcvtah_f32_bf16(x_ptr[2]); | |||
| x3 = alpha * vcvtah_f32_bf16(x_ptr[3]); | |||
| x4 = alpha * vcvtah_f32_bf16(x_ptr[4]); | |||
| x5 = alpha * vcvtah_f32_bf16(x_ptr[5]); | |||
| x6 = alpha * vcvtah_f32_bf16(x_ptr[6]); | |||
| x7 = alpha * vcvtah_f32_bf16(x_ptr[7]); | |||
| for (BLASLONG j = 0; j < rest_m; j++) { | |||
| y_ptr[j] += x0 * vcvtah_f32_bf16(a_ptr0[j]); | |||
| y_ptr[j] += x1 * vcvtah_f32_bf16(a_ptr1[j]); | |||
| y_ptr[j] += x2 * vcvtah_f32_bf16(a_ptr2[j]); | |||
| y_ptr[j] += x3 * vcvtah_f32_bf16(a_ptr3[j]); | |||
| y_ptr[j] += x4 * vcvtah_f32_bf16(a_ptr4[j]); | |||
| y_ptr[j] += x5 * vcvtah_f32_bf16(a_ptr5[j]); | |||
| y_ptr[j] += x6 * vcvtah_f32_bf16(a_ptr6[j]); | |||
| y_ptr[j] += x7 * vcvtah_f32_bf16(a_ptr7[j]); | |||
| } | |||
| } | |||
| x_ptr += 8; | |||
| } | |||
| if (n & 4) { | |||
| a_ptr0 = a_ptr; | |||
| a_ptr1 = a_ptr0 + lda; | |||
| a_ptr2 = a_ptr1 + lda; | |||
| a_ptr3 = a_ptr2 + lda; | |||
| a_ptr += 4 * lda; | |||
| x_vecx4 = vld1_bf16(x_ptr); | |||
| if (alpha != 1) { | |||
| fp32_low = vcvt_f32_bf16(x_vecx4); | |||
| fp32_low = vmulq_n_f32(fp32_low, alpha); | |||
| x_vecx4 = vcvt_bf16_f32(fp32_low); | |||
| } | |||
| y_ptr = y; | |||
| for (j = 0; j < m / 8; j++) { | |||
| a0 = vld1q_bf16(a_ptr0); | |||
| a1 = vld1q_bf16(a_ptr1); | |||
| a2 = vld1q_bf16(a_ptr2); | |||
| a3 = vld1q_bf16(a_ptr3); | |||
| y1_vec = vld1q_f32(y_ptr); | |||
| y2_vec = vld1q_f32(y_ptr + 4); | |||
| t0 = vreinterpretq_bf16_u16( | |||
| vzip1q_u16(vreinterpretq_u16_bf16(a0), vreinterpretq_u16_bf16(a1))); | |||
| t1 = vreinterpretq_bf16_u16( | |||
| vzip1q_u16(vreinterpretq_u16_bf16(a2), vreinterpretq_u16_bf16(a3))); | |||
| t4 = vreinterpretq_bf16_u16( | |||
| vzip2q_u16(vreinterpretq_u16_bf16(a0), vreinterpretq_u16_bf16(a1))); | |||
| t5 = vreinterpretq_bf16_u16( | |||
| vzip2q_u16(vreinterpretq_u16_bf16(a2), vreinterpretq_u16_bf16(a3))); | |||
| y1_vec = vbfmlalbq_lane_f32(y1_vec, t0, x_vecx4, 0); | |||
| y1_vec = vbfmlaltq_lane_f32(y1_vec, t0, x_vecx4, 1); | |||
| y1_vec = vbfmlalbq_lane_f32(y1_vec, t1, x_vecx4, 2); | |||
| y1_vec = vbfmlaltq_lane_f32(y1_vec, t1, x_vecx4, 3); | |||
| y2_vec = vbfmlalbq_lane_f32(y2_vec, t4, x_vecx4, 0); | |||
| y2_vec = vbfmlaltq_lane_f32(y2_vec, t4, x_vecx4, 1); | |||
| y2_vec = vbfmlalbq_lane_f32(y2_vec, t5, x_vecx4, 2); | |||
| y2_vec = vbfmlaltq_lane_f32(y2_vec, t5, x_vecx4, 3); | |||
| vst1q_f32(y_ptr, y1_vec); | |||
| vst1q_f32(y_ptr + 4, y2_vec); | |||
| a_ptr0 += 8; | |||
| a_ptr1 += 8; | |||
| a_ptr2 += 8; | |||
| a_ptr3 += 8; | |||
| y_ptr += 8; | |||
| } | |||
| if (m & 4) { | |||
| bfloat16x4_t a0x4 = vld1_bf16(a_ptr0); | |||
| bfloat16x4_t a1x4 = vld1_bf16(a_ptr1); | |||
| bfloat16x4_t a2x4 = vld1_bf16(a_ptr2); | |||
| bfloat16x4_t a3x4 = vld1_bf16(a_ptr3); | |||
| y1_vec = vld1q_f32(y_ptr); | |||
| a0 = vcombine_bf16(a0x4, a2x4); | |||
| a1 = vcombine_bf16(a1x4, a3x4); | |||
| t0 = vreinterpretq_bf16_u16(vzip1q_u16(vreinterpretq_u16_bf16(a0), vreinterpretq_u16_bf16(a1))); | |||
| t1 = vreinterpretq_bf16_u16(vzip2q_u16(vreinterpretq_u16_bf16(a0), vreinterpretq_u16_bf16(a1))); | |||
| y1_vec = vbfmlalbq_lane_f32(y1_vec, t0, x_vecx4, 0); | |||
| y1_vec = vbfmlaltq_lane_f32(y1_vec, t0, x_vecx4, 1); | |||
| y1_vec = vbfmlalbq_lane_f32(y1_vec, t1, x_vecx4, 2); | |||
| y1_vec = vbfmlaltq_lane_f32(y1_vec, t1, x_vecx4, 3); | |||
| vst1q_f32(y_ptr, y1_vec); | |||
| a_ptr0 += 4; | |||
| a_ptr1 += 4; | |||
| a_ptr2 += 4; | |||
| a_ptr3 += 4; | |||
| y_ptr += 4; | |||
| } | |||
| if (rest_m) { | |||
| fp32_low = vcvt_f32_bf16(x_vecx4); | |||
| x0 = vgetq_lane_f32(fp32_low, 0); | |||
| x1 = vgetq_lane_f32(fp32_low, 1); | |||
| x2 = vgetq_lane_f32(fp32_low, 2); | |||
| x3 = vgetq_lane_f32(fp32_low, 3); | |||
| for (BLASLONG j = 0; j < rest_m; j++) { | |||
| y_ptr[j] += x0 * vcvtah_f32_bf16(a_ptr0[j]); | |||
| y_ptr[j] += x1 * vcvtah_f32_bf16(a_ptr1[j]); | |||
| y_ptr[j] += x2 * vcvtah_f32_bf16(a_ptr2[j]); | |||
| y_ptr[j] += x3 * vcvtah_f32_bf16(a_ptr3[j]); | |||
| } | |||
| } | |||
| x_ptr += 4; | |||
| } | |||
| if (n & 2) { | |||
| a_ptr0 = a_ptr; | |||
| a_ptr1 = a_ptr0 + lda; | |||
| a_ptr += 2 * lda; | |||
| x_vecx4 = vreinterpret_bf16_u16(vzip1_u16( | |||
| vreinterpret_u16_bf16(vdup_n_bf16(x_ptr[0])), | |||
| vreinterpret_u16_bf16(vdup_n_bf16(x_ptr[1])) | |||
| )); | |||
| if (alpha != 1) { | |||
| fp32_low = vcvt_f32_bf16(x_vecx4); | |||
| fp32_low = vmulq_n_f32(fp32_low, alpha); | |||
| x_vecx4 = vcvt_bf16_f32(fp32_low); | |||
| } | |||
| y_ptr = y; | |||
| for (j = 0; j < m / 8; j++) { | |||
| a0 = vld1q_bf16(a_ptr0); | |||
| a1 = vld1q_bf16(a_ptr1); | |||
| y1_vec = vld1q_f32(y_ptr); | |||
| y2_vec = vld1q_f32(y_ptr + 4); | |||
| t0 = vreinterpretq_bf16_u16( | |||
| vzip1q_u16(vreinterpretq_u16_bf16(a0), vreinterpretq_u16_bf16(a1))); | |||
| t1 = vreinterpretq_bf16_u16( | |||
| vzip2q_u16(vreinterpretq_u16_bf16(a0), vreinterpretq_u16_bf16(a1))); | |||
| y1_vec = vbfmlalbq_lane_f32(y1_vec, t0, x_vecx4, 0); | |||
| y1_vec = vbfmlaltq_lane_f32(y1_vec, t0, x_vecx4, 1); | |||
| y2_vec = vbfmlalbq_lane_f32(y2_vec, t1, x_vecx4, 0); | |||
| y2_vec = vbfmlaltq_lane_f32(y2_vec, t1, x_vecx4, 1); | |||
| vst1q_f32(y_ptr, y1_vec); | |||
| vst1q_f32(y_ptr + 4, y2_vec); | |||
| a_ptr0 += 8; | |||
| a_ptr1 += 8; | |||
| y_ptr += 8; | |||
| } | |||
| if (m & 4) { | |||
| bfloat16x4_t a0x4 = vld1_bf16(a_ptr0); | |||
| bfloat16x4_t a1x4 = vld1_bf16(a_ptr1); | |||
| y1_vec = vld1q_f32(y_ptr); | |||
| a0 = vcombine_bf16(a0x4, bf16_zero); | |||
| a1 = vcombine_bf16(a1x4, bf16_zero); | |||
| t0 = vreinterpretq_bf16_u16(vzip1q_u16(vreinterpretq_u16_bf16(a0), vreinterpretq_u16_bf16(a1))); | |||
| y1_vec = vbfmlalbq_lane_f32(y1_vec, t0, x_vecx4, 0); | |||
| y1_vec = vbfmlaltq_lane_f32(y1_vec, t0, x_vecx4, 1); | |||
| vst1q_f32(y_ptr, y1_vec); | |||
| a_ptr0 += 4; | |||
| a_ptr1 += 4; | |||
| y_ptr += 4; | |||
| } | |||
| if (m & 2) { | |||
| fp32_low = vcvt_f32_bf16(x_vecx4); | |||
| x0 = vgetq_lane_f32(fp32_low, 0); | |||
| x1 = vgetq_lane_f32(fp32_low, 1); | |||
| y_ptr[0] += x0 * vcvtah_f32_bf16(a_ptr0[0]); | |||
| y_ptr[0] += x1 * vcvtah_f32_bf16(a_ptr1[0]); | |||
| y_ptr[1] += x0 * vcvtah_f32_bf16(a_ptr0[1]); | |||
| y_ptr[1] += x1 * vcvtah_f32_bf16(a_ptr1[1]); | |||
| a_ptr0 += 2; | |||
| a_ptr1 += 2; | |||
| y_ptr += 2; | |||
| } | |||
| if (m & 1) { | |||
| fp32_low = vcvt_f32_bf16(x_vecx4); | |||
| x0 = vgetq_lane_f32(fp32_low, 0); | |||
| x1 = vgetq_lane_f32(fp32_low, 1); | |||
| y_ptr[0] += x0 * vcvtah_f32_bf16(a_ptr0[0]); | |||
| y_ptr[0] += x1 * vcvtah_f32_bf16(a_ptr1[0]); | |||
| } | |||
| x_ptr += 2; | |||
| } | |||
| if (n & 1) { | |||
| x0 = vcvtah_f32_bf16(x_ptr[0]) * alpha; | |||
| y_ptr = y; | |||
| a_ptr0 = a_ptr; | |||
| for (j = 0; j < m; j++) { | |||
| y_ptr[j] += x0 * vcvtah_f32_bf16(a_ptr0[j]); | |||
| } | |||
| } | |||
| return (0); | |||
| } | |||
| BLASLONG iy = 0; | |||
| for (i = 0; i < m; i++) { | |||
| y[iy] *= beta; | |||
| iy += incy; | |||
| } | |||
| for (j = 0; j < n; j++) { | |||
| x0 = alpha * vcvtah_f32_bf16(*x_ptr); | |||
| iy = 0; | |||
| for (i = 0; i < m; i++) { | |||
| y[iy] += x0 * vcvtah_f32_bf16(a_ptr[i]); | |||
| iy += incy; | |||
| } | |||
| a_ptr += lda; | |||
| x_ptr += incx; | |||
| } | |||
| return (0); | |||
| } | |||
| @@ -0,0 +1,202 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2025, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written | |||
| permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include <arm_neon.h> | |||
| #include "common.h" | |||
| int CNAME(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, BLASLONG incx, float beta, float *y, BLASLONG incy) | |||
| { | |||
| if (m < 1 || n < 1) return(0); | |||
| BLASLONG i; | |||
| BLASLONG ix,iy; | |||
| BLASLONG j; | |||
| bfloat16_t *a_ptr; | |||
| bfloat16_t *x_ptr; | |||
| float *y_ptr; | |||
| float temp; | |||
| iy = 0; | |||
| a_ptr = (bfloat16_t*)(a); | |||
| x_ptr = (bfloat16_t*)(x); | |||
| if (incx == 1) { | |||
| BLASLONG width = n / 4; | |||
| bfloat16_t *a0_ptr = a_ptr + lda * width * 0; | |||
| bfloat16_t *a1_ptr = a_ptr + lda * width * 1; | |||
| bfloat16_t *a2_ptr = a_ptr + lda * width * 2; | |||
| bfloat16_t *a3_ptr = a_ptr + lda * width * 3; | |||
| float *y0_ptr = y + incy * width * 0; | |||
| float *y1_ptr = y + incy * width * 1; | |||
| float *y2_ptr = y + incy * width * 2; | |||
| float *y3_ptr = y + incy * width * 3; | |||
| for (j = 0; j < width; j++) { | |||
| float32x4_t temp0_vec = vdupq_n_f32(0.0f); | |||
| float32x4_t temp1_vec = vdupq_n_f32(0.0f); | |||
| float32x4_t temp2_vec = vdupq_n_f32(0.0f); | |||
| float32x4_t temp3_vec = vdupq_n_f32(0.0f); | |||
| i = 0; | |||
| while (i + 7 < m) { | |||
| bfloat16x8_t x_vec = vld1q_bf16(x_ptr + i); | |||
| bfloat16x8_t a0_vec = vld1q_bf16(a0_ptr + i); | |||
| bfloat16x8_t a1_vec = vld1q_bf16(a1_ptr + i); | |||
| bfloat16x8_t a2_vec = vld1q_bf16(a2_ptr + i); | |||
| bfloat16x8_t a3_vec = vld1q_bf16(a3_ptr + i); | |||
| temp0_vec = vbfdotq_f32(temp0_vec, a0_vec, x_vec); | |||
| temp1_vec = vbfdotq_f32(temp1_vec, a1_vec, x_vec); | |||
| temp2_vec = vbfdotq_f32(temp2_vec, a2_vec, x_vec); | |||
| temp3_vec = vbfdotq_f32(temp3_vec, a3_vec, x_vec); | |||
| i += 8; | |||
| } | |||
| if (i + 3 < m) { | |||
| float32x2_t t0 = vdup_n_f32(0.0f); | |||
| float32x2_t t1 = vdup_n_f32(0.0f); | |||
| float32x2_t t2 = vdup_n_f32(0.0f); | |||
| float32x2_t t3 = vdup_n_f32(0.0f); | |||
| bfloat16x4_t x_vec = vld1_bf16(x_ptr + i); | |||
| bfloat16x4_t a0_vec = vld1_bf16(a0_ptr + i); | |||
| bfloat16x4_t a1_vec = vld1_bf16(a1_ptr + i); | |||
| bfloat16x4_t a2_vec = vld1_bf16(a2_ptr + i); | |||
| bfloat16x4_t a3_vec = vld1_bf16(a3_ptr + i); | |||
| t0 = vbfdot_f32(t0, a0_vec, x_vec); | |||
| t1 = vbfdot_f32(t1, a1_vec, x_vec); | |||
| t2 = vbfdot_f32(t2, a2_vec, x_vec); | |||
| t3 = vbfdot_f32(t3, a3_vec, x_vec); | |||
| float32x2_t temp0_vec_low = vget_low_f32(temp0_vec); | |||
| float32x2_t temp1_vec_low = vget_low_f32(temp1_vec); | |||
| float32x2_t temp2_vec_low = vget_low_f32(temp2_vec); | |||
| float32x2_t temp3_vec_low = vget_low_f32(temp3_vec); | |||
| temp0_vec = vcombine_f32(vadd_f32(t0, temp0_vec_low), vget_high_f32(temp0_vec)); | |||
| temp1_vec = vcombine_f32(vadd_f32(t1, temp1_vec_low), vget_high_f32(temp1_vec)); | |||
| temp2_vec = vcombine_f32(vadd_f32(t2, temp2_vec_low), vget_high_f32(temp2_vec)); | |||
| temp3_vec = vcombine_f32(vadd_f32(t3, temp3_vec_low), vget_high_f32(temp3_vec)); | |||
| i += 4; | |||
| } | |||
| if (beta == 0.0f) { | |||
| y0_ptr[iy] = alpha * vaddvq_f32(temp0_vec); | |||
| y1_ptr[iy] = alpha * vaddvq_f32(temp1_vec); | |||
| y2_ptr[iy] = alpha * vaddvq_f32(temp2_vec); | |||
| y3_ptr[iy] = alpha * vaddvq_f32(temp3_vec); | |||
| } | |||
| else { | |||
| y0_ptr[iy] = alpha * vaddvq_f32(temp0_vec) + beta * y0_ptr[iy]; | |||
| y1_ptr[iy] = alpha * vaddvq_f32(temp1_vec) + beta * y1_ptr[iy]; | |||
| y2_ptr[iy] = alpha * vaddvq_f32(temp2_vec) + beta * y2_ptr[iy]; | |||
| y3_ptr[iy] = alpha * vaddvq_f32(temp3_vec) + beta * y3_ptr[iy]; | |||
| } | |||
| for (; i < m; ++i) { | |||
| y0_ptr[iy] += alpha * vcvtah_f32_bf16(a0_ptr[i]) * vcvtah_f32_bf16(x_ptr[i]); | |||
| y1_ptr[iy] += alpha * vcvtah_f32_bf16(a1_ptr[i]) * vcvtah_f32_bf16(x_ptr[i]); | |||
| y2_ptr[iy] += alpha * vcvtah_f32_bf16(a2_ptr[i]) * vcvtah_f32_bf16(x_ptr[i]); | |||
| y3_ptr[iy] += alpha * vcvtah_f32_bf16(a3_ptr[i]) * vcvtah_f32_bf16(x_ptr[i]); | |||
| } | |||
| iy += incy; | |||
| a0_ptr += lda; | |||
| a1_ptr += lda; | |||
| a2_ptr += lda; | |||
| a3_ptr += lda; | |||
| } | |||
| a_ptr = a3_ptr; | |||
| y_ptr = y3_ptr; | |||
| for (j = width * 4; j < n; j++) { | |||
| float32x4_t temp0_vec = vdupq_n_f32(0.0f); | |||
| i = 0; | |||
| while (i + 7 < m) { | |||
| bfloat16x8_t x_vec = vld1q_bf16(x_ptr + i); | |||
| bfloat16x8_t a0_vec = vld1q_bf16(a_ptr + i); | |||
| temp0_vec = vbfdotq_f32(temp0_vec, a0_vec, x_vec); | |||
| i += 8; | |||
| } | |||
| if (i + 3 < m) { | |||
| float32x2_t t0 = vdup_n_f32(0.0f); | |||
| bfloat16x4_t x_vec = vld1_bf16(x_ptr + i); | |||
| bfloat16x4_t a0_vec = vld1_bf16(a_ptr + i); | |||
| t0 = vbfdot_f32(t0, a0_vec, x_vec); | |||
| float32x2_t temp0_vec_low = vget_low_f32(temp0_vec); | |||
| temp0_vec = vcombine_f32(vadd_f32(t0, temp0_vec_low), vget_high_f32(temp0_vec)); | |||
| i += 4; | |||
| } | |||
| if (beta == 0.0f) { | |||
| y_ptr[iy] = alpha * vaddvq_f32(temp0_vec); | |||
| } | |||
| else { | |||
| y_ptr[iy] = alpha * vaddvq_f32(temp0_vec) + beta * y_ptr[iy]; | |||
| } | |||
| for (; i < m; ++i) { | |||
| y_ptr[iy] += alpha * vcvtah_f32_bf16(a_ptr[i]) * vcvtah_f32_bf16(x_ptr[i]); | |||
| } | |||
| iy += incy; | |||
| a_ptr += lda; | |||
| } | |||
| return(0); | |||
| } | |||
| for (j = 0; j < n; j++) { | |||
| temp = 0.0; | |||
| ix = 0; | |||
| for (i = 0; i < m; i++) { | |||
| temp += vcvtah_f32_bf16(a_ptr[i]) * vcvtah_f32_bf16(x_ptr[ix]); | |||
| ix += incx; | |||
| } | |||
| if (beta == 0.0f) { | |||
| y[iy] = alpha * temp; | |||
| } | |||
| else { | |||
| y[iy] = alpha * temp + beta * y[iy]; | |||
| } | |||
| iy += incy; | |||
| a_ptr += lda; | |||
| } | |||
| return (0); | |||
| } | |||
| @@ -0,0 +1,80 @@ | |||
| /* | |||
| Copyright (c) 2025 Qualcomm Innovation Center, Inc. All rights reserved. | |||
| SPDX-License-Identifier: BSD-3-Clause-Clear | |||
| */ | |||
| #include "common.h" | |||
| #include <stdlib.h> | |||
| #include <inttypes.h> | |||
| #include <math.h> | |||
| #if defined(HAVE_SME) | |||
| /* Function prototypes */ | |||
| extern void sgemm_direct_sme1_preprocess(uint64_t nbr, uint64_t nbc,\ | |||
| const float * restrict a, float * a_mod) __asm__("sgemm_direct_sme1_preprocess"); | |||
| extern void sgemm_direct_sme1_2VLx2VL(uint64_t m, uint64_t k, uint64_t n,\ | |||
| const float * matLeft,\ | |||
| const float * restrict matRight,\ | |||
| const float * restrict matResult) __asm__("sgemm_direct_sme1_2VLx2VL"); | |||
| /* Function Definitions */ | |||
| uint64_t sve_cntw() { | |||
| uint64_t cnt; | |||
| asm volatile( | |||
| "rdsvl %[res], #1\n" | |||
| "lsr %[res], %[res], #2\n" | |||
| : [res] "=r" (cnt) :: | |||
| ); | |||
| return cnt; | |||
| } | |||
| /*void sgemm_kernel_direct (BLASLONG M, BLASLONG N, BLASLONG K,\ | |||
| float * __restrict A, BLASLONG strideA, float * __restrict B,\ | |||
| BLASLONG strideB , float * __restrict R, BLASLONG strideR) | |||
| */ | |||
| void CNAME (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A,\ | |||
| BLASLONG strideA, float * __restrict B, BLASLONG strideB ,\ | |||
| float * __restrict R, BLASLONG strideR){ | |||
| uint64_t m_mod, vl_elms; | |||
| vl_elms = sve_cntw(); | |||
| m_mod = ceil((double)M/(double)vl_elms) * vl_elms; | |||
| float *A_mod = (float *) malloc(m_mod*K*sizeof(float)); | |||
| /* Prevent compiler optimization by reading from memory instead | |||
| * of reading directly from vector (z) registers. | |||
| * */ | |||
| asm volatile("" : : :"p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", | |||
| "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", | |||
| "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", | |||
| "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", | |||
| "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", | |||
| "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"); | |||
| /* Pre-process the left matrix to make it suitable for | |||
| matrix sum of outer-product calculation | |||
| */ | |||
| sgemm_direct_sme1_preprocess(M, K, A, A_mod); | |||
| /* Calculate C = A*B */ | |||
| sgemm_direct_sme1_2VLx2VL(M, K, N, A_mod, B, R); | |||
| asm volatile("" : : :"p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", | |||
| "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", | |||
| "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", | |||
| "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", | |||
| "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", | |||
| "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"); | |||
| free(A_mod); | |||
| } | |||
| #else | |||
| void CNAME (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A,\ | |||
| BLASLONG strideA, float * __restrict B, BLASLONG strideB ,\ | |||
| float * __restrict R, BLASLONG strideR){} | |||
| #endif | |||