Browse Source

Merge pull request #5316 from OpenMathLib/develop

Update from develop for 0.3.30 release
tags/v0.3.30
Martin Kroeker GitHub 7 months ago
parent
commit
e81fca06dd
No known key found for this signature in database GPG Key ID: B5690EEEBB952194
100 changed files with 4121 additions and 765 deletions
  1. +7
    -7
      .cirrus.yml
  2. +1
    -0
      .github/workflows/apple_m.yml
  3. +41
    -10
      .github/workflows/c910v.yml
  4. +2
    -2
      .github/workflows/codspeed-bench.yml
  5. +23
    -1
      .github/workflows/dynamic_arch.yml
  6. +1
    -1
      .github/workflows/loongarch64_clang.yml
  7. +2
    -2
      .github/workflows/mips64.yml
  8. +107
    -28
      CMakeLists.txt
  9. +23
    -1
      CONTRIBUTORS.md
  10. +134
    -0
      Changelog.txt
  11. +6
    -1
      Makefile
  12. +7
    -2
      Makefile.arm64
  13. +2
    -2
      Makefile.install
  14. +1
    -1
      Makefile.rule
  15. +14
    -0
      Makefile.system
  16. +1
    -0
      TargetList.txt
  17. +40
    -14
      azure-pipelines.yml
  18. +19
    -0
      c_check
  19. +15
    -14
      cmake/arch.cmake
  20. +28
    -8
      cmake/cc.cmake
  21. +12
    -12
      cmake/fc.cmake
  22. +3
    -0
      cmake/kernel.cmake
  23. +8
    -6
      cmake/lapacke.cmake
  24. +27
    -6
      cmake/prebuild.cmake
  25. +39
    -18
      cmake/system.cmake
  26. +11
    -0
      cmake/system_check.cmake
  27. +8
    -0
      cmake/utils.cmake
  28. +1
    -0
      common.h
  29. +9
    -1
      common_arm.h
  30. +1
    -1
      common_arm64.h
  31. +2
    -0
      common_d.h
  32. +3
    -3
      common_level1.h
  33. +3
    -0
      common_macro.h
  34. +8
    -0
      common_param.h
  35. +2
    -0
      common_q.h
  36. +4
    -2
      common_s.h
  37. +9
    -2
      common_zarch.h
  38. +3
    -0
      cpuid.S
  39. +40
    -4
      cpuid_arm64.c
  40. +17
    -0
      cpuid_x86.c
  41. +1
    -17
      ctest/CMakeLists.txt
  42. +4
    -4
      ctest/Makefile
  43. +1
    -1
      ctest/c_cblat1c.c
  44. +3
    -3
      docs/install.md
  45. +4
    -0
      driver/level2/CMakeLists.txt
  46. +4
    -0
      driver/level3/CMakeLists.txt
  47. +34
    -16
      driver/level3/level3_thread.c
  48. +4
    -0
      driver/others/CMakeLists.txt
  49. +4
    -2
      driver/others/blas_server.c
  50. +67
    -4
      driver/others/dynamic_arm64.c
  51. +1
    -1
      exports/Makefile
  52. +154
    -29
      exports/gensymbol.pl
  53. +7
    -0
      f_check
  54. +13
    -0
      getarch.c
  55. +11
    -7
      interface/CMakeLists.txt
  56. +12
    -12
      interface/Makefile
  57. +86
    -14
      interface/gemm.c
  58. +36
    -0
      interface/gemmt.c
  59. +65
    -12
      interface/gemv.c
  60. +21
    -7
      interface/lapack/gesv.c
  61. +63
    -1
      interface/nrm2.c
  62. +6
    -134
      interface/rotm.c
  63. +10
    -5
      interface/zgemv.c
  64. +2
    -2
      interface/zscal.c
  65. +2
    -2
      interface/zsyr.c
  66. +20
    -3
      kernel/CMakeLists.txt
  67. +4
    -0
      kernel/Makefile
  68. +25
    -4
      kernel/Makefile.L1
  69. +33
    -1
      kernel/Makefile.L3
  70. +12
    -0
      kernel/alpha/KERNEL
  71. +10
    -0
      kernel/arm/KERNEL
  72. +40
    -49
      kernel/arm/zscal.c
  73. +10
    -0
      kernel/arm64/KERNEL
  74. +2
    -2
      kernel/arm64/KERNEL.A64FX
  75. +9
    -4
      kernel/arm64/KERNEL.ARMV8SVE
  76. +3
    -0
      kernel/arm64/KERNEL.ARMV9SME
  77. +15
    -1
      kernel/arm64/KERNEL.NEOVERSEN1
  78. +6
    -4
      kernel/arm64/KERNEL.NEOVERSEN2
  79. +20
    -0
      kernel/arm64/KERNEL.NEOVERSEV1
  80. +5
    -0
      kernel/arm64/KERNEL.NEOVERSEV2
  81. +12
    -0
      kernel/arm64/KERNEL.generic
  82. +217
    -216
      kernel/arm64/copy_thunderx2t99.c
  83. +2
    -0
      kernel/arm64/dasum_thunderx2t99.c
  84. +1
    -1
      kernel/arm64/dgemm_small_kernel_tn_sve.c
  85. +1
    -1
      kernel/arm64/dgemm_small_kernel_tt_sve.c
  86. +50
    -3
      kernel/arm64/dot.c
  87. +3
    -2
      kernel/arm64/dot_kernel_asimd.c
  88. +71
    -12
      kernel/arm64/gemv_n_sve.c
  89. +138
    -0
      kernel/arm64/gemv_n_sve_v1x3.c
  90. +207
    -0
      kernel/arm64/gemv_n_sve_v4x3.c
  91. +55
    -39
      kernel/arm64/gemv_t_sve_v1x3.c
  92. +2
    -1
      kernel/arm64/sasum_thunderx2t99.c
  93. +83
    -0
      kernel/arm64/sbgemm_beta_neoversev1.c
  94. +46
    -0
      kernel/arm64/sbgemm_kernel_4x4_neoversev1.c
  95. +414
    -0
      kernel/arm64/sbgemm_kernel_4x4_neoversev1_impl.c
  96. +148
    -0
      kernel/arm64/sbgemm_ncopy_4_neoversev1.c
  97. +361
    -0
      kernel/arm64/sbgemm_tcopy_4_neoversev1.c
  98. +515
    -0
      kernel/arm64/sbgemv_n_neon.c
  99. +202
    -0
      kernel/arm64/sbgemv_t_bfdot.c
  100. +80
    -0
      kernel/arm64/sgemm_direct_arm64_sme1.c

+ 7
- 7
.cirrus.yml View File

@@ -58,8 +58,8 @@ task:
- export VALID_ARCHS="i386 x86_64"
- xcrun --sdk macosx --show-sdk-path
- xcodebuild -version
- export CC=/Applications/Xcode_15.4.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
- export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode_15.4.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX14.5.sdk -arch x86_64"
- export CC=/Applications/Xcode_16.3.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
- export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode_16.3.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX15.4.sdk -arch x86_64"
- make TARGET=CORE2 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 RANLIB="ls -l"
always:
config_artifacts:
@@ -78,8 +78,8 @@ task:
- export #PATH=/opt/homebrew/opt/llvm/bin:$PATH
- export #LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
- export #CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
- export CC=/Applications/Xcode_15.4.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
- export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode_15.4.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS17.5.sdk -arch arm64 -miphoneos-version-min=10.0"
- export CC=/Applications/Xcode_16.3.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
- export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode_16.3.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS18.4.sdk -arch arm64 -miphoneos-version-min=10.0"
- xcrun --sdk iphoneos --show-sdk-path
- ls -l /Applications
- make TARGET=ARMV8 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 CROSS=1
@@ -127,7 +127,7 @@ task:
FreeBSD_task:
name: FreeBSD-gcc
freebsd_instance:
image_family: freebsd-14-1
image_family: freebsd-14-2
install_script:
- pkg update -f && pkg upgrade -y && pkg install -y gmake gcc
compile_script:
@@ -138,7 +138,7 @@ FreeBSD_task:
FreeBSD_task:
name: freebsd-gcc-ilp64
freebsd_instance:
image_family: freebsd-14-1
image_family: freebsd-14-2
install_script:
- pkg update -f && pkg upgrade -y && pkg install -y gmake gcc
compile_script:
@@ -148,7 +148,7 @@ FreeBSD_task:
FreeBSD_task:
name: FreeBSD-clang-openmp
freebsd_instance:
image_family: freebsd-14-1
image_family: freebsd-14-2
install_script:
- pkg update -f && pkg upgrade -y && pkg install -y gmake gcc
- ln -s /usr/local/lib/gcc13/libgfortran.so.5.0.0 /usr/lib/libgfortran.so


+ 1
- 0
.github/workflows/apple_m.yml View File

@@ -102,6 +102,7 @@ jobs:
mkdir build && cd build
cmake -DDYNAMIC_ARCH=1 \
-DUSE_OPENMP=${{matrix.openmp}} \
-DOpenMP_Fortran_LIB_NAMES=omp \
-DINTERFACE64=${{matrix.ilp64}} \
-DNOFORTRAN=0 \
-DBUILD_WITHOUT_LAPACK=0 \


+ 41
- 10
.github/workflows/c910v.yml View File

@@ -31,27 +31,28 @@ jobs:

steps:
- name: Checkout repository
uses: actions/checkout@v3
uses: actions/checkout@v4

- name: install build deps
run: |
sudo apt-get update
sudo apt-get install autoconf automake autotools-dev ninja-build make ccache \
gcc-${{ matrix.apt_triple }} gfortran-${{ matrix.apt_triple }} libgomp1-riscv64-cross
gcc-${{ matrix.apt_triple }} gfortran-${{ matrix.apt_triple }} libgomp1-riscv64-cross libglib2.0-dev

- name: checkout qemu
uses: actions/checkout@v3
uses: actions/checkout@v4
with:
repository: T-head-Semi/qemu
repository: XUANTIE-RV/qemu
path: qemu
ref: 1e692ebb43d396c52352406323fc782c1ac99a42
ref: e0ace167effcd36d1f82c7ccb4522b3126011479 # xuantie-qemu-9.0

- name: build qemu
run: |
# Force use c910v qemu-user
wget https://github.com/revyos/qemu/commit/5164bca5a4bcde4534dc1a9aa3a7f619719874cf.patch
wget https://github.com/revyos/qemu/commit/222729c7455784dd855216d7a2bec4bd8f2a6800.patch
cd qemu
patch -p1 < ../5164bca5a4bcde4534dc1a9aa3a7f619719874cf.patch
patch -p1 < ../222729c7455784dd855216d7a2bec4bd8f2a6800.patch
export CXXFLAGS="-Wno-error"; export CFLAGS="-Wno-error"
./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=riscv64-linux-user --disable-system
make -j$(nproc)
make install
@@ -82,9 +83,39 @@ jobs:

- name: test
run: |
export PATH=$GITHUB_WORKSPACE/qemu-install/bin/:$PATH
qemu-riscv64 ./utest/openblas_utest
qemu-riscv64 ./utest/openblas_utest_ext
run_with_retry() {
local cmd="$1"
local time_out=10
local retries=10
local attempt=0

for ((i=1; i<=retries; i++)); do
attempt=$((i))
if timeout -s 12 --preserve-status $time_out $cmd; then
echo "Command succeeded on attempt $i."
return 0
else
local exit_code=$?
if [ $exit_code -eq 140 ]; then
echo "Attempt $i timed out (retrying...)"
time_out=$((time_out + 5))
else
echo "Attempt $i failed with exit code $exit_code. Aborting workflow."
exit $exit_code
fi
fi
done
echo "All $retries attempts failed, giving up."
echo "Final failure was due to timeout."
echo "Aborting workflow."
exit $exit_code
}
export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH
which qemu-riscv64
export QEMU_BIN=$(which qemu-riscv64)
run_with_retry "$QEMU_BIN ./utest/openblas_utest"
run_with_retry "$QEMU_BIN ./utest/openblas_utest_ext"

OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xscblat1
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xdcblat1
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xccblat1


+ 2
- 2
.github/workflows/codspeed-bench.yml View File

@@ -15,7 +15,7 @@ jobs:
strategy:
fail-fast: false
matrix:
os: [ubuntu-latest]
os: [ubuntu-22.04]
fortran: [gfortran]
build: [make]
pyver: ["3.12"]
@@ -147,7 +147,7 @@ jobs:
OPENBLAS_NUM_THREADS=1 pytest benchmarks/bench_blas.py -k 'gesdd'

- name: Run benchmarks
uses: CodSpeedHQ/action@v2
uses: CodSpeedHQ/action@v3
with:
token: ${{ secrets.CODSPEED_TOKEN }}
run: |


+ 23
- 1
.github/workflows/dynamic_arch.yml View File

@@ -43,7 +43,9 @@ jobs:
run: |
if [ "$RUNNER_OS" == "Linux" ]; then
sudo apt-get update
sudo apt-get install -y gfortran cmake ccache libtinfo5
sudo apt-get install -y gfortran cmake ccache
wget http://security.ubuntu.com/ubuntu/pool/universe/n/ncurses/libtinfo5_6.3-2ubuntu0.1_amd64.deb
sudo apt install ./libtinfo5_6.3-2ubuntu0.1_amd64.deb
elif [ "$RUNNER_OS" == "macOS" ]; then
# It looks like "gfortran" isn't working correctly unless "gcc" is re-installed.
brew reinstall gcc
@@ -354,3 +356,23 @@ jobs:
- name: Build OpenBLAS
run: |
make -j$(nproc) HOSTCC="ccache gcc" CC="ccache ${{ matrix.triple }}-gcc" FC="ccache ${{ matrix.triple }}-gfortran" ARCH=${{ matrix.target }} ${{ matrix.opts }}

neoverse_build:
if: "github.repository == 'OpenMathLib/OpenBLAS'"
runs-on: ubuntu-24.04-arm

steps:
- name: Checkout repository
uses: actions/checkout@v3
- name: Install Dependencies
run: |
sudo apt-get update
sudo apt-get install -y gcc gfortran make
- name: Build OpenBLAS
run: |
make -j${nproc}
make -j${nproc} lapack-test

+ 1
- 1
.github/workflows/loongarch64_clang.yml View File

@@ -41,7 +41,7 @@ jobs:
- name: Install APT deps
run: |
sudo apt-get update
sudo apt-get install autoconf automake autotools-dev ninja-build make ccache
sudo apt-get install autoconf automake autotools-dev ninja-build make ccache libglib2.0-dev

- name: Download and install loongarch64-toolchain
run: |


+ 2
- 2
.github/workflows/mips64.yml View File

@@ -41,14 +41,14 @@ jobs:
run: |
sudo apt-get update
sudo apt-get install autoconf automake autotools-dev ninja-build make ccache \
gcc-${{ matrix.triple }} gfortran-${{ matrix.triple }} libgomp1-mips64el-cross
gcc-${{ matrix.triple }} gfortran-${{ matrix.triple }} libgomp1-mips64el-cross libglib2.0-dev

- name: checkout qemu
uses: actions/checkout@v3
with:
repository: qemu/qemu
path: qemu
ref: 79dfa177ae348bb5ab5f97c0915359b13d6186e2
ref: ae35f033b874c627d81d51070187fbf55f0bf1a7

- name: build qemu
run: |


+ 107
- 28
CMakeLists.txt View File

@@ -9,7 +9,7 @@ project(OpenBLAS C ASM)

set(OpenBLAS_MAJOR_VERSION 0)
set(OpenBLAS_MINOR_VERSION 3)
set(OpenBLAS_PATCH_VERSION 29)
set(OpenBLAS_PATCH_VERSION 29.dev)

set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")

@@ -21,6 +21,8 @@ include(CMakePackageConfigHelpers)
#######
option(BUILD_WITHOUT_LAPACK "Do not build LAPACK and LAPACKE (Only BLAS or CBLAS)" OFF)

option(BUILD_WITHOUT_LAPACKE "Do not build the C interface to LAPACK)" OFF)

option(BUILD_LAPACK_DEPRECATED "When building LAPACK, include also some older, deprecated routines" ON)

set(LAPACK_STRLEN "" CACHE STRING "When building LAPACK, use this type (e.g. \"int\") for character lengths (defaults to size_t)")
@@ -60,6 +62,7 @@ option(CPP_THREAD_SAFETY_TEST "Run a massively parallel DGEMM test to confirm th

option(CPP_THREAD_SAFETY_GEMV "Run a massively parallel DGEMV test to confirm thread safety of the library (requires OpenMP)" OFF)
option(BUILD_STATIC_LIBS "Build static library" OFF)
option(BUILD_SHARED_LIBS "Build shared library" OFF)
if(NOT BUILD_STATIC_LIBS AND NOT BUILD_SHARED_LIBS)
set(BUILD_STATIC_LIBS ON CACHE BOOL "Build static library" FORCE)
endif()
@@ -75,12 +78,27 @@ set(SYMBOLPREFIX "" CACHE STRING "Add a prefix to all exported symbol names in

set(SYMBOLSUFFIX "" CACHE STRING "Add a suffix to all exported symbol names in the shared library, e.g. _64 for INTERFACE64 builds" )

if (CMAKE_SYSTEM_NAME MATCHES "Windows" AND BUILD_SHARED_LIBS AND NOT ("${SYMBOLPREFIX}${SYMBOLSUFFIX}" STREQUAL ""))
set (DELETE_STATIC_LIBS "")
if (NOT BUILD_STATIC_LIBS)
message (STATUS "forcing build of a temporary static library for symbol renaming")
set (BUILD_SHARED_LIBS OFF CACHE BOOL "Build shared library" FORCE)
set (BUILD_STATIC_LIBS ON CACHE BOOL "Build static library" FORCE)
set (DELETE_STATIC_LIBS file (REMOVE $<TARGET_FILE_DIR:${OpenBLAS_LIBNAME}_static>/${OpenBLAS_LIBNAME}.lib))
endif ()
endif()


#######
if(BUILD_WITHOUT_LAPACK)
set(NO_LAPACK 1)
set(NO_LAPACKE 1)
endif()

if (BUILD_WITHOUT_LAPACKE)
set(NO_LAPACKE 1)
endif()

if(BUILD_WITHOUT_CBLAS)
set(NO_CBLAS 1)
endif()
@@ -103,14 +121,15 @@ endif()

message(WARNING "CMake support is experimental. It does not yet support all build options and may not produce the same Makefiles that OpenBLAS ships with.")

if (USE_OPENMP)
find_package(OpenMP REQUIRED)
endif ()

include("${PROJECT_SOURCE_DIR}/cmake/utils.cmake")
include("${PROJECT_SOURCE_DIR}/cmake/system.cmake")

set(OpenBLAS_LIBNAME ${LIBNAMEPREFIX}openblas${LIBNAMESUFFIX}${SUFFIX64_UNDERSCORE})
string(FIND "${LIBNAMESUFFIX}" "${SUFFIX64_UNDERSCORE}" HAVE64)
if (${HAVE64} GREATER -1)
set(OpenBLAS_LIBNAME ${LIBNAMEPREFIX}openblas${LIBNAMESUFFIX})
else ()
set(OpenBLAS_LIBNAME ${LIBNAMEPREFIX}openblas${LIBNAMESUFFIX}${SUFFIX64_UNDERSCORE})
endif ()

set(BLASDIRS interface driver/level2 driver/level3 driver/others)

@@ -224,6 +243,12 @@ endif ()
# add objects to the openblas lib
if(NOT NO_LAPACK)
add_library(LAPACK_OVERRIDES OBJECT ${LA_SOURCES})
if (USE_OPENMP AND (NOT NOFORTRAN))
# Disable OpenMP for LAPACK Fortran codes on Windows.
if(NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Windows")
target_link_libraries(LAPACK_OVERRIDES OpenMP::OpenMP_Fortran)
endif()
endif()
list(APPEND TARGET_OBJS "$<TARGET_OBJECTS:LAPACK_OVERRIDES>")
endif()
if(NOT NO_LAPACKE)
@@ -265,30 +290,59 @@ endif()

if (USE_OPENMP)
if(BUILD_STATIC_LIBS)
target_link_libraries(${OpenBLAS_LIBNAME}_static OpenMP::OpenMP_C)
if(NOFORTRAN)
target_link_libraries(${OpenBLAS_LIBNAME}_static OpenMP::OpenMP_C)
else()
target_link_libraries(${OpenBLAS_LIBNAME}_static OpenMP::OpenMP_C OpenMP::OpenMP_Fortran)
endif()
endif()
if(BUILD_SHARED_LIBS)
target_link_libraries(${OpenBLAS_LIBNAME}_shared OpenMP::OpenMP_C)
if(NOFORTRAN)
target_link_libraries(${OpenBLAS_LIBNAME}_shared OpenMP::OpenMP_C)
else()
target_link_libraries(${OpenBLAS_LIBNAME}_shared OpenMP::OpenMP_C OpenMP::OpenMP_Fortran)
endif()
endif()
endif()

# Seems that this hack doesn't required since macOS 11 Big Sur
if (APPLE AND BUILD_SHARED_LIBS AND CMAKE_HOST_SYSTEM_VERSION VERSION_LESS 20)
set (CMAKE_C_USE_RESPONSE_FILE_FOR_OBJECTS 1)
if (NOT NOFORTRAN)
set (CMAKE_Fortran_USE_RESPONSE_FILE_FOR_OBJECTS 1)
set (CMAKE_Fortran_CREATE_SHARED_LIBRARY
"sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ${CMAKE_AR} -ru libopenblas.a && exit 0' "
"sh -c '${CMAKE_AR} -rs libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' "
"sh -c 'echo \"\" | ${CMAKE_Fortran_COMPILER} -o dummy.o -c -x f95-cpp-input - '"
"sh -c '${CMAKE_Fortran_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load dummy.o -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'"
"sh -c 'ls -l ${CMAKE_BINARY_DIR}/lib'")
else ()
set (CMAKE_C_CREATE_SHARED_LIBRARY
"sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ${CMAKE_AR} -ru libopenblas.a && exit 0' "
"sh -c '${CMAKE_AR} -rs libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' "
"sh -c '${CMAKE_C_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'")
endif ()
# Fix "Argument list too long" for macOS with Intel CPUs and DYNAMIC_ARCH turned on
if(APPLE AND DYNAMIC_ARCH AND (NOT CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "arm64"))
# Use response files
set(CMAKE_C_USE_RESPONSE_FILE_FOR_OBJECTS 1)
# Always build static library first
if(BUILD_STATIC_LIBS)
set(STATIC_PATH "${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/lib${OpenBLAS_LIBNAME}.a")
else()
add_library(${OpenBLAS_LIBNAME}_static STATIC ${TARGET_OBJS} ${OpenBLAS_DEF_FILE})
set(STATIC_PATH "lib${OpenBLAS_LIBNAME}.a")
endif()
set(CREATE_STATIC_LIBRARY_COMMAND
"sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/${OpenBLAS_LIBNAME}_static.dir/objects*.rsp | xargs -n 1024 ${CMAKE_AR} -ru ${STATIC_PATH} && exit 0' "
"sh -c '${CMAKE_AR} -rs ${STATIC_PATH} ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' ")
if(BUILD_SHARED_LIBS)
add_dependencies(${OpenBLAS_LIBNAME}_shared ${OpenBLAS_LIBNAME}_static)
set(SHARED_PATH "${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib")
endif()
if(USE_OPENMP)
get_target_property(OMP_LIB OpenMP::OpenMP_C INTERFACE_LINK_LIBRARIES)
else()
set(OMP_LIB "")
endif()
if(NOT NOFORTRAN)
set(CMAKE_Fortran_USE_RESPONSE_FILE_FOR_OBJECTS 1)
set(CMAKE_Fortran_CREATE_STATIC_LIBRARY ${CREATE_STATIC_LIBRARY_COMMAND})
if(BUILD_SHARED_LIBS)
set(CMAKE_Fortran_CREATE_SHARED_LIBRARY
"sh -c 'echo \"\" | ${CMAKE_Fortran_COMPILER} -o dummy.o -c -x f95-cpp-input - '"
"sh -c '${CMAKE_Fortran_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,${STATIC_PATH} dummy.o -o ${SHARED_PATH} ${OMP_LIB}'")
endif()
else()
set(CMAKE_C_CREATE_STATIC_LIBRARY ${CREATE_STATIC_LIBRARY_COMMAND})
if(BUILD_SHARED_LIBS)
set(CMAKE_C_CREATE_SHARED_LIBRARY
"sh -c '${CMAKE_C_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,${STATIC_PATH} -o ${SHARED_PATH} ${OMP_LIB}'")
endif()
endif()
endif()

# Handle MSVC exports
@@ -373,7 +427,7 @@ if (BUILD_SHARED_LIBS AND BUILD_RELAPACK)
endif()
endif()

if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFFIX} STREQUAL "")
if (BUILD_SHARED_LIBS OR DELETE_STATIC_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFFIX} STREQUAL "")
if (NOT DEFINED ARCH)
set(ARCH_IN "x86_64")
else()
@@ -461,10 +515,33 @@ if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFFIX} STREQUAL "")
else ()
set (BZ 0)
endif()

if (CMAKE_SYSTEM_NAME MATCHES "Windows")
set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)
set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)
if (CMAKE_BUILD_TYPE MATCHES "Debug")
set (CRTLIB msvcrtd)
set (PDBOPT -debug -pdb:$<TARGET_FILE_DIR:${OpenBLAS_LIBNAME}_static>/${OpenBLAS_LIBNAME}.pdb)
set (PDB_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)
else ()
set (CRTLIB msvcrt)
set (PDBOPT "")
endif()
#if (USE_PERL)
message(STATUS "adding postbuild instruction to rename syms")
add_custom_command(TARGET ${OpenBLAS_LIBNAME}_static POST_BUILD
COMMAND perl ${PROJECT_SOURCE_DIR}/exports/gensymbol.pl "win2k" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" "${SYMBOLPREFIX}" "${SYMBOLSUFFIX}" "${BLD}" "${BBF16}" "${BS}" "${BD}" "${BC}" "${BZ}" > ${PROJECT_BINARY_DIR}/renamesyms.def
COMMAND ${CMAKE_C_COMPILER} ${CMAKE_C_FLAGS} -I${PROJECT_SOURCE_DIR} -I${PROJECT_BINARY_DIR} -c -o ${PROJECT_BINARY_DIR}/dllinit.o ${PROJECT_SOURCE_DIR}/exports/dllinit.c
COMMAND lld-link -nodefaultlib:libcmt -defaultlib:${CRTLIB} ${CMAKE_LINKER_FLAGS} -errorlimit:0 -def:${PROJECT_BINARY_DIR}/renamesyms.def ${PROJECT_BINARY_DIR}/dllinit.o $<TARGET_FILE:${OpenBLAS_LIBNAME}_static> -wholearchive:$<TARGET_FILE:${OpenBLAS_LIBNAME}_static> -dll -out:$<TARGET_FILE_DIR:${OpenBLAS_LIBNAME}_static>/${OpenBLAS_LIBNAME}.dll -implib:$<TARGET_FILE_DIR:${OpenBLAS_LIBNAME}_static>/${OpenBLAS_LIBNAME}.dll.a ${PDBOPT}
#COMMAND lld-link -nodefaultlib:libcmt -defaultlib:msvcrt ${CMAKE_LINKER_FLAGS} -errorlimit:0 -def:${PROJECT_BINARY_DIR}/renamesyms.def ${PROJECT_BINARY_DIR}/dllinit.o $<TARGET_FILE:${OpenBLAS_LIBNAME}_static> -wholearchive:$<TARGET_FILE:${OpenBLAS_LIBNAME}_static> -dll -out:$<TARGET_FILE_DIR:${OpenBLAS_LIBNAME}_static>/${OpenBLAS_LIBNAME}.dll -implib:$<TARGET_FILE_DIR:${OpenBLAS_LIBNAME}_static>/${OpenBLAS_LIBNAME}.dll.a
${REMOVE_STATIC_LIB} VERBATIM
)
#endif ()
else ()
if (NOT USE_PERL)
add_custom_command(TARGET ${OpenBLAS_LIBNAME}_shared POST_BUILD
COMMAND ${PROJECT_SOURCE_DIR}/exports/gensymbol "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BLD}" "${BBF16}" "${BS}" "${BD}" "${BC}" "${BZ}" > ${PROJECT_BINARY_DIR}/objcopy.def
COMMAND objcopy -v --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def ${PROJECT_BINARY_DIR}/lib/lib${OpenBLAS_LIBNAME}.so
COMMAND sh ${PROJECT_SOURCE_DIR}/exports/gensymbol "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BLD}" "${BBF16}" "${BS}" "${BD}" "${BC}" "${BZ}" > ${PROJECT_BINARY_DIR}/objcopy.def
COMMAND objcopy -v --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def ${PROJECT_BINARY_DIR}/lib/${OpenBLAS_LIBNAME}.so
COMMENT "renaming symbols"
)
else()
@@ -475,6 +552,7 @@ if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFFIX} STREQUAL "")
)
endif()
endif()
endif()

if (BUILD_BENCHMARKS)
#find_package(OpenMP REQUIRED)
@@ -645,3 +723,4 @@ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${PN}ConfigVersion.cmake
install(EXPORT "${PN}${SUFFIX64}Targets"
NAMESPACE "${PN}${SUFFIX64}::"
DESTINATION ${CMAKECONFIG_INSTALL_DIR})


+ 23
- 1
CONTRIBUTORS.md View File

@@ -26,6 +26,9 @@
* Chris Sidebottom <chris.sidebottom@arm.com>
* Optimizations and other improvements targeting AArch64

* Annop Wongwathanarat <annop.wongwathanarat@arm.com>
* Optimizations and other improvements targeting AArch64

## Previous Developers

* Zaheer Chothia <zaheer.chothia@gmail.com>
@@ -231,4 +234,23 @@ In chronological order:
* [2024-01-24] Optimize GEMV forwarding on ARM64 systems

* Aniket P. Garade <https://github.com/garadeaniket> Sushil Pratap Singh <https://github.com/SushilPratap04> Juliya James <https://github.com/Juliya32>
* [2024-12-13] Optimized swap and rot Level-1 BLAS routines with ARM SVE
* [2024-12-13] Optimized swap and rot Level-1 BLAS routines with ARM SVE

* Annop Wongwathanarat <annop.wongwathanarat@arm.com>
* [2025-01-10] Add thread throttling profile for SGEMM on NEOVERSEV1
* [2025-01-21] Optimize gemv_t_sve_v1x3 kernel
* [2025-02-26] Add sbgemv_t_bfdot kernel
* [2025-03-12] Fix aarch64 sbgemv_t compilation error for GCC < 13
* [2025-03-12] Optimize aarch64 sgemm_ncopy

* Marek Michalowski <marek.michalowski@arm.com>
* [2025-01-21] Add thread throttling profile for SGEMV on `NEOVERSEV1`
* [2025-02-18] Add thread throttling profile for SGEMM on `NEOVERSEV2`
* [2025-02-19] Add thread throttling profile for SGEMV on `NEOVERSEV2`

* Ye Tao <ye.tao@arm.com>
* [2025-02-03] Optimize SBGEMM kernel on NEOVERSEV1
* [2025-02-27] Add sbgemv_n_neon kernel

* Abhishek Kumar <https://github.com/abhishek-iitmadras>
* [2025-04-22] Optimise dot kernel for NEOVERSE V1

+ 134
- 0
Changelog.txt View File

@@ -1,4 +1,138 @@
OpenBLAS ChangeLog
====================================================================
Version 0.3.30
19-Jun-2025

general:
- fixed an installation problem with the thread safety test in gmake builds
- fixed spurious overwriting of an input array in complex GEMMT/GEMMTR
- fixed naming of GEMMTR in error messages from XERBLA
- fixed compilation of SBGEMMT/SBGEMMTR in CMake builds
- fixed the implementation of ?NRM2 to handle INCX=0 correctly
- removed tests for CSROT and ZDROT that relied on unspecified behavior
- fixed a performance regression in multithreaded GEMM that was particularly
serious on POWER targets
- fixed linking issues when using LLVM's flang-new with gmake
- fixed a potential thread safety problem with C11 atomic operations
- further improved the workload partitioning in parallel GEMM
- fixed omission of LAPACKE interfaces for CGESVDQ,CTRSYL3 and ?GEQPF in
CMake builds
- fixed mishandling of setting NO_LAPACK to FALSE, and incorrect dependencies
for LAPACK function SPMV in CMake builds
- added explicit CMake options for building LAPACKE and shared libraries
- simplified and improved handling of OpenMP options in CMake builds
- reworked Windows DLL generation in CMake builds to ensure correct symbol
renaming (pre/postfixing) and optional generation of PDB files for debugging
- updated the Perl script version of the gensymbol utility for use with
Windows-on-Arm
- Fixed building with (Mingw) gmake on Windows to ensure completeness of the
LAPACK included in the static library (potential race condition due to the
Windows version of the "ln" utility creating snapshot copies rather than links)
- fixed unwanted deletion of the lapacke_mangling.h file by "make clean"
- fixed potential duplication of a _64 suffix on library names in CMake builds
- fixed compilation of the C fallback copies of the LAPACK code with GCC 15
- included fixed from the Reference-LAPACK project:
- fixed a truncated error message in the EIG part of the testsuite
(Reference-LAPACK PR 1119)
- fixed too strict check in LAPACKE_?gesdd_work (PR #1126)
- fixed memory corruption when calling ?GEEV with non-finite data (PR #1128)
- fixed missing initialization of a variable in C/GEQP3RK (PR #1131)
- fixed 2nd dimension chosen in C/ZUNMLQ transposition operation (PR #1135)

x86_64:
- fixed an error in the SBGEMV kernel for Cooper Lake/Sapphire Rapids
- fixed corner cases of NAN and INF input handling in CSCAL and ZSCAL
- improved the compiler identification code for flang-new
- fixed a potential build issue in the ZSUM kernel
- fixed "argument list too long" errors when building on MacOS
- added cpu autodetection support for several new Arrow Lake models
- fixed conditional inclusion of the fast path SGEMM kernel in DYNAMIC_ARCH
- fixed compilation with the MinGW build of GCC 15

arm64:
- fixed cpu type detection of A64FX and some ThunderX models (broken in 0.3.29)
- added support for the AmpereOne/1A cpus in DYNAMIC_ ARCH builds
- added an optimized SBGEMM kernel for NEOVERSEV1
- improved 1xN SBGEMM performance by forwarding to SBGEMV
- introduced a stepwise increase of the thread count used for
SGEMM and SGEMV on NEOVERSEV1/V2 in relation to problem size
- introduced a stepwise increase of the thread count used for
DGEMV on NEOVERSEV1 in relation to problem size
- introduced a stepwise increase of the thread count used for
SDOT and DDOT on NEOVERSEV1 in relation to problem size
- worked around assembler limitations in LLVM for Windows-on-Arm
- enabled cpu type autodetection from the registry on Windows-on-Arm
- improved multithreading threshold for GEMV and GESV on Windows-on-Arm
- fixed overoptimization issues with LLVM's flang in Windows-on-Arm
- fixed corner cases of NAN and INF input handling in CSCAL and ZSCAL
- added a fast path SGEMM kernel for small workloads on SME capable targets
- improved performance of SGEMM and DGEMM kernels for small workloads
- improved performance of SGEMV and DGEMV on SVE-capable targets
- improved performance of SGEMV on NEOVERSEN1 and Apple M
- added optimized SSYMV and DSYMV kernels for NEOVERSEN1, Apple M and all
SVE capable targets
- added optimized SBGEMV kernels for NEOVERSEV1/V2/N2
- improved performance of SGEMM through faster NCOPY kernels
- added compiler options for the NVIDIA HPC Compiler Suite
- fixed compilation on OSX with XCode 16.3 and later
- fixed cpu core type and cache size detection on Apple M4
- updated GEMM parameter settings for Neoverse cpus in cross-builds with CMake
- fixed default compiler options for NEOVERSEN1 and CORTEXX2 in CMake builds
- fixed conditional inclusion of the fast path SGEMM kernel in DYNAMIC_ARCH
- fixed potential miscompilation of the non-SVE SDOT kernel

riscv64:
- added optimized SROTM and DROTM kernels for x280
- fixed corner cases of NAN and INF input handling in CSCAL and ZSCAL
- improved performance of GEMM_TCOPY on RVV1.0 targets with
VLEN of 128 or 256
- improved performance of OMATCOPY on targets with VLEN 256
- greatly improved performance of SGEMV/DGEMV
- improved performance of CGEMV and ZGEMV on C910V and all RVV targets
with VLEN 256
- improved performance of SAXPBY and DAXPBY on C910V and all RVV targets
with VLEN 256
- improved performance of AXPY and DOT on C910V and ZVL256B targets by
falling back to non-vectorized code for very small N. (Thereby fixing
poor performance of CHBMV/ZHBMV for very small K)
- fixed CMake build failures of the TRMM kernels

loongarch64:
- improved performance of the LSX versions of SSYMV/DSYMV
- made the LASX versions of the DSYMV and SSYMV kernels
compatible with hardware changes in LA664 and future targets
- fixed inaccuracies in several LASX kernels
- improved compatibility of LSX kernels with LA264 targets
- fixed handling of deprecated target names in CMake builds
- fixed corner cases of NAN and INF input handling in CSCAL and ZSCAL

power:
- fixed building for PPCG4 with CMake
- fixed SSCAL/DSCAL on PPC970 running FreeBSD
- fixed a potential alignment issue in the POWER8 SGEMV kernel
- fixed corner cases of NAN and INF input handling in CSCAL and ZSCAL

zarch:
- fixed corner cases of NAN and INF input handling in CSCAL and ZSCAL
- fixed unwanted generation of object files with a writable stack

x86:
- fixed corner cases of NAN and INF input handling in CSCAL and ZSCAL
- worked around potential miscompilation of CDOT with very old binutils

arm:
- fixed corner cases of NAN and INF input handling in CSCAL and ZSCAL
- fixed unwanted generation of object files with a writable stack

sparc:
- fixed corner cases of NAN and INF input handling in CSCAL and ZSCAL

alpha:
- fixed build failure caused by spurious Windows-only typecasts

cell:
- fixed probable build issue caused by spurious Windows-only typecasts
====================================================================
Version 0.3.29
12-Jan-2025


+ 6
- 1
Makefile View File

@@ -93,6 +93,11 @@ ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
echo " Fortran compiler ... $(F_COMPILER) (command line : $(FC))";\
fi
endif

ifeq ($(OSNAME), WINNT)
@-$(LNCMD) $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
endif

ifneq ($(OSNAME), AIX)
@echo -n " Library Name ... $(LIBNAME)"
else
@@ -447,7 +452,7 @@ endif
@rm -f cblas.tmp cblas.tmp2
@touch $(NETLIB_LAPACK_DIR)/make.inc
@$(MAKE) -C $(NETLIB_LAPACK_DIR) clean
@rm -f $(NETLIB_LAPACK_DIR)/make.inc $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_mangling.h
@rm -f $(NETLIB_LAPACK_DIR)/make.inc
@$(MAKE) -C relapack clean
@rm -f *.grd Makefile.conf_last config_last.h
@(cd $(NETLIB_LAPACK_DIR)/TESTING && rm -f x* *.out testing_results.txt)


+ 7
- 2
Makefile.arm64 View File

@@ -30,6 +30,11 @@ FCOMMON_OPT += -march=armv8-a+sve
endif
endif

ifeq ($(CORE), ARMV9SME)
CCOMMON_OPT += -march=armv9-a+sve2+sme
FCOMMON_OPT += -march=armv9-a+sve2
endif

ifeq ($(CORE), CORTEXA53)
CCOMMON_OPT += -march=armv8-a -mtune=cortex-a53
ifneq ($(F_COMPILER), NAG)
@@ -101,7 +106,7 @@ ifeq ($(CORE), NEOVERSEV1)
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG)))
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ10) $(ISCLANG)))
ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ11) $(ISCLANG)))
CCOMMON_OPT += -march=armv8.4-a+sve
CCOMMON_OPT += -march=armv8.4-a+sve+bf16
ifeq (1, $(ISCLANG))
CCOMMON_OPT += -mtune=cortex-x1
else
@@ -111,7 +116,7 @@ ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=armv8.4-a -mtune=neoverse-v1
endif
else
CCOMMON_OPT += -march=armv8.4-a+sve
CCOMMON_OPT += -march=armv8.4-a+sve+bf16
ifneq ($(CROSS), 1)
CCOMMON_OPT += -mtune=native
endif


+ 2
- 2
Makefile.install View File

@@ -315,8 +315,8 @@ endif

endif
ifeq ($(CPP_THREAD_SAFETY_TEST), 1)
@install -m 666 cpp_thread_test/dgemm_tester $(DESTDIR)$(OPENBLAS_BINARY_DIR)
@install -m 666 cpp_thread_test/dgemv_tester $(DESTDIR)$(OPENBLAS_BINARY_DIR)
@install -m 666 cpp_thread_test/dgemm_tester $(DESTDIR)$(OPENBLAS_BINARY_DIR)
@install -m 666 cpp_thread_test/dgemv_tester $(DESTDIR)$(OPENBLAS_BINARY_DIR)
endif
endif


+ 1
- 1
Makefile.rule View File

@@ -3,7 +3,7 @@
#

# This library's version
VERSION = 0.3.29
VERSION = 0.3.29.dev

# If you set this prefix, the library name will be lib$(LIBNAMESUFFIX)openblas.a
# and lib$(LIBNAMESUFFIX)openblas.so, with a matching soname in the shared library


+ 14
- 0
Makefile.system View File

@@ -276,6 +276,7 @@ SMALL_MATRIX_OPT = 1
endif
ifeq ($(ARCH), arm64)
GEMM_GEMV_FORWARD = 1
GEMM_GEMV_FORWARD_BF16 = 1
endif
ifeq ($(ARCH), riscv)
GEMM_GEMV_FORWARD = 1
@@ -420,6 +421,7 @@ ifeq ($(ARCH), arm64)
export MACOSX_DEPLOYMENT_TARGET=11.0
ifeq ($(C_COMPILER), GCC)
export NO_SVE = 1
export NO_SME = 1
endif
else
export MACOSX_DEPLOYMENT_TARGET=10.8
@@ -434,6 +436,11 @@ ifeq (x$(XCVER), x 15)
CCOMMON_OPT += -Wl,-ld_classic
FCOMMON_OPT += -Wl,-ld_classic
endif
ifeq (x$(XCVER), x 16)
ifeq ($(F_COMPILER), GFORTRAN)
override CEXTRALIB := $(filter-out(-lto_library, $(CEXTRALIB)))
endif
endif
endif

ifneq (,$(findstring $(OSNAME), FreeBSD OpenBSD DragonFly))
@@ -709,6 +716,9 @@ DYNAMIC_CORE += NEOVERSEN2
DYNAMIC_CORE += ARMV8SVE
DYNAMIC_CORE += A64FX
endif
ifneq ($(NO_SME), 1)
DYNAMIC_CORE += ARMV9SME
endif
DYNAMIC_CORE += THUNDERX
DYNAMIC_CORE += THUNDERX2T99
DYNAMIC_CORE += TSV110
@@ -1472,6 +1482,10 @@ ifeq ($(NO_SVE), 1)
CCOMMON_OPT += -DNO_SVE
endif

ifeq ($(NO_SME), 1)
CCOMMON_OPT += -DNO_SME
endif

ifdef SMP
CCOMMON_OPT += -DSMP_SERVER



+ 1
- 0
TargetList.txt View File

@@ -111,6 +111,7 @@ THUNDERX3T110
VORTEX
A64FX
ARMV8SVE
ARMV9SME
FT2000

9.System Z:


+ 40
- 14
azure-pipelines.yml View File

@@ -25,14 +25,28 @@ jobs:
echo "FROM quay.io/pypa/manylinux1_x86_64
COPY . /tmp/openblas
RUN cd /tmp/openblas && \
COMMON_FLAGS='DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32' && \
BTYPE='BINARY=64' CC=gcc && \
make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE && \
make -C test $COMMON_FLAGS $BTYPE && \
make -C ctest $COMMON_FLAGS $BTYPE && \
make -C utest $COMMON_FLAGS $BTYPE" > Dockerfile
CC=gcc && \
make QUIET_MAKE=1 BINARY=64 DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 && \
make -C test BINARY=64 DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 && \
make -C ctest BINARY=64 DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 && \
make -C utest BINARY=64 DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32" > Dockerfile
docker build .
displayName: Run manylinux1 docker build
- job: manylinux_32bit
pool:
vmImage: 'ubuntu-latest'
steps:
- script: |
echo "FROM quay.io/pypa/manylinux2014_i686
COPY . /tmp/openblas
RUN cd /tmp/openblas && \
CC=gcc && \
make QUIET_MAKE=1 BINARY=32 TARGET=NEHALEM NUM_THREADS=32 && \
make -C test BINARY=32 TARGET=NEHALEM NUM_THREADS=32 && \
make -C ctest BINARY=32 TARGET=NEHALEM NUM_THREADS=32 && \
make -C utest BINARY=32 TARGET=NEHALEM NUM_THREADS=32" > Dockerfile
docker build .
displayName: Run manylinux 32bit docker build
- job: Intel_SDE_skx
pool:
vmImage: 'ubuntu-latest'
@@ -141,7 +155,7 @@ jobs:

- job: OSX_OpenMP
pool:
vmImage: 'macOS-12'
vmImage: 'macOS-13'
steps:
- script: |
brew update
@@ -151,7 +165,7 @@ jobs:
- job: OSX_GCC_Nothreads
pool:
vmImage: 'macOS-12'
vmImage: 'macOS-13'
steps:
- script: |
brew update
@@ -164,7 +178,19 @@ jobs:
- script: |
brew update
make CC=gcc-12 FC=gfortran-12

- job: OSX_LLVM_flangnew
pool:
vmImage: 'macOS-latest'
variables:
LD_LIBRARY_PATH: /usr/local/opt/llvm/lib
LIBRARY_PATH: /usr/local/opt/llvm/lib
steps:
- script: |
brew update
brew install llvm flang
make TARGET=NEHALEM CC=/usr/local/opt/llvm/bin/clang FC=/usr/local/opt/flang/bin/flang NO_SHARED=1

- job: OSX_OpenMP_Clang
pool:
vmImage: 'macOS-latest'
@@ -195,7 +221,7 @@ jobs:
- job: OSX_dynarch_cmake
pool:
vmImage: 'macOS-12'
vmImage: 'macOS-13'
variables:
LD_LIBRARY_PATH: /usr/local/opt/llvm/lib
LIBRARY_PATH: /usr/local/opt/llvm/lib
@@ -242,7 +268,7 @@ jobs:
- job: OSX_NDK_ARMV7
pool:
vmImage: 'macOS-12'
vmImage: 'macOS-13'
steps:
- script: |
brew update
@@ -252,7 +278,7 @@ jobs:

- job: OSX_IOS_ARMV8
pool:
vmImage: 'macOS-12'
vmImage: 'macOS-13'
variables:
CC: /Applications/Xcode_14.2.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
CFLAGS: -O2 -Wno-macro-redefined -isysroot /Applications/Xcode_14.2.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS16.2.sdk -arch arm64 -miphoneos-version-min=10.0
@@ -262,7 +288,7 @@ jobs:

- job: OSX_IOS_ARMV7
pool:
vmImage: 'macOS-12'
vmImage: 'macOS-13'
variables:
CC: /Applications/Xcode_14.2.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
CFLAGS: -O2 -mno-thumb -Wno-macro-redefined -isysroot /Applications/Xcode_14.2.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS16.2.sdk -arch armv7 -miphoneos-version-min=5.1
@@ -272,7 +298,7 @@ jobs:

- job: OSX_xbuild_DYNAMIC_ARM64
pool:
vmImage: 'macOS-12'
vmImage: 'macOS-13'
variables:
CC: /Applications/Xcode_14.2.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
CFLAGS: -O2 -Wno-macro-redefined -isysroot /Applications/Xcode_14.2.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX13.1.sdk -arch arm64


+ 19
- 0
c_check View File

@@ -334,6 +334,24 @@ if [ "$architecture" = "arm64" ]; then
rm -rf "$tmpd"
fi

no_sme=0
if [ "$architecture" = "arm64" ]; then
tmpd=$(mktemp -d 2>/dev/null || mktemp -d -t 'OBC')
tmpf="$tmpd/a.S"
printf ".text \n.global sme_test\n\nsme_test:\nsmstart\nsmstop\nret\n">> "$tmpf"
args=" -march=armv9-a+sve2+sme -c -o $tmpf.o $tmpf"
no_sme=0
{
$compiler_name $flags $args >/dev/null 2>&1
} || {
args=" -march=armv9-a+sme -c -o $tmpf.o $tmpf"
$compiler_name $flags $args >/dev/null 2>&1
} || {
no_sme=1
}
rm -rf "$tmpd"
fi

c11_atomics=0
case "$data" in
*HAVE_C11*)
@@ -475,6 +493,7 @@ done
printf "CEXTRALIB=%s %s %s\n" "$linker_L" "$linker_l" "$linker_a"
[ "$no_msa" -eq 1 ] && printf "NO_MSA=1\n"
[ "$no_sve" -eq 1 ] && printf "NO_SVE=1\n"
[ "$no_sme" -eq 1 ] && printf "NO_SME=1\n"
[ "$no_rv64gv" -eq 1 ] && printf "NO_RV64GV=1\n"
[ "$no_avx512" -eq 1 ] && printf "NO_AVX512=1\n"
[ "$no_avx512bf" -eq 1 ] && printf "NO_AVX512BF16=1\n"


+ 15
- 14
cmake/arch.cmake View File

@@ -31,22 +31,23 @@ if (${CMAKE_C_COMPILER_ID} STREQUAL "Intel")
set(CCOMMON_OPT "${CCOMMON_OPT} -wd981")
endif ()

if (USE_OPENMP)
# USE_SIMPLE_THREADED_LEVEL3 = 1
# NO_AFFINITY = 1
find_package(OpenMP REQUIRED)
if (OpenMP_FOUND)
set(CCOMMON_OPT "${CCOMMON_OPT} ${OpenMP_C_FLAGS} -DUSE_OPENMP")
set(FCOMMON_OPT "${FCOMMON_OPT} ${OpenMP_Fortran_FLAGS}")
endif()
endif ()


if (DYNAMIC_ARCH)
if (ARM64)
set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110)
if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 9.99)
set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2 ARMV8SVE A64FX)
set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110)
if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU")
if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER_EQUAL 10) # SVE ACLE supported in GCC >= 10
set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2 ARMV8SVE A64FX)
endif ()
if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER_EQUAL 14) # SME ACLE supported in GCC >= 14
set(DYNAMIC_CORE ${DYNAMIC_CORE} ARMV9SME)
endif()
elseif (${CMAKE_C_COMPILER_ID} MATCHES "Clang")
if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER_EQUAL 11) # SVE ACLE supported in LLVM >= 11
set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2 ARMV8SVE A64FX)
endif ()
if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER_EQUAL 19) # SME ACLE supported in LLVM >= 19
set(DYNAMIC_CORE ${DYNAMIC_CORE} ARMV9SME)
endif()
endif ()
if (DYNAMIC_LIST)
set(DYNAMIC_CORE ARMV8 ${DYNAMIC_LIST})


+ 28
- 8
cmake/cc.cmake View File

@@ -84,7 +84,7 @@ endif ()
if (${CMAKE_C_COMPILER_ID} STREQUAL "NVHPC")
if (POWER)
set(CCOMMON_OPT "${CCOMMON_OPT} -tp pwr8")
else ()
elseif (X86_64)
set(CCOMMON_OPT "${CCOMMON_OPT} -tp px")
endif ()
endif ()
@@ -182,7 +182,9 @@ endif ()

if (${CORE} STREQUAL A64FX)
if (NOT DYNAMIC_ARCH)
if (${GCC_VERSION} VERSION_GREATER 11.0 OR ${GCC_VERSION} VERSION_EQUAL 11.0)
if (${CMAKE_C_COMPILER_ID} STREQUAL "NVC" AND NOT NO_SVE)
set (CCOMMON_OPT "${CCOMMON_OPT} -tp=a64fx")
elseif (${GCC_VERSION} VERSION_GREATER 11.0 OR ${GCC_VERSION} VERSION_EQUAL 11.0)
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve -mtune=a64fx")
else ()
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve")
@@ -194,6 +196,8 @@ if (${CORE} STREQUAL NEOVERSEN2)
if (NOT DYNAMIC_ARCH)
if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE)
set (CCOMMON_OPT "${CCOMMON_OPT} -Msve_intrinsics -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2")
elseif (${CMAKE_C_COMPILER_ID} STREQUAL "NVC" AND NOT NO_SVE)
set (CCOMMON_OPT "${CCOMMON_OPT} -tp=neoverse-v2")
else ()
if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4)
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2")
@@ -208,6 +212,8 @@ if (${CORE} STREQUAL NEOVERSEV1)
if (NOT DYNAMIC_ARCH)
if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE)
set (CCOMMON_OPT "${CCOMMON_OPT} -Msve_intrinsics -march=armv8.4-a+sve -mtune=neoverse-v1")
elseif (${CMAKE_C_COMPILER_ID} STREQUAL "NVC" AND NOT NO_SVE)
set (CCOMMON_OPT "${CCOMMON_OPT} -tp=neoverse-v1")
else ()
if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4)
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.4-a+sve -mtune=neoverse-v1")
@@ -220,10 +226,12 @@ endif ()

if (${CORE} STREQUAL NEOVERSEN1)
if (NOT DYNAMIC_ARCH)
if (${GCC_VERSION} VERSION_GREATER 9.4 OR ${GCC_VERSION} VERSION_EQUAL 9.4)
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve -mtune=neoverse-n1")
if (${CMAKE_C_COMPILER_ID} STREQUAL "NVC" AND NOT NO_SVE)
set (CCOMMON_OPT "${CCOMMON_OPT} -tp=neoverse-n1")
elseif (${GCC_VERSION} VERSION_GREATER 9.4 OR ${GCC_VERSION} VERSION_EQUAL 9.4)
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a -mtune=neoverse-n1")
else ()
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve")
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a")
endif()
endif ()
endif ()
@@ -232,21 +240,33 @@ if (${CORE} STREQUAL ARMV8SVE)
if (NOT DYNAMIC_ARCH)
if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE)
set (CCOMMON_OPT "${CCOMMON_OPT} -Msve_intrinsics -march=armv8-a+sve")
elseif (${CMAKE_C_COMPILER_ID} STREQUAL "NVC" AND NOT NO_SVE)
set (CCOMMON_OPT "${CCOMMON_OPT} -tp=host")
else ()
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve")
endif ()
endif ()
endif ()

if (${CORE} STREQUAL ARMV9SME)
if (NOT DYNAMIC_ARCH)
if (${CMAKE_C_COMPILER_ID} STREQUAL "NVC" AND NOT NO_SVE)
set (CCOMMON_OPT "${CCOMMON_OPT} -tp=host")
else ()
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv9-a+sme")
endif ()
endif ()
endif ()

if (${CORE} STREQUAL CORTEXA510)
if (NOT DYNAMIC_ARCH)
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve")
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.4-a+sve")
endif ()
endif ()

if (${CORE} STREQUAL CORTEXA710)
if (NOT DYNAMIC_ARCH)
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve")
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.4-a+sve")
endif ()
endif ()

@@ -258,7 +278,7 @@ endif ()

if (${CORE} STREQUAL CORTEXX2)
if (NOT DYNAMIC_ARCH)
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve")
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.4-a+sve")
endif ()
endif ()



+ 12
- 12
cmake/fc.cmake View File

@@ -7,7 +7,7 @@ if (${F_COMPILER} STREQUAL "FLANG" AND NOT CMAKE_Fortran_COMPILER_ID STREQUAL "L
# This is for classic Flang. LLVM Flang is handled with gfortran below.
set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_FLANG")
if (USE_OPENMP)
set(FCOMMON_OPT "${FCOMMON_OPT} -fopenmp")
set(OpenMP_Fortran_FLAGS "-fopenmp" CACHE STRING "OpenMP Fortran compiler flags")
endif ()
set(FCOMMON_OPT "${FCOMMON_OPT} -Mrecursive -Kieee")
endif ()
@@ -117,7 +117,7 @@ if (${F_COMPILER} STREQUAL "GFORTRAN" OR ${F_COMPILER} STREQUAL "F95" OR CMAKE_F
endif ()

if (USE_OPENMP)
set(FCOMMON_OPT "${FCOMMON_OPT} -fopenmp")
set(OpenMP_Fortran_FLAGS "-fopenmp" CACHE STRING "OpenMP Fortran compiler flags")
endif ()
endif ()

@@ -128,14 +128,14 @@ if (${F_COMPILER} STREQUAL "INTEL" OR CMAKE_Fortran_COMPILER_ID MATCHES "Intel")
endif ()
set(FCOMMON_OPT "${FCOMMON_OPT} -recursive -fp-model=consistent")
if (USE_OPENMP)
set(FCOMMON_OPT "${FCOMMON_OPT} -openmp")
set(OpenMP_Fortran_FLAGS "-openmp" CACHE STRING "OpenMP Fortran compiler flags")
endif ()
endif ()

if (${F_COMPILER} STREQUAL "FUJITSU")
set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_FUJITSU")
if (USE_OPENMP)
set(FCOMMON_OPT "${FCOMMON_OPT} -openmp")
set(OpenMP_Fortran_FLAGS "-openmp" CACHE STRING "OpenMP Fortran compiler flags")
endif ()
endif ()

@@ -151,7 +151,7 @@ if (${F_COMPILER} STREQUAL "IBM")
set(FCOMMON_OPT "${FCOMMON_OPT} -q32")
endif ()
if (USE_OPENMP)
set(FCOMMON_OPT "${FCOMMON_OPT} -openmp")
set(OpenMP_Fortran_FLAGS "-openmp" CACHE STRING "OpenMP Fortran compiler flags")
endif ()
endif ()

@@ -168,7 +168,7 @@ if (${F_COMPILER} STREQUAL "PGI" OR ${F_COMPILER} STREQUAL "PGF95")
endif ()
set(FCOMMON_OPT "${FCOMMON_OPT} -Mrecursive")
if (USE_OPENMP)
set(FCOMMON_OPT "${FCOMMON_OPT} -mp")
set(OpenMP_Fortran_FLAGS "-mp" CACHE STRING "OpenMP Fortran compiler flags")
endif ()
endif ()

@@ -195,7 +195,7 @@ if (${F_COMPILER} STREQUAL "PATHSCALE")
endif ()

if (USE_OPENMP)
set(FCOMMON_OPT "${FCOMMON_OPT} -mp")
set(OpenMP_Fortran_FLAGS "-mp" CACHE STRING "OpenMP Fortran compiler flags")
endif ()
endif ()

@@ -233,7 +233,7 @@ if (${F_COMPILER} STREQUAL "OPEN64")

if (USE_OPENMP)
set(FEXTRALIB "${FEXTRALIB} -lstdc++")
set(FCOMMON_OPT "${FCOMMON_OPT} -mp")
set(OpenMP_Fortran_FLAGS "-mp" CACHE STRING "OpenMP Fortran compiler flags")
endif ()
endif ()

@@ -245,14 +245,14 @@ if (${F_COMPILER} STREQUAL "SUN")
set(FCOMMON_OPT "${FCOMMON_OPT} -m64")
endif ()
if (USE_OPENMP)
set(FCOMMON_OPT "${FCOMMON_OPT} -xopenmp=parallel")
set(OpenMP_Fortran_FLAGS "-xopenmp=parallel" CACHE STRING "OpenMP Fortran compiler flags")
endif ()
endif ()

if (${F_COMPILER} STREQUAL "COMPAQ")
set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_COMPAQ")
if (USE_OPENMP)
set(FCOMMON_OPT "${FCOMMON_OPT} -openmp")
set(OpenMP_Fortran_FLAGS "-openmp" CACHE STRING "OpenMP Fortran compiler flags")
endif ()
endif ()

@@ -265,7 +265,7 @@ if (${F_COMPILER} STREQUAL "CRAY")
if (NOT USE_OPENMP)
set(FCOMMON_OPT "${FCOMMON_OPT} -fno-openmp")
else ()
set(FCOMMON_OPT "${FCOMMON_OPT} -fopenmp")
set(OpenMP_Fortran_FLAGS "-fopenmp" CACHE STRING "OpenMP Fortran compiler flags")
endif ()
endif ()

@@ -290,7 +290,7 @@ if (${F_COMPILER} STREQUAL "NAGFOR")
# -w=unused: Suppress warning messages about unused variables
set(FCOMMON_OPT "${FCOMMON_OPT} -w=x77 -w=ques -w=unused")
if (USE_OPENMP)
set(FCOMMON_OPT "${FCOMMON_OPT} -openmp")
set(OpenMP_Fortran_FLAGS "-openmp" CACHE STRING "OpenMP Fortran compiler flags")
endif ()
endif ()



+ 3
- 0
cmake/kernel.cmake View File

@@ -79,6 +79,9 @@ macro(SetDefaultL1)
SetFallback(CROTKERNEL zrot.S)
SetFallback(ZROTKERNEL zrot.S)
SetFallback(XROTKERNEL zrot.S)
SetFallback(SROTMKERNEL rotm.S)
SetFallback(DROTMKERNEL rotm.S)
SetFallback(QROTMKERNEL rotm.S)
SetFallback(SSCALKERNEL scal.S)
SetFallback(DSCALKERNEL scal.S)
SetFallback(CSCALKERNEL zscal.S)


+ 8
- 6
cmake/lapacke.cmake View File

@@ -98,6 +98,8 @@ set(CSRC
lapacke_cgesv_work.c
lapacke_cgesvd.c
lapacke_cgesvd_work.c
lapacke_cgesvdq.c
lapacke_cgesvdq_work.c
lapacke_cgesvdx.c
lapacke_cgesvdx_work.c
lapacke_cgesvj.c
@@ -1766,8 +1768,8 @@ set(SSRC
lapacke_strsna_work.c
lapacke_strsyl.c
lapacke_strsyl_work.c
lapacke_ctrsyl3.c
lapacke_ctrsyl3_work.c
lapacke_strsyl3.c
lapacke_strsyl3_work.c
lapacke_strtri.c
lapacke_strtri_work.c
lapacke_strtrs.c
@@ -2410,10 +2412,10 @@ set(ZSRC
lapacke_ilaver.c
)
if (BUILD_LAPACK_DEPRECATED)
set(SRCS $SRCS lapacke_sgeqpf.c lapacke_sgeqpf_work.c lapacke_sggsvd.c lapacke_sggsvd_work.c lapacke_sggsvp.c lapacke_sggsvp_work.c)
set(SRCD $SRCD lapacke_dgeqpf.c lapacke_dgeqpf_work.c lapacke_dggsvd.c lapacke_dggsvd_work.c lapacke_dggsvp.c lapacke_dggsvp_work.c)
set(SRCC $SRCC lapacke_cgeqpf.c lapacke_cgeqpf_work.c lapacke_cggsvd.c lapacke_cggsvd_work.c lapacke_cggsvp.c lapacke_cggsvp_work.c)
set(SRCZ $SRCZ lapacke_zgeqpf.c lapacke_zgeqpf_work.c lapacke_zggsvd.c lapacke_zggsvd_work.c lapacke_zggsvp.c lapacke_zggsvp_work.c)
list(APPEND SSRC lapacke_sgeqpf.c lapacke_sgeqpf_work.c lapacke_sggsvd.c lapacke_sggsvd_work.c lapacke_sggsvp.c lapacke_sggsvp_work.c)
list(APPEND DSRC lapacke_dgeqpf.c lapacke_dgeqpf_work.c lapacke_dggsvd.c lapacke_dggsvd_work.c lapacke_dggsvp.c lapacke_dggsvp_work.c)
list(APPEND CSRC lapacke_cgeqpf.c lapacke_cgeqpf_work.c lapacke_cggsvd.c lapacke_cggsvd_work.c lapacke_cggsvp.c lapacke_cggsvp_work.c)
list(APPEND ZSRC lapacke_zgeqpf.c lapacke_zgeqpf_work.c lapacke_zggsvd.c lapacke_zggsvd_work.c lapacke_zggsvp.c lapacke_zggsvp_work.c)
endif()

set(SRCX


+ 27
- 6
cmake/prebuild.cmake View File

@@ -1006,15 +1006,15 @@ endif ()
"#define HAVE_SVE\n"
"#define ARMV8\n")
set(SGEMM_UNROLL_M 16)
set(SGEMM_UNROLL_N 4)
set(DGEMM_UNROLL_M 8)
set(DGEMM_UNROLL_N 4)
set(CGEMM_UNROLL_M 8)
set(SGEMM_UNROLL_N 8)
set(DGEMM_UNROLL_M 4)
set(DGEMM_UNROLL_N 8)
set(CGEMM_UNROLL_M 2)
set(CGEMM_UNROLL_N 4)
set(ZGEMM_UNROLL_M 4)
set(ZGEMM_UNROLL_M 2)
set(ZGEMM_UNROLL_N 4)
set(SYMV_P 16)
elseif ("${TCORE}" STREQUAL "NEOVERSEN2")
elseif ("${TCORE}" STREQUAL "NEOVERSEN2" OR "${TCORE}" STREQUAL "ARMV9SME")
file(APPEND ${TARGET_CONF_TEMP}
"#define L1_CODE_SIZE\t65536\n"
"#define L1_CODE_LINESIZE\t64\n"
@@ -1249,6 +1249,25 @@ endif ()
set(ZGEMM_UNROLL_M 2)
set(ZGEMM_UNROLL_N 4)
set(SYMV_P 16)
elseif ("${TCORE}" STREQUAL "ARMV8SVE" OR "${TCORE}" STREQUAL "CORTEXA510" OR "${TCORE}" STREQUAL "CORTEXX2" OR "${TCORE}" STREQUAL "ARMV9")
file(APPEND ${TARGET_CONF_TEMP}
"#define L1_DATA_SIZE\t32768\n"
"#define L1_DATA_LINESIZE\t64\n"
"#define L2_SIZE\t262144\n"
"#define L2_LINESIZE\t64\n"
"#define DTB_DEFAULT_ENTRIES\t64\n"
"#define DTB_SIZE\t4096\n"
"#define L2_ASSOCIATIVE\t32\n"
"#define ARMV8\n")
set(SGEMM_UNROLL_M 4)
set(SGEMM_UNROLL_N 8)
set(DGEMM_UNROLL_M 4)
set(DGEMM_UNROLL_N 8)
set(CGEMM_UNROLL_M 2)
set(CGEMM_UNROLL_N 4)
set(ZGEMM_UNROLL_M 2)
set(ZGEMM_UNROLL_N 4)
set(SYMV_P 16)
elseif ("${TCORE}" STREQUAL "P5600")
file(APPEND ${TARGET_CONF_TEMP}
"#define L2_SIZE 1048576\n"
@@ -1409,9 +1428,11 @@ endif ()
# GetArch_2nd
foreach(float_char S;D;Q;C;Z;X)
if (NOT DEFINED ${float_char}GEMM_UNROLL_M)
message(STATUS "setting unrollm=2")
set(${float_char}GEMM_UNROLL_M 2)
endif()
if (NOT DEFINED ${float_char}GEMM_UNROLL_N)
message(STATUS "setting unrolln=2")
set(${float_char}GEMM_UNROLL_N 2)
endif()
endforeach()


+ 39
- 18
cmake/system.cmake View File

@@ -21,7 +21,15 @@ endif()
# Other files expect CORE, which is actually TARGET and will become TARGET_CORE for kernel build. Confused yet?
# It seems we are meant to use TARGET as input and CORE internally as kernel.
if(NOT DEFINED CORE AND DEFINED TARGET)
set(CORE ${TARGET})
if (${TARGET} STREQUAL "LOONGSON3R5")
set(CORE "LA464")
elseif (${TARGET} STREQUAL "LOONGSON2K1000")
set(CORE "LA264")
elseif (${TARGET} STREQUAL "LOONGSONGENERIC")
set(CORE "LA64_GENERIC)")
else ()
set(CORE ${TARGET})
endif()
endif()

# TARGET_CORE will override TARGET which is used in DYNAMIC_ARCH=1.
@@ -310,6 +318,9 @@ if (${TARGET} STREQUAL NEOVERSEV1)
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv8.2-a+sve")
endif()
endif()
if (${TARGET} STREQUAL ARMV9SME)
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv9-a+sme -O3")
endif()
if (${TARGET} STREQUAL A64FX)
if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE)
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -Msve-intrinsics -march=armv8.2-a+sve -mtune=a64fx")
@@ -361,6 +372,20 @@ else ()
endif ()
endif ()

if (USE_OPENMP)
find_package(OpenMP COMPONENTS C REQUIRED)
set(CCOMMON_OPT "${CCOMMON_OPT} -DUSE_OPENMP")
if (NOT NOFORTRAN)
find_package(OpenMP COMPONENTS Fortran REQUIRED)
# Avoid mixed OpenMP linkage
get_target_property(OMP_C_LIB OpenMP::OpenMP_C INTERFACE_LINK_LIBRARIES)
get_target_property(OMP_Fortran_LIB OpenMP::OpenMP_Fortran INTERFACE_LINK_LIBRARIES)
if (NOT OMP_C_LIB STREQUAL OMP_Fortran_LIB)
message(FATAL_ERROR "Multiple OpenMP runtime libraries detected. Mixed OpenMP runtime linkage is dangerous. You may pass -DOpenMP_LANG_LIB_NAMES and -DOpenMP_omp_LIBRARY to manually choose the OpenMP library.")
endif()
endif ()
endif ()

if (BINARY64)
if (INTERFACE64)
# CCOMMON_OPT += -DUSE64BITINT
@@ -620,6 +645,18 @@ set(CMAKE_ASM_FLAGS "${CMAKE_ASM_FLAGS} ${CCOMMON_OPT}")
endif()
# TODO: not sure what PFLAGS is -hpa
set(PFLAGS "${PFLAGS} ${CCOMMON_OPT} -I${TOPDIR} -DPROFILE ${COMMON_PROF}")
if ("${CMAKE_BUILD_TYPE}" STREQUAL "Release")

if ("${F_COMPILER}" STREQUAL "FLANG")
if (${CMAKE_Fortran_COMPILER_VERSION} VERSION_LESS_EQUAL 3)
set(CMAKE_Fortran_FLAGS_RELEASE "${CMAKE_Fortran_FLAGS_RELEASE} -fno-unroll-loops")
endif ()
endif ()
if (ARM64 AND CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*" AND CMAKE_SYSTEM_NAME STREQUAL "Windows")
set(CMAKE_Fortran_FLAGS_RELEASE "${CMAKE_Fortran_FLAGS_RELEASE} -O2")
endif ()
endif ()


set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} ${FCOMMON_OPT}")
# TODO: not sure what FPFLAGS is -hpa
@@ -632,20 +669,11 @@ if (LAPACK_STRLEN)
endif()
set(LAPACK_FPFLAGS "${LAPACK_FPFLAGS} ${FPFLAGS}")

#Disable -fopenmp for LAPACK Fortran codes on Windows.
if (${CMAKE_SYSTEM_NAME} STREQUAL "Windows")
set(FILTER_FLAGS "-fopenmp;-mp;-openmp;-xopenmp=parallel")
foreach (FILTER_FLAG ${FILTER_FLAGS})
string(REPLACE ${FILTER_FLAG} "" LAPACK_FFLAGS ${LAPACK_FFLAGS})
string(REPLACE ${FILTER_FLAG} "" LAPACK_FPFLAGS ${LAPACK_FPFLAGS})
endforeach ()
endif ()

if (CMAKE_Fortran_COMPILER)
if ("${F_COMPILER}" STREQUAL "NAGFOR" OR "${F_COMPILER}" STREQUAL "CRAY" OR CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*")
set(FILTER_FLAGS "-msse3;-mssse3;-msse4.1;-mavx;-mavx2,-mskylake-avx512")
if (CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*")
message(STATUS "removing fortran flags")
message(STATUS "removing fortran flags not supported by the compiler")
set(FILTER_FLAGS "${FILTER_FLAGS};-m32;-m64")
endif ()
foreach (FILTER_FLAG ${FILTER_FLAGS})
@@ -676,13 +704,6 @@ if (${CMAKE_C_COMPILER_ID} MATCHES "IntelLLVM" AND ${CMAKE_SYSTEM_NAME} STREQUAL
set(LAPACK_CFLAGS "${LAPACK_CFLAGS} -DNOCHANGE")
endif ()

if ("${CMAKE_BUILD_TYPE}" STREQUAL "Release")
if ("${F_COMPILER}" STREQUAL "FLANG")
if (${CMAKE_Fortran_COMPILER_VERSION} VERSION_LESS_EQUAL 3)
set(CMAKE_Fortran_FLAGS_RELEASE "${CMAKE_Fortran_FLAGS_RELEASE} -fno-unroll-loops")
endif ()
endif ()
endif ()

if (NOT DEFINED SUFFIX)
set(SUFFIX o)


+ 11
- 0
cmake/system_check.cmake View File

@@ -139,6 +139,17 @@ endif()
endif()
endif()

if (ARM64)
if (NOT NO_SME)
file(WRITE ${PROJECT_BINARY_DIR}/sme.c ".text \n.global sme_test\n\nsme_test:\nsmstart\nsmstop\nret\n")
execute_process(COMMAND ${CMAKE_C_COMPILER} -march=armv9-a+sve2+sme -c -v -o ${PROJECT_BINARY_DIR}/sme.o ${PROJECT_BINARY_DIR}/sme.c OUTPUT_QUIET ERROR_QUIET RESULT_VARIABLE NO_SME)
if (NO_SME EQUAL 1)
set (CCOMMON_OPT "${CCOMMON_OPT} -DNO_SME")
endif()
file(REMOVE "${PROJECT_BINARY_DIR}/sme.c" "${PROJECT_BINARY_DIR}/sme.o")
endif()
endif()

include(CheckIncludeFile)
CHECK_INCLUDE_FILE("stdatomic.h" HAVE_C11)
if (HAVE_C11 EQUAL 1)


+ 8
- 0
cmake/utils.cmake View File

@@ -16,6 +16,14 @@ endfunction ()
macro(ParseMakefileVars MAKEFILE_IN)
message(STATUS "Reading vars from ${MAKEFILE_IN}...")
set (C_COMPILER ${CMAKE_C_COMPILER_ID})
set (OSNAME ${CMAKE_SYSTEM_NAME})
if (${C_COMPILER} MATCHES Clang)
set (C_COMPILER CLANG)
endif ()
if (${OSNAME} STREQUAL Windows)
set (OSNAME WINNT)
endif ()
message(STATUS OS ${OSNAME} COMPILER ${C_COMPILER})
set (IfElse 0)
set (ElseSeen 0)
set (SkipIfs 0)


+ 1
- 0
common.h View File

@@ -702,6 +702,7 @@ void gotoblas_profile_init(void);
void gotoblas_profile_quit(void);
int support_avx512(void);
int support_sme1(void);

#ifdef USE_OPENMP



+ 9
- 1
common_arm.h View File

@@ -114,7 +114,15 @@ static inline int blas_quickdivide(blasint x, blasint y){
OPENBLAS_ARM_TYPE_FUNCTION \
REALNAME:

#define EPILOGUE
#if defined(__ELF__) && defined(__linux__)
# define GNUSTACK .section .note.GNU-stack,"",%progbits
#else
# define GNUSTACK
#endif

#define EPILOGUE \
GNUSTACK


#define PROFCODE



+ 1
- 1
common_arm64.h View File

@@ -175,7 +175,7 @@ REALNAME:
#define HUGE_PAGESIZE ( 4 << 20)

#ifndef BUFFERSIZE
#if defined(NEOVERSEN1) || defined(NEOVERSEN2) || defined(NEOVERSEV1) || defined(A64FX) || defined(ARMV8SVE)
#if defined(NEOVERSEN1) || defined(NEOVERSEN2) || defined(NEOVERSEV1) || defined(A64FX) || defined(ARMV8SVE) || defined(ARMV9SME)
#define BUFFER_SIZE (32 << 22)
#else
#define BUFFER_SIZE (32 << 20)


+ 2
- 0
common_d.h View File

@@ -22,6 +22,7 @@
#define DSUM_K dsum_k
#define DSWAP_K dswap_k
#define DROT_K drot_k
#define DROTM_K drotm_k

#define DGEMV_N dgemv_n
#define DGEMV_T dgemv_t
@@ -180,6 +181,7 @@
#define DSUM_K gotoblas -> dsum_k
#define DSWAP_K gotoblas -> dswap_k
#define DROT_K gotoblas -> drot_k
#define DROTM_K gotoblas -> drotm_k

#define DGEMV_N gotoblas -> dgemv_n
#define DGEMV_T gotoblas -> dgemv_t


+ 3
- 3
common_level1.h View File

@@ -213,9 +213,9 @@ int srotmg_k(float *, float *, float *, float *, float *);
int drotmg_k(double *, double *, double *, double *, double *);
int qrotmg_k(xdouble *, xdouble *, xdouble *, xdouble *, xdouble *);

int srotm_k (BLASLONG, float, BLASLONG, float, BLASLONG, float);
int drotm_k (BLASLONG, double, BLASLONG, double, BLASLONG, double);
int qrotm_k (BLASLONG, xdouble, BLASLONG, xdouble, BLASLONG, xdouble);
int srotm_k (BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);
int drotm_k (BLASLONG, double *, BLASLONG, double *, BLASLONG, double *);
int qrotm_k (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *);


int saxpby_k (BLASLONG, float, float *, BLASLONG, float, float *, BLASLONG);


+ 3
- 0
common_macro.h View File

@@ -70,6 +70,7 @@
#define SUM_K QSUM_K
#define SWAP_K QSWAP_K
#define ROT_K QROT_K
#define ROTM_K QROTM_K

#define GEMV_N QGEMV_N
#define GEMV_T QGEMV_T
@@ -361,6 +362,7 @@
#define SUM_K DSUM_K
#define SWAP_K DSWAP_K
#define ROT_K DROT_K
#define ROTM_K DROTM_K

#define GEMV_N DGEMV_N
#define GEMV_T DGEMV_T
@@ -977,6 +979,7 @@
#define SUM_K SSUM_K
#define SWAP_K SSWAP_K
#define ROT_K SROT_K
#define ROTM_K SROTM_K

#define GEMV_N SGEMV_N
#define GEMV_T SGEMV_T


+ 8
- 0
common_param.h View File

@@ -77,6 +77,7 @@ BLASLONG (*isbmin_k) (BLASLONG, float *, BLASLONG);
double (*dsbdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);

int (*sbrot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float, float);
int (*sbrotm_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);

int (*sbaxpy_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG);
int (*sbscal_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG);
@@ -197,6 +198,7 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG);
//double (*dsdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);

int (*srot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float, float);
int (*srotm_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);
#endif
#if (BUILD_SINGLE==1) || (BUILD_DOUBLE==1) || (BUILD_COMPLEX==1)
int (*saxpy_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG);
@@ -221,6 +223,10 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG);
void (*sgemm_direct) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG , float *, BLASLONG , float * , BLASLONG);
int (*sgemm_direct_performant) (BLASLONG M, BLASLONG N, BLASLONG K);
#endif
#ifdef ARCH_ARM64
void (*sgemm_direct) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG , float *, BLASLONG , float * , BLASLONG);
#endif

int (*sgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG);
int (*sgemm_beta )(BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG);
@@ -330,6 +336,7 @@ BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG);
#endif
#if (BUILD_DOUBLE==1) || (BUILD_COMPLEX16==1)
int (*drot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG, double, double);
int (*drotm_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG, double *);
int (*daxpy_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG);
int (*dscal_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG);
int (*dswap_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG);
@@ -439,6 +446,7 @@ BLASLONG (*iqmin_k) (BLASLONG, xdouble *, BLASLONG);
int (*qcopy_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
xdouble (*qdot_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
int (*qrot_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble);
int (*qrotm_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *);

int (*qaxpy_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
int (*qscal_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);


+ 2
- 0
common_q.h View File

@@ -22,6 +22,7 @@
#define QSUM_K qsum_k
#define QSWAP_K qswap_k
#define QROT_K qrot_k
#define QROTM_K qrotm_k

#define QGEMV_N qgemv_n
#define QGEMV_T qgemv_t
@@ -165,6 +166,7 @@
#define QSUM_K gotoblas -> qsum_k
#define QSWAP_K gotoblas -> qswap_k
#define QROT_K gotoblas -> qrot_k
#define QROTM_K gotoblas -> qrotm_k

#define QGEMV_N gotoblas -> qgemv_n
#define QGEMV_T gotoblas -> qgemv_t


+ 4
- 2
common_s.h View File

@@ -24,6 +24,7 @@
#define SSCAL_K sscal_k
#define SSWAP_K sswap_k
#define SROT_K srot_k
#define SROTM_K srotm_k

#define SGEMV_N sgemv_n
#define SGEMV_T sgemv_t
@@ -189,6 +190,7 @@
#define SSCAL_K gotoblas -> sscal_k
#define SSWAP_K gotoblas -> sswap_k
#define SROT_K gotoblas -> srot_k
#define SROTM_K gotoblas -> srotm_k

#define SGEMV_N gotoblas -> sgemv_n
#define SGEMV_T gotoblas -> sgemv_t
@@ -213,9 +215,9 @@
#ifdef ARCH_X86_64
#define SGEMM_DIRECT_PERFORMANT gotoblas -> sgemm_direct_performant
#define SGEMM_DIRECT gotoblas -> sgemm_direct
#else
#elif ARCH_ARM64
#define SGEMM_DIRECT_PERFORMANT sgemm_direct_performant
#define SGEMM_DIRECT sgemm_direct
#define SGEMM_DIRECT gotoblas -> sgemm_direct
#endif

#define SGEMM_ONCOPY gotoblas -> sgemm_oncopy


+ 9
- 2
common_zarch.h View File

@@ -103,9 +103,16 @@ static inline int blas_quickdivide(blasint x, blasint y){
.global REALNAME ;\
.type REALNAME, %function ;\
REALNAME:

#define EPILOGUE
#if defined(__ELF__) && defined(__linux__)
# define GNUSTACK .section .note.GNU-stack,"",@progbits
#else
# define GNUSTACK
#endif

#define EPILOGUE \
.size REALNAME, .-REALNAME; \
GNUSTACK

#define PROFCODE



+ 3
- 0
cpuid.S View File

@@ -65,3 +65,6 @@ _cpuid:
.subsections_via_symbols

#endif
#if defined(__ELF__) && defined(__linux__)
.section .note.GNU-stack,"",@progbits
#endif

+ 40
- 4
cpuid_arm64.c View File

@@ -43,6 +43,9 @@ size_t length64=sizeof(value64);
#ifndef HWCAP_SVE
#define HWCAP_SVE (1 << 22)
#endif
#if (defined OS_WINDOWS)
#include <winreg.h>
#endif

#define get_cpu_ftr(id, var) ({ \
__asm__ __volatile__ ("mrs %0, "#id : "=r" (var)); \
@@ -273,11 +276,11 @@ int detect(void)
fclose(infile);
}
}
sprintf(cpuimpl,"0x%2x",implementer);
sprintf(cpuimpl,"0x%02x",implementer);
cpu_implementer=strdup(cpuimpl);
}
qsort(cpucores,1024,sizeof(int),cpusort);
sprintf(cpupart,"0x%3x",cpucores[0]);
sprintf(cpupart,"0x%03x",cpucores[0]);
cpu_part=strdup(cpupart);
if(cpu_part != NULL && cpu_implementer != NULL) {
// Arm
@@ -371,20 +374,47 @@ int detect(void)
}
#else
#ifdef __APPLE__
length64 = sizeof(value64);
sysctlbyname("hw.ncpu",&value64,&length64,NULL,0);
cpulowperf=value64;
length64 = sizeof(value64);
sysctlbyname("hw.nperflevels",&value64,&length64,NULL,0);
if (value64 > 1) {
sysctlbyname("hw.perflevel0.cpusperl",&value64,&length64,NULL,0);
length64 = sizeof(value64);
sysctlbyname("hw.perflevel0.cpusperl2",&value64,&length64,NULL,0);
cpuhiperf=value64;
sysctlbyname("hw.perflevel1.cpusperl",&value64,&length64,NULL,0);
length64 = sizeof(value64);
sysctlbyname("hw.perflevel1.cpusperl2",&value64,&length64,NULL,0);
cpulowperf=value64;
}
length64 = sizeof(value64);
sysctlbyname("hw.cpufamily",&value64,&length64,NULL,0);
if (value64 ==131287967|| value64 == 458787763 ) return CPU_VORTEX; //A12/M1
if (value64 == 3660830781) return CPU_VORTEX; //A15/M2
if (value64 == 2271604202) return CPU_VORTEX; //A16/M3
if (value64 == 1867590060) return CPU_VORTEX; //M4
#else
#ifdef OS_WINDOWS
HKEY reghandle;
HKEY hklm = HKEY_LOCAL_MACHINE;
WCHAR valstring[512];
PVOID pvalstring=valstring;
DWORD size=sizeof (valstring);
DWORD type=RRF_RT_ANY;
DWORD flags=0;
LPCWSTR subkey= L"HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0";
LPCWSTR field=L"ProcessorNameString";
LONG errcode=RegOpenKeyEx(HKEY_LOCAL_MACHINE,TEXT("Hardware\\Description\\System\\CentralProcessor\\0"), 0, KEY_READ, &reghandle);
if (errcode != NO_ERROR) wprintf(L"Could not open registry key for proc0: %x\n",errcode);
errcode=RegQueryValueEx(reghandle, "ProcessorNameString", NULL,NULL ,pvalstring,&size);
if (errcode != ERROR_SUCCESS) wprintf(L"Error reading cpuname from registry:%x\n",errcode);
//wprintf(stderr,L"%s\n",(PWSTR)valstring);
RegCloseKey(reghandle);
if (strstr(valstring, "Snapdragon(R) X Elite")) return CPU_NEOVERSEN1;
if (strstr(valstring, "Ampere(R) Altra")) return CPU_NEOVERSEN1;
if (strstr(valstring, "Snapdragon (TM) 8cx Gen 3")) return CPU_CORTEXX1;
if (strstr(valstring, "Snapdragon Compute Platform")) return CPU_CORTEXX1;
#endif
#endif
return CPU_ARMV8;
#endif
@@ -442,6 +472,7 @@ int n=0;
printf("#define NUM_CORES_HP %d\n",cpuhiperf);
#endif
#ifdef __APPLE__
length64 = sizeof(value64);
sysctlbyname("hw.physicalcpu_max",&value,&length,NULL,0);
printf("#define NUM_CORES %d\n",value);
if (cpulowperf >0)
@@ -673,12 +704,17 @@ void get_cpuconfig(void)
case CPU_VORTEX:
printf("#define VORTEX \n");
#ifdef __APPLE__
length64 = sizeof(value64);
sysctlbyname("hw.l1icachesize",&value64,&length64,NULL,0);
printf("#define L1_CODE_SIZE %lld \n",value64);
length64 = sizeof(value64);
sysctlbyname("hw.cachelinesize",&value64,&length64,NULL,0);
printf("#define L1_CODE_LINESIZE %lld \n",value64);
printf("#define L1_DATA_LINESIZE %lld \n",value64);
length64 = sizeof(value64);
sysctlbyname("hw.l1dcachesize",&value64,&length64,NULL,0);
printf("#define L1_DATA_SIZE %lld \n",value64);
length64 = sizeof(value64);
sysctlbyname("hw.l2cachesize",&value64,&length64,NULL,0);
printf("#define L2_SIZE %lld \n",value64);
#endif


+ 17
- 0
cpuid_x86.c View File

@@ -1578,6 +1578,7 @@ int get_cpuname(void){
case 12: //family 6 exmodel 12
switch (model) {
case 15:
case 6: // Arrow Lake
if(support_avx512())
return CPUTYPE_SAPPHIRERAPIDS;
if(support_avx2())
@@ -2421,6 +2422,22 @@ int get_coretype(void){
else
return CORE_NEHALEM;
}
case 12:
switch (model) {
case 6: // Arrow Lake
if(support_amx_bf16())
return CORE_SAPPHIRERAPIDS;
if(support_avx512_bf16())
return CORE_COOPERLAKE;
if(support_avx512())
return CORE_SKYLAKEX;
if(support_avx2())
return CORE_HASWELL;
if(support_avx())
return CORE_SANDYBRIDGE;
else
return CORE_NEHALEM;
}
}
case 15:
if (model <= 0x2) return CORE_NORTHWOOD;


+ 1
- 17
ctest/CMakeLists.txt View File

@@ -6,7 +6,7 @@ enable_language(Fortran)
endif()

set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DADD${BU} -DCBLAS")
if (BINARY32 AND CMAKE_C_PLATFORM_ID MATCHES "MinGW" AND CMAKE_Fortran_COMPILER_VERSION VERSION_EQUAL 14.2)
if (BINARY32 AND CMAKE_C_PLATFORM_ID MATCHES "MinGW" AND CMAKE_Fortran_COMPILER_VERSION VERSION_GREATER 14.1)
list(REMOVE_ITEM ${CMAKE_Fortran_FLAGS} -O3 -O2 -O1 -Os)
set (CMAKE_Fortran_FLAGS_RELEASE "" CACHE STRING "" FORCE)
endif()
@@ -44,10 +44,6 @@ else()
c_${float_char}blas1.c)
endif()
target_link_libraries(x${float_char}cblat1 ${OpenBLAS_LIBNAME})
if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang))
string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}")
target_link_libraries(x${float_char}cblat1 omp pthread)
endif()
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX")
target_link_libraries(x${float_char}cblat1 m)
endif()
@@ -73,10 +69,6 @@ else()
constant.c)
endif()
target_link_libraries(x${float_char}cblat2 ${OpenBLAS_LIBNAME})
if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang))
string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}")
target_link_libraries(x${float_char}cblat2 omp pthread)
endif()
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX")
target_link_libraries(x${float_char}cblat2 m)
endif()
@@ -124,20 +116,12 @@ else()
endif()
endif()
target_link_libraries(x${float_char}cblat3 ${OpenBLAS_LIBNAME})
if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang))
string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}")
target_link_libraries(x${float_char}cblat3 omp pthread)
endif()
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX")
target_link_libraries(x${float_char}cblat3 m)
endif()
if (USE_GEMM3M)
if ((${float_char} STREQUAL "c") OR (${float_char} STREQUAL "z"))
target_link_libraries(x${float_char}cblat3_3m ${OpenBLAS_LIBNAME})
if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang))
string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}")
target_link_libraries(x${float_char}cblat3 omp pthread)
endif()
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX")
target_link_libraries(x${float_char}cblat3_3m m)
endif()


+ 4
- 4
ctest/Makefile View File

@@ -235,18 +235,18 @@ FLDFLAGS = $(FFLAGS:-fPIC=) $(LDFLAGS)
ifeq ($(USE_OPENMP), 1)
ifeq ($(F_COMPILER), GFORTRAN)
ifeq ($(C_COMPILER), CLANG)
CEXTRALIB += -lomp
EXTRALIB += -lomp
endif
endif
ifeq ($(F_COMPILER), NAG)
CEXTRALIB = -lgomp
EXTRALIB = -lgomp
endif
ifeq ($(F_COMPILER), IBM)
ifeq ($(C_COMPILER), GCC)
CEXTRALIB += -lgomp
EXTRALIB += -lgomp
endif
ifeq ($(C_COMPILER), CLANG)
CEXTRALIB += -lomp
EXTRALIB += -lomp
endif
endif
endif


+ 1
- 1
ctest/c_cblat1c.c View File

@@ -440,7 +440,7 @@ static real c_b43 = (float)1.;
extern /* Subroutine */ int ctest_(integer*, complex*, complex*, complex*, real*);
static complex mwpcs[5], mwpct[5];
extern /* Subroutine */ int itest1_(integer*, integer*), stest1_(real*,real*,real*,real*);
extern /* Subroutine */ int cscaltest_(), itest1_(), stest1_();
extern /* Subroutine */ int cscaltest_(integer*, complex*, complex*, integer*);
static complex cx[8];
extern real scnrm2test_(integer*, complex*, integer*);
static integer np1;


+ 3
- 3
docs/install.md View File

@@ -480,13 +480,13 @@ the LLVM toolchain enables native compilation of the Fortran sources of LAPACK a

4. Navigate to the OpenBLAS source code directory and start building OpenBLAS
by invoking Ninja:
```cmd
cd OpenBLAS
mkdir build
cd build
cmake .. -G Ninja -DCMAKE_BUILD_TYPE=Release -DTARGET=ARMV8 -DBINARY=64 -DCMAKE_C_COMPILER=clang-cl -DCMAKE_C_COMPILER=arm64-pc-windows-msvc -DCMAKE_ASM_COMPILER=arm64-pc-windows-msvc -DCMAKE_Fortran_COMPILER=flang-new
cmake .. -G Ninja -DCMAKE_BUILD_TYPE=Release -DTARGET=ARMV8 -DBINARY=64 -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang-new

ninja -j16
```


+ 4
- 0
driver/level2/CMakeLists.txt View File

@@ -223,3 +223,7 @@ if (USE_THREAD)
endif ()

add_library(driver_level2 OBJECT ${OPENBLAS_SRC})

if (USE_OPENMP)
target_link_libraries(driver_level2 OpenMP::OpenMP_C)
endif()

+ 4
- 0
driver/level3/CMakeLists.txt View File

@@ -171,3 +171,7 @@ endforeach ()
#

add_library(driver_level3 OBJECT ${OPENBLAS_SRC})

if (USE_OPENMP)
target_link_libraries(driver_level3 OpenMP::OpenMP_C)
endif()

+ 34
- 16
driver/level3/level3_thread.c View File

@@ -547,7 +547,7 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG

#ifdef USE_OPENMP
static omp_lock_t level3_lock, critical_section_lock;
static volatile BLASLONG init_lock = 0, omp_lock_initialized = 0,
static volatile BLASULONG init_lock = 0, omp_lock_initialized = 0,
parallel_section_left = MAX_PARALLEL_NUMBER;

// Lock initialization; Todo : Maybe this part can be moved to blas_init() in blas_server_omp.c
@@ -591,7 +591,7 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG

BLASLONG nthreads = args -> nthreads;

BLASLONG width, i, j, k, js;
BLASLONG width, width_n, i, j, k, js;
BLASLONG m, n, n_from, n_to;
int mode;
#if defined(DYNAMIC_ARCH)
@@ -740,18 +740,25 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
/* Partition (a step of) n into nthreads regions */
range_N[0] = js;
num_parts = 0;
while (n > 0){
width = blas_quickdivide(n + nthreads - num_parts - 1, nthreads - num_parts);
if (width < switch_ratio && width > 1) {
width = switch_ratio;
for(j = 0; j < nthreads_n; j++){
width_n = blas_quickdivide(n + nthreads_n - j - 1, nthreads_n - j);
n -= width_n;
for(i = 0; i < nthreads_m; i++){
width = blas_quickdivide(width_n + nthreads_m - i - 1, nthreads_m - i);
if (width < switch_ratio) {
width = switch_ratio;
}
width = round_up(width_n, width, GEMM_PREFERED_SIZE);

width_n -= width;
if (width_n < 0) {
width = width + width_n;
width_n = 0;
}
range_N[num_parts + 1] = range_N[num_parts] + width;

num_parts ++;
}
width = round_up(n, width, GEMM_PREFERED_SIZE);

n -= width;
if (n < 0) width = width + n;
range_N[num_parts + 1] = range_N[num_parts] + width;

num_parts ++;
}
for (j = num_parts; j < MAX_CPU_NUMBER; j++) {
range_N[j + 1] = range_N[num_parts];
@@ -844,9 +851,20 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, IFLOAT *sa, IF
/* Objective function come from sum of partitions in m and n. */
/* (n / nthreads_n) + (m / nthreads_m) */
/* = (n * nthreads_m + m * nthreads_n) / (nthreads_n * nthreads_m) */
while (nthreads_m % 2 == 0 && n * nthreads_m + m * nthreads_n > n * (nthreads_m / 2) + m * (nthreads_n * 2)) {
nthreads_m /= 2;
nthreads_n *= 2;
BLASLONG cost = 0, div = 0;
BLASLONG i;
for (i = 1; i <= sqrt(nthreads_m); i++) {
if (nthreads_m % i) continue;
BLASLONG j = nthreads_m / i;
BLASLONG cost_i = n * j + m * nthreads_n * i;
BLASLONG cost_j = n * i + m * nthreads_n * j;
if (cost == 0 ||
cost_i < cost) {cost = cost_i; div = i;}
if (cost_j < cost) {cost = cost_j; div = j;}
}
if (div > 1) {
nthreads_m /= div;
nthreads_n *= div;
}
}



+ 4
- 0
driver/others/CMakeLists.txt View File

@@ -88,3 +88,7 @@ endif ()
#endif

add_library(driver_others OBJECT ${OPENBLAS_SRC} ${MEMORY} ${SMP_SOURCES} ${COMMON_SOURCES})

if (USE_OPENMP)
target_link_libraries(driver_others OpenMP::OpenMP_C)
endif()

+ 4
- 2
driver/others/blas_server.c View File

@@ -146,8 +146,8 @@ typedef struct {
} thread_status_t;

#ifdef HAVE_C11
#define atomic_load_queue(p) __atomic_load_n(p, __ATOMIC_RELAXED)
#define atomic_store_queue(p, v) __atomic_store_n(p, v, __ATOMIC_RELAXED)
#define atomic_load_queue(p) __atomic_load_n(p, __ATOMIC_ACQUIRE)
#define atomic_store_queue(p, v) __atomic_store_n(p, v, __ATOMIC_RELEASE)
#else
#define atomic_load_queue(p) (blas_queue_t*)(*(volatile blas_queue_t**)(p))
#define atomic_store_queue(p, v) (*(volatile blas_queue_t* volatile*)(p) = (v))
@@ -637,7 +637,9 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){

#ifdef SMP_SERVER
// Handle lazy re-init of the thread-pool after a POSIX fork
LOCK_COMMAND(&server_lock);
if (unlikely(blas_server_avail == 0)) blas_thread_init();
UNLOCK_COMMAND(&server_lock);
#endif
BLASLONG i = 0;
blas_queue_t *current = queue;


+ 67
- 4
driver/others/dynamic_arm64.c View File

@@ -43,6 +43,14 @@
#include <sys/auxv.h>
#endif

#ifdef __APPLE__
#include <sys/sysctl.h>
int32_t value;
size_t length=sizeof(value);
int64_t value64;
size_t length64=sizeof(value64);
#endif

extern gotoblas_t gotoblas_ARMV8;
#ifdef DYNAMIC_LIST
#ifdef DYN_CORTEXA53
@@ -115,7 +123,12 @@ extern gotoblas_t gotoblas_ARMV8SVE;
#else
#define gotoblas_ARMV8SVE gotoblas_ARMV8
#endif
#ifdef DYN_CORTEX_A55
#ifdef DYN_ARMV9SME
extern gotoblas_t gotoblas_ARMV9SME;
#else
#define gotoblas_ARMV9SME gotoblas_ARMV8
#endif
#ifdef DYN_CORTEXA55
extern gotoblas_t gotoblas_CORTEXA55;
#else
#define gotoblas_CORTEXA55 gotoblas_ARMV8
@@ -142,21 +155,28 @@ extern gotoblas_t gotoblas_NEOVERSEV1;
extern gotoblas_t gotoblas_NEOVERSEN2;
extern gotoblas_t gotoblas_ARMV8SVE;
extern gotoblas_t gotoblas_A64FX;
#ifndef NO_SME
extern gotoblas_t gotoblas_ARMV9SME;
#else
#define gotoblas_ARMV9SME gotoblas_ARMV8SVE
#endif
#else
#define gotoblas_NEOVERSEV1 gotoblas_ARMV8
#define gotoblas_NEOVERSEN2 gotoblas_ARMV8
#define gotoblas_ARMV8SVE gotoblas_ARMV8
#define gotoblas_A64FX gotoblas_ARMV8
#define gotoblas_ARMV9SME gotoblas_ARMV8
#endif

extern gotoblas_t gotoblas_THUNDERX3T110;
#endif
#define gotoblas_NEOVERSEV2 gotoblas_NEOVERSEV1
#define gotoblas_NEOVERSEV2 gotoblas_NEOVERSEN2

extern void openblas_warning(int verbose, const char * msg);
#define FALLBACK_VERBOSE 1
#define NEOVERSEN1_FALLBACK "OpenBLAS : Your OS does not support SVE instructions. OpenBLAS is using Neoverse N1 kernels as a fallback, which may give poorer performance.\n"

#define NUM_CORETYPES 18
#define NUM_CORETYPES 19

/*
* In case asm/hwcap.h is outdated on the build system, make sure
@@ -168,6 +188,9 @@ extern void openblas_warning(int verbose, const char * msg);
#ifndef HWCAP_SVE
#define HWCAP_SVE (1 << 22)
#endif
#ifndef HWCAP2_SME
#define HWCAP2_SME 1<<23
#endif

#define get_cpu_ftr(id, var) ({ \
__asm__ __volatile__ ("mrs %0, "#id : "=r" (var)); \
@@ -192,6 +215,7 @@ static char *corename[] = {
"cortexa55",
"armv8sve",
"a64fx",
"armv9sme",
"unknown"
};

@@ -214,6 +238,7 @@ char *gotoblas_corename(void) {
if (gotoblas == &gotoblas_CORTEXA55) return corename[15];
if (gotoblas == &gotoblas_ARMV8SVE) return corename[16];
if (gotoblas == &gotoblas_A64FX) return corename[17];
if (gotoblas == &gotoblas_ARMV9SME) return corename[18];
return corename[NUM_CORETYPES];
}

@@ -251,6 +276,7 @@ static gotoblas_t *force_coretype(char *coretype) {
case 15: return (&gotoblas_CORTEXA55);
case 16: return (&gotoblas_ARMV8SVE);
case 17: return (&gotoblas_A64FX);
case 18: return (&gotoblas_ARMV9SME);
}
snprintf(message, 128, "Core not found: %s\n", coretype);
openblas_warning(1, message);
@@ -262,6 +288,11 @@ static gotoblas_t *get_coretype(void) {
char coremsg[128];

#if defined (OS_DARWIN)
//future #if !defined(NO_SME)
// if (support_sme1()) {
// return &gotoblas_ARMV9SME;
// }
// #endif
return &gotoblas_NEOVERSEN1;
#endif
@@ -409,13 +440,21 @@ static gotoblas_t *get_coretype(void) {
return &gotoblas_TSV110;
}
break;
case 0x50: // Ampere
case 0x50: // Ampere/AppliedMicro
switch (part)
{
case 0x000: // Skylark/EMAG8180
return &gotoblas_EMAG8180;
}
break;
case 0xc0: // Ampere
switch(part)
{
case 0xac3:
case 0xac4:
return &gotoblas_NEOVERSEN1;
}
break;
case 0x51: // Qualcomm
switch (part)
{
@@ -424,12 +463,20 @@ static gotoblas_t *get_coretype(void) {
}
break;
case 0x61: // Apple
//future if (support_sme1()) return &gotoblas_ARMV9SME;
return &gotoblas_NEOVERSEN1;
break;
default:
snprintf(coremsg, 128, "Unknown CPU model - implementer %x part %x\n",implementer,part);
openblas_warning(1, coremsg);
}

#if !defined(NO_SME)
if (support_sme1()) {
return &gotoblas_ARMV9SME;
}
#endif

#ifndef NO_SVE
if ((getauxval(AT_HWCAP) & HWCAP_SVE)) {
return &gotoblas_ARMV8SVE;
@@ -480,3 +527,19 @@ void gotoblas_dynamic_init(void) {
void gotoblas_dynamic_quit(void) {
gotoblas = NULL;
}

int support_sme1(void) {
int ret = 0;

#if (defined OS_LINUX || defined OS_ANDROID)
ret = getauxval(AT_HWCAP2) & HWCAP2_SME;
if(getauxval(AT_HWCAP2) & HWCAP2_SME){
ret = 1;
}
#endif
#if defined(__APPLE__)
sysctlbyname("hw.optional.arm.FEAT_SME",&value64,&length64,NULL,0);
ret = value64;
#endif
return ret;
}

+ 1
- 1
exports/Makefile View File

@@ -197,7 +197,7 @@ ifeq ($(F_COMPILER), INTEL)
-Wl,--whole-archive $< -Wl,--no-whole-archive \
-Wl,-soname,$(INTERNALNAME) $(EXTRALIB)
$(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK.
else ifeq ($(F_COMPILER), FLANG)
else ifeq ($(F_COMPILER), $(filter $(F_COMPILER),FLANG FLANGNEW))
$(FC) $(FFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \
-Wl,--whole-archive $< -Wl,--no-whole-archive \
-Wl,-soname,$(INTERNALNAME) $(EXTRALIB)


+ 154
- 29
exports/gensymbol.pl View File

@@ -21,7 +21,7 @@
chbmv,chemm,chemv,cher2,cher2k,cher,cherk,scabs1,scamax,
chpmv,chpr2,chpr,crotg,cscal,csrot,csscal,cswap,scamin,scasum,scnrm2,
csymm,csyr2k,csyrk,ctbmv,ctbsv,ctpmv,ctpsv,ctrmm,ctrmv,ctrsm,
ctrsv,icamax,icamin,cimatcopy,comatcopy,cgeadd,scsum,cgemmt);
ctrsv,icamax,icamin,cimatcopy,comatcopy,cgeadd,scsum,cgemmt,cgemmtr);
@blasobjsd = (
damax,damin,dasum,daxpy,daxpby,dcabs1,dcopy,ddot,dgbmv,dgemm,
@@ -29,7 +29,7 @@
dscal,dsdot,dspmv,dspr2,dimatcopy,domatcopy,
dspr,dswap,dsymm,dsymv,dsyr2,dsyr2k,dsyr,dsyrk,dtbmv,dtbsv,
dtpmv,dtpsv,dtrmm,dtrmv,dtrsm,dtrsv,
idamax,idamin,idmax,idmin,dgeadd,dsum,dgemmt);
idamax,idamin,idmax,idmin,dgeadd,dsum,dgemmt,dgemmtr);
@blasobjss = (
isamax,isamin,ismax,ismin,
@@ -38,7 +38,7 @@
smax,smin,snrm2,simatcopy,somatcopy,
srot,srotg,srotm,srotmg,ssbmv,sscal,sspmv,sspr2,sspr,sswap,
ssymm,ssymv,ssyr2,ssyr2k,ssyr,ssyrk,stbmv,stbsv,stpmv,stpsv,
strmm,strmv,strsm,strsv, sgeadd,ssum,sgemmt);
strmm,strmv,strsm,strsv, sgeadd,ssum,sgemmt,sgemmtr);
@blasobjsz = (
izamax,izamin,,
@@ -48,28 +48,29 @@
zhpr,zrotg,zscal,zswap,zsymm,zsyr2k,zsyrk,ztbmv,
ztbsv,ztpmv,ztpsv,ztrmm,ztrmv,ztrsm,ztrsv,
zomatcopy, zimatcopy,dzamax,dzamin,dzasum,dznrm2,
zgeadd, dzsum, zgemmt);
zgeadd, dzsum, zgemmt,zgemmtr);

@blasobjs = (lsame, xerbla);
@bfblasobjs = (sbgemm, sbgemv, sbdot, sbstobf16, sbdtobf16, sbf16tos, dbf16tod);
@bfblasobjs = (sbgemm, sbgemmt, sbgemmtr, sbgemv, sbdot, sbstobf16, sbdtobf16, sbf16tos, dbf16tod);
@cblasobjsc = (
cblas_caxpy, cblas_ccopy, cblas_cdotc, cblas_cdotu, cblas_cgbmv, cblas_cgemm, cblas_cgemv,
cblas_cgerc, cblas_cgeru, cblas_chbmv, cblas_chemm, cblas_chemv, cblas_cher2, cblas_cher2k,
cblas_cher, cblas_cherk, cblas_chpmv, cblas_chpr2, cblas_chpr, cblas_cscal, cblas_caxpby,
cblas_csscal, cblas_cswap, cblas_csymm, cblas_csyr2k, cblas_csyrk, cblas_ctbmv, cblas_cgeadd,
cblas_ctbsv, cblas_ctpmv, cblas_ctpsv, cblas_ctrmm, cblas_ctrmv, cblas_ctrsm, cblas_ctrsv,
cblas_scnrm2, cblas_scasum,
cblas_icamax, cblas_icamin, cblas_icmin, cblas_icmax, cblas_scsum,cblas_cimatcopy,cblas_comatcopy
cblas_cgemmt);
cblas_scnrm2, cblas_scasum, cblas_cgemmt, cblas_cgemmtr,
cblas_icamax, cblas_icamin, cblas_icmin, cblas_icmax, cblas_scsum,cblas_cimatcopy,cblas_comatcopy,
cblas_caxpyc, cblas_crotg, cblas_csrot, cblas_scamax, cblas_scamin, cblas_cgemm_batch);

@cblasobjsd = (
cblas_dasum, cblas_daxpy, cblas_dcopy, cblas_ddot,
cblas_dgbmv, cblas_dgemm, cblas_dgemv, cblas_dger, cblas_dnrm2,
cblas_drot, cblas_drotg, cblas_drotm, cblas_drotmg, cblas_dsbmv, cblas_dscal, cblas_dsdot,
cblas_dspmv, cblas_dspr2, cblas_dspr, cblas_dswap, cblas_dsymm, cblas_dsymv, cblas_dsyr2,
cblas_dsyr2k, cblas_dsyr, cblas_dsyrk, cblas_dtbmv, cblas_dtbsv, cblas_dtpmv, cblas_dtpsv,
cblas_dtrmm, cblas_dtrmv, cblas_dtrsm, cblas_dtrsv, cblas_daxpby, cblas_dgeadd,
cblas_idamax, cblas_idamin, cblas_idmin, cblas_idmax, cblas_dsum,cblas_dimatcopy,cblas_domatcopy
cblas_dgemmt);
cblas_dtrmm, cblas_dtrmv, cblas_dtrsm, cblas_dtrsv, cblas_daxpby, cblas_dgeadd, cblas_dgemmt, cblas_dgemmtr,
cblas_idamax, cblas_idamin, cblas_idmin, cblas_idmax, cblas_dsum,cblas_dimatcopy,cblas_domatcopy,
cblas_damax, cblas_damin, cblas_dgemm_batch);
@cblasobjss = (
cblas_sasum, cblas_saxpy, cblas_saxpby,
@@ -78,9 +79,10 @@
cblas_srotm, cblas_srotmg, cblas_ssbmv, cblas_sscal, cblas_sspmv, cblas_sspr2, cblas_sspr,
cblas_sswap, cblas_ssymm, cblas_ssymv, cblas_ssyr2, cblas_ssyr2k, cblas_ssyr, cblas_ssyrk,
cblas_stbmv, cblas_stbsv, cblas_stpmv, cblas_stpsv, cblas_strmm, cblas_strmv, cblas_strsm,
cblas_strsv, cblas_sgeadd,
cblas_isamax, cblas_isamin, cblas_ismin, cblas_ismax, cblas_ssum,cblas_simatcopy,cblas_somatcopy
cblas_sgemmt);
cblas_strsv, cblas_sgeadd, cblas_sgemmt, cblas_sgemmtr,
cblas_isamax, cblas_isamin, cblas_ismin, cblas_ismax, cblas_ssum,cblas_simatcopy,cblas_somatcopy,
cblas_samax, cblas_samin, cblas_sgemm_batch);

@cblasobjsz = (
cblas_dzasum, cblas_dznrm2, cblas_zaxpy, cblas_zcopy, cblas_zdotc, cblas_zdotu, cblas_zdscal,
cblas_zgbmv, cblas_zgemm, cblas_zgemv, cblas_zgerc, cblas_zgeru, cblas_zhbmv, cblas_zhemm,
@@ -88,13 +90,13 @@
cblas_zhpr, cblas_zscal, cblas_zswap, cblas_zsymm, cblas_zsyr2k, cblas_zsyrk,
cblas_ztbmv, cblas_ztbsv, cblas_ztpmv, cblas_ztpsv, cblas_ztrmm, cblas_ztrmv, cblas_ztrsm,
cblas_ztrsv, cblas_cdotc_sub, cblas_cdotu_sub, cblas_zdotc_sub, cblas_zdotu_sub,
cblas_zaxpby, cblas_zgeadd,
cblas_izamax, cblas_izamin, cblas_izmin, cblas_izmax, cblas_dzsum,cblas_zimatcopy,cblas_zomatcopy
cblas_zgemmt);
cblas_zaxpby, cblas_zgeadd, cblas_zgemmt, cblas_zgemmtr,
cblas_izamax, cblas_izamin, cblas_izmin, cblas_izmax, cblas_dzsum,cblas_zimatcopy,cblas_zomatcopy,
cblas_zaxpyc, cblas_zdrot, cblas_zrotg, cblas_dzamax, cblas_dzamin, cblas_zgemm_batch);

@cblasobjs = ( cblas_xerbla );

@bfcblasobjs = (cblas_sbgemm, cblas_sbgemv, cblas_sbdot, cblas_sbstobf16, cblas_sbdtobf16, cblas_sbf16tos, cblas_dbf16tod);
@bfcblasobjs = (cblas_sbgemm, cblas_sbgemmt, cblas_sbgemmtr, cblas_sbgemv, cblas_sbdot, cblas_sbstobf16, cblas_sbdtobf16, cblas_sbf16tos, cblas_dbf16tod, cblas_sbgemm_batch);

@exblasobjs = (
qamax,qamin,qasum,qaxpy,qcabs1,qcopy,qdot,qgbmv,qgemm,
@@ -709,6 +711,7 @@ zpotri,
# functions added for lapack-3.7.0
@lapackobjs2s = (@lapackobjs2s,
slarfy,
ssyconvf,
strevc3,
sgelqt,
sgelqt3,
@@ -832,12 +835,82 @@ zpotri,
zungtsqr_row
);

#functions added for lapack-3.11
@lapackobjs2c = (@lapackobjs2c,
cgedmd,
cgedmdq
);
@lapackobjs2d = (@lapackobjs2d,
dgedmd,
dgedmdq
);
@lapackobjs2s = (@lapackobjs2s,
sgedmd,
sgedmdq
);
@lapackobjs2z = (@lapackobjs2z,
zgedmd,
zgedmdq
);

#functions added post 3.11

@lapackobjs2c = (@lapackobjs2c,
cgelst,
cgeqp3rk,
claqp2rk,
claqp3rk,
clatrs3,
crscl,
ctrsyl3
);
# claqz0
# claqz1
# claqz2
# claqz3
# clatrs3

@lapackobjs2d = (@lapackobjs2d,
dgelst,
dgeqp3rk,
dlaqp2rk,
dlaqp3rk,
dlarmm,
dlatrs3,
dtrsyl3
);

@lapackobjs2s = (@lapackobjs2s,
sgelst,
sgeqp3rk,
slaqp2rk,
slaqp3rk,
slarmm,
slatrs3,
strsyl3
);

@lapackobjs2z = (@lapackobjs2z,
zgelst,
zgeqp3rk,
zlaqp2rk,
zlaqp3rk,
zlatrs3,
zrscl,
ztrsyl3
);
# zlaqz0
# zlaqz1
# zlaqz2
# zlaqz3

@lapack_extendedprecision_objs = (
zposvxx, clagge, clatms, chesvxx, cposvxx, cgesvxx, ssyrfssx, csyrfsx,
dlagsy, dsysvxx, sporfsx, slatms, zlatms, zherfsx, csysvxx,
);

@lapack_deprecated_objsc = (
cgelqs, cgeqrs,
cgegs, cggsvd,
cgegv, cggsvp,
cgelsx, clahrd,
@@ -845,13 +918,16 @@ zpotri,
ctzrqf,
);
@lapack_deprecated_objsd = (
dgelqs, dgeqrs,
dgegs, dgeqpf,
dgegv, dggsvd,
dgelsx, dggsvp,
dlahrd,
dlatzm, dtzrqf);
@lapack_deprecated_objss = (
@lapack_deprecated_objss = (
sgelqs,
sgeqrs,
sgelsx,
sgegs,
sgegv,
@@ -864,6 +940,8 @@ zpotri,
);

@lapack_deprecated_objsz = (
zgelqs,
zgeqrs,
zgegs,
zgegv,
zgelsx,
@@ -997,6 +1075,10 @@ zpotri,
LAPACKE_cgebrd_work,
LAPACKE_cgecon,
LAPACKE_cgecon_work,
LAPACKE_cgedmd,
LAPACKE_cgedmd_work,
LAPACKE_cgedmdq,
LAPACKE_cgedmdq_work,
LAPACKE_cgeequ,
LAPACKE_cgeequ_work,
LAPACKE_cgeequb,
@@ -1584,8 +1666,15 @@ zpotri,
LAPACKE_cgetsqrhrt,
LAPACKE_cgetsqrhrt_work,
LAPACKE_cungtsqr_row,
LAPACKE_cungtsqr_row_work

LAPACKE_cungtsqr_row_work,
LAPACKE_clangb,
LAPACKE_clangb_work,
LAPACKE_ctrsyl3,
LAPACKE_ctrsyl3_work,
LAPACKE_ctz_nancheck,
LAPACKE_ctz_trans,
LAPACKE_cunhr_col,
LAPACKE_cunhr_col_work
);
@lapackeobjsd = (
LAPACKE_dgb_nancheck,
@@ -1656,6 +1745,10 @@ zpotri,
LAPACKE_dgebrd_work,
LAPACKE_dgecon,
LAPACKE_dgecon_work,
LAPACKE_dgedmd,
LAPACKE_dgedmd_work,
LAPACKE_dgedmdq,
LAPACKE_dgedmdq_work,
LAPACKE_dgeequ,
LAPACKE_dgeequ_work,
LAPACKE_dgeequb,
@@ -2197,7 +2290,15 @@ zpotri,
LAPACKE_dgetsqrhrt,
LAPACKE_dgetsqrhrt_work,
LAPACKE_dorgtsqr_row,
LAPACKE_dorgtsqr_row_work
LAPACKE_dorgtsqr_row_work,
LAPACKE_dlangb,
LAPACKE_dlangb_work,
LAPACKE_dorhr_col,
LAPACKE_dorhr_col_work,
LAPACKE_dtrsyl3,
LAPACKE_dtrsyl3_work,
LAPACKE_dtz_nancheck,
LAPACKE_dtz_trans,

);
@lapackeobjss = (
@@ -2269,6 +2370,10 @@ zpotri,
LAPACKE_sgebrd_work,
LAPACKE_sgecon,
LAPACKE_sgecon_work,
LAPACKE_sgedmd,
LAPACKE_sgedmd_work,
LAPACKE_sgedmdq,
LAPACKE_sgedmdq_work,
LAPACKE_sgeequ,
LAPACKE_sgeequ_work,
LAPACKE_sgeequb,
@@ -2802,7 +2907,15 @@ zpotri,
LAPACKE_sgetsqrhrt,
LAPACKE_sgetsqrhrt_work,
LAPACKE_sorgtsqr_row,
LAPACKE_sorgtsqr_row_work
LAPACKE_sorgtsqr_row_work,
LAPACKE_slangb,
LAPACKE_slangb_work,
LAPACKE_sorhr_col,
LAPACKE_sorhr_col_work,
LAPACKE_strsyl3,
LAPACKE_strsyl3_work,
LAPACKE_stz_nancheck,
LAPACKE_stz_trans,

);
@lapackeobjsz = (
@@ -2878,6 +2991,10 @@ zpotri,
LAPACKE_zgebrd_work,
LAPACKE_zgecon,
LAPACKE_zgecon_work,
LAPACKE_zgedmd,
LAPACKE_zgedmd_work,
LAPACKE_zgedmdq,
LAPACKE_zgedmdq_work,
LAPACKE_zgeequ,
LAPACKE_zgeequ_work,
LAPACKE_zgeequb,
@@ -3345,7 +3462,15 @@ zpotri,
LAPACKE_zgetsqrhrt,
LAPACKE_zgetsqrhrt_work,
LAPACKE_zungtsqr_row,
LAPACKE_zungtsqr_row_work
LAPACKE_zungtsqr_row_work,
LAPACKE_zlangb,
LAPACKE_zlangb_work,
LAPACKE_zunhr_col,
LAPACKE_zunhr_col_work,
LAPACKE_ztrsyl3,
LAPACKE_ztrsyl3_work,
LAPACKE_ztz_nancheck,
LAPACKE_ztz_trans,

## @(SRCX_OBJ) from `lapack-3.4.1/lapacke/src/Makefile`
## Not exported: requires LAPACKE_EXTENDED to be set and depends on the
@@ -3551,7 +3676,7 @@ zpotri,
LAPACKE_zsytrs_aa_2stage_work,
# new functions from 3.9.0
LAPACKE_zgesvdq,
LAPACKE_zgesvdq_work
LAPACKE_zgesvdq_work,
);

#These function may need 2 underscores.
@@ -3573,7 +3698,7 @@ zpotri,
ssygv_2stage,
ssysv_aa_2stage, ssytrf_aa_2stage,
ssytrs_aa_2stage,
slaorhr_col_getrfnp, slaorhr_col_getrfnp2, sorhr_col,
slaorhr_col_getrfnp, slaorhr_col_getrfnp2, sorhr_col, slarfb_gett
);
@lapack_embeded_underscore_objs_c=(
chetf2_rook, chetrf_rook, chetri_rook,
@@ -3598,7 +3723,7 @@ zpotri,
chetrf_aa_2stage, chetrs_aa_2stage,
csysv_aa_2stage, csytrf_aa_2stage,
csytrs_aa_2stage,
claunhr_col_getrfnp, claunhr_col_getrfnp2, cunhr_col,
claunhr_col_getrfnp, claunhr_col_getrfnp2, cunhr_col, clarfb_gett
);
@lapack_embeded_underscore_objs_d=(
dlasyf_rook,
@@ -3615,7 +3740,7 @@ zpotri,
dsbevd_2stage, dsygv_2stage,
dsysv_aa_2stage,
dsytrf_aa_2stage, dsytrs_aa_2stage,
dlaorhr_col_getrfnp, dlaorhr_col_getrfnp2, dorhr_col,
dlaorhr_col_getrfnp, dlaorhr_col_getrfnp2, dorhr_col, dlarfb_gett
);
@lapack_embeded_underscore_objs_z=(
zhetf2_rook, zhetrf_rook, zhetri_rook,
@@ -3639,7 +3764,7 @@ zpotri,
zhesv_aa_2stage, zhetrf_aa_2stage,
zhetrs_aa_2stage, zsysv_aa_2stage,
zsytrf_aa_2stage, zsytrs_aa_2stage,
zlaunhr_col_getrfnp, zlaunhr_col_getrfnp2, zunhr_col
zlaunhr_col_getrfnp, zlaunhr_col_getrfnp2, zunhr_col, zlarfb_gett
);




+ 7
- 0
f_check View File

@@ -245,6 +245,13 @@ else
;;
*flang*)
vendor=FLANG
data=`$compiler -v 2>&1 > /dev/null`
v="${data#*version *}"
v="${v%%*.}"
major="${v%%.*}"
if [ "$major" -ge 17 ]; then
vendor=FLANGNEW
fi
bu=_
openmp='-fopenmp'
;;


+ 13
- 0
getarch.c View File

@@ -1289,6 +1289,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CORENAME "ARMV8SVE"
#endif

#ifdef FORCE_ARMV9SME
#define FORCE
#define ARCHITECTURE "ARM64"
#define SUBARCHITECTURE "ARMV9SME"
#define SUBDIRNAME "arm64"
#define ARCHCONFIG "-DARMV9SME " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 " \
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SVE -DHAVE_SME -DARMV8 -DARMV9"
#define LIBNAME "armv9sme"
#define CORENAME "ARMV9SME"
#endif

#ifdef FORCE_ARMV8
#define FORCE


+ 11
- 7
interface/CMakeLists.txt View File

@@ -30,17 +30,17 @@ set(BLAS2_SOURCES
gemv.c ger.c
trsv.c trmv.c
syr2.c gbmv.c
sbmv.c
sbmv.c spmv.c
spr2.c
tbsv.c tbmv.c
tpsv.c tpmv.c
)

set(BLAS2_REAL_ONLY_SOURCES
symv.c syr.c spmv.c spr.c
symv.c syr.c spr.c
)
set(BLAS2_COMPLEX_LAPACK_SOURCES
symv.c syr.c spmv.c spr.c
symv.c syr.c spr.c
)

set(BLAS2_COMPLEX_ONLY_MANGLED_SOURCES
@@ -109,7 +109,7 @@ endif ()
GenerateNamedObjects("trsm.c" "TRMM" "trmm" ${CBLAS_FLAG})
# gemmtr is gemmt under the name adopted by the Reference BLAS
GenerateNamedObjects("gemm.c" "" "gemmtr" ${CBLAS_FLAG})
GenerateNamedObjects("gemm.c" "RNAME" "gemmtr" ${CBLAS_FLAG})

# max and imax are compiled 4 times
GenerateNamedObjects("max.c" "" "" ${CBLAS_FLAG})
@@ -125,8 +125,8 @@ endif ()
if (BUILD_BFLOAT16)
GenerateNamedObjects("bf16dot.c" "" "sbdot" ${CBLAS_FLAG} "" "" true "BFLOAT16")
GenerateNamedObjects("gemm.c" "" "sbgemm" ${CBLAS_FLAG} "" "" true "BFLOAT16")
GenerateNamedObjects("gemmt.c" "" "sbgemmt" ${CBLAS_FLAG} "" "" true "BFLOAT16")
GenerateNamedObjects("gemmt.c" "" "sbgemmtr" ${CBLAS_FLAG} "" "" true "BFLOAT16")
GenerateNamedObjects("sbgemmt.c" "" "sbgemmt" ${CBLAS_FLAG} "" "" true "BFLOAT16")
GenerateNamedObjects("sbgemmt.c" "RNAME" "sbgemmtr" ${CBLAS_FLAG} "" "" true "BFLOAT16")
GenerateNamedObjects("sbgemv.c" "" "sbgemv" ${CBLAS_FLAG} "" "" true "BFLOAT16")
GenerateNamedObjects("tobf16.c" "SINGLE_PREC" "sbstobf16" ${CBLAS_FLAG} "" "" true "BFLOAT16")
GenerateNamedObjects("tobf16.c" "DOUBLE_PREC" "sbdtobf16" ${CBLAS_FLAG} "" "" true "BFLOAT16")
@@ -195,7 +195,7 @@ if (NOT DEFINED NO_CBLAS)
endforeach ()
endif()

if (NOT DEFINED NO_LAPACK)
if (NOT NO_LAPACK)
set(LAPACK_SOURCES
lapack/gesv.c
)
@@ -250,3 +250,7 @@ if ( BUILD_COMPLEX16 AND NOT BUILD_DOUBLE)
endif ()

add_library(interface OBJECT ${OPENBLAS_SRC})

if (USE_OPENMP)
target_link_libraries(interface OpenMP::OpenMP_C)
endif()

+ 12
- 12
interface/Makefile View File

@@ -1304,9 +1304,9 @@ ifeq ($(BUILD_BFLOAT16),1)
sbgemm.$(SUFFIX) sbgemm.$(PSUFFIX) : gemm.c ../param.h
$(CC) -c $(CFLAGS) $< -o $(@F)
sbgemmt.$(SUFFIX) sbgemmt.$(PSUFFIX) : sbgemmt.c ../param.h
$(CC) -c $(CFLAGS) $< -o $(@F)
$(CC) -c $(CFLAGS) -URNAME $< -o $(@F)
sbgemmtr.$(SUFFIX) sbgemmtr.$(PSUFFIX) : sbgemmt.c ../param.h
$(CC) -c $(CFLAGS) $< -o $(@F)
$(CC) -c $(CFLAGS) -DRNAME $< -o $(@F)
endif

sgemm.$(SUFFIX) sgemm.$(PSUFFIX) : gemm.c ../param.h
@@ -1328,34 +1328,34 @@ xgemm.$(SUFFIX) xgemm.$(PSUFFIX) : gemm.c ../param.h
$(CC) -c $(CFLAGS) $< -o $(@F)

sgemmt.$(SUFFIX) sgemmt.$(PSUFFIX) : gemmt.c ../param.h
$(CC) -c $(CFLAGS) $< -o $(@F)
$(CC) -c $(CFLAGS) -URNAME $< -o $(@F)

dgemmt.$(SUFFIX) dgemmt.$(PSUFFIX) : gemmt.c ../param.h
$(CC) -c $(CFLAGS) $< -o $(@F)
$(CC) -c $(CFLAGS) -URNAME $< -o $(@F)

qgemmt.$(SUFFIX) qgemmt.$(PSUFFIX) : gemmt.c ../param.h
$(CC) -c $(CFLAGS) $< -o $(@F)
$(CC) -c $(CFLAGS) -URNAME $< -o $(@F)

cgemmt.$(SUFFIX) cgemmt.$(PSUFFIX) : gemmt.c ../param.h
$(CC) -c $(CFLAGS) $< -o $(@F)
$(CC) -c $(CFLAGS) -URNAME $< -o $(@F)

zgemmt.$(SUFFIX) zgemmt.$(PSUFFIX) : gemmt.c ../param.h
$(CC) -c $(CFLAGS) $< -o $(@F)
$(CC) -c $(CFLAGS) -URNAME $< -o $(@F)

sgemmtr.$(SUFFIX) sgemmtr.$(PSUFFIX) : gemmt.c ../param.h
$(CC) -c $(CFLAGS) $< -o $(@F)
$(CC) -c $(CFLAGS) -DRNAME $< -o $(@F)

dgemmtr.$(SUFFIX) dgemmtr.$(PSUFFIX) : gemmt.c ../param.h
$(CC) -c $(CFLAGS) $< -o $(@F)
$(CC) -c $(CFLAGS) -DRNAME $< -o $(@F)

qgemmtr.$(SUFFIX) qgemmtr.$(PSUFFIX) : gemmt.c ../param.h
$(CC) -c $(CFLAGS) $< -o $(@F)
$(CC) -c $(CFLAGS) -DRNAME $< -o $(@F)

cgemmtr.$(SUFFIX) cgemmtr.$(PSUFFIX) : gemmt.c ../param.h
$(CC) -c $(CFLAGS) $< -o $(@F)
$(CC) -c $(CFLAGS) -DRNAME $< -o $(@F)

zgemmtr.$(SUFFIX) zgemmtr.$(PSUFFIX) : gemmt.c ../param.h
$(CC) -c $(CFLAGS) $< -o $(@F)
$(CC) -c $(CFLAGS) -DRNAME $< -o $(@F)

ssymm.$(SUFFIX) ssymm.$(PSUFFIX) : symm.c
$(CC) -c $(CFLAGS) $< -o $(@F)


+ 86
- 14
interface/gemm.c View File

@@ -1,5 +1,5 @@
/*********************************************************************/
/* Copyright 2024 The OpenBLAS Project */
/* Copyright 2024, 2025 The OpenBLAS Project */
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
@@ -177,6 +177,74 @@ static int init_amxtile_permission() {
}
#endif

#ifdef SMP
#ifdef DYNAMIC_ARCH
extern char* gotoblas_corename(void);
#endif

#if defined(DYNAMIC_ARCH) || defined(NEOVERSEV1)
static inline int get_gemm_optimal_nthreads_neoversev1(double MNK, int ncpu) {
return
MNK < 262144L ? 1
: MNK < 1124864L ? MIN(ncpu, 6)
: MNK < 7880599L ? MIN(ncpu, 12)
: MNK < 17173512L ? MIN(ncpu, 16)
: MNK < 33386248L ? MIN(ncpu, 20)
: MNK < 57066625L ? MIN(ncpu, 24)
: MNK < 91733851L ? MIN(ncpu, 32)
: MNK < 265847707L ? MIN(ncpu, 40)
: MNK < 458314011L ? MIN(ncpu, 48)
: MNK < 729000000L ? MIN(ncpu, 56)
: ncpu;
}
#endif

#if defined(DYNAMIC_ARCH) || defined(NEOVERSEV2)
static inline int get_gemm_optimal_nthreads_neoversev2(double MNK, int ncpu) {
return
MNK < 125000L ? 1
: MNK < 1092727L ? MIN(ncpu, 6)
: MNK < 2628072L ? MIN(ncpu, 8)
: MNK < 8000000L ? MIN(ncpu, 12)
: MNK < 20346417L ? MIN(ncpu, 16)
: MNK < 57066625L ? MIN(ncpu, 24)
: MNK < 91125000L ? MIN(ncpu, 28)
: MNK < 238328000L ? MIN(ncpu, 40)
: MNK < 454756609L ? MIN(ncpu, 48)
: MNK < 857375000L ? MIN(ncpu, 56)
: MNK < 1073741824L ? MIN(ncpu, 64)
: ncpu;
}
#endif

static inline int get_gemm_optimal_nthreads(double MNK) {
int ncpu = num_cpu_avail(3);
#if defined(NEOVERSEV1) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16)
return get_gemm_optimal_nthreads_neoversev1(MNK, ncpu);
#elif defined(NEOVERSEV2) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16)
return get_gemm_optimal_nthreads_neoversev2(MNK, ncpu);
#elif defined(DYNAMIC_ARCH) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16)
if (strcmp(gotoblas_corename(), "neoversev1") == 0) {
return get_gemm_optimal_nthreads_neoversev1(MNK, ncpu);
}
if (strcmp(gotoblas_corename(), "neoversev2") == 0) {
return get_gemm_optimal_nthreads_neoversev2(MNK, ncpu);
}
#endif
if ( MNK <= (SMP_THRESHOLD_MIN * (double) GEMM_MULTITHREAD_THRESHOLD) ) {
return 1;
}
else {
if (MNK/ncpu < SMP_THRESHOLD_MIN*(double)GEMM_MULTITHREAD_THRESHOLD) {
return MNK/(SMP_THRESHOLD_MIN*(double)GEMM_MULTITHREAD_THRESHOLD);
}
else {
return ncpu;
}
}
}
#endif

#ifndef CBLAS

void NAME(char *TRANSA, char *TRANSB,
@@ -310,7 +378,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
FLOAT *beta = (FLOAT*) vbeta;
FLOAT *a = (FLOAT*) va;
FLOAT *b = (FLOAT*) vb;
FLOAT *c = (FLOAT*) vc;
FLOAT *c = (FLOAT*) vc;
#endif

blas_arg_t args;
@@ -349,15 +417,25 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS

PRINT_DEBUG_CNAME;

#if !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) && defined(USE_SGEMM_KERNEL_DIRECT)
#ifdef DYNAMIC_ARCH
if (support_avx512() )
#endif
#if !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16)
#if defined(ARCH_x86) && (defined(USE_SGEMM_KERNEL_DIRECT)||defined(DYNAMIC_ARCH))
#if defined(DYNAMIC_ARCH)
if (support_avx512() )
#endif
if (beta == 0 && alpha == 1.0 && order == CblasRowMajor && TransA == CblasNoTrans && TransB == CblasNoTrans && SGEMM_DIRECT_PERFORMANT(m,n,k)) {
SGEMM_DIRECT(m, n, k, a, lda, b, ldb, c, ldc);
return;
}

#endif
#if defined(ARCH_ARM64) && (defined(USE_SGEMM_KERNEL_DIRECT)||defined(DYNAMIC_ARCH))
#if defined(DYNAMIC_ARCH)
if (support_sme1())
#endif
if (beta == 0 && alpha == 1.0 && order == CblasRowMajor && TransA == CblasNoTrans && TransB == CblasNoTrans) {
SGEMM_DIRECT(m, n, k, a, lda, b, ldb, c, ldc);
return;
}
#endif
#endif

#ifndef COMPLEX
@@ -604,13 +682,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
#endif

MNK = (double) args.m * (double) args.n * (double) args.k;
if ( MNK <= (SMP_THRESHOLD_MIN * (double) GEMM_MULTITHREAD_THRESHOLD) )
args.nthreads = 1;
else {
args.nthreads = num_cpu_avail(3);
if (MNK/args.nthreads < SMP_THRESHOLD_MIN*(double)GEMM_MULTITHREAD_THRESHOLD)
args.nthreads = MNK/(SMP_THRESHOLD_MIN*(double)GEMM_MULTITHREAD_THRESHOLD);
}
args.nthreads = get_gemm_optimal_nthreads(MNK);

args.common = NULL;



+ 36
- 0
interface/gemmt.c View File

@@ -38,6 +38,17 @@

#ifndef COMPLEX
#define SMP_THRESHOLD_MIN 65536.0
#ifdef RNAME
#ifdef XDOUBLE
#define ERROR_NAME "QGEMMTR"
#elif defined(DOUBLE)
#define ERROR_NAME "DGEMMTR"
#elif defined(BFLOAT16)
#define ERROR_NAME "SBGEMMTR"
#else
#define ERROR_NAME "SGEMMTR"
#endif
#else
#ifdef XDOUBLE
#define ERROR_NAME "QGEMMT "
#elif defined(DOUBLE)
@@ -47,8 +58,18 @@
#else
#define ERROR_NAME "SGEMMT "
#endif
#endif
#else
#define SMP_THRESHOLD_MIN 8192.0
#ifdef RNAME
#ifdef XDOUBLE
#define ERROR_NAME "XGEMMTR"
#elif defined(DOUBLE)
#define ERROR_NAME "ZGEMMTR"
#else
#define ERROR_NAME "CGEMMTR"
#endif
#else
#ifdef XDOUBLE
#define ERROR_NAME "XGEMMT "
#elif defined(DOUBLE)
@@ -57,6 +78,7 @@
#define ERROR_NAME "CGEMMT "
#endif
#endif
#endif

#ifndef GEMM_MULTITHREAD_THRESHOLD
#define GEMM_MULTITHREAD_THRESHOLD 4
@@ -666,5 +688,19 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,

IDEBUG_END;

/* transform B back if necessary */
#if defined(COMPLEX)
if (transb > 1){
#ifndef CBLAS
IMATCOPY_K_CNC(nrowb, ncolb, (FLOAT)(1.0), (FLOAT)(0.0), b, ldb);
#else
if (order == CblasColMajor)
IMATCOPY_K_CNC(nrowb, ncolb, (FLOAT)(1.0), (FLOAT)(0.0), b, ldb);
if (order == CblasRowMajor)
IMATCOPY_K_RNC(nrowb, ncolb, (FLOAT)(1.0), (FLOAT)(0.0), b, ldb);
#endif
}
#endif

return;
}

+ 65
- 12
interface/gemv.c View File

@@ -63,6 +63,70 @@ static int (*gemv_thread[])(BLASLONG, BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT
};
#endif

#ifdef SMP
#ifdef DYNAMIC_ARCH
extern char* gotoblas_corename(void);
#endif

#if defined(DYNAMIC_ARCH) || defined(NEOVERSEV1)
static inline int get_gemv_optimal_nthreads_neoversev1(BLASLONG MN, int ncpu) {
#ifdef DOUBLE
return (MN < 8100L) ? 1
: (MN < 12100L) ? MIN(ncpu, 2)
: (MN < 36100L) ? MIN(ncpu, 4)
: (MN < 84100L) ? MIN(ncpu, 8)
: (MN < 348100L) ? MIN(ncpu, 16)
: (MN < 435600L) ? MIN(ncpu, 24)
: (MN < 810000L) ? MIN(ncpu, 32)
: (MN < 1050625L) ? MIN(ncpu, 40)
: ncpu;
#else
return (MN < 25600L) ? 1
: (MN < 63001L) ? MIN(ncpu, 4)
: (MN < 459684L) ? MIN(ncpu, 16)
: ncpu;
#endif
}
#endif

#if defined(DYNAMIC_ARCH) || defined(NEOVERSEV2)
static inline int get_gemv_optimal_nthreads_neoversev2(BLASLONG MN, int ncpu) {
return
MN < 24964L ? 1
: MN < 65536L ? MIN(ncpu, 8)
: MN < 262144L ? MIN(ncpu, 32)
: MN < 1638400L ? MIN(ncpu, 64)
: ncpu;
}
#endif

static inline int get_gemv_optimal_nthreads(BLASLONG MN) {
int ncpu = num_cpu_avail(3);
#if defined(_WIN64) && defined(_M_ARM64)
if (MN > 100000000L)
return num_cpu_avail(4);
return 1;
#endif
#if defined(NEOVERSEV1) && !defined(COMPLEX) && !defined(BFLOAT16)
return get_gemv_optimal_nthreads_neoversev1(MN, ncpu);
#elif defined(NEOVERSEV2) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16)
return get_gemv_optimal_nthreads_neoversev2(MN, ncpu);
#elif defined(DYNAMIC_ARCH) && !defined(COMPLEX) && !defined(BFLOAT16)
if (strcmp(gotoblas_corename(), "neoversev1") == 0) {
return get_gemv_optimal_nthreads_neoversev1(MN, ncpu);
}
if (strcmp(gotoblas_corename(), "neoversev2") == 0) {
return get_gemv_optimal_nthreads_neoversev2(MN, ncpu);
}
#endif

if ( MN < 115200L * GEMM_MULTITHREAD_THRESHOLD )
return 1;
else
return num_cpu_avail(2);
}
#endif

#ifndef CBLAS

void NAME(char *TRANS, blasint *M, blasint *N,
@@ -202,13 +266,6 @@ void CNAME(enum CBLAS_ORDER order,

if (alpha == ZERO) return;
#if 0
/* this optimization causes stack corruption on x86_64 under OSX, Windows and FreeBSD */
if (trans == 0 && incx == 1 && incy == 1 && m*n < 2304 *GEMM_MULTITHREAD_THRESHOLD) {
GEMV_N(m, n, 0, alpha, a, lda, x, incx, y, incy, NULL);
return;
}
#endif
IDEBUG_START;

FUNCTION_PROFILE_START();
@@ -225,11 +282,7 @@ void CNAME(enum CBLAS_ORDER order,
STACK_ALLOC(buffer_size, FLOAT, buffer);

#ifdef SMP

if ( 1L * m * n < 115200L * GEMM_MULTITHREAD_THRESHOLD )
nthreads = 1;
else
nthreads = num_cpu_avail(2);
nthreads = get_gemv_optimal_nthreads(1L * m * n);

if (nthreads == 1) {
#endif


+ 21
- 7
interface/lapack/gesv.c View File

@@ -107,21 +107,35 @@ int NAME(blasint *N, blasint *NRHS, FLOAT *a, blasint *ldA, blasint *ipiv,

#ifndef PPC440
buffer = (FLOAT *)blas_memory_alloc(1);
sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A);
sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
#endif

#ifdef SMP
args.common = NULL;
#ifndef DOUBLE
if (args.m*args.n < 40000)

#if defined(_WIN64) && defined(_M_ARM64)
#ifdef COMPLEX
if (args.m * args.n <= 300)
#else
if (args.m * args.n <= 500)
#endif
args.nthreads = 1;
else if (args.m * args.n <= 1000)
args.nthreads = 4;
else
args.nthreads = num_cpu_avail(4);
#else
if (args.m*args.n < 10000)
#ifndef DOUBLE
if (args.m * args.n < 40000)
#else
if (args.m * args.n < 10000)
#endif
args.nthreads = 1;
else
args.nthreads = num_cpu_avail(4);
#endif
args.nthreads=1;
else
args.nthreads = num_cpu_avail(4);

if (args.nthreads == 1) {
#endif


+ 63
- 1
interface/nrm2.c View File

@@ -61,6 +61,37 @@ FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX){
#else
return fabsf(x[0]);
#endif
#endif

if (incx == 0)
#ifndef COMPLEX
#ifdef DOUBLE
return (sqrt((double)n)*fabs(x[0]));
#else
return (sqrt((float)n)*fabsf(x[0]));
#endif
#else
#ifdef DOUBLE
{
double fr=fabs(x[0]);
double fi=fabs(x[1]);
double fmin=MIN(fr,fi);
double fmax=MAX(fr,fi);
if (fmax==0.) return(fmax);
if (fmax==fmin) return(sqrt((double)n)*sqrt(2.)*fmax);
return (sqrt((double)n) * fmax * sqrt (1. + (fmin/fmax)*(fmin/fmax)));
}
#else
{
float fr=fabs(x[0]);
float fi=fabs(x[1]);
float fmin=MIN(fr,fi);
float fmax=MAX(fr,fi);
if (fmax==0.) return(fmax);
if (fmax==fmin) return(sqrt((float)n)*sqrt(2.)*fmax);
return (sqrt((float)n) * fmax * sqrt (1. + (fmin/fmax)*(fmin/fmax)));
}
#endif
#endif

if (incx < 0)
@@ -97,13 +128,44 @@ FLOAT CNAME(blasint n, FLOAT *x, blasint incx){

if (n <= 0) return 0.;

#ifndef COMPLEX
#ifndef COMPLEX
if (n == 1)
#ifdef DOUBLE
return fabs(x[0]);
#else
return fabsf(x[0]);
#endif
#endif

if (incx == 0)
#ifndef COMPLEX
#ifdef DOUBLE
return (sqrt((double)n)*fabs(x[0]));
#else
return (sqrt((float)n)*fabsf(x[0]));
#endif
#else
#ifdef DOUBLE
{
double fr=fabs(x[0]);
double fi=fabs(x[1]);
double fmin=MIN(fr,fi);
double fmax=MAX(fr,fi);
if (fmax==0.) return(fmax);
if (fmax==fmin) return(sqrt((double)n)*sqrt(2.)*fmax);
return (sqrt((double)n) * fmax * sqrt (1. + (fmin/fmax)*(fmin/fmax)));
}
#else
{
float fr=fabs(x[0]);
float fi=fabs(x[1]);
float fmin=MIN(fr,fi);
float fmax=MAX(fr,fi);
if (fmax==0.) return(fmax);
if (fmax==fmin) return(sqrt((float)n)*sqrt(2.)*fmax);
return (sqrt((float)n) * fmax * sqrt (1. + (fmin/fmax)*(fmin/fmax)));
}
#endif
#endif

if (incx < 0)


+ 6
- 134
interface/rotm.c View File

@@ -7,149 +7,21 @@

void NAME(blasint *N, FLOAT *dx, blasint *INCX, FLOAT *dy, blasint *INCY, FLOAT *dparam){

blasint n = *N;
blasint incx = *INCX;
blasint incy = *INCY;
blasint n = *N;
blasint incx = *INCX;
blasint incy = *INCY;

PRINT_DEBUG_NAME
#else

void CNAME(blasint n, FLOAT *dx, blasint incx, FLOAT *dy, blasint incy, FLOAT *dparam){

#endif

blasint i__1, i__2;
PRINT_DEBUG_CNAME;

blasint i__;
FLOAT w, z__;
blasint kx, ky;
FLOAT dh11, dh12, dh22, dh21, dflag;
blasint nsteps;

#ifndef CBLAS
PRINT_DEBUG_CNAME;
#else
PRINT_DEBUG_CNAME;
#endif

--dparam;
--dy;
--dx;

dflag = dparam[1];
if (n <= 0 || dflag == - 2.0) goto L140;

if (! (incx == incy && incx > 0)) goto L70;

nsteps = n * incx;
if (dflag < 0.) {
goto L50;
} else if (dflag == 0) {
goto L10;
} else {
goto L30;
}
L10:
dh12 = dparam[4];
dh21 = dparam[3];
i__1 = nsteps;
i__2 = incx;
for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) {
w = dx[i__];
z__ = dy[i__];
dx[i__] = w + z__ * dh12;
dy[i__] = w * dh21 + z__;
/* L20: */
}
goto L140;
L30:
dh11 = dparam[2];
dh22 = dparam[5];
i__2 = nsteps;
i__1 = incx;
for (i__ = 1; i__1 < 0 ? i__ >= i__2 : i__ <= i__2; i__ += i__1) {
w = dx[i__];
z__ = dy[i__];
dx[i__] = w * dh11 + z__;
dy[i__] = -w + dh22 * z__;
/* L40: */
}
goto L140;
L50:
dh11 = dparam[2];
dh12 = dparam[4];
dh21 = dparam[3];
dh22 = dparam[5];
i__1 = nsteps;
i__2 = incx;
for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) {
w = dx[i__];
z__ = dy[i__];
dx[i__] = w * dh11 + z__ * dh12;
dy[i__] = w * dh21 + z__ * dh22;
/* L60: */
}
goto L140;
L70:
kx = 1;
ky = 1;
if (incx < 0) {
kx = (1 - n) * incx + 1;
}
if (incy < 0) {
ky = (1 - n) * incy + 1;
}
ROTM_K(n, dx, incx, dy, incy, dparam);

if (dflag < 0.) {
goto L120;
} else if (dflag == 0) {
goto L80;
} else {
goto L100;
}
L80:
dh12 = dparam[4];
dh21 = dparam[3];
i__2 = n;
for (i__ = 1; i__ <= i__2; ++i__) {
w = dx[kx];
z__ = dy[ky];
dx[kx] = w + z__ * dh12;
dy[ky] = w * dh21 + z__;
kx += incx;
ky += incy;
/* L90: */
}
goto L140;
L100:
dh11 = dparam[2];
dh22 = dparam[5];
i__2 = n;
for (i__ = 1; i__ <= i__2; ++i__) {
w = dx[kx];
z__ = dy[ky];
dx[kx] = w * dh11 + z__;
dy[ky] = -w + dh22 * z__;
kx += incx;
ky += incy;
/* L110: */
}
goto L140;
L120:
dh11 = dparam[2];
dh12 = dparam[4];
dh21 = dparam[3];
dh22 = dparam[5];
i__2 = n;
for (i__ = 1; i__ <= i__2; ++i__) {
w = dx[kx];
z__ = dy[ky];
dx[kx] = w * dh11 + z__ * dh12;
dy[ky] = w * dh21 + z__ * dh22;
kx += incx;
ky += incy;
/* L130: */
}
L140:
return;
}


+ 10
- 5
interface/zgemv.c View File

@@ -252,25 +252,30 @@ void CNAME(enum CBLAS_ORDER order,

#ifdef SMP

if ( 1L * m * n < 1024L * GEMM_MULTITHREAD_THRESHOLD )
#if defined(_WIN64) && defined(_M_ARM64)
if (m*n > 25000000L)
nthreads = num_cpu_avail(4);
else
nthreads = 1;
#else
if (1L * m * n < 1024L * GEMM_MULTITHREAD_THRESHOLD)
nthreads = 1;
else
nthreads = num_cpu_avail(2);
#endif

if (nthreads == 1) {
#endif
#endif

(gemv[(int)trans])(m, n, 0, alpha_r, alpha_i, a, lda, x, incx, y, incy, buffer);

#ifdef SMP

} else {

(gemv_thread[(int)trans])(m, n, ALPHA, a, lda, x, incx, y, incy, buffer, nthreads);

}
#endif


STACK_FREE(buffer);

FUNCTION_PROFILE_END(4, m * n + m + n, 2 * m * n);


+ 2
- 2
interface/zscal.c View File

@@ -98,7 +98,7 @@ void CNAME(blasint n, FLOAT alpha_r, void *vx, blasint incx){
if (nthreads == 1) {
#endif

SCAL_K(n, 0, 0, alpha[0], alpha[1], x, incx, NULL, 0, NULL, 0);
SCAL_K(n, 0, 0, alpha[0], alpha[1], x, incx, NULL, 0, NULL, 1);

#ifdef SMP
} else {
@@ -108,7 +108,7 @@ void CNAME(blasint n, FLOAT alpha_r, void *vx, blasint incx){
mode = BLAS_SINGLE | BLAS_COMPLEX;
#endif

blas_level1_thread(mode, n, 0, 0, alpha, x, incx, NULL, 0, NULL, 0, (int (*)(void))SCAL_K, nthreads);
blas_level1_thread(mode, n, 0, 0, alpha, x, incx, NULL, 0, NULL, 1, (int (*)(void))SCAL_K, nthreads);

}
#endif


+ 2
- 2
interface/zsyr.c View File

@@ -116,12 +116,12 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA,

#else

void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, int n, FLOAT alpha, FLOAT *x, int incx, FLOAT *a, int lda) {
void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, int n, void* valpha, FLOAT *x, int incx, FLOAT *a, int lda) {

FLOAT *buffer;
int uplo;
blasint info;
FLOAT * ALPHA = &alpha;
FLOAT * ALPHA = (FLOAT*)valpha;
FLOAT alpha_r = ALPHA[0];
FLOAT alpha_i = ALPHA[1];
#ifdef SMP


+ 20
- 3
kernel/CMakeLists.txt View File

@@ -65,6 +65,7 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
GenerateNamedObjects("${KERNELDIR}/${${float_char}COPYKERNEL}" "C_INTERFACE" "copy_k" false "" "" false ${float_type})
GenerateNamedObjects("${KERNELDIR}/${${float_char}NRM2KERNEL}" "" "nrm2_k" false "" "" false ${float_type})
GenerateNamedObjects("${KERNELDIR}/${${float_char}ROTKERNEL}" "" "rot_k" false "" "" false ${float_type})
GenerateNamedObjects("${KERNELDIR}/${${float_char}ROTMKERNEL}" "" "rotm_k" false "" "" false ${float_type})
GenerateNamedObjects("${KERNELDIR}/${${float_char}SCALKERNEL}" "" "scal_k" false "" "" false ${float_type})
GenerateNamedObjects("${KERNELDIR}/${${float_char}SWAPKERNEL}" "" "swap_k" false "" "" false ${float_type})
GenerateNamedObjects("${KERNELDIR}/${${float_char}AXPBYKERNEL}" "" "axpby_k" false "" "" false ${float_type})
@@ -125,6 +126,7 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
GenerateNamedObjects("${KERNELDIR}/${SNRM2KERNEL}" "" "nrm2_k" false "" "" false "SINGLE")
GenerateNamedObjects("${KERNELDIR}/${SDOTKERNEL}" "" "dot_k" false "" "" false "SINGLE")
GenerateNamedObjects("${KERNELDIR}/${SROTKERNEL}" "" "rot_k" false "" "" false "SINGLE")
GenerateNamedObjects("${KERNELDIR}/${SROTMKERNEL}" "" "rotm_k" false "" "" false "SINGLE")
endif ()
if (BUILD_COMPLEX16 AND NOT BUILD_DOUBLE)
GenerateNamedObjects("${KERNELDIR}/${DAMAXKERNEL}" "USE_ABS" "amax_k" false "" "" false "DOUBLE")
@@ -148,6 +150,7 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
GenerateNamedObjects("${KERNELDIR}/${DCOPYKERNEL}" "C_INTERFACE" "copy_k" false "" "" false "DOUBLE")
GenerateNamedObjects("${KERNELDIR}/${DNRM2KERNEL}" "" "nrm2_k" false "" "" false "DOUBLE")
GenerateNamedObjects("${KERNELDIR}/${DROTKERNEL}" "" "rot_k" false "" "" false "DOUBLE")
GenerateNamedObjects("${KERNELDIR}/${DROTMKERNEL}" "" "rotm_k" false "" "" false "DOUBLE")
GenerateNamedObjects("${KERNELDIR}/${DDOTKERNEL}" "" "dot_k" false "" "" false "DOUBLE")
GenerateNamedObjects("${KERNELDIR}/${DSWAPKERNEL}" "" "swap_k" false "" "" false "DOUBLE")
GenerateNamedObjects("${KERNELDIR}/${DAXPYKERNEL}" "" "axpy_k" false "" "" false "DOUBLE")
@@ -198,25 +201,35 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
# Makefile.L3
set(USE_TRMM false)
string(TOUPPER ${TARGET_CORE} UC_TARGET_CORE)
if (ARM OR ARM64 OR (UC_TARGET_CORE MATCHES LONGSOON3B) OR (UC_TARGET_CORE MATCHES GENERIC) OR (UC_TARGET_CORE MATCHES HASWELL) OR (UC_TARGET_CORE MATCHES ZEN) OR (UC_TARGET_CORE MATCHES SKYLAKEX) OR (UC_TARGET_CORE MATCHES COOPERLAKE) OR (UC_TARGET_CORE MATCHES SAPPHIRERAPIDS))
if (ARM OR ARM64 OR RISCV64 OR (UC_TARGET_CORE MATCHES LONGSOON3B) OR (UC_TARGET_CORE MATCHES GENERIC) OR (UC_TARGET_CORE MATCHES HASWELL) OR (UC_TARGET_CORE MATCHES ZEN) OR (UC_TARGET_CORE MATCHES SKYLAKEX) OR (UC_TARGET_CORE MATCHES COOPERLAKE) OR (UC_TARGET_CORE MATCHES SAPPHIRERAPIDS))
set(USE_TRMM true)
endif ()
if (ZARCH OR (UC_TARGET_CORE MATCHES POWER8) OR (UC_TARGET_CORE MATCHES POWER9) OR (UC_TARGET_CORE MATCHES POWER10))
set(USE_TRMM true)
endif ()

set(USE_DIRECT_SGEMM false)
if (X86_64)
if (X86_64 OR ARM64)
set(USE_DIRECT_SGEMM true)
endif()

if (USE_DIRECT_SGEMM)
# if (NOT DEFINED SGEMMDIRECTKERNEL)
if (X86_64)
set (SGEMMDIRECTKERNEL sgemm_direct_skylakex.c)
set (SGEMMDIRECTPERFORMANT sgemm_direct_performant.c)
# endif()
GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTKERNEL}" "" "gemm_direct" false "" "" false SINGLE)
GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTPERFORMANT}" "" "gemm_direct_performant" false "" "" false SINGLE)
elseif (ARM64)
set (SGEMMDIRECTKERNEL sgemm_direct_arm64_sme1.c)
set (SGEMMDIRECTSMEKERNEL sgemm_direct_sme1.S)
set (SGEMMDIRECTPREKERNEL sgemm_direct_sme1_preprocess.S)
GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTKERNEL}" "" "gemm_direct" false "" "" false SINGLE)
if (HAVE_SME)
GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTSMEKERNEL}" "" "gemm_direct_sme1" false "" "" false SINGLE)
GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTPREKERNEL}" "" "gemm_direct_sme1_preprocess" false "" "" false SINGLE)
endif ()
endif ()
endif()

foreach (float_type SINGLE DOUBLE)
@@ -1105,6 +1118,7 @@ endif ()
GenerateNamedObjects("${KERNELDIR}/${DCOPYKERNEL}" "C_INTERFACE" "copy_k" false "" "" false "DOUBLE")
GenerateNamedObjects("${KERNELDIR}/${DNRM2KERNEL}" "" "nrm2_k" false "" "" false "DOUBLE")
GenerateNamedObjects("${KERNELDIR}/${DROTKERNEL}" "" "rot_k" false "" "" false "DOUBLE")
GenerateNamedObjects("${KERNELDIR}/${DROTMKERNEL}" "" "rotm_k" false "" "" false "DOUBLE")
GenerateNamedObjects("${KERNELDIR}/${DDOTKERNEL}" "" "dot_k" false "" "" false "DOUBLE")
GenerateNamedObjects("${KERNELDIR}/${DSWAPKERNEL}" "" "swap_k" false "" "" false "DOUBLE")
GenerateNamedObjects("${KERNELDIR}/${DAXPYKERNEL}" "" "axpy_k" false "" "" false "DOUBLE")
@@ -1352,6 +1366,9 @@ endif ()
if (USE_GEMM3M)
target_compile_definitions(kernel${TSUFFIX} PRIVATE USE_GEMM3M)
endif()
if (USE_OPENMP)
target_link_libraries(kernel${TSUFFIX} OpenMP::OpenMP_C)
endif()
endfunction ()




+ 4
- 0
kernel/Makefile View File

@@ -24,7 +24,11 @@ ifdef NO_AVX2
AVX2OPT=
endif


ifdef TARGET_CORE
ifeq ($(TARGET_CORE), ARMV9SME)
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -DHAVE_SME -march=armv9-a+sve2+sme
endif
ifeq ($(TARGET_CORE), SAPPHIRERAPIDS)
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE)
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ11) $(CLANGVERSIONGTEQ12)))


+ 25
- 4
kernel/Makefile.L1 View File

@@ -336,6 +336,18 @@ ifndef XROTKERNEL
XROTKERNEL = zrot.S
endif

ifndef SROTMKERNEL
SROTMKERNEL = rotm.S
endif

ifndef DROTMKERNEL
DROTMKERNEL = rotm.S
endif

ifndef QROTMKERNEL
QROTMKERNEL = rotm.S
endif

### SCAL ###

ifndef SSCALKERNEL
@@ -504,21 +516,21 @@ SBLASOBJS += \
sasum_k$(TSUFFIX).$(SUFFIX) ssum_k$(TSUFFIX).$(SUFFIX) saxpy_k$(TSUFFIX).$(SUFFIX) scopy_k$(TSUFFIX).$(SUFFIX) \
sdot_k$(TSUFFIX).$(SUFFIX) sdsdot_k$(TSUFFIX).$(SUFFIX) dsdot_k$(TSUFFIX).$(SUFFIX) \
snrm2_k$(TSUFFIX).$(SUFFIX) srot_k$(TSUFFIX).$(SUFFIX) sscal_k$(TSUFFIX).$(SUFFIX) sswap_k$(TSUFFIX).$(SUFFIX) \
saxpby_k$(TSUFFIX).$(SUFFIX)
saxpby_k$(TSUFFIX).$(SUFFIX) srotm_k$(TSUFFIX).$(SUFFIX)

DBLASOBJS += \
damax_k$(TSUFFIX).$(SUFFIX) damin_k$(TSUFFIX).$(SUFFIX) dmax_k$(TSUFFIX).$(SUFFIX) dmin_k$(TSUFFIX).$(SUFFIX) \
idamax_k$(TSUFFIX).$(SUFFIX) idamin_k$(TSUFFIX).$(SUFFIX) idmax_k$(TSUFFIX).$(SUFFIX) idmin_k$(TSUFFIX).$(SUFFIX) \
dasum_k$(TSUFFIX).$(SUFFIX) daxpy_k$(TSUFFIX).$(SUFFIX) dcopy_k$(TSUFFIX).$(SUFFIX) ddot_k$(TSUFFIX).$(SUFFIX) \
dnrm2_k$(TSUFFIX).$(SUFFIX) drot_k$(TSUFFIX).$(SUFFIX) dscal_k$(TSUFFIX).$(SUFFIX) dswap_k$(TSUFFIX).$(SUFFIX) \
daxpby_k$(TSUFFIX).$(SUFFIX) dsum_k$(TSUFFIX).$(SUFFIX)
daxpby_k$(TSUFFIX).$(SUFFIX) dsum_k$(TSUFFIX).$(SUFFIX) drotm_k$(TSUFFIX).$(SUFFIX)

QBLASOBJS += \
qamax_k$(TSUFFIX).$(SUFFIX) qamin_k$(TSUFFIX).$(SUFFIX) qmax_k$(TSUFFIX).$(SUFFIX) qmin_k$(TSUFFIX).$(SUFFIX) \
iqamax_k$(TSUFFIX).$(SUFFIX) iqamin_k$(TSUFFIX).$(SUFFIX) iqmax_k$(TSUFFIX).$(SUFFIX) iqmin_k$(TSUFFIX).$(SUFFIX) \
qasum_k$(TSUFFIX).$(SUFFIX) qaxpy_k$(TSUFFIX).$(SUFFIX) qcopy_k$(TSUFFIX).$(SUFFIX) qdot_k$(TSUFFIX).$(SUFFIX) \
qnrm2_k$(TSUFFIX).$(SUFFIX) qrot_k$(TSUFFIX).$(SUFFIX) qscal_k$(TSUFFIX).$(SUFFIX) qswap_k$(TSUFFIX).$(SUFFIX) \
qsum_k$(TSUFFIX).$(SUFFIX)
qsum_k$(TSUFFIX).$(SUFFIX) qrotm_k$(TSUFFIX).$(SUFFIX)

CBLASOBJS += \
camax_k$(TSUFFIX).$(SUFFIX) camin_k$(TSUFFIX).$(SUFFIX) icamax_k$(TSUFFIX).$(SUFFIX) icamin_k$(TSUFFIX).$(SUFFIX) \
@@ -842,7 +854,16 @@ $(KDIR)drot_k$(TSUFFIX).$(SUFFIX) $(KDIR)drot_k$(TPSUFFIX).$(PSUFFIX) : $(KERN
$(CC) -c $(CFLAGS) $(FMAFLAG) -UCOMPLEX -UCOMPLEX -DDOUBLE $< -o $@

$(KDIR)qrot_k$(TSUFFIX).$(SUFFIX) $(KDIR)qrot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QROTKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -DXDOUBLE $< -o $@
$(CC) -c $(CFLAGS) $(FMAFLAG) -UCOMPLEX -UCOMPLEX -DXDOUBLE $< -o $@

$(KDIR)srotm_k$(TSUFFIX).$(SUFFIX) $(KDIR)srotm_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SROTMKERNEL)
$(CC) -c $(CFLAGS) $(FMAFLAG) -UCOMPLEX -UCOMPLEX -UDOUBLE $< -o $@

$(KDIR)drotm_k$(TSUFFIX).$(SUFFIX) $(KDIR)drotm_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DROTMKERNEL)
$(CC) -c $(CFLAGS) $(FMAFLAG) -UCOMPLEX -UCOMPLEX -DDOUBLE $< -o $@

$(KDIR)qrotm_k$(TSUFFIX).$(SUFFIX) $(KDIR)qrotm_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QROTMKERNEL)
$(CC) -c $(CFLAGS) $(FMAFLAG) -UCOMPLEX -UCOMPLEX -DXDOUBLE $< -o $@

$(KDIR)csrot_k$(TSUFFIX).$(SUFFIX) $(KDIR)csrot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CROTKERNEL)
$(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -UDOUBLE $< -o $@


+ 33
- 1
kernel/Makefile.L3 View File

@@ -24,6 +24,7 @@ endif

ifeq ($(ARCH), arm64)
USE_TRMM = 1
USE_DIRECT_SGEMM = 1
endif

ifeq ($(ARCH), riscv64)
@@ -95,9 +96,17 @@ endif

ifdef USE_DIRECT_SGEMM
ifndef SGEMMDIRECTKERNEL
ifeq ($(ARCH), x86_64)
SGEMMDIRECTKERNEL = sgemm_direct_skylakex.c
SGEMMDIRECTPERFORMANT = sgemm_direct_performant.c
endif
ifeq ($(ARCH), arm64)
ifeq ($(TARGET_CORE), ARMV9SME)
HAVE_SME = 1
endif
SGEMMDIRECTKERNEL = sgemm_direct_arm64_sme1.c
endif
endif
endif

ifeq ($(BUILD_BFLOAT16), 1)
@@ -128,9 +137,20 @@ SKERNELOBJS += \
$(SGEMMONCOPYOBJ) $(SGEMMOTCOPYOBJ)

ifdef USE_DIRECT_SGEMM
ifeq ($(ARCH), x86_64)
SKERNELOBJS += \
sgemm_direct$(TSUFFIX).$(SUFFIX) \
sgemm_direct_performant$(TSUFFIX).$(SUFFIX)
sgemm_direct_performant$(TSUFFIX).$(SUFFIX)
endif
ifeq ($(ARCH), arm64)
SKERNELOBJS += \
sgemm_direct$(TSUFFIX).$(SUFFIX)
ifdef HAVE_SME
SKERNELOBJS += \
sgemm_direct_sme1$(TSUFFIX).$(SUFFIX) \
sgemm_direct_sme1_preprocess$(TSUFFIX).$(SUFFIX)
endif
endif
endif
endif

@@ -809,11 +829,23 @@ else
endif

ifdef USE_DIRECT_SGEMM
ifeq ($(ARCH), x86_64)
$(KDIR)sgemm_direct_performant$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMDIRECTPERFORMANT)
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
$(KDIR)sgemm_direct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMDIRECTKERNEL)
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
endif
ifeq ($(ARCH), arm64)
$(KDIR)sgemm_direct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMDIRECTKERNEL)
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
ifdef HAVE_SME
$(KDIR)sgemm_direct_sme1$(TSUFFIX).$(SUFFIX) :
$(CC) $(CFLAGS) -c $(KERNELDIR)/sgemm_direct_sme1.S -UDOUBLE -UCOMPLEX -o $@
$(KDIR)sgemm_direct_sme1_preprocess$(TSUFFIX).$(SUFFIX) :
$(CC) $(CFLAGS) -c $(KERNELDIR)/sgemm_direct_sme1_preprocess.S -UDOUBLE -UCOMPLEX -o $@
endif
endif
endif

ifeq ($(BUILD_BFLOAT16), 1)



+ 12
- 0
kernel/alpha/KERNEL View File

@@ -122,3 +122,15 @@ ZTRSMKERNEL_LN = ztrsm_kernel_2x2_LN.S
ZTRSMKERNEL_LT = ztrsm_kernel_2x2_LT.S
ZTRSMKERNEL_RN = ztrsm_kernel_2x2_LT.S
ZTRSMKERNEL_RT = ztrsm_kernel_2x2_RT.S

ifndef SROTMKERNEL
SROTMKERNEL = ../generic/rotm.c
endif

ifndef DROTMKERNEL
DROTMKERNEL = ../generic/rotm.c
endif

ifndef QROTMKERNEL
QROTMKERNEL = ../generic/rotm.c
endif

+ 10
- 0
kernel/arm/KERNEL View File

@@ -43,4 +43,14 @@ ifndef ZGEMM_BETA
ZGEMM_BETA = ../generic/zgemm_beta.c
endif

ifndef SROTMKERNEL
SROTMKERNEL = ../generic/rotm.c
endif

ifndef DROTMKERNEL
DROTMKERNEL = ../generic/rotm.c
endif

ifndef QROTMKERNEL
QROTMKERNEL = ../generic/rotm.c
endif

+ 40
- 49
kernel/arm/zscal.c View File

@@ -27,65 +27,56 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

/**************************************************************************************
* 2013/09/14 Saar
* BLASTEST float : OK
* BLASTEST double : OK
* CTEST : OK
* TEST : OK
* BLASTEST float : OK
* BLASTEST double : OK
* CTEST : OK
* TEST : OK
*
**************************************************************************************/

#include "common.h"

// The c/zscal_k function is called not only by cblas_c/zscal but also by other upper-level interfaces.
// In certain cases, the expected return values for cblas_s/zscal differ from those of other upper-level interfaces.
// To handle this, we use the dummy2 parameter to differentiate between them.
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
BLASLONG i=0;
BLASLONG inc_x2;
BLASLONG ip = 0;
FLOAT temp;
BLASLONG i = 0;
BLASLONG inc_x2;
BLASLONG ip = 0;
FLOAT temp;

if ( (n <= 0) || (inc_x <= 0))
return(0);
if ((n <= 0) || (inc_x <= 0))
return(0);

inc_x2 = 2 * inc_x;
if (dummy2 == 0) {
for (i = 0; i < n; i++)
{
if (da_r == 0.0 && da_i == 0.0)
{
x[ip] = 0.0;
x[ip+1] = 0.0;
}
else
{
temp = da_r * x[ip] - da_i * x[ip+1];
x[ip+1] = da_r * x[ip+1] + da_i * x[ip] ;
x[ip] = temp;
}

inc_x2 = 2 * inc_x;
for ( i=0; i<n; i++ )
{
if ( da_r == 0.0 )
{
if ( da_i == 0.0 )
{
temp = 0.0;
x[ip+1] = 0.0 ;
}
else
{
temp = - da_i * x[ip+1] ;
if (isnan(x[ip]) || isinf(x[ip])) temp = NAN;
if (!isinf(x[ip+1]))
x[ip+1] = da_i * x[ip] ;
else x[ip+1] = NAN;
}
}
else
{
if ( da_i == 0.0 )
{
temp = da_r * x[ip] ;
x[ip+1] = da_r * x[ip+1];
}
else
{
temp = da_r * x[ip] - da_i * x[ip+1] ;
x[ip+1] = da_r * x[ip+1] + da_i * x[ip] ;
}
}
x[ip] = temp;
ip += inc_x2;
}
return(0);
}
for (i = 0; i < n; i++)
{
temp = da_r * x[ip] - da_i * x[ip+1];
x[ip+1] = da_r * x[ip+1] + da_i * x[ip] ;

ip += inc_x2;
}

return(0);
x[ip] = temp;
ip += inc_x2;
}

return(0);
}



+ 10
- 0
kernel/arm64/KERNEL View File

@@ -45,4 +45,14 @@ ifndef ZGEMM_BETA
ZGEMM_BETA = ../generic/zgemm_beta.c
endif

ifndef SROTMKERNEL
SROTMKERNEL = ../generic/rotm.c
endif

ifndef DROTMKERNEL
DROTMKERNEL = ../generic/rotm.c
endif

ifndef QROTMKERNEL
QROTMKERNEL = ../generic/rotm.c
endif

+ 2
- 2
kernel/arm64/KERNEL.A64FX View File

@@ -1,6 +1,6 @@
include $(KERNELDIR)/KERNEL.ARMV8SVE

SGEMVNKERNEL = gemv_n_sve.c
DGEMVNKERNEL = gemv_n_sve.c
SGEMVNKERNEL = gemv_n_sve_v4x3.c
DGEMVNKERNEL = gemv_n_sve_v4x3.c
SGEMVTKERNEL = gemv_t_sve_v4x3.c
DGEMVTKERNEL = gemv_t_sve_v4x3.c

+ 9
- 4
kernel/arm64/KERNEL.ARMV8SVE View File

@@ -74,16 +74,21 @@ DSCALKERNEL = scal.S
CSCALKERNEL = zscal.S
ZSCALKERNEL = zscal.S

SGEMVNKERNEL = gemv_n.S
DGEMVNKERNEL = gemv_n.S
SGEMVNKERNEL = gemv_n_sve_v1x3.c
DGEMVNKERNEL = gemv_n_sve_v1x3.c
CGEMVNKERNEL = zgemv_n.S
ZGEMVNKERNEL = zgemv_n.S

SGEMVTKERNEL = gemv_t.S
DGEMVTKERNEL = gemv_t.S
SGEMVTKERNEL = gemv_t_sve_v1x3.c
DGEMVTKERNEL = gemv_t_sve_v1x3.c
CGEMVTKERNEL = zgemv_t.S
ZGEMVTKERNEL = zgemv_t.S

SSYMV_L_KERNEL = symv_L_sve_v1x4.c
SSYMV_U_KERNEL = symv_U_sve_v1x4.c
DSYMV_L_KERNEL = symv_L_sve_v1x4.c
DSYMV_U_KERNEL = symv_U_sve_v1x4.c

SASUMKERNEL = sasum_thunderx2t99.c
DASUMKERNEL = dasum_thunderx2t99.c
CASUMKERNEL = casum_thunderx2t99.c


+ 3
- 0
kernel/arm64/KERNEL.ARMV9SME View File

@@ -0,0 +1,3 @@
include $(KERNELDIR)/KERNEL.ARMV8SVE



+ 15
- 1
kernel/arm64/KERNEL.NEOVERSEN1 View File

@@ -60,7 +60,7 @@ DSCALKERNEL = scal.S
CSCALKERNEL = zscal.S
ZSCALKERNEL = zscal.S

SGEMVNKERNEL = gemv_n.S
SGEMVNKERNEL = sgemv_n_neon.c
DGEMVNKERNEL = gemv_n.S
CGEMVNKERNEL = zgemv_n.S
ZGEMVNKERNEL = zgemv_n.S
@@ -70,6 +70,10 @@ DGEMVTKERNEL = gemv_t.S
CGEMVTKERNEL = zgemv_t.S
ZGEMVTKERNEL = zgemv_t.S

SSYMV_L_KERNEL = symv_L_asimd_4x4.c
SSYMV_U_KERNEL = symv_U_asimd_4x4.c
DSYMV_L_KERNEL = symv_L_asimd_4x4.c
DSYMV_U_KERNEL = symv_U_asimd_4x4.c

SASUMKERNEL = sasum_thunderx2t99.c
DASUMKERNEL = dasum_thunderx2t99.c
@@ -98,8 +102,18 @@ ZNRM2KERNEL = znrm2.S

DDOTKERNEL = dot.c
SDOTKERNEL = dot.c
ifeq ($(OSNAME), WINNT)
ifeq ($(C_COMPILER), CLANG)
CDOTKERNEL = zdot.S
ZDOTKERNEL = zdot.S
else
CDOTKERNEL = zdot_thunderx2t99.c
ZDOTKERNEL = zdot_thunderx2t99.c
endif
else
CDOTKERNEL = zdot_thunderx2t99.c
ZDOTKERNEL = zdot_thunderx2t99.c
endif
DSDOTKERNEL = dot.S

DGEMM_BETA = dgemm_beta.S


+ 6
- 4
kernel/arm64/KERNEL.NEOVERSEN2 View File

@@ -60,13 +60,13 @@ DSCALKERNEL = scal.S
CSCALKERNEL = zscal.S
ZSCALKERNEL = zscal.S

SGEMVNKERNEL = gemv_n.S
DGEMVNKERNEL = gemv_n.S
SGEMVNKERNEL = gemv_n_sve_v1x3.c
DGEMVNKERNEL = gemv_n_sve_v1x3.c
CGEMVNKERNEL = zgemv_n.S
ZGEMVNKERNEL = zgemv_n.S

SGEMVTKERNEL = gemv_t.S
DGEMVTKERNEL = gemv_t.S
SGEMVTKERNEL = gemv_t_sve_v1x3.c
DGEMVTKERNEL = gemv_t_sve_v1x3.c
CGEMVTKERNEL = zgemv_t.S
ZGEMVTKERNEL = zgemv_t.S

@@ -198,3 +198,5 @@ SBGEMMINCOPYOBJ = sbgemm_incopy$(TSUFFIX).$(SUFFIX)
SBGEMMITCOPYOBJ = sbgemm_itcopy$(TSUFFIX).$(SUFFIX)
SBGEMMONCOPYOBJ = sbgemm_oncopy$(TSUFFIX).$(SUFFIX)
SBGEMMOTCOPYOBJ = sbgemm_otcopy$(TSUFFIX).$(SUFFIX)
SBGEMVTKERNEL = sbgemv_t_bfdot.c
SBGEMVNKERNEL = sbgemv_n_neon.c

+ 20
- 0
kernel/arm64/KERNEL.NEOVERSEV1 View File

@@ -1,4 +1,24 @@
include $(KERNELDIR)/KERNEL.ARMV8SVE

SGEMVNKERNEL = gemv_n_sve_v1x3.c
DGEMVNKERNEL = gemv_n_sve_v1x3.c
SGEMVTKERNEL = gemv_t_sve_v1x3.c
DGEMVTKERNEL = gemv_t_sve_v1x3.c
ifeq ($(BUILD_BFLOAT16), 1)
SBGEMM_BETA = sbgemm_beta_neoversev1.c
SBGEMMKERNEL = sbgemm_kernel_$(SBGEMM_UNROLL_M)x$(SBGEMM_UNROLL_N)_neoversev1.c
ifneq ($(SBGEMM_UNROLL_M), $(SBGEMM_UNROLL_N))
SBGEMMINCOPY = sbgemm_ncopy_$(SBGEMM_UNROLL_M)_neoversev1.c
SBGEMMITCOPY = sbgemm_tcopy_$(SBGEMM_UNROLL_M)_neoversev1.c
SBGEMMINCOPYOBJ = sbgemm_incopy$(TSUFFIX).$(SUFFIX)
SBGEMMITCOPYOBJ = sbgemm_itcopy$(TSUFFIX).$(SUFFIX)
endif
SBGEMMONCOPY = sbgemm_ncopy_$(SBGEMM_UNROLL_N)_neoversev1.c
SBGEMMOTCOPY = sbgemm_tcopy_$(SBGEMM_UNROLL_N)_neoversev1.c
SBGEMMONCOPYOBJ = sbgemm_oncopy$(TSUFFIX).$(SUFFIX)
SBGEMMOTCOPYOBJ = sbgemm_otcopy$(TSUFFIX).$(SUFFIX)

SBGEMVNKERNEL = sbgemv_n_neon.c
SBGEMVTKERNEL = sbgemv_t_bfdot.c

endif

+ 5
- 0
kernel/arm64/KERNEL.NEOVERSEV2 View File

@@ -1 +1,6 @@
include $(KERNELDIR)/KERNEL.ARMV8SVE

ifeq ($(BUILD_BFLOAT16), 1)
SBGEMVTKERNEL = sbgemv_t_bfdot.c
SBGEMVNKERNEL = sbgemv_n_neon.c
endif

+ 12
- 0
kernel/arm64/KERNEL.generic View File

@@ -171,3 +171,15 @@ QCABS_KERNEL = ../generic/cabs.c
#Dump kernel
CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c
ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c

ifndef SROTMKERNEL
SROTMKERNEL = ../generic/rotm.c
endif

ifndef DROTMKERNEL
DROTMKERNEL = ../generic/rotm.c
endif

ifndef QROTMKERNEL
QROTMKERNEL = ../generic/rotm.c
endif

+ 217
- 216
kernel/arm64/copy_thunderx2t99.c View File

@@ -1,216 +1,217 @@
/***************************************************************************
Copyright (c) 2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

#include "common.h"

#include <arm_neon.h>
#define N "x0" /* vector length */
#define X "x1" /* X vector address */
#define INC_X "x2" /* X stride */
#define Y "x3" /* Y vector address */
#define INC_Y "x4" /* Y stride */
#define J "x5" /* loop variable */

/*******************************************************************************
* Macro definitions
*******************************************************************************/
#if !defined(COMPLEX)
#if !defined(DOUBLE)
#define TMPF "s0"
#define INC_SHIFT "2"
#define N_DIV_SHIFT "2"
#define N_REM_MASK "3"
#else
#define TMPF "d0"
#define INC_SHIFT "3"
#define N_DIV_SHIFT "1"
#define N_REM_MASK "1"
#endif
#else
#if !defined(DOUBLE)
#define TMPF "d0"
#define INC_SHIFT "3"
#define N_DIV_SHIFT "1"
#define N_REM_MASK "1"
#else
#define TMPF "q0"
#define INC_SHIFT "4"
#define N_DIV_SHIFT "0"
#define N_REM_MASK "0"
#endif
#endif

#define KERNEL_F1 \
"ldr "TMPF", ["X"] \n" \
"add "X", "X", "INC_X" \n" \
"str "TMPF", ["Y"] \n" \
"add "Y", "Y", "INC_Y" \n"

#define KERNEL_F \
"ldr q0, ["X"], #16 \n" \
"str q0, ["Y"], #16 \n"

#define INIT \
"lsl "INC_X", "INC_X", #"INC_SHIFT" \n" \
"lsl "INC_Y", "INC_Y", #"INC_SHIFT" \n"


static int do_copy(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
{
if ( n < 0 ) return 0;

__asm__ __volatile__ (
" mov "N", %[N_] \n"
" mov "X", %[X_] \n"
" mov "INC_X", %[INCX_] \n"
" mov "Y", %[Y_] \n"
" mov "INC_Y", %[INCY_] \n"
" cmp "N", xzr \n"
" ble 8f //copy_kernel_L999 \n"
" cmp "INC_X", #1 \n"
" bne 4f //copy_kernel_S_BEGIN \n"
" cmp "INC_Y", #1 \n"
" bne 4f //copy_kernel_S_BEGIN \n"

"// .Lcopy_kernel_F_BEGIN: \n"
" "INIT" \n"
" asr "J", "N", #"N_DIV_SHIFT" \n"
" cmp "J", xzr \n"
" beq 2f //copy_kernel_F1 \n"
" .align 5 \n"

"1: //copy_kernel_F: \n"
" "KERNEL_F" \n"
" subs "J", "J", #1 \n"
" bne 1b //copy_kernel_F \n"

"2: //copy_kernel_F1: \n"
#if defined(COMPLEX) && defined(DOUBLE)
" b 8f //copy_kernel_L999 \n"
#else
" ands "J", "N", #"N_REM_MASK" \n"
" ble 8f //copy_kernel_L999 \n"
#endif

"3: //copy_kernel_F10: \n"
" "KERNEL_F1" \n"
" subs "J", "J", #1 \n"
" bne 3b //copy_kernel_F10 \n"
" b 8f //copy_kernel_L999 \n"

"4: //copy_kernel_S_BEGIN: \n"
" "INIT" \n"
" asr "J", "N", #2 \n"
" cmp "J", xzr \n"
" ble 6f //copy_kernel_S1 \n"

"5: //copy_kernel_S4: \n"
" "KERNEL_F1" \n"
" "KERNEL_F1" \n"
" "KERNEL_F1" \n"
" "KERNEL_F1" \n"
" subs "J", "J", #1 \n"
" bne 5b //copy_kernel_S4 \n"

"6: //copy_kernel_S1: \n"
" ands "J", "N", #3 \n"
" ble 8f //copy_kernel_L999 \n"

"7: //copy_kernel_S10: \n"
" "KERNEL_F1" \n"
" subs "J", "J", #1 \n"
" bne 7b //copy_kernel_S10 \n"

"8: //copy_kernel_L999: \n"

:
: [N_] "r" (n), //%1
[X_] "r" (x), //%2
[INCX_] "r" (inc_x), //%3
[Y_] "r" (y), //%4
[INCY_] "r" (inc_y) //%5
: "cc",
"memory",
"x0", "x1", "x2", "x3", "x4", "x5",
"d0"
);

return 0;
}

#if defined(SMP)
static int copy_thread_function(BLASLONG n, BLASLONG dummy0,
BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *y,
BLASLONG inc_y, FLOAT *dummy3, BLASLONG dummy4)
{
do_copy(n, x, inc_x, y, inc_y);

return 0;
}
#endif

int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
{
#if defined(SMP)
int nthreads;
FLOAT dummy_alpha;
#endif

if (n <= 0) return 0;

#if defined(SMP)
if (inc_x == 0 || n <= 10000)
nthreads = 1;
else
nthreads = num_cpu_avail(1);

if (nthreads == 1) {
do_copy(n, x, inc_x, y, inc_y);
} else {
int mode = 0;

#if !defined(COMPLEX)
mode = BLAS_REAL;
#else
mode = BLAS_COMPLEX;
#endif
#if !defined(DOUBLE)
mode |= BLAS_SINGLE;
#else
mode |= BLAS_DOUBLE;
#endif

blas_level1_thread(mode, n, 0, 0, &dummy_alpha,
x, inc_x, y, inc_y, NULL, 0,
( void *)copy_thread_function, nthreads);
}
#else
do_copy(n, x, inc_x, y, inc_y);
#endif

return 0;
}
/***************************************************************************
Copyright (c) 2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#include <arm_neon.h>
#define N "x0" /* vector length */
#define X "x1" /* X vector address */
#define INC_X "x2" /* X stride */
#define Y "x3" /* Y vector address */
#define INC_Y "x4" /* Y stride */
#define J "x5" /* loop variable */
/*******************************************************************************
* Macro definitions
*******************************************************************************/
#if !defined(COMPLEX)
#if !defined(DOUBLE)
#define TMPF "s0"
#define INC_SHIFT "2"
#define N_DIV_SHIFT "2"
#define N_REM_MASK "3"
#else
#define TMPF "d0"
#define INC_SHIFT "3"
#define N_DIV_SHIFT "1"
#define N_REM_MASK "1"
#endif
#else
#if !defined(DOUBLE)
#define TMPF "d0"
#define INC_SHIFT "3"
#define N_DIV_SHIFT "1"
#define N_REM_MASK "1"
#else
#define TMPF "q0"
#define INC_SHIFT "4"
#define N_DIV_SHIFT "0"
#define N_REM_MASK "0"
#endif
#endif
#define KERNEL_F1 \
"ldr "TMPF", ["X"] \n" \
"add "X", "X", "INC_X" \n" \
"str "TMPF", ["Y"] \n" \
"add "Y", "Y", "INC_Y" \n"
#define KERNEL_F \
"ldr q0, ["X"], #16 \n" \
"str q0, ["Y"], #16 \n"
#define INIT \
"lsl "INC_X", "INC_X", #"INC_SHIFT" \n" \
"lsl "INC_Y", "INC_Y", #"INC_SHIFT" \n"
static int do_copy(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
{
if ( n < 0 ) return 0;
__asm__ __volatile__ (
" mov "N", %[N_] \n"
" mov "X", %[X_] \n"
" mov "INC_X", %[INCX_] \n"
" mov "Y", %[Y_] \n"
" mov "INC_Y", %[INCY_] \n"
" cmp "N", xzr \n"
" ble 8f //copy_kernel_L999 \n"
" cmp "INC_X", #1 \n"
" bne 4f //copy_kernel_S_BEGIN \n"
" cmp "INC_Y", #1 \n"
" bne 4f //copy_kernel_S_BEGIN \n"
"// .Lcopy_kernel_F_BEGIN: \n"
" "INIT" \n"
" asr "J", "N", #"N_DIV_SHIFT" \n"
" cmp "J", xzr \n"
" beq 2f //copy_kernel_F1 \n"
#if !(defined(__clang__) && defined(OS_WINDOWS))
" .align 5 \n"
#endif
"1: //copy_kernel_F: \n"
" "KERNEL_F" \n"
" subs "J", "J", #1 \n"
" bne 1b //copy_kernel_F \n"
"2: //copy_kernel_F1: \n"
#if defined(COMPLEX) && defined(DOUBLE)
" b 8f //copy_kernel_L999 \n"
#else
" ands "J", "N", #"N_REM_MASK" \n"
" ble 8f //copy_kernel_L999 \n"
#endif
"3: //copy_kernel_F10: \n"
" "KERNEL_F1" \n"
" subs "J", "J", #1 \n"
" bne 3b //copy_kernel_F10 \n"
" b 8f //copy_kernel_L999 \n"
"4: //copy_kernel_S_BEGIN: \n"
" "INIT" \n"
" asr "J", "N", #2 \n"
" cmp "J", xzr \n"
" ble 6f //copy_kernel_S1 \n"
"5: //copy_kernel_S4: \n"
" "KERNEL_F1" \n"
" "KERNEL_F1" \n"
" "KERNEL_F1" \n"
" "KERNEL_F1" \n"
" subs "J", "J", #1 \n"
" bne 5b //copy_kernel_S4 \n"
"6: //copy_kernel_S1: \n"
" ands "J", "N", #3 \n"
" ble 8f //copy_kernel_L999 \n"
"7: //copy_kernel_S10: \n"
" "KERNEL_F1" \n"
" subs "J", "J", #1 \n"
" bne 7b //copy_kernel_S10 \n"
"8: //copy_kernel_L999: \n"
:
: [N_] "r" (n), //%1
[X_] "r" (x), //%2
[INCX_] "r" (inc_x), //%3
[Y_] "r" (y), //%4
[INCY_] "r" (inc_y) //%5
: "cc",
"memory",
"x0", "x1", "x2", "x3", "x4", "x5",
"d0"
);
return 0;
}
#if defined(SMP)
static int copy_thread_function(BLASLONG n, BLASLONG dummy0,
BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *y,
BLASLONG inc_y, FLOAT *dummy3, BLASLONG dummy4)
{
do_copy(n, x, inc_x, y, inc_y);
return 0;
}
#endif
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
{
#if defined(SMP)
int nthreads;
FLOAT dummy_alpha;
#endif
if (n <= 0) return 0;
#if defined(SMP)
if (inc_x == 0 || n <= 10000)
nthreads = 1;
else
nthreads = num_cpu_avail(1);
if (nthreads == 1) {
do_copy(n, x, inc_x, y, inc_y);
} else {
int mode = 0;
#if !defined(COMPLEX)
mode = BLAS_REAL;
#else
mode = BLAS_COMPLEX;
#endif
#if !defined(DOUBLE)
mode |= BLAS_SINGLE;
#else
mode |= BLAS_DOUBLE;
#endif
blas_level1_thread(mode, n, 0, 0, &dummy_alpha,
x, inc_x, y, inc_y, NULL, 0,
( void *)copy_thread_function, nthreads);
}
#else
do_copy(n, x, inc_x, y, inc_y);
#endif
return 0;
}

+ 2
- 0
kernel/arm64/dasum_thunderx2t99.c View File

@@ -152,7 +152,9 @@ static FLOAT dasum_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x)
" cmp "J", xzr \n"
" beq 3f //asum_kernel_F1 \n"

#if !(defined(__clang__) && defined(OS_WINDOWS))
".align 5 \n"
#endif
"2: //asum_kernel_F32: \n"
" "KERNEL_F32" \n"
" subs "J", "J", #1 \n"


+ 1
- 1
kernel/arm64/dgemm_small_kernel_tn_sve.c View File

@@ -213,7 +213,7 @@ CNAME(BLASLONG M,
const BLASLONG n2 = N & -2;
const BLASLONG n8 = N & -8;

const int pack_a = M >= v_size2 && N >= 8 && K >= 8 ? 1 : 0;
const int pack_a = M >= v_size2 && N >= 8 ? 1 : 0;
FLOAT* packed_a =
(pack_a) ? packed_a = (FLOAT*)malloc(K * v_size2 * sizeof(FLOAT)) : NULL;



+ 1
- 1
kernel/arm64/dgemm_small_kernel_tt_sve.c View File

@@ -219,7 +219,7 @@ CNAME(BLASLONG M,
const BLASLONG n4 = N & -4;
const BLASLONG n2 = N & -2;

const int pack_a = M >= v_size2 && N >= 8 && K >= 8 ? 1 : 0;
const int pack_a = M >= v_size2 && N >= 8 ? 1 : 0;
FLOAT* packed_a =
(pack_a) ? packed_a = (FLOAT*)malloc(K * v_size2 * sizeof(FLOAT)) : NULL;



+ 50
- 3
kernel/arm64/dot.c View File

@@ -48,6 +48,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n,
BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb,
void *c, BLASLONG ldc, int (*function)(), int nthreads);

#ifdef DYNAMIC_ARCH
extern char* gotoblas_corename(void);
#endif

#if defined(DYNAMIC_ARCH) || defined(NEOVERSEV1)
static inline int get_dot_optimal_nthreads_neoversev1(BLASLONG N, int ncpu) {
#ifdef DOUBLE
return (N <= 10000L) ? 1
: (N <= 64500L) ? 1
: (N <= 100000L) ? MIN(ncpu, 2)
: (N <= 150000L) ? MIN(ncpu, 4)
: (N <= 260000L) ? MIN(ncpu, 8)
: (N <= 360000L) ? MIN(ncpu, 16)
: (N <= 520000L) ? MIN(ncpu, 24)
: (N <= 1010000L) ? MIN(ncpu, 56)
: ncpu;
#else
return (N <= 10000L) ? 1
: (N <= 110000L) ? 1
: (N <= 200000L) ? MIN(ncpu, 2)
: (N <= 280000L) ? MIN(ncpu, 4)
: (N <= 520000L) ? MIN(ncpu, 8)
: (N <= 830000L) ? MIN(ncpu, 16)
: (N <= 1010000L) ? MIN(ncpu, 24)
: ncpu;
#endif
}
#endif

static inline int get_dot_optimal_nthreads(BLASLONG n) {
int ncpu = num_cpu_avail(1);

#if defined(NEOVERSEV1) && !defined(COMPLEX) && !defined(BFLOAT16)
return get_dot_optimal_nthreads_neoversev1(n, ncpu);
#elif defined(DYNAMIC_ARCH) && !defined(COMPLEX) && !defined(BFLOAT16)
if (strcmp(gotoblas_corename(), "neoversev1") == 0) {
return get_dot_optimal_nthreads_neoversev1(n, ncpu);
}
#endif

// Default case
if (n <= 10000L)
return 1;
else
return num_cpu_avail(1);
}
#endif

static RETURN_TYPE dot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
@@ -85,10 +132,10 @@ RETURN_TYPE CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y
RETURN_TYPE dot = 0.0;

#if defined(SMP)
if (inc_x == 0 || inc_y == 0 || n <= 10000)
if (inc_x == 0 || inc_y == 0)
nthreads = 1;
else
nthreads = num_cpu_avail(1);
nthreads = get_dot_optimal_nthreads(n);

if (nthreads == 1) {
dot = dot_compute(n, x, inc_x, y, inc_y);
@@ -105,7 +152,7 @@ RETURN_TYPE CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y

blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha,
x, inc_x, y, inc_y, result, 0,
( void *)dot_thread_function, nthreads);
(void *)dot_thread_function, nthreads);

ptr = (RETURN_TYPE *)result;
for (i = 0; i < nthreads; i++) {


+ 3
- 2
kernel/arm64/dot_kernel_asimd.c View File

@@ -134,7 +134,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
" fadd v4.4s, v4.4s, v6.4s \n" \
" fadd v0.4s, v0.4s, v4.4s \n" \
" faddp v0.4s, v0.4s, v0.4s \n" \
" faddp v0.4s, v0.4s, v0.4s \n"
" faddp "OUT", v0.2s \n"

#else /* !defined(DSDOT) */
#define KERNEL_F1 \
@@ -285,8 +285,9 @@ static RETURN_TYPE dot_kernel_asimd(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT
" asr %[J_], %[N_], #"N_DIV_SHIFT" \n"
" cmp %[J_], xzr \n"
" beq 3f //dot_kernel_F1 \n"
#if !(defined(__clang__) && defined(OS_WINDOWS))
" .align 5 \n"
#endif
"2: //dot_kernel_F: \n"
" "KERNEL_F" \n"
" subs %[J_], %[J_], #1 \n"


+ 71
- 12
kernel/arm64/gemv_n_sve.c View File

@@ -1,5 +1,5 @@
/***************************************************************************
Copyright (c) 2024, The OpenBLAS Project
Copyright (c) 2024-2025, The OpenBLAS Project
All rights reserved.

Redistribution and use in source and binary forms, with or without
@@ -59,23 +59,82 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
a_ptr = a;

if (inc_y == 1) {
BLASLONG width = n / 3;
uint64_t sve_size = SV_COUNT();
for (j = 0; j < n; j++) {
SV_TYPE temp_vec = SV_DUP(alpha * x[ix]);
i = 0;
svbool_t pg = SV_WHILE(i, m);
while (svptest_any(SV_TRUE(), pg)) {
SV_TYPE a_vec = svld1(pg, a_ptr + i);
svbool_t pg_true = SV_TRUE();
svbool_t pg = SV_WHILE(0, m % sve_size);

FLOAT *a0_ptr = a + lda * width * 0;
FLOAT *a1_ptr = a + lda * width * 1;
FLOAT *a2_ptr = a + lda * width * 2;

for (j = 0; j < width; j++) {
for (i = 0; (i + sve_size - 1) < m; i += sve_size) {
ix = j * inc_x;

SV_TYPE x0_vec = SV_DUP(alpha * x[ix + (inc_x * width * 0)]);
SV_TYPE x1_vec = SV_DUP(alpha * x[ix + (inc_x * width * 1)]);
SV_TYPE x2_vec = SV_DUP(alpha * x[ix + (inc_x * width * 2)]);

SV_TYPE a00_vec = svld1(pg_true, a0_ptr + i);
SV_TYPE a01_vec = svld1(pg_true, a1_ptr + i);
SV_TYPE a02_vec = svld1(pg_true, a2_ptr + i);

SV_TYPE y_vec = svld1(pg_true, y + i);
y_vec = svmla_lane(y_vec, a00_vec, x0_vec, 0);
y_vec = svmla_lane(y_vec, a01_vec, x1_vec, 0);
y_vec = svmla_lane(y_vec, a02_vec, x2_vec, 0);

svst1(pg_true, y + i, y_vec);
}

if (i < m) {
SV_TYPE x0_vec = SV_DUP(alpha * x[ix + (inc_x * width * 0)]);
SV_TYPE x1_vec = SV_DUP(alpha * x[ix + (inc_x * width * 1)]);
SV_TYPE x2_vec = SV_DUP(alpha * x[ix + (inc_x * width * 2)]);

SV_TYPE a00_vec = svld1(pg, a0_ptr + i);
SV_TYPE a01_vec = svld1(pg, a1_ptr + i);
SV_TYPE a02_vec = svld1(pg, a2_ptr + i);

SV_TYPE y_vec = svld1(pg, y + i);
y_vec = svmla_x(pg, y_vec, temp_vec, a_vec);
y_vec = svmla_m(pg, y_vec, a00_vec, x0_vec);
y_vec = svmla_m(pg, y_vec, a01_vec, x1_vec);
y_vec = svmla_m(pg, y_vec, a02_vec, x2_vec);

ix += inc_x;

svst1(pg, y + i, y_vec);
i += sve_size;
pg = SV_WHILE(i, m);
}

a0_ptr += lda;
a1_ptr += lda;
a2_ptr += lda;
}

a_ptr = a2_ptr;
for (j = width * 3; j < n; j++) {
ix = j * inc_x;
for (i = 0; (i + sve_size - 1) < m; i += sve_size) {
SV_TYPE y_vec = svld1(pg_true, y + i);
SV_TYPE x_vec = SV_DUP(alpha * x[(ix)]);
SV_TYPE a_vec = svld1(pg_true, a_ptr + i);
y_vec = svmla_x(pg_true, y_vec, a_vec, x_vec);
svst1(pg_true, y + i, y_vec);
}

if (i < m) {
SV_TYPE y_vec = svld1(pg, y + i);
SV_TYPE x_vec = SV_DUP(alpha * x[(ix)]);
SV_TYPE a_vec = svld1(pg, a_ptr + i);
y_vec = svmla_m(pg, y_vec, a_vec, x_vec);
svst1(pg, y + i, y_vec);
}

a_ptr += lda;
ix += inc_x;
}
return(0);
return (0);
}

for (j = 0; j < n; j++) {
@@ -89,4 +148,4 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
ix += inc_x;
}
return (0);
}
}

+ 138
- 0
kernel/arm64/gemv_n_sve_v1x3.c View File

@@ -0,0 +1,138 @@
/***************************************************************************
Copyright (c) 2025, The OpenBLAS Project
All rights reserved.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:

1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.

2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written
permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

#include <arm_sve.h>

#include "common.h"

#ifdef DOUBLE
#define SV_COUNT svcntd
#define SV_TYPE svfloat64_t
#define SV_TRUE svptrue_b64
#define SV_WHILE svwhilelt_b64_s64
#define SV_DUP svdup_f64
#else
#define SV_COUNT svcntw
#define SV_TYPE svfloat32_t
#define SV_TRUE svptrue_b32
#define SV_WHILE svwhilelt_b32_s64
#define SV_DUP svdup_f32
#endif

int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a,
BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
FLOAT *buffer)
{
BLASLONG i;
BLASLONG ix,iy;
BLASLONG j;
FLOAT *a_ptr;
FLOAT temp;

ix = 0;
a_ptr = a;

if (inc_y == 1) {
BLASLONG width = (n + 3 - 1) / 3;

FLOAT *a0_ptr = a_ptr + lda * width * 0;
FLOAT *a1_ptr = a_ptr + lda * width * 1;
FLOAT *a2_ptr = a_ptr + lda * width * 2;

FLOAT *x0_ptr = x + inc_x * width * 0;
FLOAT *x1_ptr = x + inc_x * width * 1;
FLOAT *x2_ptr = x + inc_x * width * 2;

for (j = 0; j < width; j++) {
svbool_t pg00 = ((j + width * 0) < n) ? SV_TRUE() : svpfalse();
svbool_t pg01 = ((j + width * 1) < n) ? SV_TRUE() : svpfalse();
svbool_t pg02 = ((j + width * 2) < n) ? SV_TRUE() : svpfalse();

SV_TYPE temp0_vec = ((j + width * 0) < n) ? SV_DUP(alpha * x0_ptr[ix]) : SV_DUP(0.0);
SV_TYPE temp1_vec = ((j + width * 1) < n) ? SV_DUP(alpha * x1_ptr[ix]) : SV_DUP(0.0);
SV_TYPE temp2_vec = ((j + width * 2) < n) ? SV_DUP(alpha * x2_ptr[ix]) : SV_DUP(0.0);
i = 0;
BLASLONG sve_size = SV_COUNT();
while ((i + sve_size * 1 - 1) < m) {
SV_TYPE y0_vec = svld1_vnum(SV_TRUE(), y + i, 0);

SV_TYPE a00_vec = svld1_vnum(pg00, a0_ptr + i, 0);
SV_TYPE a01_vec = svld1_vnum(pg01, a1_ptr + i, 0);
SV_TYPE a02_vec = svld1_vnum(pg02, a2_ptr + i, 0);

y0_vec = svmla_m(pg00, y0_vec, temp0_vec, a00_vec);
y0_vec = svmla_m(pg01, y0_vec, temp1_vec, a01_vec);
y0_vec = svmla_m(pg02, y0_vec, temp2_vec, a02_vec);

svst1_vnum(SV_TRUE(), y + i, 0, y0_vec);
i += sve_size * 1;
}

if (i < m) {
svbool_t pg0 = SV_WHILE(i + sve_size * 0, m);

pg00 = svand_z(SV_TRUE(), pg0, pg00);
pg01 = svand_z(SV_TRUE(), pg0, pg01);
pg02 = svand_z(SV_TRUE(), pg0, pg02);

SV_TYPE y0_vec = svld1_vnum(pg0, y + i, 0);

SV_TYPE a00_vec = svld1_vnum(pg00, a0_ptr + i, 0);
SV_TYPE a01_vec = svld1_vnum(pg01, a1_ptr + i, 0);
SV_TYPE a02_vec = svld1_vnum(pg02, a2_ptr + i, 0);

y0_vec = svmla_m(pg00, y0_vec, temp0_vec, a00_vec);
y0_vec = svmla_m(pg01, y0_vec, temp1_vec, a01_vec);
y0_vec = svmla_m(pg02, y0_vec, temp2_vec, a02_vec);

svst1_vnum(pg0, y + i, 0, y0_vec);
}
a0_ptr += lda;
a1_ptr += lda;
a2_ptr += lda;
ix += inc_x;
}
return(0);
}

for (j = 0; j < n; j++) {
temp = alpha * x[ix];
iy = 0;
for (i = 0; i < m; i++) {
y[iy] += temp * a_ptr[i];
iy += inc_y;
}
a_ptr += lda;
ix += inc_x;
}
return (0);
}

+ 207
- 0
kernel/arm64/gemv_n_sve_v4x3.c View File

@@ -0,0 +1,207 @@
/***************************************************************************
Copyright (c) 2025, The OpenBLAS Project
All rights reserved.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:

1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.

2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written
permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

#include <arm_sve.h>

#include "common.h"

#ifdef DOUBLE
#define SV_COUNT svcntd
#define SV_TYPE svfloat64_t
#define SV_TRUE svptrue_b64
#define SV_WHILE svwhilelt_b64_s64
#define SV_DUP svdup_f64
#else
#define SV_COUNT svcntw
#define SV_TYPE svfloat32_t
#define SV_TRUE svptrue_b32
#define SV_WHILE svwhilelt_b32_s64
#define SV_DUP svdup_f32
#endif

int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a,
BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
FLOAT *buffer)
{
BLASLONG i;
BLASLONG ix,iy;
BLASLONG j;
FLOAT *a_ptr;
FLOAT temp;

ix = 0;
a_ptr = a;

if (inc_y == 1) {
BLASLONG width = (n + 3 - 1) / 3;

FLOAT *a0_ptr = a_ptr + lda * width * 0;
FLOAT *a1_ptr = a_ptr + lda * width * 1;
FLOAT *a2_ptr = a_ptr + lda * width * 2;

FLOAT *x0_ptr = x + inc_x * width * 0;
FLOAT *x1_ptr = x + inc_x * width * 1;
FLOAT *x2_ptr = x + inc_x * width * 2;

for (j = 0; j < width; j++) {
svbool_t pg00 = ((j + width * 0) < n) ? SV_TRUE() : svpfalse();
svbool_t pg10 = ((j + width * 0) < n) ? SV_TRUE() : svpfalse();
svbool_t pg20 = ((j + width * 0) < n) ? SV_TRUE() : svpfalse();
svbool_t pg30 = ((j + width * 0) < n) ? SV_TRUE() : svpfalse();
svbool_t pg01 = ((j + width * 1) < n) ? SV_TRUE() : svpfalse();
svbool_t pg11 = ((j + width * 1) < n) ? SV_TRUE() : svpfalse();
svbool_t pg21 = ((j + width * 1) < n) ? SV_TRUE() : svpfalse();
svbool_t pg31 = ((j + width * 1) < n) ? SV_TRUE() : svpfalse();
svbool_t pg02 = ((j + width * 2) < n) ? SV_TRUE() : svpfalse();
svbool_t pg12 = ((j + width * 2) < n) ? SV_TRUE() : svpfalse();
svbool_t pg22 = ((j + width * 2) < n) ? SV_TRUE() : svpfalse();
svbool_t pg32 = ((j + width * 2) < n) ? SV_TRUE() : svpfalse();

SV_TYPE temp0_vec = ((j + width * 0) < n) ? SV_DUP(alpha * x0_ptr[ix]) : SV_DUP(0.0);
SV_TYPE temp1_vec = ((j + width * 1) < n) ? SV_DUP(alpha * x1_ptr[ix]) : SV_DUP(0.0);
SV_TYPE temp2_vec = ((j + width * 2) < n) ? SV_DUP(alpha * x2_ptr[ix]) : SV_DUP(0.0);
i = 0;
BLASLONG sve_size = SV_COUNT();
while ((i + sve_size * 4 - 1) < m) {
SV_TYPE y0_vec = svld1_vnum(SV_TRUE(), y + i, 0);
SV_TYPE y1_vec = svld1_vnum(SV_TRUE(), y + i, 1);
SV_TYPE y2_vec = svld1_vnum(SV_TRUE(), y + i, 2);
SV_TYPE y3_vec = svld1_vnum(SV_TRUE(), y + i, 3);

SV_TYPE a00_vec = svld1_vnum(pg00, a0_ptr + i, 0);
SV_TYPE a10_vec = svld1_vnum(pg10, a0_ptr + i, 1);
SV_TYPE a20_vec = svld1_vnum(pg20, a0_ptr + i, 2);
SV_TYPE a30_vec = svld1_vnum(pg30, a0_ptr + i, 3);
SV_TYPE a01_vec = svld1_vnum(pg01, a1_ptr + i, 0);
SV_TYPE a11_vec = svld1_vnum(pg11, a1_ptr + i, 1);
SV_TYPE a21_vec = svld1_vnum(pg21, a1_ptr + i, 2);
SV_TYPE a31_vec = svld1_vnum(pg31, a1_ptr + i, 3);
SV_TYPE a02_vec = svld1_vnum(pg02, a2_ptr + i, 0);
SV_TYPE a12_vec = svld1_vnum(pg12, a2_ptr + i, 1);
SV_TYPE a22_vec = svld1_vnum(pg22, a2_ptr + i, 2);
SV_TYPE a32_vec = svld1_vnum(pg32, a2_ptr + i, 3);

y0_vec = svmla_m(pg00, y0_vec, temp0_vec, a00_vec);
y1_vec = svmla_m(pg10, y1_vec, temp0_vec, a10_vec);
y2_vec = svmla_m(pg20, y2_vec, temp0_vec, a20_vec);
y3_vec = svmla_m(pg30, y3_vec, temp0_vec, a30_vec);
y0_vec = svmla_m(pg01, y0_vec, temp1_vec, a01_vec);
y1_vec = svmla_m(pg11, y1_vec, temp1_vec, a11_vec);
y2_vec = svmla_m(pg21, y2_vec, temp1_vec, a21_vec);
y3_vec = svmla_m(pg31, y3_vec, temp1_vec, a31_vec);
y0_vec = svmla_m(pg02, y0_vec, temp2_vec, a02_vec);
y1_vec = svmla_m(pg12, y1_vec, temp2_vec, a12_vec);
y2_vec = svmla_m(pg22, y2_vec, temp2_vec, a22_vec);
y3_vec = svmla_m(pg32, y3_vec, temp2_vec, a32_vec);

svst1_vnum(SV_TRUE(), y + i, 0, y0_vec);
svst1_vnum(SV_TRUE(), y + i, 1, y1_vec);
svst1_vnum(SV_TRUE(), y + i, 2, y2_vec);
svst1_vnum(SV_TRUE(), y + i, 3, y3_vec);
i += sve_size * 4;
}

if (i < m) {
svbool_t pg0 = SV_WHILE(i + sve_size * 0, m);
svbool_t pg1 = SV_WHILE(i + sve_size * 1, m);
svbool_t pg2 = SV_WHILE(i + sve_size * 2, m);
svbool_t pg3 = SV_WHILE(i + sve_size * 3, m);

pg00 = svand_z(SV_TRUE(), pg0, pg00);
pg10 = svand_z(SV_TRUE(), pg1, pg10);
pg20 = svand_z(SV_TRUE(), pg2, pg20);
pg30 = svand_z(SV_TRUE(), pg3, pg30);
pg01 = svand_z(SV_TRUE(), pg0, pg01);
pg11 = svand_z(SV_TRUE(), pg1, pg11);
pg21 = svand_z(SV_TRUE(), pg2, pg21);
pg31 = svand_z(SV_TRUE(), pg3, pg31);
pg02 = svand_z(SV_TRUE(), pg0, pg02);
pg12 = svand_z(SV_TRUE(), pg1, pg12);
pg22 = svand_z(SV_TRUE(), pg2, pg22);
pg32 = svand_z(SV_TRUE(), pg3, pg32);

SV_TYPE y0_vec = svld1_vnum(pg0, y + i, 0);
SV_TYPE y1_vec = svld1_vnum(pg1, y + i, 1);
SV_TYPE y2_vec = svld1_vnum(pg2, y + i, 2);
SV_TYPE y3_vec = svld1_vnum(pg3, y + i, 3);

SV_TYPE a00_vec = svld1_vnum(pg00, a0_ptr + i, 0);
SV_TYPE a10_vec = svld1_vnum(pg10, a0_ptr + i, 1);
SV_TYPE a20_vec = svld1_vnum(pg20, a0_ptr + i, 2);
SV_TYPE a30_vec = svld1_vnum(pg30, a0_ptr + i, 3);
SV_TYPE a01_vec = svld1_vnum(pg01, a1_ptr + i, 0);
SV_TYPE a11_vec = svld1_vnum(pg11, a1_ptr + i, 1);
SV_TYPE a21_vec = svld1_vnum(pg21, a1_ptr + i, 2);
SV_TYPE a31_vec = svld1_vnum(pg31, a1_ptr + i, 3);
SV_TYPE a02_vec = svld1_vnum(pg02, a2_ptr + i, 0);
SV_TYPE a12_vec = svld1_vnum(pg12, a2_ptr + i, 1);
SV_TYPE a22_vec = svld1_vnum(pg22, a2_ptr + i, 2);
SV_TYPE a32_vec = svld1_vnum(pg32, a2_ptr + i, 3);

y0_vec = svmla_m(pg00, y0_vec, temp0_vec, a00_vec);
y1_vec = svmla_m(pg10, y1_vec, temp0_vec, a10_vec);
y2_vec = svmla_m(pg20, y2_vec, temp0_vec, a20_vec);
y3_vec = svmla_m(pg30, y3_vec, temp0_vec, a30_vec);
y0_vec = svmla_m(pg01, y0_vec, temp1_vec, a01_vec);
y1_vec = svmla_m(pg11, y1_vec, temp1_vec, a11_vec);
y2_vec = svmla_m(pg21, y2_vec, temp1_vec, a21_vec);
y3_vec = svmla_m(pg31, y3_vec, temp1_vec, a31_vec);
y0_vec = svmla_m(pg02, y0_vec, temp2_vec, a02_vec);
y1_vec = svmla_m(pg12, y1_vec, temp2_vec, a12_vec);
y2_vec = svmla_m(pg22, y2_vec, temp2_vec, a22_vec);
y3_vec = svmla_m(pg32, y3_vec, temp2_vec, a32_vec);

svst1_vnum(pg0, y + i, 0, y0_vec);
svst1_vnum(pg1, y + i, 1, y1_vec);
svst1_vnum(pg2, y + i, 2, y2_vec);
svst1_vnum(pg3, y + i, 3, y3_vec);
}
a0_ptr += lda;
a1_ptr += lda;
a2_ptr += lda;
ix += inc_x;
}
return(0);
}

for (j = 0; j < n; j++) {
temp = alpha * x[ix];
iy = 0;
for (i = 0; i < m; i++) {
y[iy] += temp * a_ptr[i];
iy += inc_y;
}
a_ptr += lda;
ix += inc_x;
}
return (0);
}

+ 55
- 39
kernel/arm64/gemv_t_sve_v1x3.c View File

@@ -1,5 +1,5 @@
/***************************************************************************
Copyright (c) 2024, The OpenBLAS Project
Copyright (c) 2024, 2025 The OpenBLAS Project
All rights reserved.

Redistribution and use in source and binary forms, with or without
@@ -56,12 +56,16 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a,
BLASLONG ix,iy;
BLASLONG j;
FLOAT *a_ptr;
FLOAT *y_ptr;
FLOAT temp;

iy = 0;

if (inc_x == 1) {
BLASLONG width = (n + 3 - 1) / 3;
BLASLONG width = n / 3;
BLASLONG sve_size = SV_COUNT();
svbool_t pg_true = SV_TRUE();
svbool_t pg = SV_WHILE(0, m % sve_size);

FLOAT *a0_ptr = a + lda * width * 0;
FLOAT *a1_ptr = a + lda * width * 1;
@@ -72,60 +76,41 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a,
FLOAT *y2_ptr = y + inc_y * width * 2;

for (j = 0; j < width; j++) {
svbool_t pg00 = ((j + width * 0) < n) ? SV_TRUE() : svpfalse();
svbool_t pg01 = ((j + width * 1) < n) ? SV_TRUE() : svpfalse();
svbool_t pg02 = ((j + width * 2) < n) ? SV_TRUE() : svpfalse();

SV_TYPE temp00_vec = SV_DUP(0.0);
SV_TYPE temp01_vec = SV_DUP(0.0);
SV_TYPE temp02_vec = SV_DUP(0.0);

i = 0;
BLASLONG sve_size = SV_COUNT();
while ((i + sve_size * 1 - 1) < m) {
SV_TYPE x0_vec = svld1_vnum(SV_TRUE(), x + i, 0);
SV_TYPE x0_vec = svld1(pg_true, x + i);

SV_TYPE a00_vec = svld1_vnum(pg00, a0_ptr + i, 0);
SV_TYPE a01_vec = svld1_vnum(pg01, a1_ptr + i, 0);
SV_TYPE a02_vec = svld1_vnum(pg02, a2_ptr + i, 0);
SV_TYPE a00_vec = svld1(pg_true, a0_ptr + i);
SV_TYPE a01_vec = svld1(pg_true, a1_ptr + i);
SV_TYPE a02_vec = svld1(pg_true, a2_ptr + i);

temp00_vec = svmla_m(pg00, temp00_vec, a00_vec, x0_vec);
temp01_vec = svmla_m(pg01, temp01_vec, a01_vec, x0_vec);
temp02_vec = svmla_m(pg02, temp02_vec, a02_vec, x0_vec);
temp00_vec = svmla_x(pg_true, temp00_vec, a00_vec, x0_vec);
temp01_vec = svmla_x(pg_true, temp01_vec, a01_vec, x0_vec);
temp02_vec = svmla_x(pg_true, temp02_vec, a02_vec, x0_vec);

i += sve_size * 1;
}

if (i < m) {
svbool_t pg0 = SV_WHILE(i + sve_size * 0, m);

pg00 = svand_z(SV_TRUE(), pg0, pg00);
pg01 = svand_z(SV_TRUE(), pg0, pg01);
pg02 = svand_z(SV_TRUE(), pg0, pg02);
SV_TYPE x0_vec = svld1(pg, x + i);

SV_TYPE x0_vec = svld1_vnum(pg0, x + i, 0);
SV_TYPE a00_vec = svld1(pg, a0_ptr + i);
SV_TYPE a01_vec = svld1(pg, a1_ptr + i);
SV_TYPE a02_vec = svld1(pg, a2_ptr + i);

SV_TYPE a00_vec = svld1_vnum(pg00, a0_ptr + i, 0);
SV_TYPE a01_vec = svld1_vnum(pg01, a1_ptr + i, 0);
SV_TYPE a02_vec = svld1_vnum(pg02, a2_ptr + i, 0);

temp00_vec = svmla_m(pg00, temp00_vec, a00_vec, x0_vec);
temp01_vec = svmla_m(pg01, temp01_vec, a01_vec, x0_vec);
temp02_vec = svmla_m(pg02, temp02_vec, a02_vec, x0_vec);
temp00_vec = svmla_m(pg, temp00_vec, a00_vec, x0_vec);
temp01_vec = svmla_m(pg, temp01_vec, a01_vec, x0_vec);
temp02_vec = svmla_m(pg, temp02_vec, a02_vec, x0_vec);
}

if ((j + width * 0) < n) {
temp = svaddv(SV_TRUE(), temp00_vec);
y0_ptr[iy] += alpha * temp;
}
if ((j + width * 1) < n) {
temp = svaddv(SV_TRUE(), temp01_vec);
y1_ptr[iy] += alpha * temp;
}
if ((j + width * 2) < n) {
temp = svaddv(SV_TRUE(), temp02_vec);
y2_ptr[iy] += alpha * temp;
}
y0_ptr[iy] += alpha * svaddv(pg_true, temp00_vec);
y1_ptr[iy] += alpha * svaddv(pg_true, temp01_vec);
y2_ptr[iy] += alpha * svaddv(pg_true, temp02_vec);

iy += inc_y;

a0_ptr += lda;
@@ -133,6 +118,37 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a,
a2_ptr += lda;
}

a_ptr = a2_ptr;
y_ptr = y2_ptr;
for (j = width * 3; j < n; j++) {
SV_TYPE temp_vec = SV_DUP(0.0);

i = 0;
while ((i + sve_size * 1 - 1) < m) {
SV_TYPE x_vec = svld1(pg_true, x + i);

SV_TYPE a_vec = svld1(pg_true, a_ptr + i);

temp_vec = svmla_x(pg_true, temp_vec, a_vec, x_vec);

i += sve_size * 1;
}

if (i < m) {
SV_TYPE x_vec = svld1(pg, x + i);

SV_TYPE a_vec = svld1(pg, a_ptr + i);

temp_vec = svmla_m(pg, temp_vec, a_vec, x_vec);
}

y_ptr[iy] += alpha * svaddv(pg_true, temp_vec);

iy += inc_y;

a_ptr += lda;
}

return(0);
}



+ 2
- 1
kernel/arm64/sasum_thunderx2t99.c View File

@@ -153,8 +153,9 @@ static FLOAT sasum_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x)
" asr "J", "N", #6 \n"
" cmp "J", xzr \n"
" beq 3f //asum_kernel_F1 \n"
#if !(defined(__clang__) && defined(OS_WINDOWS))
".align 5 \n"
#endif
"2: //asum_kernel_F64: \n"
" "KERNEL_F64" \n"
" subs "J", "J", #1 \n"


+ 83
- 0
kernel/arm64/sbgemm_beta_neoversev1.c View File

@@ -0,0 +1,83 @@
/***************************************************************************
* Copyright (c) 2024, The OpenBLAS Project
* All rights reserved.
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
* 3. Neither the name of the OpenBLAS project nor the names of
* its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
* *****************************************************************************/

#include "common.h"

int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, IFLOAT *dummy2,
BLASLONG dummy3, IFLOAT *dummy4, BLASLONG dummy5, FLOAT *c,
BLASLONG ldc) {

BLASLONG i, j;
BLASLONG chunk, remain;
FLOAT *c_offset1, *c_offset;
c_offset = c;
chunk = m >> 3;
remain = m & 7;
if (beta == ZERO) {
for (j = n; j > 0; j--) {
c_offset1 = c_offset;
c_offset += ldc;
for (i = chunk; i > 0; i--) {
*(c_offset1 + 0) = ZERO;
*(c_offset1 + 1) = ZERO;
*(c_offset1 + 2) = ZERO;
*(c_offset1 + 3) = ZERO;
*(c_offset1 + 4) = ZERO;
*(c_offset1 + 5) = ZERO;
*(c_offset1 + 6) = ZERO;
*(c_offset1 + 7) = ZERO;
c_offset1 += 8;
}
for (i = remain; i > 0; i--) {
*c_offset1 = ZERO;
c_offset1++;
}
}
} else {
for (j = n; j > 0; j--) {
c_offset1 = c_offset;
c_offset += ldc;
for (i = chunk; i > 0; i--) {
*(c_offset1 + 0) *= beta;
*(c_offset1 + 1) *= beta;
*(c_offset1 + 2) *= beta;
*(c_offset1 + 3) *= beta;
*(c_offset1 + 4) *= beta;
*(c_offset1 + 5) *= beta;
*(c_offset1 + 6) *= beta;
*(c_offset1 + 7) *= beta;
c_offset1 += 8;
}
for (i = remain; i > 0; i--) {
*c_offset1 *= beta;
c_offset1++;
}
}
}
return 0;
};

+ 46
- 0
kernel/arm64/sbgemm_kernel_4x4_neoversev1.c View File

@@ -0,0 +1,46 @@
/***************************************************************************
* Copyright (c) 2024-2025, The OpenBLAS Project
* All rights reserved.
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
* 3. Neither the name of the OpenBLAS project nor the names of
* its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
* *****************************************************************************/

#include <arm_sve.h>

#include "common.h"

#define ALPHA_ONE
#include "sbgemm_kernel_4x4_neoversev1_impl.c"
#undef ALPHA_ONE
#include "sbgemm_kernel_4x4_neoversev1_impl.c"

int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT *A, IFLOAT *B,
FLOAT *C, BLASLONG ldc) {
if (alpha == 1.0f)
return sbgemm_kernel_neoversev1_alpha_one(m, n, k, alpha, A, B, C, ldc);
else
return sbgemm_kernel_neoversev1_alpha(m, n, k, alpha, A, B, C, ldc);
return 0;
}


+ 414
- 0
kernel/arm64/sbgemm_kernel_4x4_neoversev1_impl.c View File

@@ -0,0 +1,414 @@
/***************************************************************************
* Copyright (c) 2024-2025, The OpenBLAS Project
* All rights reserved.
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
* 3. Neither the name of the OpenBLAS project nor the names of
* its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
* *****************************************************************************/

#include <arm_sve.h>

#include "common.h"

#define INIT_C(M, N) mc##M##N = svdup_f32(0);

#define MATMUL(M, N) mc##M##N = svbfmmla(mc##M##N, ma##M, mb##N);

#define INIT_C_4x4 \
do { \
INIT_C(0, 0); \
INIT_C(0, 1); \
INIT_C(1, 0); \
INIT_C(1, 1); \
} while (0);

#ifdef ALPHA_ONE
#define UPDATE_C(PG, PTR, DST, SRC) \
do { \
DST = svld1_f32((PG), (PTR)); \
DST = svadd_z((PG), SRC, DST); \
svst1_f32((PG), (PTR), DST); \
} while (0);
#else
#define UPDATE_C(PG, PTR, DST, SRC) \
do { \
DST = svld1_f32((PG), (PTR)); \
DST = svmad_z((PG), svalpha, SRC, DST); \
svst1_f32((PG), (PTR), DST); \
} while (0);
#endif

#define ZIP_EVEN_ELEMENTS(PG, mc0, mc1, tmp, vc) \
do { \
(tmp) = svuzp1_f32((mc0), (mc1)); \
(vc) = svcompact_f32((PG), (tmp)); \
} while (0)

#define ZIP_ODD_ELEMENTS(PG, mc0, mc1, tmp, vc) \
do { \
(tmp) = svuzp2_f32((mc0), (mc1)); \
(vc) = svcompact_f32((PG), (tmp)); \
} while (0)

#define ACCUMULATE_LAST4_TO_FIRST4(M, N, TMP) \
do { \
TMP = svext_f32(mc##M##N, mc##M##N, 4); \
mc##M##N = svadd_f32_z(svptrue_b32(), mc##M##N, (TMP)); \
} while (0)

#ifdef ALPHA_ONE
int sbgemm_kernel_neoversev1_alpha_one(BLASLONG m, BLASLONG n, BLASLONG k,
FLOAT alpha, IFLOAT *A, IFLOAT *B,
FLOAT *C, BLASLONG ldc)
#else
int sbgemm_kernel_neoversev1_alpha(BLASLONG m, BLASLONG n, BLASLONG k,
FLOAT alpha, IFLOAT *A, IFLOAT *B, FLOAT *C,
BLASLONG ldc)
#endif
{

BLASLONG pad_k = (k + 7) & ~7;
svbfloat16_t ma0, ma1, mb0, mb1;
svfloat32_t mc00, mc01, mc10, mc11, vc0, vc1, vc2, vc3, oc0, oc1, oc2, oc3;
svfloat32_t tmp;
svfloat32_t svalpha = svdup_f32(alpha);

svbool_t pg16_all = svptrue_b16();

svbool_t pg32_first_1 = svwhilelt_b32(0, 1);
svbool_t pg32_first_2 = svwhilelt_b32(0, 2);
svbool_t pg32_first_4 = svwhilelt_b32(0, 4);

svbool_t pg32_select_first_2_per_quadword = svdupq_b32(1, 1, 0, 0);

bfloat16_t *ptr_a = (bfloat16_t *)A;
bfloat16_t *ptr_b = (bfloat16_t *)B;
FLOAT *ptr_c = C;

bfloat16_t *ptr_a0;
bfloat16_t *ptr_b0;
FLOAT *ptr_c0, *ptr_c1, *ptr_c2, *ptr_c3;

for (BLASLONG j = 0; j < n / 4; j++) {
ptr_c0 = ptr_c;
ptr_c1 = ptr_c0 + ldc;
ptr_c2 = ptr_c1 + ldc;
ptr_c3 = ptr_c2 + ldc;
ptr_c += 4 * ldc;
ptr_a = (bfloat16_t *)A;

for (BLASLONG i = 0; i < m / 4; i++) {
ptr_a0 = ptr_a;
ptr_a += 4 * pad_k;

ptr_b0 = ptr_b;

INIT_C_4x4;

for (BLASLONG p = 0; p < pad_k; p += 8) {
ma0 = svld1_bf16(pg16_all, ptr_a0);
ma1 = svld1_bf16(pg16_all, ptr_a0 + 16);

mb0 = svld1_bf16(pg16_all, ptr_b0);
mb1 = svld1_bf16(pg16_all, ptr_b0 + 16);

MATMUL(0, 0);
MATMUL(0, 1);
MATMUL(1, 0);
MATMUL(1, 1);

ptr_a0 += 32;
ptr_b0 += 32;
}

ACCUMULATE_LAST4_TO_FIRST4(0, 0, tmp);
ACCUMULATE_LAST4_TO_FIRST4(0, 1, tmp);
ACCUMULATE_LAST4_TO_FIRST4(1, 0, tmp);
ACCUMULATE_LAST4_TO_FIRST4(1, 1, tmp);

ZIP_EVEN_ELEMENTS(pg32_select_first_2_per_quadword, mc00, mc10, tmp, vc0);
ZIP_ODD_ELEMENTS(pg32_select_first_2_per_quadword, mc00, mc10, tmp, vc1);

ZIP_EVEN_ELEMENTS(pg32_select_first_2_per_quadword, mc01, mc11, tmp, vc2);
ZIP_ODD_ELEMENTS(pg32_select_first_2_per_quadword, mc01, mc11, tmp, vc3);

UPDATE_C(pg32_first_4, ptr_c0, oc0, vc0);
UPDATE_C(pg32_first_4, ptr_c1, oc1, vc1);
UPDATE_C(pg32_first_4, ptr_c2, oc2, vc2)
UPDATE_C(pg32_first_4, ptr_c3, oc3, vc3)

ptr_c0 += 4;
ptr_c1 += 4;
ptr_c2 += 4;
ptr_c3 += 4;
}

if (m & 2) {
ptr_a0 = ptr_a;
ptr_a += 2 * pad_k;

ptr_b0 = ptr_b;
INIT_C(0, 0);
INIT_C(0, 1);
for (BLASLONG p = 0; p < pad_k; p += 8) {
ma0 = svld1_bf16(pg16_all, ptr_a0);
mb0 = svld1_bf16(pg16_all, ptr_b0);
mb1 = svld1_bf16(pg16_all, ptr_b0 + 16);

MATMUL(0, 0);
MATMUL(0, 1);

ptr_a0 += 16;
ptr_b0 += 32;
}

ACCUMULATE_LAST4_TO_FIRST4(0, 0, tmp);
ACCUMULATE_LAST4_TO_FIRST4(0, 1, tmp);

vc0 = svuzp1(mc00, mc00);
vc1 = svuzp2(mc00, mc00);
vc2 = svuzp1(mc01, mc01);
vc3 = svuzp2(mc01, mc01);

UPDATE_C(pg32_first_2, ptr_c0, oc0, vc0);
UPDATE_C(pg32_first_2, ptr_c1, oc1, vc1);
UPDATE_C(pg32_first_2, ptr_c2, oc2, vc2);
UPDATE_C(pg32_first_2, ptr_c3, oc3, vc3);

ptr_c0 += 2;
ptr_c1 += 2;
ptr_c2 += 2;
ptr_c3 += 2;
}

if (m & 1) {
ptr_a0 = ptr_a;
ptr_b0 = ptr_b;

INIT_C(0, 0);
INIT_C(0, 1);
for (BLASLONG p = 0; p < pad_k; p += 8) {
ma0 = svld1_bf16(pg16_all, ptr_a0);
mb0 = svld1_bf16(pg16_all, ptr_b0);
mb1 = svld1_bf16(pg16_all, ptr_b0 + 16);

MATMUL(0, 0);
MATMUL(0, 1);

ptr_a0 += 16;
ptr_b0 += 32;
}

ACCUMULATE_LAST4_TO_FIRST4(0, 0, tmp);
ACCUMULATE_LAST4_TO_FIRST4(0, 1, tmp);

// use compact is more straightforward
vc1 = svuzp2(mc00, mc00);
vc3 = svuzp2(mc01, mc01);

UPDATE_C(pg32_first_1, ptr_c0, oc0, mc00);
UPDATE_C(pg32_first_1, ptr_c1, oc1, vc1);
UPDATE_C(pg32_first_1, ptr_c2, oc2, mc01);
UPDATE_C(pg32_first_1, ptr_c3, oc3, vc3);
}

ptr_b += 4 * pad_k;
}

if (n & 2) {
ptr_c0 = ptr_c;
ptr_c1 = ptr_c0 + ldc;
ptr_c += 2 * ldc;
ptr_a = (bfloat16_t *)A;

for (BLASLONG i = 0; i < m / 4; i++) {
ptr_a0 = ptr_a;
ptr_a += 4 * pad_k;

ptr_b0 = ptr_b;

INIT_C(0, 0);
INIT_C(1, 0);

for (BLASLONG p = 0; p < pad_k; p += 8) {
ma0 = svld1_bf16(pg16_all, ptr_a0);
ma1 = svld1_bf16(pg16_all, ptr_a0 + 16);

mb0 = svld1_bf16(pg16_all, ptr_b0);

MATMUL(0, 0);
MATMUL(1, 0);

ptr_a0 += 32;
ptr_b0 += 16;
}

ACCUMULATE_LAST4_TO_FIRST4(0, 0, tmp);
ACCUMULATE_LAST4_TO_FIRST4(1, 0, tmp);

ZIP_EVEN_ELEMENTS(pg32_select_first_2_per_quadword, mc00, mc10, tmp, vc0);
ZIP_ODD_ELEMENTS(pg32_select_first_2_per_quadword, mc00, mc10, tmp, vc2);

UPDATE_C(pg32_first_4, ptr_c0, oc0, vc0);
UPDATE_C(pg32_first_4, ptr_c1, oc2, vc2);

ptr_c0 += 4;
ptr_c1 += 4;
}

if (m & 2) {
ptr_a0 = ptr_a;
ptr_a += 2 * pad_k;
ptr_b0 = ptr_b;

INIT_C(0, 0);

for (BLASLONG p = 0; p < pad_k; p += 8) {
ma0 = svld1_bf16(pg16_all, ptr_a0);
mb0 = svld1_bf16(pg16_all, ptr_b0);

MATMUL(0, 0);

ptr_a0 += 16;
ptr_b0 += 16;
}

ACCUMULATE_LAST4_TO_FIRST4(0, 0, tmp);
vc0 = svuzp1(mc00, mc00);
vc1 = svuzp2(mc00, mc00);

UPDATE_C(pg32_first_2, ptr_c0, oc0, vc0);
UPDATE_C(pg32_first_2, ptr_c1, oc1, vc1);

ptr_c0 += 2;
ptr_c1 += 2;
}

if (m & 1) {
ptr_a0 = ptr_a;
ptr_b0 = ptr_b;
INIT_C(0, 0);
for (BLASLONG p = 0; p < pad_k; p += 8) {
ma0 = svld1_bf16(pg16_all, ptr_a0);
mb0 = svld1_bf16(pg16_all, ptr_b0);
MATMUL(0, 0);
ptr_a0 += 16;
ptr_b0 += 16;
}

ACCUMULATE_LAST4_TO_FIRST4(0, 0, tmp);
vc1 = svuzp2(mc00, mc00);

UPDATE_C(pg32_first_1, ptr_c0, oc0, mc00);
UPDATE_C(pg32_first_1, ptr_c1, oc1, vc1);
}

ptr_b += 2 * pad_k;
}

if (n & 1) { // TODO: this case seems a overhead. find out whether it's in our
// case.
ptr_c0 = ptr_c;
ptr_a = (bfloat16_t *)A;

for (BLASLONG i = 0; i < m / 4; i++) {
ptr_a0 = ptr_a;
ptr_a += 4 * pad_k;

ptr_b0 = ptr_b;

INIT_C(0, 0);
INIT_C(1, 0);

for (BLASLONG p = 0; p < pad_k; p += 8) {
ma0 = svld1_bf16(pg16_all, ptr_a0);
ma1 = svld1_bf16(pg16_all, ptr_a0 + 16);

mb0 = svld1_bf16(pg16_all, ptr_b0);

MATMUL(0, 0);
MATMUL(1, 0);

ptr_a0 += 32;
ptr_b0 += 16;
}

ACCUMULATE_LAST4_TO_FIRST4(0, 0, tmp);
ACCUMULATE_LAST4_TO_FIRST4(1, 0, tmp);

ZIP_EVEN_ELEMENTS(pg32_select_first_2_per_quadword, mc00, mc10, tmp, vc0);

UPDATE_C(pg32_first_4, ptr_c0, oc0, vc0);

ptr_c0 += 4;
}

if (m & 2) {
ptr_a0 = ptr_a;
ptr_a += 2 * pad_k;
ptr_b0 = ptr_b;

INIT_C(0, 0);

for (BLASLONG p = 0; p < pad_k; p += 8) {
ma0 = svld1_bf16(pg16_all, ptr_a0);
mb0 = svld1_bf16(pg16_all, ptr_b0);

MATMUL(0, 0);

ptr_a0 += 16;
ptr_b0 += 16;
}

ACCUMULATE_LAST4_TO_FIRST4(0, 0, tmp);

vc0 = svuzp1(mc00, mc00);

UPDATE_C(pg32_first_2, ptr_c0, oc0, vc0);

ptr_c0 += 2;
}

if (m & 1) {
ptr_a0 = ptr_a;
ptr_b0 = ptr_b;

INIT_C(0, 0);
for (BLASLONG p = 0; p < pad_k; p += 8) {

ma0 = svld1_bf16(pg16_all, ptr_a0);
mb0 = svld1_bf16(pg16_all, ptr_b0);

MATMUL(0, 0);
ptr_a0 += 16;
ptr_b0 += 16;
}

ACCUMULATE_LAST4_TO_FIRST4(0, 0, tmp);

UPDATE_C(pg32_first_1, ptr_c0, oc0, mc00);
}
}

return 0;
}

+ 148
- 0
kernel/arm64/sbgemm_ncopy_4_neoversev1.c View File

@@ -0,0 +1,148 @@
/***************************************************************************
* Copyright (c) 2024-2025, The OpenBLAS Project
* All rights reserved.
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
* 3. Neither the name of the OpenBLAS project nor the names of
* its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
* *****************************************************************************/

#include <arm_sve.h>

#include "common.h"

int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) {
IFLOAT *a_offset;
IFLOAT *a_offsetx[4];
IFLOAT *b_offset;
a_offset = a;
b_offset = b;

bfloat16_t zero_value_bf16;
*((uint16_t *)(&zero_value_bf16)) = 0;

svbool_t pg16_all = svptrue_b16(); // 16 elements for sve-256 machine.
svbool_t pg16_first_8 = svwhilelt_b16(0, 8);

svbfloat16_t v0, v1, v2, v3;
svuint64_t t0, t1;

BLASLONG rest = m & 7;
svbool_t pg16_rest = svwhilelt_b16_s32(0, rest);

for (BLASLONG j = 0; j < n / 4; j++) {
a_offsetx[0] = a_offset;
a_offsetx[1] = a_offsetx[0] + lda;
a_offsetx[2] = a_offsetx[1] + lda;
a_offsetx[3] = a_offsetx[2] + lda;
a_offset += 4 * lda;

for (BLASLONG i = 0; i < m / 8; i++) {
v0 = svld1_bf16(pg16_first_8, (bfloat16_t *)a_offsetx[0]);
v1 = svld1_bf16(pg16_first_8, (bfloat16_t *)a_offsetx[1]);
v2 = svld1_bf16(pg16_first_8, (bfloat16_t *)a_offsetx[2]);
v3 = svld1_bf16(pg16_first_8, (bfloat16_t *)a_offsetx[3]);

t0 = svzip1_u64(svreinterpret_u64_bf16(v0), svreinterpret_u64_bf16(v1));
t1 = svzip1_u64(svreinterpret_u64_bf16(v2), svreinterpret_u64_bf16(v3));

svst1_bf16(pg16_all, (bfloat16_t *)b_offset, svreinterpret_bf16_u64(t0));
svst1_bf16(pg16_all, (bfloat16_t *)b_offset + 16,
svreinterpret_bf16_u64(t1));

a_offsetx[0] += 8;
a_offsetx[1] += 8;
a_offsetx[2] += 8;
a_offsetx[3] += 8;

b_offset += 32;
}

if (rest) { // remainder along k dim
v0 = svld1_bf16(pg16_rest, (bfloat16_t *)a_offsetx[0]);
v1 = svld1_bf16(pg16_rest, (bfloat16_t *)a_offsetx[1]);
v2 = svld1_bf16(pg16_rest, (bfloat16_t *)a_offsetx[2]);
v3 = svld1_bf16(pg16_rest, (bfloat16_t *)a_offsetx[3]);

t0 = svzip1_u64(svreinterpret_u64_bf16(v0), svreinterpret_u64_bf16(v1));
t1 = svzip1_u64(svreinterpret_u64_bf16(v2), svreinterpret_u64_bf16(v3));

svst1_bf16(pg16_all, (bfloat16_t *)b_offset, svreinterpret_bf16_u64(t0));
svst1_bf16(pg16_all, (bfloat16_t *)b_offset + 16,
svreinterpret_bf16_u64(t1));

b_offset += 32;
}
}

if (n & 2) {
a_offsetx[0] = a_offset;
a_offsetx[1] = a_offsetx[0] + lda;
a_offset += 2 * lda;

for (BLASLONG i = 0; i < m / 8; i++) {
v0 = svld1_bf16(pg16_first_8, (bfloat16_t *)a_offsetx[0]);
v1 = svld1_bf16(pg16_first_8, (bfloat16_t *)a_offsetx[1]);

t0 = svzip1_u64(svreinterpret_u64_bf16(v0), svreinterpret_u64_bf16(v1));
svst1_bf16(pg16_all, (bfloat16_t *)b_offset, svreinterpret_bf16_u64(t0));

b_offset += 16;
a_offsetx[0] += 8;
a_offsetx[1] += 8;
}

if (rest) { // remainder along k dim
v0 = svld1_bf16(pg16_rest, (bfloat16_t *)a_offsetx[0]);
v1 = svld1_bf16(pg16_rest, (bfloat16_t *)a_offsetx[1]);

t0 = svzip1_u64(svreinterpret_u64_bf16(v0), svreinterpret_u64_bf16(v1));
svst1_bf16(pg16_all, (bfloat16_t *)b_offset, svreinterpret_bf16_u64(t0));

b_offset += 16;
}
}

if (n & 1) {
a_offsetx[0] = a_offset;

for (BLASLONG i = 0; i < m / 8; i++) {
v0 = svld1_bf16(pg16_first_8, (bfloat16_t *)a_offsetx[0]);
v1 = svdup_bf16(zero_value_bf16);

t0 = svzip1_u64(svreinterpret_u64_bf16(v0), svreinterpret_u64_bf16(v1));
svst1_bf16(pg16_all, (bfloat16_t *)b_offset, svreinterpret_bf16_u64(t0));

b_offset += 16;
a_offsetx[0] += 8;
}

if (rest) { // remainder along k dim
v0 = svld1_bf16(pg16_rest, (bfloat16_t *)a_offsetx[0]);
v1 = svdup_bf16(zero_value_bf16);
t0 = svzip1_u64(svreinterpret_u64_bf16(v0), svreinterpret_u64_bf16(v1));
svst1_bf16(pg16_all, (bfloat16_t *)b_offset, svreinterpret_bf16_u64(t0));
}
}

return 0;
}

+ 361
- 0
kernel/arm64/sbgemm_tcopy_4_neoversev1.c View File

@@ -0,0 +1,361 @@
/***************************************************************************
* Copyright (c) 2024-2025, The OpenBLAS Project
* All rights reserved.
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
* 3. Neither the name of the OpenBLAS project nor the names of
* its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
* *****************************************************************************/
#include "common.h"
#include <arm_neon.h>
#include <arm_sve.h>

int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) {
BLASLONG pad_m = ((m + 7) & ~7);
BLASLONG rest = (m & 7); // rest along m dim

IFLOAT *a_offset;
IFLOAT *a_offset0, *a_offset1, *a_offset2, *a_offset3;
IFLOAT *a_offset4, *a_offset5, *a_offset6, *a_offset7;

IFLOAT *b_offset;
IFLOAT *b_offset0, *b_offset1;

a_offset = a;
b_offset = b;

svuint16_t c0, c1, c2, c3, c4, c5, c6, c7;
svuint16_t t0, t1, t2, t3;
svuint32_t m00, m01, m10, m11;
svuint64_t st_offsets_0, st_offsets_1;

svbool_t pg16_first_4 = svwhilelt_b16(0, 4);
svbool_t pg16_first_8 = svwhilelt_b16(0, 8);

svbool_t pg64_first_4 = svwhilelt_b64(0, 4);
u_int32_t sizeof_u64 = 8;
u_int64_t _st_offsets_0[4] = {
0 * sizeof_u64,
1 * sizeof_u64,
4 * sizeof_u64,
5 * sizeof_u64,
};

u_int64_t _st_offsets_1[4] = {
2 * sizeof_u64,
3 * sizeof_u64,
6 * sizeof_u64,
7 * sizeof_u64,
};

st_offsets_0 = svld1_u64(pg64_first_4, _st_offsets_0);
st_offsets_1 = svld1_u64(pg64_first_4, _st_offsets_1);

for (BLASLONG j = 0; j < n / 8; j++) {
a_offset0 = a_offset;
a_offset1 = a_offset0 + lda;
a_offset2 = a_offset1 + lda;
a_offset3 = a_offset2 + lda;
a_offset4 = a_offset3 + lda;
a_offset5 = a_offset4 + lda;
a_offset6 = a_offset5 + lda;
a_offset7 = a_offset6 + lda;
a_offset += 8;

b_offset0 = b_offset;
b_offset1 = b_offset0 + 4 * pad_m;

b_offset += 8 * pad_m;
for (BLASLONG i = 0; i < m / 8; i++) {
// transpose 8x8 matrix and pack into two 4x8 block consists of two 2x4
// small blocks
c0 = svld1_u16(pg16_first_8, a_offset0);
c1 = svld1_u16(pg16_first_8, a_offset1);
c2 = svld1_u16(pg16_first_8, a_offset2);
c3 = svld1_u16(pg16_first_8, a_offset3);
c4 = svld1_u16(pg16_first_8, a_offset4);
c5 = svld1_u16(pg16_first_8, a_offset5);
c6 = svld1_u16(pg16_first_8, a_offset6);
c7 = svld1_u16(pg16_first_8, a_offset7);

t0 = svzip1_u16(c0, c1);
t1 = svzip1_u16(c2, c3);
t2 = svzip1_u16(c4, c5);
t3 = svzip1_u16(c6, c7);

m00 = svzip1_u32(svreinterpret_u32_u16(t0), svreinterpret_u32_u16(t1));
m10 = svzip2_u32(svreinterpret_u32_u16(t0), svreinterpret_u32_u16(t1));
m01 = svzip1_u32(svreinterpret_u32_u16(t2), svreinterpret_u32_u16(t3));
m11 = svzip2_u32(svreinterpret_u32_u16(t2), svreinterpret_u32_u16(t3));

svst1_scatter_u64offset_u64(pg64_first_4, (u_int64_t *)b_offset0,
st_offsets_0, svreinterpret_u64_u32(m00));
svst1_scatter_u64offset_u64(pg64_first_4, (u_int64_t *)b_offset0,
st_offsets_1, svreinterpret_u64_u32(m01));
svst1_scatter_u64offset_u64(pg64_first_4, (u_int64_t *)b_offset1,
st_offsets_0, svreinterpret_u64_u32(m10));
svst1_scatter_u64offset_u64(pg64_first_4, (u_int64_t *)b_offset1,
st_offsets_1, svreinterpret_u64_u32(m11));

a_offset0 += 8 * lda;
a_offset1 += 8 * lda;
a_offset2 += 8 * lda;
a_offset3 += 8 * lda;
a_offset4 += 8 * lda;
a_offset5 += 8 * lda;
a_offset6 += 8 * lda;
a_offset7 += 8 * lda;

b_offset0 += 32;
b_offset1 += 32;
}

if (rest) {
c0 = svld1_u16(pg16_first_8, a_offset0);
c1 = (rest >= 2 ? svld1_u16(pg16_first_8, a_offset1) : svdup_u16(0));
c2 = (rest >= 3 ? svld1_u16(pg16_first_8, a_offset2) : svdup_u16(0));
c3 = (rest >= 4 ? svld1_u16(pg16_first_8, a_offset3) : svdup_u16(0));
c4 = (rest >= 5 ? svld1_u16(pg16_first_8, a_offset4) : svdup_u16(0));
c5 = (rest >= 6 ? svld1_u16(pg16_first_8, a_offset5) : svdup_u16(0));
c6 = (rest == 7 ? svld1_u16(pg16_first_8, a_offset6) : svdup_u16(0));
c7 = (svdup_u16(0));

t0 = svzip1_u16(c0, c1);
t1 = svzip1_u16(c2, c3);
t2 = svzip1_u16(c4, c5);
t3 = svzip1_u16(c6, c7);

m00 = svzip1_u32(svreinterpret_u32_u16(t0), svreinterpret_u32_u16(t1));
m10 = svzip2_u32(svreinterpret_u32_u16(t0), svreinterpret_u32_u16(t1));
m01 = svzip1_u32(svreinterpret_u32_u16(t2), svreinterpret_u32_u16(t3));
m11 = svzip2_u32(svreinterpret_u32_u16(t2), svreinterpret_u32_u16(t3));

svst1_scatter_u64offset_u64(pg64_first_4, (u_int64_t *)b_offset0,
st_offsets_0, svreinterpret_u64_u32(m00));
svst1_scatter_u64offset_u64(pg64_first_4, (u_int64_t *)b_offset0,
st_offsets_1, svreinterpret_u64_u32(m01));
svst1_scatter_u64offset_u64(pg64_first_4, (u_int64_t *)b_offset1,
st_offsets_0, svreinterpret_u64_u32(m10));
svst1_scatter_u64offset_u64(pg64_first_4, (u_int64_t *)b_offset1,
st_offsets_1, svreinterpret_u64_u32(m11));
}
}

if (n & 4) {
a_offset0 = a_offset;
a_offset1 = a_offset0 + lda;
a_offset2 = a_offset1 + lda;
a_offset3 = a_offset2 + lda;
a_offset4 = a_offset3 + lda;
a_offset5 = a_offset4 + lda;
a_offset6 = a_offset5 + lda;
a_offset7 = a_offset6 + lda;
a_offset += 4;

b_offset0 = b_offset;
b_offset += 4 * pad_m;

for (BLASLONG i = 0; i < m / 8; i++) {
// transpose 8x8 matrix and pack into two 4x8 block consists of two 2x4
// small blocks
c0 = svld1_u16(pg16_first_4, a_offset0);
c1 = svld1_u16(pg16_first_4, a_offset1);
c2 = svld1_u16(pg16_first_4, a_offset2);
c3 = svld1_u16(pg16_first_4, a_offset3);
c4 = svld1_u16(pg16_first_4, a_offset4);
c5 = svld1_u16(pg16_first_4, a_offset5);
c6 = svld1_u16(pg16_first_4, a_offset6);
c7 = svld1_u16(pg16_first_4, a_offset7);

t0 = svzip1_u16(c0, c1);
t1 = svzip1_u16(c2, c3);
t2 = svzip1_u16(c4, c5);
t3 = svzip1_u16(c6, c7);

m00 = svzip1_u32(svreinterpret_u32_u16(t0), svreinterpret_u32_u16(t1));
m01 = svzip1_u32(svreinterpret_u32_u16(t2), svreinterpret_u32_u16(t3));
svst1_scatter_u64offset_u64(pg64_first_4, (u_int64_t *)b_offset0,
st_offsets_0, svreinterpret_u64_u32(m00));
svst1_scatter_u64offset_u64(pg64_first_4, (u_int64_t *)b_offset0,
st_offsets_1, svreinterpret_u64_u32(m01));

a_offset0 += 8 * lda;
a_offset1 += 8 * lda;
a_offset2 += 8 * lda;
a_offset3 += 8 * lda;
a_offset4 += 8 * lda;
a_offset5 += 8 * lda;
a_offset6 += 8 * lda;
a_offset7 += 8 * lda;

b_offset0 += 32;
}

if (rest) {
c0 = svld1_u16(pg16_first_4, a_offset0); // rest >= 1
c1 = (rest >= 2 ? svld1_u16(pg16_first_4, a_offset1) : svdup_u16(0));
c2 = (rest >= 3 ? svld1_u16(pg16_first_4, a_offset2) : svdup_u16(0));
c3 = (rest >= 4 ? svld1_u16(pg16_first_4, a_offset3) : svdup_u16(0));
c4 = (rest >= 5 ? svld1_u16(pg16_first_4, a_offset4) : svdup_u16(0));
c5 = (rest >= 6 ? svld1_u16(pg16_first_4, a_offset5) : svdup_u16(0));
c6 = (rest == 7 ? svld1_u16(pg16_first_4, a_offset6) : svdup_u16(0));
c7 = (svdup_u16(0));

t0 = svzip1_u16(c0, c1);
t1 = svzip1_u16(c2, c3);
t2 = svzip1_u16(c4, c5);
t3 = svzip1_u16(c6, c7);

m00 = svzip1_u32(svreinterpret_u32_u16(t0), svreinterpret_u32_u16(t1));
m01 = svzip1_u32(svreinterpret_u32_u16(t2), svreinterpret_u32_u16(t3));

svst1_scatter_u64offset_u64(pg64_first_4, (u_int64_t *)b_offset0,
st_offsets_0, svreinterpret_u64_u32(m00));
svst1_scatter_u64offset_u64(pg64_first_4, (u_int64_t *)b_offset0,
st_offsets_1, svreinterpret_u64_u32(m01));
}
}

if (n & 2) {
a_offset0 = a_offset;
a_offset1 = a_offset0 + lda;
a_offset2 = a_offset1 + lda;
a_offset3 = a_offset2 + lda;
a_offset4 = a_offset3 + lda;
a_offset5 = a_offset4 + lda;
a_offset6 = a_offset5 + lda;
a_offset7 = a_offset6 + lda;
a_offset += 2;

b_offset0 = b_offset;
b_offset1 = b_offset0 + 8;

b_offset += 2 * pad_m;

for (BLASLONG i = 0; i < m / 8; i++) {
for (BLASLONG line = 0; line < 2; line++) {
b_offset0[line * 4] = a_offset0[line];
b_offset0[line * 4 + 1] = a_offset1[line];
b_offset0[line * 4 + 2] = a_offset2[line];
b_offset0[line * 4 + 3] = a_offset3[line];

b_offset1[line * 4] = a_offset4[line];
b_offset1[line * 4 + 1] = a_offset5[line];
b_offset1[line * 4 + 2] = a_offset6[line];
b_offset1[line * 4 + 3] = a_offset7[line];
}
b_offset0 += 16;
b_offset1 += 16;

a_offset0 += 8 * lda;
a_offset1 += 8 * lda;
a_offset2 += 8 * lda;
a_offset3 += 8 * lda;
a_offset4 += 8 * lda;
a_offset5 += 8 * lda;
a_offset6 += 8 * lda;
a_offset7 += 8 * lda;
}

if (rest) {
for (BLASLONG line = 0; line < 2; line++) {
b_offset0[line * 4] = a_offset0[line];
b_offset0[line * 4 + 1] = rest == 1 ? 0 : a_offset1[line];
b_offset0[line * 4 + 2] = rest <= 2 ? 0 : a_offset2[line];
b_offset0[line * 4 + 3] = rest <= 3 ? 0 : a_offset3[line];

b_offset1[line * 4] = rest <= 4 ? 0 : a_offset4[line];
b_offset1[line * 4 + 1] = rest <= 5 ? 0 : a_offset5[line];
b_offset1[line * 4 + 2] = rest <= 6 ? 0 : a_offset6[line];
b_offset1[line * 4 + 3] = 0;
}
}
}

if (n & 1) {
a_offset0 = a_offset;
a_offset1 = a_offset0 + lda;
a_offset2 = a_offset1 + lda;
a_offset3 = a_offset2 + lda;
a_offset4 = a_offset3 + lda;
a_offset5 = a_offset4 + lda;
a_offset6 = a_offset5 + lda;
a_offset7 = a_offset6 + lda;

for (BLASLONG i = 0; i < m / 8; i++) {
b_offset[0] = a_offset0[0];
b_offset[1] = a_offset1[0];
b_offset[2] = a_offset2[0];
b_offset[3] = a_offset3[0];

b_offset[4] = 0;
b_offset[5] = 0;
b_offset[6] = 0;
b_offset[7] = 0;

b_offset[8] = a_offset4[0];
b_offset[9] = a_offset5[0];
b_offset[10] = a_offset6[0];
b_offset[11] = a_offset7[0];

b_offset[12] = 0;
b_offset[13] = 0;
b_offset[14] = 0;
b_offset[15] = 0;

b_offset += 16;
a_offset0 += 8 * lda;
a_offset1 += 8 * lda;
a_offset2 += 8 * lda;
a_offset3 += 8 * lda;
a_offset4 += 8 * lda;
a_offset5 += 8 * lda;
a_offset6 += 8 * lda;
a_offset7 += 8 * lda;
}

if (rest) {
b_offset[0] = *a_offset0;
b_offset[1] = rest == 1 ? 0 : *a_offset1;
b_offset[2] = rest <= 2 ? 0 : *a_offset2;
b_offset[3] = rest <= 3 ? 0 : *a_offset3;

b_offset[4] = 0;
b_offset[5] = 0;
b_offset[6] = 0;
b_offset[7] = 0;

b_offset[8] = rest <= 4 ? 0 : *a_offset4;
b_offset[9] = rest <= 5 ? 0 : *a_offset5;
b_offset[10] = rest <= 6 ? 0 : *a_offset6;
b_offset[11] = 0;

b_offset[12] = 0;
b_offset[13] = 0;
b_offset[14] = 0;
b_offset[15] = 0;
}
}

return 0;
}

+ 515
- 0
kernel/arm64/sbgemv_n_neon.c View File

@@ -0,0 +1,515 @@
/***************************************************************************
Copyright (c) 2025, The OpenBLAS Project
All rights reserved.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:

1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.

2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written
permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

#include "common.h"
#include <arm_neon.h>

static void beta_op(float *x, BLASLONG n, FLOAT beta) {
if (beta == 0) {
memset(x, 0, n * sizeof(float));
return;
}

float32x4_t y0, y1, y2, y3;

for (BLASLONG i = 0; i < n / 16; i++) {
y0 = vld1q_f32(x);
y1 = vld1q_f32(x + 4);
y2 = vld1q_f32(x + 8);
y3 = vld1q_f32(x + 12);

y0 = vmulq_n_f32(y0, beta);
y1 = vmulq_n_f32(y1, beta);
y2 = vmulq_n_f32(y2, beta);
y3 = vmulq_n_f32(y3, beta);

vst1q_f32(x, y0);
vst1q_f32(x + 4, y1);
vst1q_f32(x + 8, y2);
vst1q_f32(x + 12, y3);

x += 16;
}

if (n & 15) {
BLASLONG rest_n = n & 15;
for (BLASLONG i = 0; i < (rest_n) / 4; i++) {
y0 = vld1q_f32(x);
y0 = vmulq_n_f32(y0, beta);
vst1q_f32(x, y0);
x += 4;
}

for (BLASLONG i = 0; i < (rest_n & 3); i ++) {
x[i] *= beta;
}
}
return;
}

int CNAME(BLASLONG m, BLASLONG n, FLOAT alpha, bfloat16 *a, BLASLONG lda,
bfloat16 *x, BLASLONG incx, float beta, float *y, BLASLONG incy) {
BLASLONG i, j;
bfloat16_t *a_ptr, *x_ptr;
FLOAT *y_ptr;

bfloat16x8_t a0, a1, a2, a3, a4, a5, a6, a7;
bfloat16x8_t t0, t1, t2, t3, t4, t5, t6, t7;

bfloat16x8_t x_vec;
bfloat16x4_t x_vecx4;

float32x4_t y1_vec, y2_vec;
float32x4_t fp32_low, fp32_high;

float x0, x1, x2, x3, x4, x5, x6, x7;
bfloat16_t *a_ptr0, *a_ptr1, *a_ptr2, *a_ptr3, *a_ptr4, *a_ptr5, *a_ptr6,
*a_ptr7;

a_ptr = (bfloat16_t *)a;
x_ptr = (bfloat16_t *)x;

BLASLONG rest_m = m & 3;

bfloat16x4_t bf16_zero = vreinterpret_bf16_u16(vdup_n_u16(0));
bfloat16x8_t bf16_zero_q = vreinterpretq_bf16_u16(vdupq_n_u16(0));

if (incx == 1 && incy == 1) {
if (beta != 1) {
beta_op(y, m, beta);
}

for (i = 0; i < n / 8; i++) {
a_ptr0 = a_ptr;
a_ptr1 = a_ptr0 + lda;
a_ptr2 = a_ptr1 + lda;
a_ptr3 = a_ptr2 + lda;
a_ptr4 = a_ptr3 + lda;
a_ptr5 = a_ptr4 + lda;
a_ptr6 = a_ptr5 + lda;
a_ptr7 = a_ptr6 + lda;

a_ptr += 8 * lda;

y_ptr = y;

x_vec = vld1q_bf16(x_ptr);

if (alpha != 1) {
fp32_low = vreinterpretq_f32_u16(
vzip1q_u16(vreinterpretq_u16_bf16(bf16_zero_q),
vreinterpretq_u16_bf16(x_vec)));
fp32_high = vreinterpretq_f32_u16(
vzip2q_u16(vreinterpretq_u16_bf16(bf16_zero_q),
vreinterpretq_u16_bf16(x_vec)));

fp32_low = vmulq_n_f32(fp32_low, alpha);
fp32_high = vmulq_n_f32(fp32_high, alpha);

x_vec =
vcombine_bf16(vcvt_bf16_f32(fp32_low), vcvt_bf16_f32(fp32_high));
}

for (j = 0; j < m / 8; j++) {
a0 = vld1q_bf16(a_ptr0);
a1 = vld1q_bf16(a_ptr1);
a2 = vld1q_bf16(a_ptr2);
a3 = vld1q_bf16(a_ptr3);
a4 = vld1q_bf16(a_ptr4);
a5 = vld1q_bf16(a_ptr5);
a6 = vld1q_bf16(a_ptr6);
a7 = vld1q_bf16(a_ptr7);

y1_vec = vld1q_f32(y_ptr);
y2_vec = vld1q_f32(y_ptr + 4);

t0 = vreinterpretq_bf16_u16(
vzip1q_u16(vreinterpretq_u16_bf16(a0), vreinterpretq_u16_bf16(a1)));
t1 = vreinterpretq_bf16_u16(
vzip1q_u16(vreinterpretq_u16_bf16(a2), vreinterpretq_u16_bf16(a3)));
t2 = vreinterpretq_bf16_u16(
vzip1q_u16(vreinterpretq_u16_bf16(a4), vreinterpretq_u16_bf16(a5)));
t3 = vreinterpretq_bf16_u16(
vzip1q_u16(vreinterpretq_u16_bf16(a6), vreinterpretq_u16_bf16(a7)));

t4 = vreinterpretq_bf16_u16(
vzip2q_u16(vreinterpretq_u16_bf16(a0), vreinterpretq_u16_bf16(a1)));
t5 = vreinterpretq_bf16_u16(
vzip2q_u16(vreinterpretq_u16_bf16(a2), vreinterpretq_u16_bf16(a3)));
t6 = vreinterpretq_bf16_u16(
vzip2q_u16(vreinterpretq_u16_bf16(a4), vreinterpretq_u16_bf16(a5)));
t7 = vreinterpretq_bf16_u16(
vzip2q_u16(vreinterpretq_u16_bf16(a6), vreinterpretq_u16_bf16(a7)));

y1_vec = vbfmlalbq_laneq_f32(y1_vec, t0, x_vec, 0);
y1_vec = vbfmlaltq_laneq_f32(y1_vec, t0, x_vec, 1);
y1_vec = vbfmlalbq_laneq_f32(y1_vec, t1, x_vec, 2);
y1_vec = vbfmlaltq_laneq_f32(y1_vec, t1, x_vec, 3);
y1_vec = vbfmlalbq_laneq_f32(y1_vec, t2, x_vec, 4);
y1_vec = vbfmlaltq_laneq_f32(y1_vec, t2, x_vec, 5);
y1_vec = vbfmlalbq_laneq_f32(y1_vec, t3, x_vec, 6);
y1_vec = vbfmlaltq_laneq_f32(y1_vec, t3, x_vec, 7);

y2_vec = vbfmlalbq_laneq_f32(y2_vec, t4, x_vec, 0);
y2_vec = vbfmlaltq_laneq_f32(y2_vec, t4, x_vec, 1);
y2_vec = vbfmlalbq_laneq_f32(y2_vec, t5, x_vec, 2);
y2_vec = vbfmlaltq_laneq_f32(y2_vec, t5, x_vec, 3);
y2_vec = vbfmlalbq_laneq_f32(y2_vec, t6, x_vec, 4);
y2_vec = vbfmlaltq_laneq_f32(y2_vec, t6, x_vec, 5);
y2_vec = vbfmlalbq_laneq_f32(y2_vec, t7, x_vec, 6);
y2_vec = vbfmlaltq_laneq_f32(y2_vec, t7, x_vec, 7);

vst1q_f32(y_ptr, y1_vec);
vst1q_f32(y_ptr + 4, y2_vec);

a_ptr0 += 8;
a_ptr1 += 8;
a_ptr2 += 8;
a_ptr3 += 8;
a_ptr4 += 8;
a_ptr5 += 8;
a_ptr6 += 8;
a_ptr7 += 8;

y_ptr += 8;
}

if (m & 4) {
bfloat16x4_t a0x4 = vld1_bf16(a_ptr0);
bfloat16x4_t a1x4 = vld1_bf16(a_ptr1);
bfloat16x4_t a2x4 = vld1_bf16(a_ptr2);
bfloat16x4_t a3x4 = vld1_bf16(a_ptr3);
bfloat16x4_t a4x4 = vld1_bf16(a_ptr4);
bfloat16x4_t a5x4 = vld1_bf16(a_ptr5);
bfloat16x4_t a6x4 = vld1_bf16(a_ptr6);
bfloat16x4_t a7x4 = vld1_bf16(a_ptr7);

y1_vec = vld1q_f32(y_ptr);

a0 = vcombine_bf16(a0x4, bf16_zero);
a1 = vcombine_bf16(a1x4, bf16_zero);
a2 = vcombine_bf16(a2x4, bf16_zero);
a3 = vcombine_bf16(a3x4, bf16_zero);
a4 = vcombine_bf16(a4x4, bf16_zero);
a5 = vcombine_bf16(a5x4, bf16_zero);
a6 = vcombine_bf16(a6x4, bf16_zero);
a7 = vcombine_bf16(a7x4, bf16_zero);

t0 = vreinterpretq_bf16_u16(
vzip1q_u16(vreinterpretq_u16_bf16(a0), vreinterpretq_u16_bf16(a1)));
t1 = vreinterpretq_bf16_u16(
vzip1q_u16(vreinterpretq_u16_bf16(a2), vreinterpretq_u16_bf16(a3)));
t2 = vreinterpretq_bf16_u16(
vzip1q_u16(vreinterpretq_u16_bf16(a4), vreinterpretq_u16_bf16(a5)));
t3 = vreinterpretq_bf16_u16(
vzip1q_u16(vreinterpretq_u16_bf16(a6), vreinterpretq_u16_bf16(a7)));

y1_vec = vbfmlalbq_laneq_f32(y1_vec, t0, x_vec, 0);
y1_vec = vbfmlaltq_laneq_f32(y1_vec, t0, x_vec, 1);
y1_vec = vbfmlalbq_laneq_f32(y1_vec, t1, x_vec, 2);
y1_vec = vbfmlaltq_laneq_f32(y1_vec, t1, x_vec, 3);
y1_vec = vbfmlalbq_laneq_f32(y1_vec, t2, x_vec, 4);
y1_vec = vbfmlaltq_laneq_f32(y1_vec, t2, x_vec, 5);
y1_vec = vbfmlalbq_laneq_f32(y1_vec, t3, x_vec, 6);
y1_vec = vbfmlaltq_laneq_f32(y1_vec, t3, x_vec, 7);

vst1q_f32(y_ptr, y1_vec);

a_ptr0 += 4;
a_ptr1 += 4;
a_ptr2 += 4;
a_ptr3 += 4;
a_ptr4 += 4;
a_ptr5 += 4;
a_ptr6 += 4;
a_ptr7 += 4;

y_ptr += 4;
}

if (rest_m) {
x0 = alpha * vcvtah_f32_bf16(x_ptr[0]);
x1 = alpha * vcvtah_f32_bf16(x_ptr[1]);
x2 = alpha * vcvtah_f32_bf16(x_ptr[2]);
x3 = alpha * vcvtah_f32_bf16(x_ptr[3]);
x4 = alpha * vcvtah_f32_bf16(x_ptr[4]);
x5 = alpha * vcvtah_f32_bf16(x_ptr[5]);
x6 = alpha * vcvtah_f32_bf16(x_ptr[6]);
x7 = alpha * vcvtah_f32_bf16(x_ptr[7]);

for (BLASLONG j = 0; j < rest_m; j++) {
y_ptr[j] += x0 * vcvtah_f32_bf16(a_ptr0[j]);
y_ptr[j] += x1 * vcvtah_f32_bf16(a_ptr1[j]);
y_ptr[j] += x2 * vcvtah_f32_bf16(a_ptr2[j]);
y_ptr[j] += x3 * vcvtah_f32_bf16(a_ptr3[j]);
y_ptr[j] += x4 * vcvtah_f32_bf16(a_ptr4[j]);
y_ptr[j] += x5 * vcvtah_f32_bf16(a_ptr5[j]);
y_ptr[j] += x6 * vcvtah_f32_bf16(a_ptr6[j]);
y_ptr[j] += x7 * vcvtah_f32_bf16(a_ptr7[j]);
}
}

x_ptr += 8;
}

if (n & 4) {
a_ptr0 = a_ptr;
a_ptr1 = a_ptr0 + lda;
a_ptr2 = a_ptr1 + lda;
a_ptr3 = a_ptr2 + lda;

a_ptr += 4 * lda;

x_vecx4 = vld1_bf16(x_ptr);
if (alpha != 1) {
fp32_low = vcvt_f32_bf16(x_vecx4);
fp32_low = vmulq_n_f32(fp32_low, alpha);
x_vecx4 = vcvt_bf16_f32(fp32_low);
}

y_ptr = y;
for (j = 0; j < m / 8; j++) {
a0 = vld1q_bf16(a_ptr0);
a1 = vld1q_bf16(a_ptr1);
a2 = vld1q_bf16(a_ptr2);
a3 = vld1q_bf16(a_ptr3);

y1_vec = vld1q_f32(y_ptr);
y2_vec = vld1q_f32(y_ptr + 4);

t0 = vreinterpretq_bf16_u16(
vzip1q_u16(vreinterpretq_u16_bf16(a0), vreinterpretq_u16_bf16(a1)));
t1 = vreinterpretq_bf16_u16(
vzip1q_u16(vreinterpretq_u16_bf16(a2), vreinterpretq_u16_bf16(a3)));
t4 = vreinterpretq_bf16_u16(
vzip2q_u16(vreinterpretq_u16_bf16(a0), vreinterpretq_u16_bf16(a1)));
t5 = vreinterpretq_bf16_u16(
vzip2q_u16(vreinterpretq_u16_bf16(a2), vreinterpretq_u16_bf16(a3)));

y1_vec = vbfmlalbq_lane_f32(y1_vec, t0, x_vecx4, 0);
y1_vec = vbfmlaltq_lane_f32(y1_vec, t0, x_vecx4, 1);
y1_vec = vbfmlalbq_lane_f32(y1_vec, t1, x_vecx4, 2);
y1_vec = vbfmlaltq_lane_f32(y1_vec, t1, x_vecx4, 3);

y2_vec = vbfmlalbq_lane_f32(y2_vec, t4, x_vecx4, 0);
y2_vec = vbfmlaltq_lane_f32(y2_vec, t4, x_vecx4, 1);
y2_vec = vbfmlalbq_lane_f32(y2_vec, t5, x_vecx4, 2);
y2_vec = vbfmlaltq_lane_f32(y2_vec, t5, x_vecx4, 3);

vst1q_f32(y_ptr, y1_vec);
vst1q_f32(y_ptr + 4, y2_vec);

a_ptr0 += 8;
a_ptr1 += 8;
a_ptr2 += 8;
a_ptr3 += 8;

y_ptr += 8;
}

if (m & 4) {
bfloat16x4_t a0x4 = vld1_bf16(a_ptr0);
bfloat16x4_t a1x4 = vld1_bf16(a_ptr1);
bfloat16x4_t a2x4 = vld1_bf16(a_ptr2);
bfloat16x4_t a3x4 = vld1_bf16(a_ptr3);

y1_vec = vld1q_f32(y_ptr);

a0 = vcombine_bf16(a0x4, a2x4);
a1 = vcombine_bf16(a1x4, a3x4);

t0 = vreinterpretq_bf16_u16(vzip1q_u16(vreinterpretq_u16_bf16(a0), vreinterpretq_u16_bf16(a1)));
t1 = vreinterpretq_bf16_u16(vzip2q_u16(vreinterpretq_u16_bf16(a0), vreinterpretq_u16_bf16(a1)));

y1_vec = vbfmlalbq_lane_f32(y1_vec, t0, x_vecx4, 0);
y1_vec = vbfmlaltq_lane_f32(y1_vec, t0, x_vecx4, 1);
y1_vec = vbfmlalbq_lane_f32(y1_vec, t1, x_vecx4, 2);
y1_vec = vbfmlaltq_lane_f32(y1_vec, t1, x_vecx4, 3);

vst1q_f32(y_ptr, y1_vec);

a_ptr0 += 4;
a_ptr1 += 4;
a_ptr2 += 4;
a_ptr3 += 4;

y_ptr += 4;
}

if (rest_m) {
fp32_low = vcvt_f32_bf16(x_vecx4);

x0 = vgetq_lane_f32(fp32_low, 0);
x1 = vgetq_lane_f32(fp32_low, 1);
x2 = vgetq_lane_f32(fp32_low, 2);
x3 = vgetq_lane_f32(fp32_low, 3);

for (BLASLONG j = 0; j < rest_m; j++) {
y_ptr[j] += x0 * vcvtah_f32_bf16(a_ptr0[j]);
y_ptr[j] += x1 * vcvtah_f32_bf16(a_ptr1[j]);
y_ptr[j] += x2 * vcvtah_f32_bf16(a_ptr2[j]);
y_ptr[j] += x3 * vcvtah_f32_bf16(a_ptr3[j]);
}
}

x_ptr += 4;
}

if (n & 2) {
a_ptr0 = a_ptr;
a_ptr1 = a_ptr0 + lda;

a_ptr += 2 * lda;

x_vecx4 = vreinterpret_bf16_u16(vzip1_u16(
vreinterpret_u16_bf16(vdup_n_bf16(x_ptr[0])),
vreinterpret_u16_bf16(vdup_n_bf16(x_ptr[1]))
));

if (alpha != 1) {
fp32_low = vcvt_f32_bf16(x_vecx4);
fp32_low = vmulq_n_f32(fp32_low, alpha);
x_vecx4 = vcvt_bf16_f32(fp32_low);
}

y_ptr = y;
for (j = 0; j < m / 8; j++) {
a0 = vld1q_bf16(a_ptr0);
a1 = vld1q_bf16(a_ptr1);

y1_vec = vld1q_f32(y_ptr);
y2_vec = vld1q_f32(y_ptr + 4);

t0 = vreinterpretq_bf16_u16(
vzip1q_u16(vreinterpretq_u16_bf16(a0), vreinterpretq_u16_bf16(a1)));
t1 = vreinterpretq_bf16_u16(
vzip2q_u16(vreinterpretq_u16_bf16(a0), vreinterpretq_u16_bf16(a1)));

y1_vec = vbfmlalbq_lane_f32(y1_vec, t0, x_vecx4, 0);
y1_vec = vbfmlaltq_lane_f32(y1_vec, t0, x_vecx4, 1);

y2_vec = vbfmlalbq_lane_f32(y2_vec, t1, x_vecx4, 0);
y2_vec = vbfmlaltq_lane_f32(y2_vec, t1, x_vecx4, 1);

vst1q_f32(y_ptr, y1_vec);
vst1q_f32(y_ptr + 4, y2_vec);

a_ptr0 += 8;
a_ptr1 += 8;

y_ptr += 8;
}

if (m & 4) {
bfloat16x4_t a0x4 = vld1_bf16(a_ptr0);
bfloat16x4_t a1x4 = vld1_bf16(a_ptr1);

y1_vec = vld1q_f32(y_ptr);

a0 = vcombine_bf16(a0x4, bf16_zero);
a1 = vcombine_bf16(a1x4, bf16_zero);

t0 = vreinterpretq_bf16_u16(vzip1q_u16(vreinterpretq_u16_bf16(a0), vreinterpretq_u16_bf16(a1)));

y1_vec = vbfmlalbq_lane_f32(y1_vec, t0, x_vecx4, 0);
y1_vec = vbfmlaltq_lane_f32(y1_vec, t0, x_vecx4, 1);

vst1q_f32(y_ptr, y1_vec);

a_ptr0 += 4;
a_ptr1 += 4;

y_ptr += 4;
}

if (m & 2) {
fp32_low = vcvt_f32_bf16(x_vecx4);
x0 = vgetq_lane_f32(fp32_low, 0);
x1 = vgetq_lane_f32(fp32_low, 1);


y_ptr[0] += x0 * vcvtah_f32_bf16(a_ptr0[0]);
y_ptr[0] += x1 * vcvtah_f32_bf16(a_ptr1[0]);
y_ptr[1] += x0 * vcvtah_f32_bf16(a_ptr0[1]);
y_ptr[1] += x1 * vcvtah_f32_bf16(a_ptr1[1]);

a_ptr0 += 2;
a_ptr1 += 2;

y_ptr += 2;
}

if (m & 1) {
fp32_low = vcvt_f32_bf16(x_vecx4);
x0 = vgetq_lane_f32(fp32_low, 0);
x1 = vgetq_lane_f32(fp32_low, 1);

y_ptr[0] += x0 * vcvtah_f32_bf16(a_ptr0[0]);
y_ptr[0] += x1 * vcvtah_f32_bf16(a_ptr1[0]);
}

x_ptr += 2;
}

if (n & 1) {
x0 = vcvtah_f32_bf16(x_ptr[0]) * alpha;
y_ptr = y;
a_ptr0 = a_ptr;

for (j = 0; j < m; j++) {
y_ptr[j] += x0 * vcvtah_f32_bf16(a_ptr0[j]);
}
}

return (0);
}

BLASLONG iy = 0;
for (i = 0; i < m; i++) {
y[iy] *= beta;
iy += incy;
}

for (j = 0; j < n; j++) {
x0 = alpha * vcvtah_f32_bf16(*x_ptr);
iy = 0;
for (i = 0; i < m; i++) {
y[iy] += x0 * vcvtah_f32_bf16(a_ptr[i]);
iy += incy;
}

a_ptr += lda;
x_ptr += incx;
}

return (0);
}

+ 202
- 0
kernel/arm64/sbgemv_t_bfdot.c View File

@@ -0,0 +1,202 @@
/***************************************************************************
Copyright (c) 2025, The OpenBLAS Project
All rights reserved.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:

1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.

2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written
permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

#include <arm_neon.h>
#include "common.h"

int CNAME(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, BLASLONG incx, float beta, float *y, BLASLONG incy)
{
if (m < 1 || n < 1) return(0);
BLASLONG i;
BLASLONG ix,iy;
BLASLONG j;
bfloat16_t *a_ptr;
bfloat16_t *x_ptr;
float *y_ptr;
float temp;

iy = 0;
a_ptr = (bfloat16_t*)(a);
x_ptr = (bfloat16_t*)(x);

if (incx == 1) {
BLASLONG width = n / 4;

bfloat16_t *a0_ptr = a_ptr + lda * width * 0;
bfloat16_t *a1_ptr = a_ptr + lda * width * 1;
bfloat16_t *a2_ptr = a_ptr + lda * width * 2;
bfloat16_t *a3_ptr = a_ptr + lda * width * 3;

float *y0_ptr = y + incy * width * 0;
float *y1_ptr = y + incy * width * 1;
float *y2_ptr = y + incy * width * 2;
float *y3_ptr = y + incy * width * 3;

for (j = 0; j < width; j++) {
float32x4_t temp0_vec = vdupq_n_f32(0.0f);
float32x4_t temp1_vec = vdupq_n_f32(0.0f);
float32x4_t temp2_vec = vdupq_n_f32(0.0f);
float32x4_t temp3_vec = vdupq_n_f32(0.0f);

i = 0;
while (i + 7 < m) {
bfloat16x8_t x_vec = vld1q_bf16(x_ptr + i);

bfloat16x8_t a0_vec = vld1q_bf16(a0_ptr + i);
bfloat16x8_t a1_vec = vld1q_bf16(a1_ptr + i);
bfloat16x8_t a2_vec = vld1q_bf16(a2_ptr + i);
bfloat16x8_t a3_vec = vld1q_bf16(a3_ptr + i);

temp0_vec = vbfdotq_f32(temp0_vec, a0_vec, x_vec);
temp1_vec = vbfdotq_f32(temp1_vec, a1_vec, x_vec);
temp2_vec = vbfdotq_f32(temp2_vec, a2_vec, x_vec);
temp3_vec = vbfdotq_f32(temp3_vec, a3_vec, x_vec);

i += 8;
}
if (i + 3 < m) {
float32x2_t t0 = vdup_n_f32(0.0f);
float32x2_t t1 = vdup_n_f32(0.0f);
float32x2_t t2 = vdup_n_f32(0.0f);
float32x2_t t3 = vdup_n_f32(0.0f);

bfloat16x4_t x_vec = vld1_bf16(x_ptr + i);

bfloat16x4_t a0_vec = vld1_bf16(a0_ptr + i);
bfloat16x4_t a1_vec = vld1_bf16(a1_ptr + i);
bfloat16x4_t a2_vec = vld1_bf16(a2_ptr + i);
bfloat16x4_t a3_vec = vld1_bf16(a3_ptr + i);

t0 = vbfdot_f32(t0, a0_vec, x_vec);
t1 = vbfdot_f32(t1, a1_vec, x_vec);
t2 = vbfdot_f32(t2, a2_vec, x_vec);
t3 = vbfdot_f32(t3, a3_vec, x_vec);

float32x2_t temp0_vec_low = vget_low_f32(temp0_vec);
float32x2_t temp1_vec_low = vget_low_f32(temp1_vec);
float32x2_t temp2_vec_low = vget_low_f32(temp2_vec);
float32x2_t temp3_vec_low = vget_low_f32(temp3_vec);

temp0_vec = vcombine_f32(vadd_f32(t0, temp0_vec_low), vget_high_f32(temp0_vec));
temp1_vec = vcombine_f32(vadd_f32(t1, temp1_vec_low), vget_high_f32(temp1_vec));
temp2_vec = vcombine_f32(vadd_f32(t2, temp2_vec_low), vget_high_f32(temp2_vec));
temp3_vec = vcombine_f32(vadd_f32(t3, temp3_vec_low), vget_high_f32(temp3_vec));

i += 4;
}
if (beta == 0.0f) {
y0_ptr[iy] = alpha * vaddvq_f32(temp0_vec);
y1_ptr[iy] = alpha * vaddvq_f32(temp1_vec);
y2_ptr[iy] = alpha * vaddvq_f32(temp2_vec);
y3_ptr[iy] = alpha * vaddvq_f32(temp3_vec);
}
else {
y0_ptr[iy] = alpha * vaddvq_f32(temp0_vec) + beta * y0_ptr[iy];
y1_ptr[iy] = alpha * vaddvq_f32(temp1_vec) + beta * y1_ptr[iy];
y2_ptr[iy] = alpha * vaddvq_f32(temp2_vec) + beta * y2_ptr[iy];
y3_ptr[iy] = alpha * vaddvq_f32(temp3_vec) + beta * y3_ptr[iy];
}

for (; i < m; ++i) {
y0_ptr[iy] += alpha * vcvtah_f32_bf16(a0_ptr[i]) * vcvtah_f32_bf16(x_ptr[i]);
y1_ptr[iy] += alpha * vcvtah_f32_bf16(a1_ptr[i]) * vcvtah_f32_bf16(x_ptr[i]);
y2_ptr[iy] += alpha * vcvtah_f32_bf16(a2_ptr[i]) * vcvtah_f32_bf16(x_ptr[i]);
y3_ptr[iy] += alpha * vcvtah_f32_bf16(a3_ptr[i]) * vcvtah_f32_bf16(x_ptr[i]);
}

iy += incy;

a0_ptr += lda;
a1_ptr += lda;
a2_ptr += lda;
a3_ptr += lda;
}

a_ptr = a3_ptr;
y_ptr = y3_ptr;
for (j = width * 4; j < n; j++) {
float32x4_t temp0_vec = vdupq_n_f32(0.0f);
i = 0;
while (i + 7 < m) {
bfloat16x8_t x_vec = vld1q_bf16(x_ptr + i);
bfloat16x8_t a0_vec = vld1q_bf16(a_ptr + i);
temp0_vec = vbfdotq_f32(temp0_vec, a0_vec, x_vec);

i += 8;
}
if (i + 3 < m) {
float32x2_t t0 = vdup_n_f32(0.0f);
bfloat16x4_t x_vec = vld1_bf16(x_ptr + i);
bfloat16x4_t a0_vec = vld1_bf16(a_ptr + i);

t0 = vbfdot_f32(t0, a0_vec, x_vec);
float32x2_t temp0_vec_low = vget_low_f32(temp0_vec);
temp0_vec = vcombine_f32(vadd_f32(t0, temp0_vec_low), vget_high_f32(temp0_vec));

i += 4;
}
if (beta == 0.0f) {
y_ptr[iy] = alpha * vaddvq_f32(temp0_vec);
}
else {
y_ptr[iy] = alpha * vaddvq_f32(temp0_vec) + beta * y_ptr[iy];
}

for (; i < m; ++i) {
y_ptr[iy] += alpha * vcvtah_f32_bf16(a_ptr[i]) * vcvtah_f32_bf16(x_ptr[i]);
}

iy += incy;

a_ptr += lda;
}
return(0);
}

for (j = 0; j < n; j++) {
temp = 0.0;
ix = 0;
for (i = 0; i < m; i++) {
temp += vcvtah_f32_bf16(a_ptr[i]) * vcvtah_f32_bf16(x_ptr[ix]);
ix += incx;
}
if (beta == 0.0f) {
y[iy] = alpha * temp;
}
else {
y[iy] = alpha * temp + beta * y[iy];
}
iy += incy;
a_ptr += lda;
}
return (0);
}

+ 80
- 0
kernel/arm64/sgemm_direct_arm64_sme1.c View File

@@ -0,0 +1,80 @@
/*
Copyright (c) 2025 Qualcomm Innovation Center, Inc. All rights reserved.
SPDX-License-Identifier: BSD-3-Clause-Clear
*/
#include "common.h"
#include <stdlib.h>
#include <inttypes.h>
#include <math.h>
#if defined(HAVE_SME)
/* Function prototypes */
extern void sgemm_direct_sme1_preprocess(uint64_t nbr, uint64_t nbc,\
const float * restrict a, float * a_mod) __asm__("sgemm_direct_sme1_preprocess");
extern void sgemm_direct_sme1_2VLx2VL(uint64_t m, uint64_t k, uint64_t n,\
const float * matLeft,\
const float * restrict matRight,\
const float * restrict matResult) __asm__("sgemm_direct_sme1_2VLx2VL");
/* Function Definitions */
uint64_t sve_cntw() {
uint64_t cnt;
asm volatile(
"rdsvl %[res], #1\n"
"lsr %[res], %[res], #2\n"
: [res] "=r" (cnt) ::
);
return cnt;
}
/*void sgemm_kernel_direct (BLASLONG M, BLASLONG N, BLASLONG K,\
float * __restrict A, BLASLONG strideA, float * __restrict B,\
BLASLONG strideB , float * __restrict R, BLASLONG strideR)
*/
void CNAME (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A,\
BLASLONG strideA, float * __restrict B, BLASLONG strideB ,\
float * __restrict R, BLASLONG strideR){
uint64_t m_mod, vl_elms;
vl_elms = sve_cntw();
m_mod = ceil((double)M/(double)vl_elms) * vl_elms;
float *A_mod = (float *) malloc(m_mod*K*sizeof(float));
/* Prevent compiler optimization by reading from memory instead
* of reading directly from vector (z) registers.
* */
asm volatile("" : : :"p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7",
"p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15",
"z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7",
"z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15",
"z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23",
"z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31");
/* Pre-process the left matrix to make it suitable for
matrix sum of outer-product calculation
*/
sgemm_direct_sme1_preprocess(M, K, A, A_mod);
/* Calculate C = A*B */
sgemm_direct_sme1_2VLx2VL(M, K, N, A_mod, B, R);
asm volatile("" : : :"p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7",
"p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15",
"z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7",
"z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15",
"z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23",
"z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31");
free(A_mod);
}
#else
void CNAME (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A,\
BLASLONG strideA, float * __restrict B, BLASLONG strideB ,\
float * __restrict R, BLASLONG strideR){}
#endif

Some files were not shown because too many files changed in this diff

Loading…
Cancel
Save