Browse Source

Merge branch 'OpenMathLib:develop' into gemmt_tests

pull/5187/head
Martin Kroeker GitHub 1 year ago
parent
commit
77b2f15c45
No known key found for this signature in database GPG Key ID: B5690EEEBB952194
100 changed files with 3572 additions and 3113 deletions
  1. +4
    -4
      .cirrus.yml
  2. +1
    -0
      .github/workflows/apple_m.yml
  3. +6
    -6
      .github/workflows/c910v.yml
  4. +93
    -27
      CMakeLists.txt
  5. +7
    -1
      CONTRIBUTORS.md
  6. +6
    -1
      Makefile
  7. +5
    -0
      Makefile.system
  8. +1
    -1
      azure-pipelines.yml
  9. +0
    -11
      cmake/arch.cmake
  10. +12
    -12
      cmake/fc.cmake
  11. +27
    -6
      cmake/prebuild.cmake
  12. +14
    -9
      cmake/system.cmake
  13. +0
    -2
      common_param.h
  14. +9
    -2
      common_zarch.h
  15. +3
    -0
      cpuid.S
  16. +13
    -2
      cpuid_arm64.c
  17. +17
    -0
      cpuid_x86.c
  18. +0
    -16
      ctest/CMakeLists.txt
  19. +4
    -4
      ctest/Makefile
  20. +1
    -1
      ctest/c_cblat1c.c
  21. +4
    -0
      driver/level2/CMakeLists.txt
  22. +4
    -0
      driver/level3/CMakeLists.txt
  23. +4
    -0
      driver/others/CMakeLists.txt
  24. +31
    -10
      driver/others/dynamic_arm64.c
  25. +154
    -29
      exports/gensymbol.pl
  26. +8
    -4
      interface/CMakeLists.txt
  27. +18
    -7
      interface/gemv.c
  28. +6
    -1
      kernel/CMakeLists.txt
  29. +4
    -3
      kernel/Makefile.L3
  30. +2
    -2
      kernel/arm64/KERNEL.A64FX
  31. +9
    -4
      kernel/arm64/KERNEL.ARMV8SVE
  32. +5
    -1
      kernel/arm64/KERNEL.NEOVERSEN1
  33. +4
    -4
      kernel/arm64/KERNEL.NEOVERSEN2
  34. +2
    -0
      kernel/arm64/KERNEL.NEOVERSEV1
  35. +50
    -3
      kernel/arm64/dot.c
  36. +138
    -0
      kernel/arm64/gemv_n_sve_v1x3.c
  37. +207
    -0
      kernel/arm64/gemv_n_sve_v4x3.c
  38. +24
    -3
      kernel/arm64/sgemm_direct_arm64_sme1.c
  39. +219
    -0
      kernel/arm64/sgemv_n_neon.c
  40. +113
    -0
      kernel/arm64/symv_L_asimd_4x4.c
  41. +103
    -0
      kernel/arm64/symv_L_sve_v1x4.c
  42. +106
    -0
      kernel/arm64/symv_U_asimd_4x4.c
  43. +104
    -0
      kernel/arm64/symv_U_sve_v1x4.c
  44. +120
    -0
      kernel/arm64/symv_microk_asimd_4x4.c
  45. +89
    -0
      kernel/arm64/symv_microk_sve_v1x4.c
  46. +1
    -6
      kernel/loongarch64/amax_lasx.S
  47. +10
    -12
      kernel/loongarch64/asum_lasx.S
  48. +1
    -1
      kernel/loongarch64/cdot_lasx.S
  49. +53
    -25
      kernel/loongarch64/cnrm2_lasx.S
  50. +4
    -4
      kernel/loongarch64/copy_lasx.S
  51. +61
    -186
      kernel/loongarch64/cscal_lasx.S
  52. +29
    -57
      kernel/loongarch64/dot_lasx.S
  53. +282
    -284
      kernel/loongarch64/iamax_lasx.S
  54. +165
    -247
      kernel/loongarch64/icamax_lasx.S
  55. +123
    -1177
      kernel/loongarch64/rot_lasx.S
  56. +94
    -32
      kernel/loongarch64/snrm2_lasx.S
  57. +9
    -56
      kernel/loongarch64/swap_lasx.S
  58. +336
    -421
      kernel/power/sgemv_n.c
  59. +128
    -172
      kernel/power/sgemv_t.c
  60. +207
    -97
      kernel/riscv64/gemv_n_vector.c
  61. +221
    -74
      kernel/riscv64/zgemv_n_vector.c
  62. +0
    -2
      kernel/setparam-ref.c
  63. +1
    -1
      kernel/x86_64/sbgemv_n_microk_cooperlake_template.c
  64. +1
    -1
      kernel/x86_64/zsum.c
  65. +2
    -0
      kernel/zarch/ctrmm4x4V.S
  66. +2
    -0
      kernel/zarch/gemm8x4V.S
  67. +2
    -0
      kernel/zarch/strmm8x4V.S
  68. +2
    -0
      kernel/zarch/trmm8x4V.S
  69. +2
    -0
      kernel/zarch/ztrmm4x4V.S
  70. +10
    -0
      lapack-netlib/LAPACKE/include/lapacke_config.h
  71. +2
    -2
      lapack-netlib/SRC/cgees.c
  72. +2
    -2
      lapack-netlib/SRC/cgeesx.c
  73. +3
    -3
      lapack-netlib/SRC/cgeev.f
  74. +2
    -2
      lapack-netlib/SRC/cgges.c
  75. +2
    -2
      lapack-netlib/SRC/cgges3.c
  76. +2
    -2
      lapack-netlib/SRC/cggesx.c
  77. +1
    -1
      lapack-netlib/SRC/dgees.c
  78. +1
    -1
      lapack-netlib/SRC/dgeesx.c
  79. +3
    -3
      lapack-netlib/SRC/dgeev.f
  80. +2
    -2
      lapack-netlib/SRC/dgges.c
  81. +2
    -2
      lapack-netlib/SRC/dgges3.c
  82. +2
    -2
      lapack-netlib/SRC/dggesx.c
  83. +1
    -1
      lapack-netlib/SRC/sgees.c
  84. +1
    -1
      lapack-netlib/SRC/sgeesx.c
  85. +3
    -3
      lapack-netlib/SRC/sgeev.f
  86. +2
    -2
      lapack-netlib/SRC/sgges.c
  87. +2
    -2
      lapack-netlib/SRC/sgges3.c
  88. +2
    -2
      lapack-netlib/SRC/sggesx.c
  89. +2
    -2
      lapack-netlib/SRC/zgees.c
  90. +2
    -2
      lapack-netlib/SRC/zgeesx.c
  91. +3
    -3
      lapack-netlib/SRC/zgeev.f
  92. +3
    -2
      lapack-netlib/SRC/zgges.c
  93. +3
    -2
      lapack-netlib/SRC/zgges3.c
  94. +3
    -2
      lapack-netlib/SRC/zggesx.c
  95. +0
    -6
      lapack-netlib/TESTING/EIG/CMakeLists.txt
  96. +6
    -6
      lapack-netlib/TESTING/EIG/cerred.f
  97. +6
    -6
      lapack-netlib/TESTING/EIG/derred.f
  98. +6
    -6
      lapack-netlib/TESTING/EIG/serred.f
  99. +6
    -6
      lapack-netlib/TESTING/EIG/zerred.f
  100. +0
    -4
      lapack-netlib/TESTING/LIN/CMakeLists.txt

+ 4
- 4
.cirrus.yml View File

@@ -58,8 +58,8 @@ task:
- export VALID_ARCHS="i386 x86_64"
- xcrun --sdk macosx --show-sdk-path
- xcodebuild -version
- export CC=/Applications/Xcode_15.4.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
- export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode_15.4.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX14.5.sdk -arch x86_64"
- export CC=/Applications/Xcode_16.3.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
- export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode_16.3.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX15.4.sdk -arch x86_64"
- make TARGET=CORE2 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 RANLIB="ls -l"
always:
config_artifacts:
@@ -78,8 +78,8 @@ task:
- export #PATH=/opt/homebrew/opt/llvm/bin:$PATH
- export #LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
- export #CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
- export CC=/Applications/Xcode_15.4.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
- export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode_15.4.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS17.5.sdk -arch arm64 -miphoneos-version-min=10.0"
- export CC=/Applications/Xcode_16.3.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
- export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode_16.3.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS18.4.sdk -arch arm64 -miphoneos-version-min=10.0"
- xcrun --sdk iphoneos --show-sdk-path
- ls -l /Applications
- make TARGET=ARMV8 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 CROSS=1


+ 1
- 0
.github/workflows/apple_m.yml View File

@@ -102,6 +102,7 @@ jobs:
mkdir build && cd build
cmake -DDYNAMIC_ARCH=1 \
-DUSE_OPENMP=${{matrix.openmp}} \
-DOpenMP_Fortran_LIB_NAMES=omp \
-DINTERFACE64=${{matrix.ilp64}} \
-DNOFORTRAN=0 \
-DBUILD_WITHOUT_LAPACK=0 \


+ 6
- 6
.github/workflows/c910v.yml View File

@@ -31,7 +31,7 @@ jobs:

steps:
- name: Checkout repository
uses: actions/checkout@v3
uses: actions/checkout@v4

- name: install build deps
run: |
@@ -40,18 +40,18 @@ jobs:
gcc-${{ matrix.apt_triple }} gfortran-${{ matrix.apt_triple }} libgomp1-riscv64-cross libglib2.0-dev

- name: checkout qemu
uses: actions/checkout@v3
uses: actions/checkout@v4
with:
repository: T-head-Semi/qemu
repository: XUANTIE-RV/qemu
path: qemu
ref: 1e692ebb43d396c52352406323fc782c1ac99a42
ref: e0ace167effcd36d1f82c7ccb4522b3126011479 # xuantie-qemu-9.0

- name: build qemu
run: |
# Force use c910v qemu-user
wget https://github.com/revyos/qemu/commit/5164bca5a4bcde4534dc1a9aa3a7f619719874cf.patch
wget https://github.com/revyos/qemu/commit/222729c7455784dd855216d7a2bec4bd8f2a6800.patch
cd qemu
patch -p1 < ../5164bca5a4bcde4534dc1a9aa3a7f619719874cf.patch
patch -p1 < ../222729c7455784dd855216d7a2bec4bd8f2a6800.patch
export CXXFLAGS="-Wno-error"; export CFLAGS="-Wno-error"
./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=riscv64-linux-user --disable-system
make -j$(nproc)


+ 93
- 27
CMakeLists.txt View File

@@ -77,6 +77,17 @@ set(SYMBOLPREFIX "" CACHE STRING "Add a prefix to all exported symbol names in

set(SYMBOLSUFFIX "" CACHE STRING "Add a suffix to all exported symbol names in the shared library, e.g. _64 for INTERFACE64 builds" )

if (CMAKE_SYSTEM_NAME MATCHES "Windows" AND BUILD_SHARED_LIBS AND NOT ("${SYMBOLPREFIX}${SYMBOLSUFFIX}" STREQUAL ""))
set (DELETE_STATIC_LIBS "")
if (NOT BUILD_STATIC_LIBS)
message (STATUS "forcing build of a temporary static library for symbol renaming")
set (BUILD_SHARED_LIBS OFF CACHE BOOL "Build shared library" FORCE)
set (BUILD_STATIC_LIBS ON CACHE BOOL "Build static library" FORCE)
set (DELETE_STATIC_LIBS file (REMOVE $<TARGET_FILE_DIR:${OpenBLAS_LIBNAME}_static>/${OpenBLAS_LIBNAME}.lib))
endif ()
endif()


#######
if(BUILD_WITHOUT_LAPACK)
set(NO_LAPACK 1)
@@ -109,10 +120,6 @@ endif()

message(WARNING "CMake support is experimental. It does not yet support all build options and may not produce the same Makefiles that OpenBLAS ships with.")

if (USE_OPENMP)
find_package(OpenMP REQUIRED)
endif ()

include("${PROJECT_SOURCE_DIR}/cmake/utils.cmake")
include("${PROJECT_SOURCE_DIR}/cmake/system.cmake")

@@ -230,6 +237,12 @@ endif ()
# add objects to the openblas lib
if(NOT NO_LAPACK)
add_library(LAPACK_OVERRIDES OBJECT ${LA_SOURCES})
if (USE_OPENMP AND (NOT NOFORTRAN))
# Disable OpenMP for LAPACK Fortran codes on Windows.
if(NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Windows")
target_link_libraries(LAPACK_OVERRIDES OpenMP::OpenMP_Fortran)
endif()
endif()
list(APPEND TARGET_OBJS "$<TARGET_OBJECTS:LAPACK_OVERRIDES>")
endif()
if(NOT NO_LAPACKE)
@@ -271,30 +284,59 @@ endif()

if (USE_OPENMP)
if(BUILD_STATIC_LIBS)
target_link_libraries(${OpenBLAS_LIBNAME}_static OpenMP::OpenMP_C)
if(NOFORTRAN)
target_link_libraries(${OpenBLAS_LIBNAME}_static OpenMP::OpenMP_C)
else()
target_link_libraries(${OpenBLAS_LIBNAME}_static OpenMP::OpenMP_C OpenMP::OpenMP_Fortran)
endif()
endif()
if(BUILD_SHARED_LIBS)
target_link_libraries(${OpenBLAS_LIBNAME}_shared OpenMP::OpenMP_C)
if(NOFORTRAN)
target_link_libraries(${OpenBLAS_LIBNAME}_shared OpenMP::OpenMP_C)
else()
target_link_libraries(${OpenBLAS_LIBNAME}_shared OpenMP::OpenMP_C OpenMP::OpenMP_Fortran)
endif()
endif()
endif()

# Seems that this hack doesn't required since macOS 11 Big Sur
if (APPLE AND BUILD_SHARED_LIBS AND CMAKE_HOST_SYSTEM_VERSION VERSION_LESS 20)
set (CMAKE_C_USE_RESPONSE_FILE_FOR_OBJECTS 1)
if (NOT NOFORTRAN)
set (CMAKE_Fortran_USE_RESPONSE_FILE_FOR_OBJECTS 1)
set (CMAKE_Fortran_CREATE_SHARED_LIBRARY
"sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ${CMAKE_AR} -ru libopenblas.a && exit 0' "
"sh -c '${CMAKE_AR} -rs libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' "
"sh -c 'echo \"\" | ${CMAKE_Fortran_COMPILER} -o dummy.o -c -x f95-cpp-input - '"
"sh -c '${CMAKE_Fortran_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load dummy.o -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'"
"sh -c 'ls -l ${CMAKE_BINARY_DIR}/lib'")
else ()
set (CMAKE_C_CREATE_SHARED_LIBRARY
"sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ${CMAKE_AR} -ru libopenblas.a && exit 0' "
"sh -c '${CMAKE_AR} -rs libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' "
"sh -c '${CMAKE_C_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'")
endif ()
# Fix "Argument list too long" for macOS with Intel CPUs and DYNAMIC_ARCH turned on
if(APPLE AND DYNAMIC_ARCH AND (NOT CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "arm64"))
# Use response files
set(CMAKE_C_USE_RESPONSE_FILE_FOR_OBJECTS 1)
# Always build static library first
if(BUILD_STATIC_LIBS)
set(STATIC_PATH "${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/lib${OpenBLAS_LIBNAME}.a")
else()
add_library(${OpenBLAS_LIBNAME}_static STATIC ${TARGET_OBJS} ${OpenBLAS_DEF_FILE})
set(STATIC_PATH "lib${OpenBLAS_LIBNAME}.a")
endif()
set(CREATE_STATIC_LIBRARY_COMMAND
"sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/${OpenBLAS_LIBNAME}_static.dir/objects*.rsp | xargs -n 1024 ${CMAKE_AR} -ru ${STATIC_PATH} && exit 0' "
"sh -c '${CMAKE_AR} -rs ${STATIC_PATH} ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' ")
if(BUILD_SHARED_LIBS)
add_dependencies(${OpenBLAS_LIBNAME}_shared ${OpenBLAS_LIBNAME}_static)
set(SHARED_PATH "${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib")
endif()
if(USE_OPENMP)
get_target_property(OMP_LIB OpenMP::OpenMP_C INTERFACE_LINK_LIBRARIES)
else()
set(OMP_LIB "")
endif()
if(NOT NOFORTRAN)
set(CMAKE_Fortran_USE_RESPONSE_FILE_FOR_OBJECTS 1)
set(CMAKE_Fortran_CREATE_STATIC_LIBRARY ${CREATE_STATIC_LIBRARY_COMMAND})
if(BUILD_SHARED_LIBS)
set(CMAKE_Fortran_CREATE_SHARED_LIBRARY
"sh -c 'echo \"\" | ${CMAKE_Fortran_COMPILER} -o dummy.o -c -x f95-cpp-input - '"
"sh -c '${CMAKE_Fortran_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,${STATIC_PATH} dummy.o -o ${SHARED_PATH} ${OMP_LIB}'")
endif()
else()
set(CMAKE_C_CREATE_STATIC_LIBRARY ${CREATE_STATIC_LIBRARY_COMMAND})
if(BUILD_SHARED_LIBS)
set(CMAKE_C_CREATE_SHARED_LIBRARY
"sh -c '${CMAKE_C_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,${STATIC_PATH} -o ${SHARED_PATH} ${OMP_LIB}'")
endif()
endif()
endif()

# Handle MSVC exports
@@ -379,7 +421,7 @@ if (BUILD_SHARED_LIBS AND BUILD_RELAPACK)
endif()
endif()

if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFFIX} STREQUAL "")
if (BUILD_SHARED_LIBS OR DELETE_STATIC_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFFIX} STREQUAL "")
if (NOT DEFINED ARCH)
set(ARCH_IN "x86_64")
else()
@@ -467,10 +509,33 @@ if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFFIX} STREQUAL "")
else ()
set (BZ 0)
endif()

if (CMAKE_SYSTEM_NAME MATCHES "Windows")
set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)
set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)
if (CMAKE_BUILD_TYPE MATCHES "Debug")
set (CRTLIB msvcrtd)
set (PDBOPT -debug -pdb:$<TARGET_FILE_DIR:${OpenBLAS_LIBNAME}_static>/${OpenBLAS_LIBNAME}.pdb)
set (PDB_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)
else ()
set (CRTLIB msvcrt)
set (PDBOPT "")
endif()
#if (USE_PERL)
message(STATUS "adding postbuild instruction to rename syms")
add_custom_command(TARGET ${OpenBLAS_LIBNAME}_static POST_BUILD
COMMAND perl ${PROJECT_SOURCE_DIR}/exports/gensymbol.pl "win2k" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" "${SYMBOLPREFIX}" "${SYMBOLSUFFIX}" "${BLD}" "${BBF16}" "${BS}" "${BD}" "${BC}" "${BZ}" > ${PROJECT_BINARY_DIR}/renamesyms.def
COMMAND ${CMAKE_C_COMPILER} ${CMAKE_C_FLAGS} -I${PROJECT_SOURCE_DIR} -I${PROJECT_BINARY_DIR} -c -o ${PROJECT_BINARY_DIR}/dllinit.o ${PROJECT_SOURCE_DIR}/exports/dllinit.c
COMMAND lld-link -nodefaultlib:libcmt -defaultlib:${CRTLIB} ${CMAKE_LINKER_FLAGS} -errorlimit:0 -def:${PROJECT_BINARY_DIR}/renamesyms.def ${PROJECT_BINARY_DIR}/dllinit.o $<TARGET_FILE:${OpenBLAS_LIBNAME}_static> -wholearchive:$<TARGET_FILE:${OpenBLAS_LIBNAME}_static> -dll -out:$<TARGET_FILE_DIR:${OpenBLAS_LIBNAME}_static>/${OpenBLAS_LIBNAME}.dll -implib:$<TARGET_FILE_DIR:${OpenBLAS_LIBNAME}_static>/${OpenBLAS_LIBNAME}.dll.a ${PDBOPT}
#COMMAND lld-link -nodefaultlib:libcmt -defaultlib:msvcrt ${CMAKE_LINKER_FLAGS} -errorlimit:0 -def:${PROJECT_BINARY_DIR}/renamesyms.def ${PROJECT_BINARY_DIR}/dllinit.o $<TARGET_FILE:${OpenBLAS_LIBNAME}_static> -wholearchive:$<TARGET_FILE:${OpenBLAS_LIBNAME}_static> -dll -out:$<TARGET_FILE_DIR:${OpenBLAS_LIBNAME}_static>/${OpenBLAS_LIBNAME}.dll -implib:$<TARGET_FILE_DIR:${OpenBLAS_LIBNAME}_static>/${OpenBLAS_LIBNAME}.dll.a
${REMOVE_STATIC_LIB} VERBATIM
)
#endif ()
else ()
if (NOT USE_PERL)
add_custom_command(TARGET ${OpenBLAS_LIBNAME}_shared POST_BUILD
COMMAND ${PROJECT_SOURCE_DIR}/exports/gensymbol "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BLD}" "${BBF16}" "${BS}" "${BD}" "${BC}" "${BZ}" > ${PROJECT_BINARY_DIR}/objcopy.def
COMMAND objcopy -v --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def ${PROJECT_BINARY_DIR}/lib/lib${OpenBLAS_LIBNAME}.so
COMMAND sh ${PROJECT_SOURCE_DIR}/exports/gensymbol "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BLD}" "${BBF16}" "${BS}" "${BD}" "${BC}" "${BZ}" > ${PROJECT_BINARY_DIR}/objcopy.def
COMMAND objcopy -v --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def ${PROJECT_BINARY_DIR}/lib/${OpenBLAS_LIBNAME}.so
COMMENT "renaming symbols"
)
else()
@@ -481,6 +546,7 @@ if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFFIX} STREQUAL "")
)
endif()
endif()
endif()

if (BUILD_BENCHMARKS)
#find_package(OpenMP REQUIRED)
@@ -650,4 +716,4 @@ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${PN}ConfigVersion.cmake
DESTINATION ${CMAKECONFIG_INSTALL_DIR})
install(EXPORT "${PN}${SUFFIX64}Targets"
NAMESPACE "${PN}${SUFFIX64}::"
DESTINATION ${CMAKECONFIG_INSTALL_DIR})
DESTINATION ${CMAKECONFIG_INSTALL_DIR})

+ 7
- 1
CONTRIBUTORS.md View File

@@ -26,6 +26,9 @@
* Chris Sidebottom <chris.sidebottom@arm.com>
* Optimizations and other improvements targeting AArch64

* Annop Wongwathanarat <annop.wongwathanarat@arm.com>
* Optimizations and other improvements targeting AArch64

## Previous Developers

* Zaheer Chothia <zaheer.chothia@gmail.com>
@@ -247,4 +250,7 @@ In chronological order:

* Ye Tao <ye.tao@arm.com>
* [2025-02-03] Optimize SBGEMM kernel on NEOVERSEV1
* [2025-02-27] Add sbgemv_n_neon kernel
* [2025-02-27] Add sbgemv_n_neon kernel

* Abhishek Kumar <https://github.com/abhishek-iitmadras>
* [2025-04-22] Optimise dot kernel for NEOVERSE V1

+ 6
- 1
Makefile View File

@@ -93,6 +93,11 @@ ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
echo " Fortran compiler ... $(F_COMPILER) (command line : $(FC))";\
fi
endif

ifeq ($(OSNAME), WINNT)
@-$(LNCMD) $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
endif

ifneq ($(OSNAME), AIX)
@echo -n " Library Name ... $(LIBNAME)"
else
@@ -447,7 +452,7 @@ endif
@rm -f cblas.tmp cblas.tmp2
@touch $(NETLIB_LAPACK_DIR)/make.inc
@$(MAKE) -C $(NETLIB_LAPACK_DIR) clean
@rm -f $(NETLIB_LAPACK_DIR)/make.inc $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_mangling.h
@rm -f $(NETLIB_LAPACK_DIR)/make.inc
@$(MAKE) -C relapack clean
@rm -f *.grd Makefile.conf_last config_last.h
@(cd $(NETLIB_LAPACK_DIR)/TESTING && rm -f x* *.out testing_results.txt)


+ 5
- 0
Makefile.system View File

@@ -435,6 +435,11 @@ ifeq (x$(XCVER), x 15)
CCOMMON_OPT += -Wl,-ld_classic
FCOMMON_OPT += -Wl,-ld_classic
endif
ifeq (x$(XCVER), x 16)
ifeq ($(F_COMPILER), GFORTRAN)
override CEXTRALIB := $(filter-out(-lto_library, $(CEXTRALIB)))
endif
endif
endif

ifneq (,$(findstring $(OSNAME), FreeBSD OpenBSD DragonFly))


+ 1
- 1
azure-pipelines.yml View File

@@ -175,7 +175,7 @@ jobs:
- script: |
brew update
brew install llvm flang
make TARGET=NEHALEM CC=/usr/local/opt/llvm/bin/clang FC=/usr/local/Cellar/flang/19.1.7_1/bin/flang-new NO_SHARED=1
make TARGET=NEHALEM CC=/usr/local/opt/llvm/bin/clang FC=/usr/local/opt/flang/bin/flang NO_SHARED=1

- job: OSX_OpenMP_Clang
pool:


+ 0
- 11
cmake/arch.cmake View File

@@ -31,17 +31,6 @@ if (${CMAKE_C_COMPILER_ID} STREQUAL "Intel")
set(CCOMMON_OPT "${CCOMMON_OPT} -wd981")
endif ()

if (USE_OPENMP)
# USE_SIMPLE_THREADED_LEVEL3 = 1
# NO_AFFINITY = 1
find_package(OpenMP REQUIRED)
if (OpenMP_FOUND)
set(CCOMMON_OPT "${CCOMMON_OPT} ${OpenMP_C_FLAGS} -DUSE_OPENMP")
set(FCOMMON_OPT "${FCOMMON_OPT} ${OpenMP_Fortran_FLAGS}")
endif()
endif ()


if (DYNAMIC_ARCH)
if (ARM64)
set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110)


+ 12
- 12
cmake/fc.cmake View File

@@ -7,7 +7,7 @@ if (${F_COMPILER} STREQUAL "FLANG" AND NOT CMAKE_Fortran_COMPILER_ID STREQUAL "L
# This is for classic Flang. LLVM Flang is handled with gfortran below.
set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_FLANG")
if (USE_OPENMP)
set(FCOMMON_OPT "${FCOMMON_OPT} -fopenmp")
set(OpenMP_Fortran_FLAGS "-fopenmp" CACHE STRING "OpenMP Fortran compiler flags")
endif ()
set(FCOMMON_OPT "${FCOMMON_OPT} -Mrecursive -Kieee")
endif ()
@@ -117,7 +117,7 @@ if (${F_COMPILER} STREQUAL "GFORTRAN" OR ${F_COMPILER} STREQUAL "F95" OR CMAKE_F
endif ()

if (USE_OPENMP)
set(FCOMMON_OPT "${FCOMMON_OPT} -fopenmp")
set(OpenMP_Fortran_FLAGS "-fopenmp" CACHE STRING "OpenMP Fortran compiler flags")
endif ()
endif ()

@@ -128,14 +128,14 @@ if (${F_COMPILER} STREQUAL "INTEL" OR CMAKE_Fortran_COMPILER_ID MATCHES "Intel")
endif ()
set(FCOMMON_OPT "${FCOMMON_OPT} -recursive -fp-model=consistent")
if (USE_OPENMP)
set(FCOMMON_OPT "${FCOMMON_OPT} -openmp")
set(OpenMP_Fortran_FLAGS "-openmp" CACHE STRING "OpenMP Fortran compiler flags")
endif ()
endif ()

if (${F_COMPILER} STREQUAL "FUJITSU")
set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_FUJITSU")
if (USE_OPENMP)
set(FCOMMON_OPT "${FCOMMON_OPT} -openmp")
set(OpenMP_Fortran_FLAGS "-openmp" CACHE STRING "OpenMP Fortran compiler flags")
endif ()
endif ()

@@ -151,7 +151,7 @@ if (${F_COMPILER} STREQUAL "IBM")
set(FCOMMON_OPT "${FCOMMON_OPT} -q32")
endif ()
if (USE_OPENMP)
set(FCOMMON_OPT "${FCOMMON_OPT} -openmp")
set(OpenMP_Fortran_FLAGS "-openmp" CACHE STRING "OpenMP Fortran compiler flags")
endif ()
endif ()

@@ -168,7 +168,7 @@ if (${F_COMPILER} STREQUAL "PGI" OR ${F_COMPILER} STREQUAL "PGF95")
endif ()
set(FCOMMON_OPT "${FCOMMON_OPT} -Mrecursive")
if (USE_OPENMP)
set(FCOMMON_OPT "${FCOMMON_OPT} -mp")
set(OpenMP_Fortran_FLAGS "-mp" CACHE STRING "OpenMP Fortran compiler flags")
endif ()
endif ()

@@ -195,7 +195,7 @@ if (${F_COMPILER} STREQUAL "PATHSCALE")
endif ()

if (USE_OPENMP)
set(FCOMMON_OPT "${FCOMMON_OPT} -mp")
set(OpenMP_Fortran_FLAGS "-mp" CACHE STRING "OpenMP Fortran compiler flags")
endif ()
endif ()

@@ -233,7 +233,7 @@ if (${F_COMPILER} STREQUAL "OPEN64")

if (USE_OPENMP)
set(FEXTRALIB "${FEXTRALIB} -lstdc++")
set(FCOMMON_OPT "${FCOMMON_OPT} -mp")
set(OpenMP_Fortran_FLAGS "-mp" CACHE STRING "OpenMP Fortran compiler flags")
endif ()
endif ()

@@ -245,14 +245,14 @@ if (${F_COMPILER} STREQUAL "SUN")
set(FCOMMON_OPT "${FCOMMON_OPT} -m64")
endif ()
if (USE_OPENMP)
set(FCOMMON_OPT "${FCOMMON_OPT} -xopenmp=parallel")
set(OpenMP_Fortran_FLAGS "-xopenmp=parallel" CACHE STRING "OpenMP Fortran compiler flags")
endif ()
endif ()

if (${F_COMPILER} STREQUAL "COMPAQ")
set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_COMPAQ")
if (USE_OPENMP)
set(FCOMMON_OPT "${FCOMMON_OPT} -openmp")
set(OpenMP_Fortran_FLAGS "-openmp" CACHE STRING "OpenMP Fortran compiler flags")
endif ()
endif ()

@@ -265,7 +265,7 @@ if (${F_COMPILER} STREQUAL "CRAY")
if (NOT USE_OPENMP)
set(FCOMMON_OPT "${FCOMMON_OPT} -fno-openmp")
else ()
set(FCOMMON_OPT "${FCOMMON_OPT} -fopenmp")
set(OpenMP_Fortran_FLAGS "-fopenmp" CACHE STRING "OpenMP Fortran compiler flags")
endif ()
endif ()

@@ -290,7 +290,7 @@ if (${F_COMPILER} STREQUAL "NAGFOR")
# -w=unused: Suppress warning messages about unused variables
set(FCOMMON_OPT "${FCOMMON_OPT} -w=x77 -w=ques -w=unused")
if (USE_OPENMP)
set(FCOMMON_OPT "${FCOMMON_OPT} -openmp")
set(OpenMP_Fortran_FLAGS "-openmp" CACHE STRING "OpenMP Fortran compiler flags")
endif ()
endif ()



+ 27
- 6
cmake/prebuild.cmake View File

@@ -1006,15 +1006,15 @@ endif ()
"#define HAVE_SVE\n"
"#define ARMV8\n")
set(SGEMM_UNROLL_M 16)
set(SGEMM_UNROLL_N 4)
set(DGEMM_UNROLL_M 8)
set(DGEMM_UNROLL_N 4)
set(CGEMM_UNROLL_M 8)
set(SGEMM_UNROLL_N 8)
set(DGEMM_UNROLL_M 4)
set(DGEMM_UNROLL_N 8)
set(CGEMM_UNROLL_M 2)
set(CGEMM_UNROLL_N 4)
set(ZGEMM_UNROLL_M 4)
set(ZGEMM_UNROLL_M 2)
set(ZGEMM_UNROLL_N 4)
set(SYMV_P 16)
elseif ("${TCORE}" STREQUAL "NEOVERSEN2" or "${TCORE}" STREQUAL "ARMV9SME")
elseif ("${TCORE}" STREQUAL "NEOVERSEN2" OR "${TCORE}" STREQUAL "ARMV9SME")
file(APPEND ${TARGET_CONF_TEMP}
"#define L1_CODE_SIZE\t65536\n"
"#define L1_CODE_LINESIZE\t64\n"
@@ -1249,6 +1249,25 @@ endif ()
set(ZGEMM_UNROLL_M 2)
set(ZGEMM_UNROLL_N 4)
set(SYMV_P 16)
elseif ("${TCORE}" STREQUAL "ARMV8SVE" OR "${TCORE}" STREQUAL "CORTEXA510" OR "${TCORE}" STREQUAL "CORTEXX2" OR "${TCORE}" STREQUAL "ARMV9")
file(APPEND ${TARGET_CONF_TEMP}
"#define L1_DATA_SIZE\t32768\n"
"#define L1_DATA_LINESIZE\t64\n"
"#define L2_SIZE\t262144\n"
"#define L2_LINESIZE\t64\n"
"#define DTB_DEFAULT_ENTRIES\t64\n"
"#define DTB_SIZE\t4096\n"
"#define L2_ASSOCIATIVE\t32\n"
"#define ARMV8\n")
set(SGEMM_UNROLL_M 4)
set(SGEMM_UNROLL_N 8)
set(DGEMM_UNROLL_M 4)
set(DGEMM_UNROLL_N 8)
set(CGEMM_UNROLL_M 2)
set(CGEMM_UNROLL_N 4)
set(ZGEMM_UNROLL_M 2)
set(ZGEMM_UNROLL_N 4)
set(SYMV_P 16)
elseif ("${TCORE}" STREQUAL "P5600")
file(APPEND ${TARGET_CONF_TEMP}
"#define L2_SIZE 1048576\n"
@@ -1409,9 +1428,11 @@ endif ()
# GetArch_2nd
foreach(float_char S;D;Q;C;Z;X)
if (NOT DEFINED ${float_char}GEMM_UNROLL_M)
message(STATUS "setting unrollm=2")
set(${float_char}GEMM_UNROLL_M 2)
endif()
if (NOT DEFINED ${float_char}GEMM_UNROLL_N)
message(STATUS "setting unrolln=2")
set(${float_char}GEMM_UNROLL_N 2)
endif()
endforeach()


+ 14
- 9
cmake/system.cmake View File

@@ -372,6 +372,20 @@ else ()
endif ()
endif ()

if (USE_OPENMP)
find_package(OpenMP COMPONENTS C REQUIRED)
set(CCOMMON_OPT "${CCOMMON_OPT} -DUSE_OPENMP")
if (NOT NOFORTRAN)
find_package(OpenMP COMPONENTS Fortran REQUIRED)
# Avoid mixed OpenMP linkage
get_target_property(OMP_C_LIB OpenMP::OpenMP_C INTERFACE_LINK_LIBRARIES)
get_target_property(OMP_Fortran_LIB OpenMP::OpenMP_Fortran INTERFACE_LINK_LIBRARIES)
if (NOT OMP_C_LIB STREQUAL OMP_Fortran_LIB)
message(FATAL_ERROR "Multiple OpenMP runtime libraries detected. Mixed OpenMP runtime linkage is dangerous. You may pass -DOpenMP_LANG_LIB_NAMES and -DOpenMP_omp_LIBRARY to manually choose the OpenMP library.")
endif()
endif ()
endif ()

if (BINARY64)
if (INTERFACE64)
# CCOMMON_OPT += -DUSE64BITINT
@@ -655,15 +669,6 @@ if (LAPACK_STRLEN)
endif()
set(LAPACK_FPFLAGS "${LAPACK_FPFLAGS} ${FPFLAGS}")

#Disable -fopenmp for LAPACK Fortran codes on Windows.
if (${CMAKE_SYSTEM_NAME} STREQUAL "Windows")
set(FILTER_FLAGS "-fopenmp;-mp;-openmp;-xopenmp=parallel")
foreach (FILTER_FLAG ${FILTER_FLAGS})
string(REPLACE ${FILTER_FLAG} "" LAPACK_FFLAGS ${LAPACK_FFLAGS})
string(REPLACE ${FILTER_FLAG} "" LAPACK_FPFLAGS ${LAPACK_FPFLAGS})
endforeach ()
endif ()

if (CMAKE_Fortran_COMPILER)
if ("${F_COMPILER}" STREQUAL "NAGFOR" OR "${F_COMPILER}" STREQUAL "CRAY" OR CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*")
set(FILTER_FLAGS "-msse3;-mssse3;-msse4.1;-mavx;-mavx2,-mskylake-avx512")


+ 0
- 2
common_param.h View File

@@ -224,10 +224,8 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG);
int (*sgemm_direct_performant) (BLASLONG M, BLASLONG N, BLASLONG K);
#endif
#ifdef ARCH_ARM64
#ifdef HAVE_SME
void (*sgemm_direct) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG , float *, BLASLONG , float * , BLASLONG);
#endif
#endif

int (*sgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG);


+ 9
- 2
common_zarch.h View File

@@ -103,9 +103,16 @@ static inline int blas_quickdivide(blasint x, blasint y){
.global REALNAME ;\
.type REALNAME, %function ;\
REALNAME:

#define EPILOGUE
#if defined(__ELF__) && defined(__linux__)
# define GNUSTACK .section .note.GNU-stack,"",@progbits
#else
# define GNUSTACK
#endif

#define EPILOGUE \
.size REALNAME, .-REALNAME; \
GNUSTACK

#define PROFCODE



+ 3
- 0
cpuid.S View File

@@ -65,3 +65,6 @@ _cpuid:
.subsections_via_symbols

#endif
#if defined(__ELF__) && defined(__linux__)
.section .note.GNU-stack,"",@progbits
#endif

+ 13
- 2
cpuid_arm64.c View File

@@ -374,15 +374,20 @@ int detect(void)
}
#else
#ifdef __APPLE__
length64 = sizeof(value64);
sysctlbyname("hw.ncpu",&value64,&length64,NULL,0);
cpulowperf=value64;
length64 = sizeof(value64);
sysctlbyname("hw.nperflevels",&value64,&length64,NULL,0);
if (value64 > 1) {
sysctlbyname("hw.perflevel0.cpusperl",&value64,&length64,NULL,0);
length64 = sizeof(value64);
sysctlbyname("hw.perflevel0.cpusperl2",&value64,&length64,NULL,0);
cpuhiperf=value64;
sysctlbyname("hw.perflevel1.cpusperl",&value64,&length64,NULL,0);
length64 = sizeof(value64);
sysctlbyname("hw.perflevel1.cpusperl2",&value64,&length64,NULL,0);
cpulowperf=value64;
}
length64 = sizeof(value64);
sysctlbyname("hw.cpufamily",&value64,&length64,NULL,0);
if (value64 ==131287967|| value64 == 458787763 ) return CPU_VORTEX; //A12/M1
if (value64 == 3660830781) return CPU_VORTEX; //A15/M2
@@ -467,6 +472,7 @@ int n=0;
printf("#define NUM_CORES_HP %d\n",cpuhiperf);
#endif
#ifdef __APPLE__
length64 = sizeof(value64);
sysctlbyname("hw.physicalcpu_max",&value,&length,NULL,0);
printf("#define NUM_CORES %d\n",value);
if (cpulowperf >0)
@@ -698,12 +704,17 @@ void get_cpuconfig(void)
case CPU_VORTEX:
printf("#define VORTEX \n");
#ifdef __APPLE__
length64 = sizeof(value64);
sysctlbyname("hw.l1icachesize",&value64,&length64,NULL,0);
printf("#define L1_CODE_SIZE %lld \n",value64);
length64 = sizeof(value64);
sysctlbyname("hw.cachelinesize",&value64,&length64,NULL,0);
printf("#define L1_CODE_LINESIZE %lld \n",value64);
printf("#define L1_DATA_LINESIZE %lld \n",value64);
length64 = sizeof(value64);
sysctlbyname("hw.l1dcachesize",&value64,&length64,NULL,0);
printf("#define L1_DATA_SIZE %lld \n",value64);
length64 = sizeof(value64);
sysctlbyname("hw.l2cachesize",&value64,&length64,NULL,0);
printf("#define L2_SIZE %lld \n",value64);
#endif


+ 17
- 0
cpuid_x86.c View File

@@ -1578,6 +1578,7 @@ int get_cpuname(void){
case 12: //family 6 exmodel 12
switch (model) {
case 15:
case 6: // Arrow Lake
if(support_avx512())
return CPUTYPE_SAPPHIRERAPIDS;
if(support_avx2())
@@ -2421,6 +2422,22 @@ int get_coretype(void){
else
return CORE_NEHALEM;
}
case 12:
switch (model) {
case 6: // Arrow Lake
if(support_amx_bf16())
return CORE_SAPPHIRERAPIDS;
if(support_avx512_bf16())
return CORE_COOPERLAKE;
if(support_avx512())
return CORE_SKYLAKEX;
if(support_avx2())
return CORE_HASWELL;
if(support_avx())
return CORE_SANDYBRIDGE;
else
return CORE_NEHALEM;
}
}
case 15:
if (model <= 0x2) return CORE_NORTHWOOD;


+ 0
- 16
ctest/CMakeLists.txt View File

@@ -44,10 +44,6 @@ else()
c_${float_char}blas1.c)
endif()
target_link_libraries(x${float_char}cblat1 ${OpenBLAS_LIBNAME})
if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang))
string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}")
target_link_libraries(x${float_char}cblat1 omp pthread)
endif()
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX")
target_link_libraries(x${float_char}cblat1 m)
endif()
@@ -73,10 +69,6 @@ else()
constant.c)
endif()
target_link_libraries(x${float_char}cblat2 ${OpenBLAS_LIBNAME})
if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang))
string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}")
target_link_libraries(x${float_char}cblat2 omp pthread)
endif()
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX")
target_link_libraries(x${float_char}cblat2 m)
endif()
@@ -124,20 +116,12 @@ else()
endif()
endif()
target_link_libraries(x${float_char}cblat3 ${OpenBLAS_LIBNAME})
if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang))
string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}")
target_link_libraries(x${float_char}cblat3 omp pthread)
endif()
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX")
target_link_libraries(x${float_char}cblat3 m)
endif()
if (USE_GEMM3M)
if ((${float_char} STREQUAL "c") OR (${float_char} STREQUAL "z"))
target_link_libraries(x${float_char}cblat3_3m ${OpenBLAS_LIBNAME})
if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang))
string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}")
target_link_libraries(x${float_char}cblat3 omp pthread)
endif()
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX")
target_link_libraries(x${float_char}cblat3_3m m)
endif()


+ 4
- 4
ctest/Makefile View File

@@ -235,18 +235,18 @@ FLDFLAGS = $(FFLAGS:-fPIC=) $(LDFLAGS)
ifeq ($(USE_OPENMP), 1)
ifeq ($(F_COMPILER), GFORTRAN)
ifeq ($(C_COMPILER), CLANG)
CEXTRALIB += -lomp
EXTRALIB += -lomp
endif
endif
ifeq ($(F_COMPILER), NAG)
CEXTRALIB = -lgomp
EXTRALIB = -lgomp
endif
ifeq ($(F_COMPILER), IBM)
ifeq ($(C_COMPILER), GCC)
CEXTRALIB += -lgomp
EXTRALIB += -lgomp
endif
ifeq ($(C_COMPILER), CLANG)
CEXTRALIB += -lomp
EXTRALIB += -lomp
endif
endif
endif


+ 1
- 1
ctest/c_cblat1c.c View File

@@ -440,7 +440,7 @@ static real c_b43 = (float)1.;
extern /* Subroutine */ int ctest_(integer*, complex*, complex*, complex*, real*);
static complex mwpcs[5], mwpct[5];
extern /* Subroutine */ int itest1_(integer*, integer*), stest1_(real*,real*,real*,real*);
extern /* Subroutine */ int cscaltest_(), itest1_(), stest1_();
extern /* Subroutine */ int cscaltest_(integer*, complex*, complex*, integer*);
static complex cx[8];
extern real scnrm2test_(integer*, complex*, integer*);
static integer np1;


+ 4
- 0
driver/level2/CMakeLists.txt View File

@@ -223,3 +223,7 @@ if (USE_THREAD)
endif ()

add_library(driver_level2 OBJECT ${OPENBLAS_SRC})

if (USE_OPENMP)
target_link_libraries(driver_level2 OpenMP::OpenMP_C)
endif()

+ 4
- 0
driver/level3/CMakeLists.txt View File

@@ -171,3 +171,7 @@ endforeach ()
#

add_library(driver_level3 OBJECT ${OPENBLAS_SRC})

if (USE_OPENMP)
target_link_libraries(driver_level3 OpenMP::OpenMP_C)
endif()

+ 4
- 0
driver/others/CMakeLists.txt View File

@@ -88,3 +88,7 @@ endif ()
#endif

add_library(driver_others OBJECT ${OPENBLAS_SRC} ${MEMORY} ${SMP_SOURCES} ${COMMON_SOURCES})

if (USE_OPENMP)
target_link_libraries(driver_others OpenMP::OpenMP_C)
endif()

+ 31
- 10
driver/others/dynamic_arm64.c View File

@@ -43,6 +43,14 @@
#include <sys/auxv.h>
#endif

#ifdef __APPLE__
#include <sys/sysctl.h>
int32_t value;
size_t length=sizeof(value);
int64_t value64;
size_t length64=sizeof(value64);
#endif

extern gotoblas_t gotoblas_ARMV8;
#ifdef DYNAMIC_LIST
#ifdef DYN_CORTEXA53
@@ -120,7 +128,7 @@ extern gotoblas_t gotoblas_ARMV9SME;
#else
#define gotoblas_ARMV9SME gotoblas_ARMV8
#endif
#ifdef DYN_CORTEX_A55
#ifdef DYN_CORTEXA55
extern gotoblas_t gotoblas_CORTEXA55;
#else
#define gotoblas_CORTEXA55 gotoblas_ARMV8
@@ -147,17 +155,17 @@ extern gotoblas_t gotoblas_NEOVERSEV1;
extern gotoblas_t gotoblas_NEOVERSEN2;
extern gotoblas_t gotoblas_ARMV8SVE;
extern gotoblas_t gotoblas_A64FX;
#ifndef NO_SME
extern gotoblas_t gotoblas_ARMV9SME;
#else
#define gotoblas_ARMV9SME gotoblas_ARMV8SVE
#endif
#else
#define gotoblas_NEOVERSEV1 gotoblas_ARMV8
#define gotoblas_NEOVERSEN2 gotoblas_ARMV8
#define gotoblas_ARMV8SVE gotoblas_ARMV8
#define gotoblas_A64FX gotoblas_ARMV8
#endif

#ifndef NO_SME
extern gotoblas_t gotoblas_ARMV9SME;
#else
#define gotoblas_ARMV9SME gotoblas_ARMV8SVE
#define gotoblas_ARMV9SME gotoblas_ARMV8
#endif

extern gotoblas_t gotoblas_THUNDERX3T110;
@@ -168,7 +176,7 @@ extern void openblas_warning(int verbose, const char * msg);
#define FALLBACK_VERBOSE 1
#define NEOVERSEN1_FALLBACK "OpenBLAS : Your OS does not support SVE instructions. OpenBLAS is using Neoverse N1 kernels as a fallback, which may give poorer performance.\n"

#define NUM_CORETYPES 18
#define NUM_CORETYPES 19

/*
* In case asm/hwcap.h is outdated on the build system, make sure
@@ -207,6 +215,7 @@ static char *corename[] = {
"cortexa55",
"armv8sve",
"a64fx",
"armv9sme",
"unknown"
};

@@ -229,6 +238,7 @@ char *gotoblas_corename(void) {
if (gotoblas == &gotoblas_CORTEXA55) return corename[15];
if (gotoblas == &gotoblas_ARMV8SVE) return corename[16];
if (gotoblas == &gotoblas_A64FX) return corename[17];
if (gotoblas == &gotoblas_ARMV9SME) return corename[18];
return corename[NUM_CORETYPES];
}

@@ -266,6 +276,7 @@ static gotoblas_t *force_coretype(char *coretype) {
case 15: return (&gotoblas_CORTEXA55);
case 16: return (&gotoblas_ARMV8SVE);
case 17: return (&gotoblas_A64FX);
case 18: return (&gotoblas_ARMV9SME);
}
snprintf(message, 128, "Core not found: %s\n", coretype);
openblas_warning(1, message);
@@ -277,6 +288,11 @@ static gotoblas_t *get_coretype(void) {
char coremsg[128];

#if defined (OS_DARWIN)
//future #if !defined(NO_SME)
// if (support_sme1()) {
// return &gotoblas_ARMV9SME;
// }
// #endif
return &gotoblas_NEOVERSEN1;
#endif
@@ -439,6 +455,7 @@ static gotoblas_t *get_coretype(void) {
}
break;
case 0x61: // Apple
//future if (support_sme1()) return &gotoblas_ARMV9SME;
return &gotoblas_NEOVERSEN1;
break;
default:
@@ -446,8 +463,8 @@ static gotoblas_t *get_coretype(void) {
openblas_warning(1, coremsg);
}

#if !defined(NO_SME) && defined(HWCAP2_SME)
if ((getauxval(AT_HWCAP2) & HWCAP2_SME)) {
#if !defined(NO_SME)
if (support_sme1()) {
return &gotoblas_ARMV9SME;
}
#endif
@@ -511,6 +528,10 @@ int support_sme1(void) {
if(getauxval(AT_HWCAP2) & HWCAP2_SME){
ret = 1;
}
#endif
#if defined(__APPLE__)
sysctlbyname("hw.optional.arm.FEAT_SME",&value64,&length64,NULL,0);
ret = value64;
#endif
return ret;
}

+ 154
- 29
exports/gensymbol.pl View File

@@ -21,7 +21,7 @@
chbmv,chemm,chemv,cher2,cher2k,cher,cherk,scabs1,scamax,
chpmv,chpr2,chpr,crotg,cscal,csrot,csscal,cswap,scamin,scasum,scnrm2,
csymm,csyr2k,csyrk,ctbmv,ctbsv,ctpmv,ctpsv,ctrmm,ctrmv,ctrsm,
ctrsv,icamax,icamin,cimatcopy,comatcopy,cgeadd,scsum,cgemmt);
ctrsv,icamax,icamin,cimatcopy,comatcopy,cgeadd,scsum,cgemmt,cgemmtr);
@blasobjsd = (
damax,damin,dasum,daxpy,daxpby,dcabs1,dcopy,ddot,dgbmv,dgemm,
@@ -29,7 +29,7 @@
dscal,dsdot,dspmv,dspr2,dimatcopy,domatcopy,
dspr,dswap,dsymm,dsymv,dsyr2,dsyr2k,dsyr,dsyrk,dtbmv,dtbsv,
dtpmv,dtpsv,dtrmm,dtrmv,dtrsm,dtrsv,
idamax,idamin,idmax,idmin,dgeadd,dsum,dgemmt);
idamax,idamin,idmax,idmin,dgeadd,dsum,dgemmt,dgemmtr);
@blasobjss = (
isamax,isamin,ismax,ismin,
@@ -38,7 +38,7 @@
smax,smin,snrm2,simatcopy,somatcopy,
srot,srotg,srotm,srotmg,ssbmv,sscal,sspmv,sspr2,sspr,sswap,
ssymm,ssymv,ssyr2,ssyr2k,ssyr,ssyrk,stbmv,stbsv,stpmv,stpsv,
strmm,strmv,strsm,strsv, sgeadd,ssum,sgemmt);
strmm,strmv,strsm,strsv, sgeadd,ssum,sgemmt,sgemmtr);
@blasobjsz = (
izamax,izamin,,
@@ -48,28 +48,29 @@
zhpr,zrotg,zscal,zswap,zsymm,zsyr2k,zsyrk,ztbmv,
ztbsv,ztpmv,ztpsv,ztrmm,ztrmv,ztrsm,ztrsv,
zomatcopy, zimatcopy,dzamax,dzamin,dzasum,dznrm2,
zgeadd, dzsum, zgemmt);
zgeadd, dzsum, zgemmt,zgemmtr);

@blasobjs = (lsame, xerbla);
@bfblasobjs = (sbgemm, sbgemv, sbdot, sbstobf16, sbdtobf16, sbf16tos, dbf16tod);
@bfblasobjs = (sbgemm, sbgemmt, sbgemmtr, sbgemv, sbdot, sbstobf16, sbdtobf16, sbf16tos, dbf16tod);
@cblasobjsc = (
cblas_caxpy, cblas_ccopy, cblas_cdotc, cblas_cdotu, cblas_cgbmv, cblas_cgemm, cblas_cgemv,
cblas_cgerc, cblas_cgeru, cblas_chbmv, cblas_chemm, cblas_chemv, cblas_cher2, cblas_cher2k,
cblas_cher, cblas_cherk, cblas_chpmv, cblas_chpr2, cblas_chpr, cblas_cscal, cblas_caxpby,
cblas_csscal, cblas_cswap, cblas_csymm, cblas_csyr2k, cblas_csyrk, cblas_ctbmv, cblas_cgeadd,
cblas_ctbsv, cblas_ctpmv, cblas_ctpsv, cblas_ctrmm, cblas_ctrmv, cblas_ctrsm, cblas_ctrsv,
cblas_scnrm2, cblas_scasum,
cblas_icamax, cblas_icamin, cblas_icmin, cblas_icmax, cblas_scsum,cblas_cimatcopy,cblas_comatcopy
cblas_cgemmt);
cblas_scnrm2, cblas_scasum, cblas_cgemmt, cblas_cgemmtr,
cblas_icamax, cblas_icamin, cblas_icmin, cblas_icmax, cblas_scsum,cblas_cimatcopy,cblas_comatcopy,
cblas_caxpyc, cblas_crotg, cblas_csrot, cblas_scamax, cblas_scamin, cblas_cgemm_batch);

@cblasobjsd = (
cblas_dasum, cblas_daxpy, cblas_dcopy, cblas_ddot,
cblas_dgbmv, cblas_dgemm, cblas_dgemv, cblas_dger, cblas_dnrm2,
cblas_drot, cblas_drotg, cblas_drotm, cblas_drotmg, cblas_dsbmv, cblas_dscal, cblas_dsdot,
cblas_dspmv, cblas_dspr2, cblas_dspr, cblas_dswap, cblas_dsymm, cblas_dsymv, cblas_dsyr2,
cblas_dsyr2k, cblas_dsyr, cblas_dsyrk, cblas_dtbmv, cblas_dtbsv, cblas_dtpmv, cblas_dtpsv,
cblas_dtrmm, cblas_dtrmv, cblas_dtrsm, cblas_dtrsv, cblas_daxpby, cblas_dgeadd,
cblas_idamax, cblas_idamin, cblas_idmin, cblas_idmax, cblas_dsum,cblas_dimatcopy,cblas_domatcopy
cblas_dgemmt);
cblas_dtrmm, cblas_dtrmv, cblas_dtrsm, cblas_dtrsv, cblas_daxpby, cblas_dgeadd, cblas_dgemmt, cblas_dgemmtr,
cblas_idamax, cblas_idamin, cblas_idmin, cblas_idmax, cblas_dsum,cblas_dimatcopy,cblas_domatcopy,
cblas_damax, cblas_damin, cblas_dgemm_batch);
@cblasobjss = (
cblas_sasum, cblas_saxpy, cblas_saxpby,
@@ -78,9 +79,10 @@
cblas_srotm, cblas_srotmg, cblas_ssbmv, cblas_sscal, cblas_sspmv, cblas_sspr2, cblas_sspr,
cblas_sswap, cblas_ssymm, cblas_ssymv, cblas_ssyr2, cblas_ssyr2k, cblas_ssyr, cblas_ssyrk,
cblas_stbmv, cblas_stbsv, cblas_stpmv, cblas_stpsv, cblas_strmm, cblas_strmv, cblas_strsm,
cblas_strsv, cblas_sgeadd,
cblas_isamax, cblas_isamin, cblas_ismin, cblas_ismax, cblas_ssum,cblas_simatcopy,cblas_somatcopy
cblas_sgemmt);
cblas_strsv, cblas_sgeadd, cblas_sgemmt, cblas_sgemmtr,
cblas_isamax, cblas_isamin, cblas_ismin, cblas_ismax, cblas_ssum,cblas_simatcopy,cblas_somatcopy,
cblas_samax, cblas_samin, cblas_sgemm_batch);

@cblasobjsz = (
cblas_dzasum, cblas_dznrm2, cblas_zaxpy, cblas_zcopy, cblas_zdotc, cblas_zdotu, cblas_zdscal,
cblas_zgbmv, cblas_zgemm, cblas_zgemv, cblas_zgerc, cblas_zgeru, cblas_zhbmv, cblas_zhemm,
@@ -88,13 +90,13 @@
cblas_zhpr, cblas_zscal, cblas_zswap, cblas_zsymm, cblas_zsyr2k, cblas_zsyrk,
cblas_ztbmv, cblas_ztbsv, cblas_ztpmv, cblas_ztpsv, cblas_ztrmm, cblas_ztrmv, cblas_ztrsm,
cblas_ztrsv, cblas_cdotc_sub, cblas_cdotu_sub, cblas_zdotc_sub, cblas_zdotu_sub,
cblas_zaxpby, cblas_zgeadd,
cblas_izamax, cblas_izamin, cblas_izmin, cblas_izmax, cblas_dzsum,cblas_zimatcopy,cblas_zomatcopy
cblas_zgemmt);
cblas_zaxpby, cblas_zgeadd, cblas_zgemmt, cblas_zgemmtr,
cblas_izamax, cblas_izamin, cblas_izmin, cblas_izmax, cblas_dzsum,cblas_zimatcopy,cblas_zomatcopy,
cblas_zaxpyc, cblas_zdrot, cblas_zrotg, cblas_dzamax, cblas_dzamin, cblas_zgemm_batch);

@cblasobjs = ( cblas_xerbla );

@bfcblasobjs = (cblas_sbgemm, cblas_sbgemv, cblas_sbdot, cblas_sbstobf16, cblas_sbdtobf16, cblas_sbf16tos, cblas_dbf16tod);
@bfcblasobjs = (cblas_sbgemm, cblas_sbgemmt, cblas_sbgemmtr, cblas_sbgemv, cblas_sbdot, cblas_sbstobf16, cblas_sbdtobf16, cblas_sbf16tos, cblas_dbf16tod, cblas_sbgemm_batch);

@exblasobjs = (
qamax,qamin,qasum,qaxpy,qcabs1,qcopy,qdot,qgbmv,qgemm,
@@ -709,6 +711,7 @@ zpotri,
# functions added for lapack-3.7.0
@lapackobjs2s = (@lapackobjs2s,
slarfy,
ssyconvf,
strevc3,
sgelqt,
sgelqt3,
@@ -832,12 +835,82 @@ zpotri,
zungtsqr_row
);

#functions added for lapack-3.11
@lapackobjs2c = (@lapackobjs2c,
cgedmd,
cgedmdq
);
@lapackobjs2d = (@lapackobjs2d,
dgedmd,
dgedmdq
);
@lapackobjs2s = (@lapackobjs2s,
sgedmd,
sgedmdq
);
@lapackobjs2z = (@lapackobjs2z,
zgedmd,
zgedmdq
);

#functions added post 3.11

@lapackobjs2c = (@lapackobjs2c,
cgelst,
cgeqp3rk,
claqp2rk,
claqp3rk,
clatrs3,
crscl,
ctrsyl3
);
# claqz0
# claqz1
# claqz2
# claqz3
# clatrs3

@lapackobjs2d = (@lapackobjs2d,
dgelst,
dgeqp3rk,
dlaqp2rk,
dlaqp3rk,
dlarmm,
dlatrs3,
dtrsyl3
);

@lapackobjs2s = (@lapackobjs2s,
sgelst,
sgeqp3rk,
slaqp2rk,
slaqp3rk,
slarmm,
slatrs3,
strsyl3
);

@lapackobjs2z = (@lapackobjs2z,
zgelst,
zgeqp3rk,
zlaqp2rk,
zlaqp3rk,
zlatrs3,
zrscl,
ztrsyl3
);
# zlaqz0
# zlaqz1
# zlaqz2
# zlaqz3

@lapack_extendedprecision_objs = (
zposvxx, clagge, clatms, chesvxx, cposvxx, cgesvxx, ssyrfssx, csyrfsx,
dlagsy, dsysvxx, sporfsx, slatms, zlatms, zherfsx, csysvxx,
);

@lapack_deprecated_objsc = (
cgelqs, cgeqrs,
cgegs, cggsvd,
cgegv, cggsvp,
cgelsx, clahrd,
@@ -845,13 +918,16 @@ zpotri,
ctzrqf,
);
@lapack_deprecated_objsd = (
dgelqs, dgeqrs,
dgegs, dgeqpf,
dgegv, dggsvd,
dgelsx, dggsvp,
dlahrd,
dlatzm, dtzrqf);
@lapack_deprecated_objss = (
@lapack_deprecated_objss = (
sgelqs,
sgeqrs,
sgelsx,
sgegs,
sgegv,
@@ -864,6 +940,8 @@ zpotri,
);

@lapack_deprecated_objsz = (
zgelqs,
zgeqrs,
zgegs,
zgegv,
zgelsx,
@@ -997,6 +1075,10 @@ zpotri,
LAPACKE_cgebrd_work,
LAPACKE_cgecon,
LAPACKE_cgecon_work,
LAPACKE_cgedmd,
LAPACKE_cgedmd_work,
LAPACKE_cgedmdq,
LAPACKE_cgedmdq_work,
LAPACKE_cgeequ,
LAPACKE_cgeequ_work,
LAPACKE_cgeequb,
@@ -1584,8 +1666,15 @@ zpotri,
LAPACKE_cgetsqrhrt,
LAPACKE_cgetsqrhrt_work,
LAPACKE_cungtsqr_row,
LAPACKE_cungtsqr_row_work

LAPACKE_cungtsqr_row_work,
LAPACKE_clangb,
LAPACKE_clangb_work,
LAPACKE_ctrsyl3,
LAPACKE_ctrsyl3_work,
LAPACKE_ctz_nancheck,
LAPACKE_ctz_trans,
LAPACKE_cunhr_col,
LAPACKE_cunhr_col_work
);
@lapackeobjsd = (
LAPACKE_dgb_nancheck,
@@ -1656,6 +1745,10 @@ zpotri,
LAPACKE_dgebrd_work,
LAPACKE_dgecon,
LAPACKE_dgecon_work,
LAPACKE_dgedmd,
LAPACKE_dgedmd_work,
LAPACKE_dgedmdq,
LAPACKE_dgedmdq_work,
LAPACKE_dgeequ,
LAPACKE_dgeequ_work,
LAPACKE_dgeequb,
@@ -2197,7 +2290,15 @@ zpotri,
LAPACKE_dgetsqrhrt,
LAPACKE_dgetsqrhrt_work,
LAPACKE_dorgtsqr_row,
LAPACKE_dorgtsqr_row_work
LAPACKE_dorgtsqr_row_work,
LAPACKE_dlangb,
LAPACKE_dlangb_work,
LAPACKE_dorhr_col,
LAPACKE_dorhr_col_work,
LAPACKE_dtrsyl3,
LAPACKE_dtrsyl3_work,
LAPACKE_dtz_nancheck,
LAPACKE_dtz_trans,

);
@lapackeobjss = (
@@ -2269,6 +2370,10 @@ zpotri,
LAPACKE_sgebrd_work,
LAPACKE_sgecon,
LAPACKE_sgecon_work,
LAPACKE_sgedmd,
LAPACKE_sgedmd_work,
LAPACKE_sgedmdq,
LAPACKE_sgedmdq_work,
LAPACKE_sgeequ,
LAPACKE_sgeequ_work,
LAPACKE_sgeequb,
@@ -2802,7 +2907,15 @@ zpotri,
LAPACKE_sgetsqrhrt,
LAPACKE_sgetsqrhrt_work,
LAPACKE_sorgtsqr_row,
LAPACKE_sorgtsqr_row_work
LAPACKE_sorgtsqr_row_work,
LAPACKE_slangb,
LAPACKE_slangb_work,
LAPACKE_sorhr_col,
LAPACKE_sorhr_col_work,
LAPACKE_strsyl3,
LAPACKE_strsyl3_work,
LAPACKE_stz_nancheck,
LAPACKE_stz_trans,

);
@lapackeobjsz = (
@@ -2878,6 +2991,10 @@ zpotri,
LAPACKE_zgebrd_work,
LAPACKE_zgecon,
LAPACKE_zgecon_work,
LAPACKE_zgedmd,
LAPACKE_zgedmd_work,
LAPACKE_zgedmdq,
LAPACKE_zgedmdq_work,
LAPACKE_zgeequ,
LAPACKE_zgeequ_work,
LAPACKE_zgeequb,
@@ -3345,7 +3462,15 @@ zpotri,
LAPACKE_zgetsqrhrt,
LAPACKE_zgetsqrhrt_work,
LAPACKE_zungtsqr_row,
LAPACKE_zungtsqr_row_work
LAPACKE_zungtsqr_row_work,
LAPACKE_zlangb,
LAPACKE_zlangb_work,
LAPACKE_zunhr_col,
LAPACKE_zunhr_col_work,
LAPACKE_ztrsyl3,
LAPACKE_ztrsyl3_work,
LAPACKE_ztz_nancheck,
LAPACKE_ztz_trans,

## @(SRCX_OBJ) from `lapack-3.4.1/lapacke/src/Makefile`
## Not exported: requires LAPACKE_EXTENDED to be set and depends on the
@@ -3551,7 +3676,7 @@ zpotri,
LAPACKE_zsytrs_aa_2stage_work,
# new functions from 3.9.0
LAPACKE_zgesvdq,
LAPACKE_zgesvdq_work
LAPACKE_zgesvdq_work,
);

#These function may need 2 underscores.
@@ -3573,7 +3698,7 @@ zpotri,
ssygv_2stage,
ssysv_aa_2stage, ssytrf_aa_2stage,
ssytrs_aa_2stage,
slaorhr_col_getrfnp, slaorhr_col_getrfnp2, sorhr_col,
slaorhr_col_getrfnp, slaorhr_col_getrfnp2, sorhr_col, slarfb_gett
);
@lapack_embeded_underscore_objs_c=(
chetf2_rook, chetrf_rook, chetri_rook,
@@ -3598,7 +3723,7 @@ zpotri,
chetrf_aa_2stage, chetrs_aa_2stage,
csysv_aa_2stage, csytrf_aa_2stage,
csytrs_aa_2stage,
claunhr_col_getrfnp, claunhr_col_getrfnp2, cunhr_col,
claunhr_col_getrfnp, claunhr_col_getrfnp2, cunhr_col, clarfb_gett
);
@lapack_embeded_underscore_objs_d=(
dlasyf_rook,
@@ -3615,7 +3740,7 @@ zpotri,
dsbevd_2stage, dsygv_2stage,
dsysv_aa_2stage,
dsytrf_aa_2stage, dsytrs_aa_2stage,
dlaorhr_col_getrfnp, dlaorhr_col_getrfnp2, dorhr_col,
dlaorhr_col_getrfnp, dlaorhr_col_getrfnp2, dorhr_col, dlarfb_gett
);
@lapack_embeded_underscore_objs_z=(
zhetf2_rook, zhetrf_rook, zhetri_rook,
@@ -3639,7 +3764,7 @@ zpotri,
zhesv_aa_2stage, zhetrf_aa_2stage,
zhetrs_aa_2stage, zsysv_aa_2stage,
zsytrf_aa_2stage, zsytrs_aa_2stage,
zlaunhr_col_getrfnp, zlaunhr_col_getrfnp2, zunhr_col
zlaunhr_col_getrfnp, zlaunhr_col_getrfnp2, zunhr_col, zlarfb_gett
);




+ 8
- 4
interface/CMakeLists.txt View File

@@ -30,17 +30,17 @@ set(BLAS2_SOURCES
gemv.c ger.c
trsv.c trmv.c
syr2.c gbmv.c
sbmv.c
sbmv.c spmv.c
spr2.c
tbsv.c tbmv.c
tpsv.c tpmv.c
)

set(BLAS2_REAL_ONLY_SOURCES
symv.c syr.c spmv.c spr.c
symv.c syr.c spr.c
)
set(BLAS2_COMPLEX_LAPACK_SOURCES
symv.c syr.c spmv.c spr.c
symv.c syr.c spr.c
)

set(BLAS2_COMPLEX_ONLY_MANGLED_SOURCES
@@ -195,7 +195,7 @@ if (NOT DEFINED NO_CBLAS)
endforeach ()
endif()

if (NOT DEFINED NO_LAPACK)
if (NOT NO_LAPACK)
set(LAPACK_SOURCES
lapack/gesv.c
)
@@ -250,3 +250,7 @@ if ( BUILD_COMPLEX16 AND NOT BUILD_DOUBLE)
endif ()

add_library(interface OBJECT ${OPENBLAS_SRC})

if (USE_OPENMP)
target_link_libraries(interface OpenMP::OpenMP_C)
endif()

+ 18
- 7
interface/gemv.c View File

@@ -70,11 +70,22 @@ static int (*gemv_thread[])(BLASLONG, BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT

#if defined(DYNAMIC_ARCH) || defined(NEOVERSEV1)
static inline int get_gemv_optimal_nthreads_neoversev1(BLASLONG MN, int ncpu) {
return
MN < 25600L ? 1
: MN < 63001L ? MIN(ncpu, 4)
: MN < 459684L ? MIN(ncpu, 16)
: ncpu;
#ifdef DOUBLE
return (MN < 8100L) ? 1
: (MN < 12100L) ? MIN(ncpu, 2)
: (MN < 36100L) ? MIN(ncpu, 4)
: (MN < 84100L) ? MIN(ncpu, 8)
: (MN < 348100L) ? MIN(ncpu, 16)
: (MN < 435600L) ? MIN(ncpu, 24)
: (MN < 810000L) ? MIN(ncpu, 32)
: (MN < 1050625L) ? MIN(ncpu, 40)
: ncpu;
#else
return (MN < 25600L) ? 1
: (MN < 63001L) ? MIN(ncpu, 4)
: (MN < 459684L) ? MIN(ncpu, 16)
: ncpu;
#endif
}
#endif

@@ -96,11 +107,11 @@ static inline int get_gemv_optimal_nthreads(BLASLONG MN) {
return num_cpu_avail(4);
return 1;
#endif
#if defined(NEOVERSEV1) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16)
#if defined(NEOVERSEV1) && !defined(COMPLEX) && !defined(BFLOAT16)
return get_gemv_optimal_nthreads_neoversev1(MN, ncpu);
#elif defined(NEOVERSEV2) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16)
return get_gemv_optimal_nthreads_neoversev2(MN, ncpu);
#elif defined(DYNAMIC_ARCH) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16)
#elif defined(DYNAMIC_ARCH) && !defined(COMPLEX) && !defined(BFLOAT16)
if (strcmp(gotoblas_corename(), "neoversev1") == 0) {
return get_gemv_optimal_nthreads_neoversev1(MN, ncpu);
}


+ 6
- 1
kernel/CMakeLists.txt View File

@@ -208,7 +208,7 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
set(USE_TRMM true)
endif ()
set(USE_DIRECT_SGEMM false)
if (X86_64 OR (ARM64 AND (UC_TARGET_CORE MATCHES ARMV9SME)))
if (X86_64 OR ARM64)
set(USE_DIRECT_SGEMM true)
endif()

@@ -225,9 +225,11 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
set (SGEMMDIRECTSMEKERNEL sgemm_direct_sme1.S)
set (SGEMMDIRECTPREKERNEL sgemm_direct_sme1_preprocess.S)
GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTKERNEL}" "" "gemm_direct" false "" "" false SINGLE)
if (HAVE_SME)
GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTSMEKERNEL}" "" "gemm_direct_sme1" false "" "" false SINGLE)
GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTPREKERNEL}" "" "gemm_direct_sme1_preprocess" false "" "" false SINGLE)
endif ()
endif ()
endif()

foreach (float_type SINGLE DOUBLE)
@@ -1364,6 +1366,9 @@ endif ()
if (USE_GEMM3M)
target_compile_definitions(kernel${TSUFFIX} PRIVATE USE_GEMM3M)
endif()
if (USE_OPENMP)
target_link_libraries(kernel${TSUFFIX} OpenMP::OpenMP_C)
endif()
endfunction ()




+ 4
- 3
kernel/Makefile.L3 View File

@@ -103,8 +103,8 @@ endif
ifeq ($(ARCH), arm64)
ifeq ($(TARGET_CORE), ARMV9SME)
HAVE_SME = 1
SGEMMDIRECTKERNEL = sgemm_direct_arm64_sme1.c
endif
SGEMMDIRECTKERNEL = sgemm_direct_arm64_sme1.c
endif
endif
endif
@@ -143,9 +143,10 @@ SKERNELOBJS += \
sgemm_direct_performant$(TSUFFIX).$(SUFFIX)
endif
ifeq ($(ARCH), arm64)
SKERNELOBJS += \
sgemm_direct$(TSUFFIX).$(SUFFIX)
ifdef HAVE_SME
SKERNELOBJS += \
sgemm_direct$(TSUFFIX).$(SUFFIX) \
sgemm_direct_sme1$(TSUFFIX).$(SUFFIX) \
sgemm_direct_sme1_preprocess$(TSUFFIX).$(SUFFIX)
endif
@@ -835,9 +836,9 @@ $(KDIR)sgemm_direct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMDIRECTKERNEL)
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
endif
ifeq ($(ARCH), arm64)
ifdef HAVE_SME
$(KDIR)sgemm_direct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMDIRECTKERNEL)
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
ifdef HAVE_SME
$(KDIR)sgemm_direct_sme1$(TSUFFIX).$(SUFFIX) :
$(CC) $(CFLAGS) -c $(KERNELDIR)/sgemm_direct_sme1.S -UDOUBLE -UCOMPLEX -o $@
$(KDIR)sgemm_direct_sme1_preprocess$(TSUFFIX).$(SUFFIX) :


+ 2
- 2
kernel/arm64/KERNEL.A64FX View File

@@ -1,6 +1,6 @@
include $(KERNELDIR)/KERNEL.ARMV8SVE

SGEMVNKERNEL = gemv_n_sve.c
DGEMVNKERNEL = gemv_n_sve.c
SGEMVNKERNEL = gemv_n_sve_v4x3.c
DGEMVNKERNEL = gemv_n_sve_v4x3.c
SGEMVTKERNEL = gemv_t_sve_v4x3.c
DGEMVTKERNEL = gemv_t_sve_v4x3.c

+ 9
- 4
kernel/arm64/KERNEL.ARMV8SVE View File

@@ -74,16 +74,21 @@ DSCALKERNEL = scal.S
CSCALKERNEL = zscal.S
ZSCALKERNEL = zscal.S

SGEMVNKERNEL = gemv_n_sve.c
DGEMVNKERNEL = gemv_n.S
SGEMVNKERNEL = gemv_n_sve_v1x3.c
DGEMVNKERNEL = gemv_n_sve_v1x3.c
CGEMVNKERNEL = zgemv_n.S
ZGEMVNKERNEL = zgemv_n.S

SGEMVTKERNEL = gemv_t.S
DGEMVTKERNEL = gemv_t.S
SGEMVTKERNEL = gemv_t_sve_v1x3.c
DGEMVTKERNEL = gemv_t_sve_v1x3.c
CGEMVTKERNEL = zgemv_t.S
ZGEMVTKERNEL = zgemv_t.S

SSYMV_L_KERNEL = symv_L_sve_v1x4.c
SSYMV_U_KERNEL = symv_U_sve_v1x4.c
DSYMV_L_KERNEL = symv_L_sve_v1x4.c
DSYMV_U_KERNEL = symv_U_sve_v1x4.c

SASUMKERNEL = sasum_thunderx2t99.c
DASUMKERNEL = dasum_thunderx2t99.c
CASUMKERNEL = casum_thunderx2t99.c


+ 5
- 1
kernel/arm64/KERNEL.NEOVERSEN1 View File

@@ -60,7 +60,7 @@ DSCALKERNEL = scal.S
CSCALKERNEL = zscal.S
ZSCALKERNEL = zscal.S

SGEMVNKERNEL = gemv_n.S
SGEMVNKERNEL = sgemv_n_neon.c
DGEMVNKERNEL = gemv_n.S
CGEMVNKERNEL = zgemv_n.S
ZGEMVNKERNEL = zgemv_n.S
@@ -70,6 +70,10 @@ DGEMVTKERNEL = gemv_t.S
CGEMVTKERNEL = zgemv_t.S
ZGEMVTKERNEL = zgemv_t.S

SSYMV_L_KERNEL = symv_L_asimd_4x4.c
SSYMV_U_KERNEL = symv_U_asimd_4x4.c
DSYMV_L_KERNEL = symv_L_asimd_4x4.c
DSYMV_U_KERNEL = symv_U_asimd_4x4.c

SASUMKERNEL = sasum_thunderx2t99.c
DASUMKERNEL = dasum_thunderx2t99.c


+ 4
- 4
kernel/arm64/KERNEL.NEOVERSEN2 View File

@@ -60,13 +60,13 @@ DSCALKERNEL = scal.S
CSCALKERNEL = zscal.S
ZSCALKERNEL = zscal.S

SGEMVNKERNEL = gemv_n.S
DGEMVNKERNEL = gemv_n.S
SGEMVNKERNEL = gemv_n_sve_v1x3.c
DGEMVNKERNEL = gemv_n_sve_v1x3.c
CGEMVNKERNEL = zgemv_n.S
ZGEMVNKERNEL = zgemv_n.S

SGEMVTKERNEL = gemv_t.S
DGEMVTKERNEL = gemv_t.S
SGEMVTKERNEL = gemv_t_sve_v1x3.c
DGEMVTKERNEL = gemv_t_sve_v1x3.c
CGEMVTKERNEL = zgemv_t.S
ZGEMVTKERNEL = zgemv_t.S



+ 2
- 0
kernel/arm64/KERNEL.NEOVERSEV1 View File

@@ -1,5 +1,7 @@
include $(KERNELDIR)/KERNEL.ARMV8SVE

SGEMVNKERNEL = gemv_n_sve_v1x3.c
DGEMVNKERNEL = gemv_n_sve_v1x3.c
SGEMVTKERNEL = gemv_t_sve_v1x3.c
DGEMVTKERNEL = gemv_t_sve_v1x3.c
ifeq ($(BUILD_BFLOAT16), 1)


+ 50
- 3
kernel/arm64/dot.c View File

@@ -48,6 +48,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n,
BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb,
void *c, BLASLONG ldc, int (*function)(), int nthreads);

#ifdef DYNAMIC_ARCH
extern char* gotoblas_corename(void);
#endif

#if defined(DYNAMIC_ARCH) || defined(NEOVERSEV1)
static inline int get_dot_optimal_nthreads_neoversev1(BLASLONG N, int ncpu) {
#ifdef DOUBLE
return (N <= 10000L) ? 1
: (N <= 64500L) ? 1
: (N <= 100000L) ? MIN(ncpu, 2)
: (N <= 150000L) ? MIN(ncpu, 4)
: (N <= 260000L) ? MIN(ncpu, 8)
: (N <= 360000L) ? MIN(ncpu, 16)
: (N <= 520000L) ? MIN(ncpu, 24)
: (N <= 1010000L) ? MIN(ncpu, 56)
: ncpu;
#else
return (N <= 10000L) ? 1
: (N <= 110000L) ? 1
: (N <= 200000L) ? MIN(ncpu, 2)
: (N <= 280000L) ? MIN(ncpu, 4)
: (N <= 520000L) ? MIN(ncpu, 8)
: (N <= 830000L) ? MIN(ncpu, 16)
: (N <= 1010000L) ? MIN(ncpu, 24)
: ncpu;
#endif
}
#endif

static inline int get_dot_optimal_nthreads(BLASLONG n) {
int ncpu = num_cpu_avail(1);

#if defined(NEOVERSEV1) && !defined(COMPLEX) && !defined(BFLOAT16)
return get_dot_optimal_nthreads_neoversev1(n, ncpu);
#elif defined(DYNAMIC_ARCH) && !defined(COMPLEX) && !defined(BFLOAT16)
if (strcmp(gotoblas_corename(), "neoversev1") == 0) {
return get_dot_optimal_nthreads_neoversev1(n, ncpu);
}
#endif

// Default case
if (n <= 10000L)
return 1;
else
return num_cpu_avail(1);
}
#endif

static RETURN_TYPE dot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
@@ -85,10 +132,10 @@ RETURN_TYPE CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y
RETURN_TYPE dot = 0.0;

#if defined(SMP)
if (inc_x == 0 || inc_y == 0 || n <= 10000)
if (inc_x == 0 || inc_y == 0)
nthreads = 1;
else
nthreads = num_cpu_avail(1);
nthreads = get_dot_optimal_nthreads(n);

if (nthreads == 1) {
dot = dot_compute(n, x, inc_x, y, inc_y);
@@ -105,7 +152,7 @@ RETURN_TYPE CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y

blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha,
x, inc_x, y, inc_y, result, 0,
( void *)dot_thread_function, nthreads);
(void *)dot_thread_function, nthreads);

ptr = (RETURN_TYPE *)result;
for (i = 0; i < nthreads; i++) {


+ 138
- 0
kernel/arm64/gemv_n_sve_v1x3.c View File

@@ -0,0 +1,138 @@
/***************************************************************************
Copyright (c) 2025, The OpenBLAS Project
All rights reserved.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:

1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.

2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written
permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

#include <arm_sve.h>

#include "common.h"

#ifdef DOUBLE
#define SV_COUNT svcntd
#define SV_TYPE svfloat64_t
#define SV_TRUE svptrue_b64
#define SV_WHILE svwhilelt_b64_s64
#define SV_DUP svdup_f64
#else
#define SV_COUNT svcntw
#define SV_TYPE svfloat32_t
#define SV_TRUE svptrue_b32
#define SV_WHILE svwhilelt_b32_s64
#define SV_DUP svdup_f32
#endif

int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a,
BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
FLOAT *buffer)
{
BLASLONG i;
BLASLONG ix,iy;
BLASLONG j;
FLOAT *a_ptr;
FLOAT temp;

ix = 0;
a_ptr = a;

if (inc_y == 1) {
BLASLONG width = (n + 3 - 1) / 3;

FLOAT *a0_ptr = a_ptr + lda * width * 0;
FLOAT *a1_ptr = a_ptr + lda * width * 1;
FLOAT *a2_ptr = a_ptr + lda * width * 2;

FLOAT *x0_ptr = x + inc_x * width * 0;
FLOAT *x1_ptr = x + inc_x * width * 1;
FLOAT *x2_ptr = x + inc_x * width * 2;

for (j = 0; j < width; j++) {
svbool_t pg00 = ((j + width * 0) < n) ? SV_TRUE() : svpfalse();
svbool_t pg01 = ((j + width * 1) < n) ? SV_TRUE() : svpfalse();
svbool_t pg02 = ((j + width * 2) < n) ? SV_TRUE() : svpfalse();

SV_TYPE temp0_vec = ((j + width * 0) < n) ? SV_DUP(alpha * x0_ptr[ix]) : SV_DUP(0.0);
SV_TYPE temp1_vec = ((j + width * 1) < n) ? SV_DUP(alpha * x1_ptr[ix]) : SV_DUP(0.0);
SV_TYPE temp2_vec = ((j + width * 2) < n) ? SV_DUP(alpha * x2_ptr[ix]) : SV_DUP(0.0);
i = 0;
BLASLONG sve_size = SV_COUNT();
while ((i + sve_size * 1 - 1) < m) {
SV_TYPE y0_vec = svld1_vnum(SV_TRUE(), y + i, 0);

SV_TYPE a00_vec = svld1_vnum(pg00, a0_ptr + i, 0);
SV_TYPE a01_vec = svld1_vnum(pg01, a1_ptr + i, 0);
SV_TYPE a02_vec = svld1_vnum(pg02, a2_ptr + i, 0);

y0_vec = svmla_m(pg00, y0_vec, temp0_vec, a00_vec);
y0_vec = svmla_m(pg01, y0_vec, temp1_vec, a01_vec);
y0_vec = svmla_m(pg02, y0_vec, temp2_vec, a02_vec);

svst1_vnum(SV_TRUE(), y + i, 0, y0_vec);
i += sve_size * 1;
}

if (i < m) {
svbool_t pg0 = SV_WHILE(i + sve_size * 0, m);

pg00 = svand_z(SV_TRUE(), pg0, pg00);
pg01 = svand_z(SV_TRUE(), pg0, pg01);
pg02 = svand_z(SV_TRUE(), pg0, pg02);

SV_TYPE y0_vec = svld1_vnum(pg0, y + i, 0);

SV_TYPE a00_vec = svld1_vnum(pg00, a0_ptr + i, 0);
SV_TYPE a01_vec = svld1_vnum(pg01, a1_ptr + i, 0);
SV_TYPE a02_vec = svld1_vnum(pg02, a2_ptr + i, 0);

y0_vec = svmla_m(pg00, y0_vec, temp0_vec, a00_vec);
y0_vec = svmla_m(pg01, y0_vec, temp1_vec, a01_vec);
y0_vec = svmla_m(pg02, y0_vec, temp2_vec, a02_vec);

svst1_vnum(pg0, y + i, 0, y0_vec);
}
a0_ptr += lda;
a1_ptr += lda;
a2_ptr += lda;
ix += inc_x;
}
return(0);
}

for (j = 0; j < n; j++) {
temp = alpha * x[ix];
iy = 0;
for (i = 0; i < m; i++) {
y[iy] += temp * a_ptr[i];
iy += inc_y;
}
a_ptr += lda;
ix += inc_x;
}
return (0);
}

+ 207
- 0
kernel/arm64/gemv_n_sve_v4x3.c View File

@@ -0,0 +1,207 @@
/***************************************************************************
Copyright (c) 2025, The OpenBLAS Project
All rights reserved.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:

1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.

2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written
permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

#include <arm_sve.h>

#include "common.h"

#ifdef DOUBLE
#define SV_COUNT svcntd
#define SV_TYPE svfloat64_t
#define SV_TRUE svptrue_b64
#define SV_WHILE svwhilelt_b64_s64
#define SV_DUP svdup_f64
#else
#define SV_COUNT svcntw
#define SV_TYPE svfloat32_t
#define SV_TRUE svptrue_b32
#define SV_WHILE svwhilelt_b32_s64
#define SV_DUP svdup_f32
#endif

int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a,
BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
FLOAT *buffer)
{
BLASLONG i;
BLASLONG ix,iy;
BLASLONG j;
FLOAT *a_ptr;
FLOAT temp;

ix = 0;
a_ptr = a;

if (inc_y == 1) {
BLASLONG width = (n + 3 - 1) / 3;

FLOAT *a0_ptr = a_ptr + lda * width * 0;
FLOAT *a1_ptr = a_ptr + lda * width * 1;
FLOAT *a2_ptr = a_ptr + lda * width * 2;

FLOAT *x0_ptr = x + inc_x * width * 0;
FLOAT *x1_ptr = x + inc_x * width * 1;
FLOAT *x2_ptr = x + inc_x * width * 2;

for (j = 0; j < width; j++) {
svbool_t pg00 = ((j + width * 0) < n) ? SV_TRUE() : svpfalse();
svbool_t pg10 = ((j + width * 0) < n) ? SV_TRUE() : svpfalse();
svbool_t pg20 = ((j + width * 0) < n) ? SV_TRUE() : svpfalse();
svbool_t pg30 = ((j + width * 0) < n) ? SV_TRUE() : svpfalse();
svbool_t pg01 = ((j + width * 1) < n) ? SV_TRUE() : svpfalse();
svbool_t pg11 = ((j + width * 1) < n) ? SV_TRUE() : svpfalse();
svbool_t pg21 = ((j + width * 1) < n) ? SV_TRUE() : svpfalse();
svbool_t pg31 = ((j + width * 1) < n) ? SV_TRUE() : svpfalse();
svbool_t pg02 = ((j + width * 2) < n) ? SV_TRUE() : svpfalse();
svbool_t pg12 = ((j + width * 2) < n) ? SV_TRUE() : svpfalse();
svbool_t pg22 = ((j + width * 2) < n) ? SV_TRUE() : svpfalse();
svbool_t pg32 = ((j + width * 2) < n) ? SV_TRUE() : svpfalse();

SV_TYPE temp0_vec = ((j + width * 0) < n) ? SV_DUP(alpha * x0_ptr[ix]) : SV_DUP(0.0);
SV_TYPE temp1_vec = ((j + width * 1) < n) ? SV_DUP(alpha * x1_ptr[ix]) : SV_DUP(0.0);
SV_TYPE temp2_vec = ((j + width * 2) < n) ? SV_DUP(alpha * x2_ptr[ix]) : SV_DUP(0.0);
i = 0;
BLASLONG sve_size = SV_COUNT();
while ((i + sve_size * 4 - 1) < m) {
SV_TYPE y0_vec = svld1_vnum(SV_TRUE(), y + i, 0);
SV_TYPE y1_vec = svld1_vnum(SV_TRUE(), y + i, 1);
SV_TYPE y2_vec = svld1_vnum(SV_TRUE(), y + i, 2);
SV_TYPE y3_vec = svld1_vnum(SV_TRUE(), y + i, 3);

SV_TYPE a00_vec = svld1_vnum(pg00, a0_ptr + i, 0);
SV_TYPE a10_vec = svld1_vnum(pg10, a0_ptr + i, 1);
SV_TYPE a20_vec = svld1_vnum(pg20, a0_ptr + i, 2);
SV_TYPE a30_vec = svld1_vnum(pg30, a0_ptr + i, 3);
SV_TYPE a01_vec = svld1_vnum(pg01, a1_ptr + i, 0);
SV_TYPE a11_vec = svld1_vnum(pg11, a1_ptr + i, 1);
SV_TYPE a21_vec = svld1_vnum(pg21, a1_ptr + i, 2);
SV_TYPE a31_vec = svld1_vnum(pg31, a1_ptr + i, 3);
SV_TYPE a02_vec = svld1_vnum(pg02, a2_ptr + i, 0);
SV_TYPE a12_vec = svld1_vnum(pg12, a2_ptr + i, 1);
SV_TYPE a22_vec = svld1_vnum(pg22, a2_ptr + i, 2);
SV_TYPE a32_vec = svld1_vnum(pg32, a2_ptr + i, 3);

y0_vec = svmla_m(pg00, y0_vec, temp0_vec, a00_vec);
y1_vec = svmla_m(pg10, y1_vec, temp0_vec, a10_vec);
y2_vec = svmla_m(pg20, y2_vec, temp0_vec, a20_vec);
y3_vec = svmla_m(pg30, y3_vec, temp0_vec, a30_vec);
y0_vec = svmla_m(pg01, y0_vec, temp1_vec, a01_vec);
y1_vec = svmla_m(pg11, y1_vec, temp1_vec, a11_vec);
y2_vec = svmla_m(pg21, y2_vec, temp1_vec, a21_vec);
y3_vec = svmla_m(pg31, y3_vec, temp1_vec, a31_vec);
y0_vec = svmla_m(pg02, y0_vec, temp2_vec, a02_vec);
y1_vec = svmla_m(pg12, y1_vec, temp2_vec, a12_vec);
y2_vec = svmla_m(pg22, y2_vec, temp2_vec, a22_vec);
y3_vec = svmla_m(pg32, y3_vec, temp2_vec, a32_vec);

svst1_vnum(SV_TRUE(), y + i, 0, y0_vec);
svst1_vnum(SV_TRUE(), y + i, 1, y1_vec);
svst1_vnum(SV_TRUE(), y + i, 2, y2_vec);
svst1_vnum(SV_TRUE(), y + i, 3, y3_vec);
i += sve_size * 4;
}

if (i < m) {
svbool_t pg0 = SV_WHILE(i + sve_size * 0, m);
svbool_t pg1 = SV_WHILE(i + sve_size * 1, m);
svbool_t pg2 = SV_WHILE(i + sve_size * 2, m);
svbool_t pg3 = SV_WHILE(i + sve_size * 3, m);

pg00 = svand_z(SV_TRUE(), pg0, pg00);
pg10 = svand_z(SV_TRUE(), pg1, pg10);
pg20 = svand_z(SV_TRUE(), pg2, pg20);
pg30 = svand_z(SV_TRUE(), pg3, pg30);
pg01 = svand_z(SV_TRUE(), pg0, pg01);
pg11 = svand_z(SV_TRUE(), pg1, pg11);
pg21 = svand_z(SV_TRUE(), pg2, pg21);
pg31 = svand_z(SV_TRUE(), pg3, pg31);
pg02 = svand_z(SV_TRUE(), pg0, pg02);
pg12 = svand_z(SV_TRUE(), pg1, pg12);
pg22 = svand_z(SV_TRUE(), pg2, pg22);
pg32 = svand_z(SV_TRUE(), pg3, pg32);

SV_TYPE y0_vec = svld1_vnum(pg0, y + i, 0);
SV_TYPE y1_vec = svld1_vnum(pg1, y + i, 1);
SV_TYPE y2_vec = svld1_vnum(pg2, y + i, 2);
SV_TYPE y3_vec = svld1_vnum(pg3, y + i, 3);

SV_TYPE a00_vec = svld1_vnum(pg00, a0_ptr + i, 0);
SV_TYPE a10_vec = svld1_vnum(pg10, a0_ptr + i, 1);
SV_TYPE a20_vec = svld1_vnum(pg20, a0_ptr + i, 2);
SV_TYPE a30_vec = svld1_vnum(pg30, a0_ptr + i, 3);
SV_TYPE a01_vec = svld1_vnum(pg01, a1_ptr + i, 0);
SV_TYPE a11_vec = svld1_vnum(pg11, a1_ptr + i, 1);
SV_TYPE a21_vec = svld1_vnum(pg21, a1_ptr + i, 2);
SV_TYPE a31_vec = svld1_vnum(pg31, a1_ptr + i, 3);
SV_TYPE a02_vec = svld1_vnum(pg02, a2_ptr + i, 0);
SV_TYPE a12_vec = svld1_vnum(pg12, a2_ptr + i, 1);
SV_TYPE a22_vec = svld1_vnum(pg22, a2_ptr + i, 2);
SV_TYPE a32_vec = svld1_vnum(pg32, a2_ptr + i, 3);

y0_vec = svmla_m(pg00, y0_vec, temp0_vec, a00_vec);
y1_vec = svmla_m(pg10, y1_vec, temp0_vec, a10_vec);
y2_vec = svmla_m(pg20, y2_vec, temp0_vec, a20_vec);
y3_vec = svmla_m(pg30, y3_vec, temp0_vec, a30_vec);
y0_vec = svmla_m(pg01, y0_vec, temp1_vec, a01_vec);
y1_vec = svmla_m(pg11, y1_vec, temp1_vec, a11_vec);
y2_vec = svmla_m(pg21, y2_vec, temp1_vec, a21_vec);
y3_vec = svmla_m(pg31, y3_vec, temp1_vec, a31_vec);
y0_vec = svmla_m(pg02, y0_vec, temp2_vec, a02_vec);
y1_vec = svmla_m(pg12, y1_vec, temp2_vec, a12_vec);
y2_vec = svmla_m(pg22, y2_vec, temp2_vec, a22_vec);
y3_vec = svmla_m(pg32, y3_vec, temp2_vec, a32_vec);

svst1_vnum(pg0, y + i, 0, y0_vec);
svst1_vnum(pg1, y + i, 1, y1_vec);
svst1_vnum(pg2, y + i, 2, y2_vec);
svst1_vnum(pg3, y + i, 3, y3_vec);
}
a0_ptr += lda;
a1_ptr += lda;
a2_ptr += lda;
ix += inc_x;
}
return(0);
}

for (j = 0; j < n; j++) {
temp = alpha * x[ix];
iy = 0;
for (i = 0; i < m; i++) {
y[iy] += temp * a_ptr[i];
iy += inc_y;
}
a_ptr += lda;
ix += inc_x;
}
return (0);
}

+ 24
- 3
kernel/arm64/sgemm_direct_arm64_sme1.c View File

@@ -7,7 +7,6 @@
#include <stdlib.h>
#include <inttypes.h>
#include <math.h>
#if defined(HAVE_SME)
/* Function prototypes */
@@ -44,7 +43,17 @@ void CNAME (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A,\
m_mod = ceil((double)M/(double)vl_elms) * vl_elms;
float *A_mod = (float *) malloc(m_mod*K*sizeof(float));
/* Prevent compiler optimization by reading from memory instead
* of reading directly from vector (z) registers.
* */
asm volatile("" : : :"p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7",
"p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15",
"z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7",
"z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15",
"z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23",
"z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31");
/* Pre-process the left matrix to make it suitable for
matrix sum of outer-product calculation
*/
@@ -52,8 +61,20 @@ void CNAME (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A,\
/* Calculate C = A*B */
sgemm_direct_sme1_2VLx2VL(M, K, N, A_mod, B, R);
asm volatile("" : : :"p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7",
"p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15",
"z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7",
"z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15",
"z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23",
"z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31");
free(A_mod);
}
#else
void CNAME (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A,\
BLASLONG strideA, float * __restrict B, BLASLONG strideB ,\
float * __restrict R, BLASLONG strideR){}
#endif

+ 219
- 0
kernel/arm64/sgemv_n_neon.c View File

@@ -0,0 +1,219 @@
/***************************************************************************
Copyright (c) 2025, The OpenBLAS Project
All rights reserved.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:

1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.

2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written
permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

#include <arm_neon.h>
#include "common.h"

int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
{
BLASLONG i;
BLASLONG ix,iy;
BLASLONG j;
FLOAT *a_ptr;
FLOAT temp;

ix = 0;
a_ptr = a;

if (inc_x == 1 && inc_y == 1) {
FLOAT *a0_ptr = a + lda * 0;
FLOAT *a1_ptr = a + lda * 1;
FLOAT *a2_ptr = a + lda * 2;
FLOAT *a3_ptr = a + lda * 3;
FLOAT *a4_ptr = a + lda * 4;
FLOAT *a5_ptr = a + lda * 5;
FLOAT *a6_ptr = a + lda * 6;
FLOAT *a7_ptr = a + lda * 7;

j = 0;
while (j + 3 < n) {
float32x4_t x0_vec = vld1q_f32(x + j);
x0_vec = vmulq_n_f32(x0_vec, alpha);
i = 0;
while (i + 7 < m) {
float32x4_t a00_vec = vld1q_f32(a0_ptr + i);
float32x4_t a01_vec = vld1q_f32(a0_ptr + i + 4);
float32x4_t a10_vec = vld1q_f32(a1_ptr + i);
float32x4_t a11_vec = vld1q_f32(a1_ptr + i + 4);
float32x4_t a20_vec = vld1q_f32(a2_ptr + i);
float32x4_t a21_vec = vld1q_f32(a2_ptr + i + 4);
float32x4_t a30_vec = vld1q_f32(a3_ptr + i);
float32x4_t a31_vec = vld1q_f32(a3_ptr + i + 4);

float32x4_t y0_vec = vld1q_f32(y + i);
float32x4_t y1_vec = vld1q_f32(y + i + 4);
y0_vec = vmlaq_laneq_f32(y0_vec, a00_vec, x0_vec, 0);
y0_vec = vmlaq_laneq_f32(y0_vec, a10_vec, x0_vec, 1);
y0_vec = vmlaq_laneq_f32(y0_vec, a20_vec, x0_vec, 2);
y0_vec = vmlaq_laneq_f32(y0_vec, a30_vec, x0_vec, 3);
y1_vec = vmlaq_laneq_f32(y1_vec, a01_vec, x0_vec, 0);
y1_vec = vmlaq_laneq_f32(y1_vec, a11_vec, x0_vec, 1);
y1_vec = vmlaq_laneq_f32(y1_vec, a21_vec, x0_vec, 2);
y1_vec = vmlaq_laneq_f32(y1_vec, a31_vec, x0_vec, 3);

vst1q_f32(y + i, y0_vec);
vst1q_f32(y + i + 4, y1_vec);

i += 8;
}
while (i + 3 < m) {
float32x4_t a0_vec = vld1q_f32(a0_ptr + i);
float32x4_t a1_vec = vld1q_f32(a1_ptr + i);
float32x4_t a2_vec = vld1q_f32(a2_ptr + i);
float32x4_t a3_vec = vld1q_f32(a3_ptr + i);

float32x4_t y_vec = vld1q_f32(y + i);
y_vec = vmlaq_laneq_f32(y_vec, a0_vec, x0_vec, 0);
y_vec = vmlaq_laneq_f32(y_vec, a1_vec, x0_vec, 1);
y_vec = vmlaq_laneq_f32(y_vec, a2_vec, x0_vec, 2);
y_vec = vmlaq_laneq_f32(y_vec, a3_vec, x0_vec, 3);

vst1q_f32(y + i, y_vec);

i += 4;
}
while (i + 1 < m) {
float32x2_t a0_vec = vld1_f32(a0_ptr + i);
float32x2_t a1_vec = vld1_f32(a1_ptr + i);
float32x2_t a2_vec = vld1_f32(a2_ptr + i);
float32x2_t a3_vec = vld1_f32(a3_ptr + i);

float32x2_t y_vec = vld1_f32(y + i);
y_vec = vmla_laneq_f32(y_vec, a0_vec, x0_vec, 0);
y_vec = vmla_laneq_f32(y_vec, a1_vec, x0_vec, 1);
y_vec = vmla_laneq_f32(y_vec, a2_vec, x0_vec, 2);
y_vec = vmla_laneq_f32(y_vec, a3_vec, x0_vec, 3);

vst1_f32(y + i, y_vec);

i += 2;
}
while (i < m) {
y[i] += a0_ptr[i] * x0_vec[0];
y[i] += a1_ptr[i] * x0_vec[1];
y[i] += a2_ptr[i] * x0_vec[2];
y[i] += a3_ptr[i] * x0_vec[3];

i++;
}

a0_ptr += lda * 4;
a1_ptr += lda * 4;
a2_ptr += lda * 4;
a3_ptr += lda * 4;

j += 4;
}
while (j + 1 < n) {
float32x2_t x0_vec = vld1_f32(x + j);
x0_vec = vmul_n_f32(x0_vec, alpha);
i = 0;
while (i + 7 < m) {
float32x4_t a00_vec = vld1q_f32(a0_ptr + i);
float32x4_t a01_vec = vld1q_f32(a0_ptr + i + 4);
float32x4_t a10_vec = vld1q_f32(a1_ptr + i);
float32x4_t a11_vec = vld1q_f32(a1_ptr + i + 4);

float32x4_t y0_vec = vld1q_f32(y + i);
float32x4_t y1_vec = vld1q_f32(y + i + 4);
y0_vec = vmlaq_lane_f32(y0_vec, a00_vec, x0_vec, 0);
y0_vec = vmlaq_lane_f32(y0_vec, a10_vec, x0_vec, 1);
y1_vec = vmlaq_lane_f32(y1_vec, a01_vec, x0_vec, 0);
y1_vec = vmlaq_lane_f32(y1_vec, a11_vec, x0_vec, 1);

vst1q_f32(y + i, y0_vec);
vst1q_f32(y + i + 4, y1_vec);

i += 8;
}
while (i + 3 < m) {
float32x4_t a0_vec = vld1q_f32(a0_ptr + i);
float32x4_t a1_vec = vld1q_f32(a1_ptr + i);

float32x4_t y_vec = vld1q_f32(y + i);
y_vec = vmlaq_lane_f32(y_vec, a0_vec, x0_vec, 0);
y_vec = vmlaq_lane_f32(y_vec, a1_vec, x0_vec, 1);

vst1q_f32(y + i, y_vec);

i += 4;
}
while (i + 1 < m) {
float32x2_t a0_vec = vld1_f32(a0_ptr + i);
float32x2_t a1_vec = vld1_f32(a1_ptr + i);

float32x2_t y_vec = vld1_f32(y + i);
y_vec = vmla_lane_f32(y_vec, a0_vec, x0_vec, 0);
y_vec = vmla_lane_f32(y_vec, a1_vec, x0_vec, 1);

vst1_f32(y + i, y_vec);

i += 2;
}
while (i < m) {
y[i] += a0_ptr[i] * x0_vec[0];
y[i] += a1_ptr[i] * x0_vec[1];

i++;
}

a0_ptr += lda * 2;
a1_ptr += lda * 2;

j += 2;
}
while (j < n) {
i = 0;
temp = alpha * x[j];
while (i < m) {
y[i] += a0_ptr[i] * temp;
i++;
}

a0_ptr += lda;
j++;
}
return (0);
}

for (j = 0; j < n; j++) {
temp = alpha * x[ix];
iy = 0;
for (i = 0; i < m; i++) {
y[iy] += temp * a_ptr[i];
iy += inc_y;
}
a_ptr += lda;
ix += inc_x;
}
return (0);
}

+ 113
- 0
kernel/arm64/symv_L_asimd_4x4.c View File

@@ -0,0 +1,113 @@
/***************************************************************************
Copyright (c) 2025, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:

1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.

2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written
permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

#include "symv_microk_asimd_4x4.c"

int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda,
FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
{
BLASLONG i, j;
FLOAT temp1, temp2;
FLOAT tmp1[4];
FLOAT tmp2[4];
FLOAT *a0, *a1, *a2, *a3;
FLOAT x0, x1, x2, x3;
FLOAT *X = x;
FLOAT *Y = y;

if (inc_y != 1) {
Y = buffer;
COPY_K(m, y, inc_y, Y, 1);
}
if (inc_x != 1) {
if (inc_y != 1) {
X = Y + m;
} else {
X = buffer;
}
COPY_K(m, x, inc_x, X, 1);
}

BLASLONG offset1 = (offset / 4) * 4;
for (j = 0; j < offset1; j+=4) {
a0 = &a[j*lda];
a1 = a0 + lda;
a2 = a1 + lda;
a3 = a2 + lda;
x0 = X[j];
x1 = X[j+1];
x2 = X[j+2];
x3 = X[j+3];
tmp2[0] = a0[j ]*x0 + a0[j+1]*x1 + a0[j+2]*x2 + a0[j+3]*x3;
tmp2[1] = a0[j+1]*x0 + a1[j+1]*x1 + a1[j+2]*x2 + a1[j+3]*x3;
tmp2[2] = a0[j+2]*x0 + a1[j+2]*x1 + a2[j+2]*x2 + a2[j+3]*x3;
tmp2[3] = a0[j+3]*x0 + a1[j+3]*x1 + a2[j+3]*x2 + a3[j+3]*x3;
tmp1[0] = alpha * x0;
tmp1[1] = alpha * x1;
tmp1[2] = alpha * x2;
tmp1[3] = alpha * x3;

BLASLONG m2 = (m/4)*4;
if (m2 > j+4)
symv_kernel_4x4(j+4, m2, a0, a1, a2, a3, X, Y, tmp1, tmp2);

for (i = m2; i < m; i++) {
Y[i] += tmp1[0] * a0[i];
tmp2[0] += a0[i] * X[i];
Y[i] += tmp1[1] * a1[i];
tmp2[1] += a1[i] * X[i];
Y[i] += tmp1[2] * a2[i];
tmp2[2] += a2[i] * X[i];
Y[i] += tmp1[3] * a3[i];
tmp2[3] += a3[i] * X[i];
}
Y[j] += alpha * tmp2[0];
Y[j+1] += alpha * tmp2[1];
Y[j+2] += alpha * tmp2[2];
Y[j+3] += alpha * tmp2[3];
}

for (j = offset1; j < offset; j++) {
temp1 = alpha * X[j];
temp2 = 0.0;
Y[j] += temp1 * a[j*lda+j];
for (i = j+1; i < m; i++) {
Y[i] += temp1 * a[j*lda+i];
temp2 += a[j*lda+i] * X[i];
}
Y[j] += alpha * temp2;
}

if (inc_y != 1) {
COPY_K(m, Y, 1, y, inc_y);
}
return(0);
}

+ 103
- 0
kernel/arm64/symv_L_sve_v1x4.c View File

@@ -0,0 +1,103 @@
/***************************************************************************
Copyright (c) 2025, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:

1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.

2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written
permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

#include "symv_microk_sve_v1x4.c"

int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda,
FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
{
BLASLONG i, j;
FLOAT temp1, temp2;
FLOAT tmp1[4];
FLOAT tmp2[4];
FLOAT *a0, *a1, *a2, *a3;
FLOAT x0, x1, x2, x3;
FLOAT *X = x;
FLOAT *Y = y;

if (inc_y != 1) {
Y = buffer;
COPY_K(m, y, inc_y, Y, 1);
}
if (inc_x != 1) {
if (inc_y != 1) {
X = Y + m;
} else {
X = buffer;
}
COPY_K(m, x, inc_x, X, 1);
}

BLASLONG offset1 = (offset / 4) * 4;

for (j = 0; j < offset1; j+=4) {
a0 = &a[j*lda];
a1 = a0 + lda;
a2 = a1 + lda;
a3 = a2 + lda;
x0 = X[j];
x1 = X[j+1];
x2 = X[j+2];
x3 = X[j+3];
tmp2[0] = a0[j ]*x0 + a0[j+1]*x1 + a0[j+2]*x2 + a0[j+3]*x3;
tmp2[1] = a0[j+1]*x0 + a1[j+1]*x1 + a1[j+2]*x2 + a1[j+3]*x3;
tmp2[2] = a0[j+2]*x0 + a1[j+2]*x1 + a2[j+2]*x2 + a2[j+3]*x3;
tmp2[3] = a0[j+3]*x0 + a1[j+3]*x1 + a2[j+3]*x2 + a3[j+3]*x3;
tmp1[0] = alpha * x0;
tmp1[1] = alpha * x1;
tmp1[2] = alpha * x2;
tmp1[3] = alpha * x3;

symv_kernel_v1x4(j+4, m, a0, a1, a2, a3, X, Y, tmp1, tmp2);

Y[j] += alpha * tmp2[0];
Y[j+1] += alpha * tmp2[1];
Y[j+2] += alpha * tmp2[2];
Y[j+3] += alpha * tmp2[3];
}

for (j = offset1; j < offset; j++) {
temp1 = alpha * X[j];
temp2 = 0.0;
a0 = &a[j*lda];
Y[j] += temp1 * a0[j];
for (i = j+1; i < m; i++) {
Y[i] += temp1 * a0[i];
temp2 += a0[i] * X[i];
}
Y[j] += alpha * temp2;
}

if (inc_y != 1) {
COPY_K(m, Y, 1, y, inc_y);
}
return(0);
}

+ 106
- 0
kernel/arm64/symv_U_asimd_4x4.c View File

@@ -0,0 +1,106 @@
/***************************************************************************
Copyright (c) 2025, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:

1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.

2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written
permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

#include "symv_microk_asimd_4x4.c"

int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda,
FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
{
BLASLONG i, j, j1, j2, m2;
FLOAT temp1, temp2;
FLOAT tmp1[4];
FLOAT tmp2[4];
FLOAT *a0, *a1, *a2, *a3;
FLOAT *X = x;
FLOAT *Y = y;

BLASLONG m1 = m - offset;
if (inc_y != 1) {
Y = buffer;
COPY_K(m, y, inc_y, Y, 1);
}
if (inc_x != 1) {
if (inc_y != 1) {
X = Y + m;
} else {
X = buffer;
}
COPY_K(m, x, inc_x, X, 1);
}

m2 = m - (offset % 4);
for (j = m1; j < m2; j += 4) {
tmp1[0] = alpha * X[j];
tmp1[1] = alpha * X[j+1];
tmp1[2] = alpha * X[j+2];
tmp1[3] = alpha * X[j+3];
tmp2[0] = 0.0;
tmp2[1] = 0.0;
tmp2[2] = 0.0;
tmp2[3] = 0.0;
a0 = &a[j*lda];
a1 = a0 + lda;
a2 = a1 + lda;
a3 = a2 + lda;
j1 = (j / 4) * 4;
if ( j1 )
symv_kernel_4x4(0, j1, a0, a1, a2, a3, X, Y, tmp1, tmp2);

j2 = 0;
for (j1 = j ; j1 < j+4 ; j1++) {
temp1 = tmp1[j2];
temp2 = tmp2[j2];
a0 = &a[j1*lda];
for (i=j ; i<j1; i++) {
Y[i] += temp1 * a0[i];
temp2 += a0[i] * X[i];
}
Y[j1] += temp1 * a0[j1] + alpha * temp2;
j2++;
}
}

for ( ; j < m; j++) {
temp1 = alpha * X[j];
temp2 = 0.0;
a0 = &a[j*lda];
for (i = 0 ; i < j; i++) {
Y[i] += temp1 * a0[i];
temp2 += a0[i] * X[i];
}
Y[j] += temp1 * a0[j] + alpha * temp2;
}

if (inc_y != 1) {
COPY_K(m, Y, 1, y, inc_y);
}
return(0);
}

+ 104
- 0
kernel/arm64/symv_U_sve_v1x4.c View File

@@ -0,0 +1,104 @@
/***************************************************************************
Copyright (c) 2025, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:

1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.

2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written
permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

#include "symv_microk_sve_v1x4.c"

int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda,
FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
{
BLASLONG i, j, j1, j2, m2;
FLOAT temp1, temp2;
FLOAT tmp1[4];
FLOAT tmp2[4];
FLOAT *a0, *a1, *a2, *a3;
FLOAT *X = x;
FLOAT *Y = y;

BLASLONG m1 = m - offset;
if (inc_y != 1) {
Y = buffer;
COPY_K(m, y, inc_y, Y, 1);
}
if (inc_x != 1) {
if (inc_y != 1) {
X = Y + m;
} else {
X = buffer;
}
COPY_K(m, x, inc_x, X, 1);
}

m2 = m - (offset % 4);
for (j = m1; j < m2; j += 4) {
tmp1[0] = alpha * X[j];
tmp1[1] = alpha * X[j+1];
tmp1[2] = alpha * X[j+2];
tmp1[3] = alpha * X[j+3];
tmp2[0] = 0.0;
tmp2[1] = 0.0;
tmp2[2] = 0.0;
tmp2[3] = 0.0;
a0 = &a[j*lda];
a1 = a0 + lda;
a2 = a1 + lda;
a3 = a2 + lda;
symv_kernel_v1x4(0, j, a0, a1, a2, a3, X, Y, tmp1, tmp2);

j2 = 0;
for (j1 = j ; j1 < j+4 ; j1++) {
temp1 = tmp1[j2];
temp2 = tmp2[j2];
a0 = &a[j1*lda];
for (i=j ; i<j1; i++) {
Y[i] += temp1 * a0[i];
temp2 += a0[i] * X[i];
}
Y[j1] += temp1 * a0[j1] + alpha * temp2;
j2++;
}
}

for ( ; j < m; j++) {
temp1 = alpha * X[j];
temp2 = 0.0;
a0 = &a[j*lda];
for (i = 0 ; i < j; i++) {
Y[i] += temp1 * a0[i];
temp2 += a0[i] * X[i];
}
Y[j] += temp1 * a0[j] + alpha * temp2;
}

if (inc_y != 1) {
COPY_K(m, Y, 1, y, inc_y);
}
return(0);
}

+ 120
- 0
kernel/arm64/symv_microk_asimd_4x4.c View File

@@ -0,0 +1,120 @@
/***************************************************************************
Copyright (c) 2025, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:

1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.

2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written
permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

#include "common.h"
#include <arm_neon.h>

static void symv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT *a3,
FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2)
{
#ifdef DOUBLE
float64x2_t vtmpx0 = vld1q_dup_f64(&temp1[0]);
float64x2_t vtmpx1 = vld1q_dup_f64(&temp1[1]);
float64x2_t vtmpx2 = vld1q_dup_f64(&temp1[2]);
float64x2_t vtmpx3 = vld1q_dup_f64(&temp1[3]);
float64x2_t vtmpy0 = {0.0, 0.0};
float64x2_t vtmpy1 = {0.0, 0.0};
float64x2_t vtmpy2 = {0.0, 0.0};
float64x2_t vtmpy3 = {0.0, 0.0};
float64x2_t vxl, vxh, vyl, vyh;
float64x2_t vap0l, vap0h, vap1l, vap1h, vap2l, vap2h, vap3l, vap3h;
BLASLONG i;
for (i = from; i < to; i+=4) {
vyl = vld1q_f64(&y[i]);
vyh = vld1q_f64(&y[i+2]);
vxl = vld1q_f64(&x[i]);
vxh = vld1q_f64(&x[i+2]);
vap0l = vld1q_f64(&a0[i]);
vap0h = vld1q_f64(&a0[i+2]);
vap1l = vld1q_f64(&a1[i]);
vap1h = vld1q_f64(&a1[i+2]);
vap2l = vld1q_f64(&a2[i]);
vap2h = vld1q_f64(&a2[i+2]);
vap3l = vld1q_f64(&a3[i]);
vap3h = vld1q_f64(&a3[i+2]);
vyl = vfmaq_f64(vyl, vtmpx0, vap0l);
vyh = vfmaq_f64(vyh, vtmpx0, vap0h);
vyl = vfmaq_f64(vyl, vtmpx1, vap1l);
vyh = vfmaq_f64(vyh, vtmpx1, vap1h);
vyl = vfmaq_f64(vyl, vtmpx2, vap2l);
vyh = vfmaq_f64(vyh, vtmpx2, vap2h);
vyl = vfmaq_f64(vyl, vtmpx3, vap3l);
vyh = vfmaq_f64(vyh, vtmpx3, vap3h);
vtmpy0 = vfmaq_f64(vtmpy0, vxl, vap0l);
vtmpy0 = vfmaq_f64(vtmpy0, vxh, vap0h);
vtmpy1 = vfmaq_f64(vtmpy1, vxl, vap1l);
vtmpy2 = vfmaq_f64(vtmpy2, vxl, vap2l);
vtmpy1 = vfmaq_f64(vtmpy1, vxh, vap1h);
vtmpy2 = vfmaq_f64(vtmpy2, vxh, vap2h);
vtmpy3 = vfmaq_f64(vtmpy3, vxl, vap3l);
vtmpy3 = vfmaq_f64(vtmpy3, vxh, vap3h);
vst1q_f64(&y[i], vyl);
vst1q_f64(&y[i+2], vyh);
}
temp2[0] += vaddvq_f64(vtmpy0);
temp2[1] += vaddvq_f64(vtmpy1);
temp2[2] += vaddvq_f64(vtmpy2);
temp2[3] += vaddvq_f64(vtmpy3);
#else
float32x4_t vtmpx0 = vld1q_dup_f32(&temp1[0]);
float32x4_t vtmpx1 = vld1q_dup_f32(&temp1[1]);
float32x4_t vtmpx2 = vld1q_dup_f32(&temp1[2]);
float32x4_t vtmpx3 = vld1q_dup_f32(&temp1[3]);
float32x4_t vtmpy0 = {0.0, 0.0, 0.0, 0.0};
float32x4_t vtmpy1 = {0.0, 0.0, 0.0, 0.0};
float32x4_t vtmpy2 = {0.0, 0.0, 0.0, 0.0};
float32x4_t vtmpy3 = {0.0, 0.0, 0.0, 0.0};
float32x4_t vx, vy;
float32x4_t vap0, vap1, vap2, vap3;
BLASLONG i;
for (i = from; i < to; i+=4) {
vy = vld1q_f32(&y[i]);
vx = vld1q_f32(&x[i]);
vap0 = vld1q_f32(&a0[i]);
vap1 = vld1q_f32(&a1[i]);
vap2 = vld1q_f32(&a2[i]);
vap3 = vld1q_f32(&a3[i]);
vy = vfmaq_f32(vy, vtmpx0, vap0);
vy = vfmaq_f32(vy, vtmpx1, vap1);
vy = vfmaq_f32(vy, vtmpx2, vap2);
vy = vfmaq_f32(vy, vtmpx3, vap3);
vtmpy0 = vfmaq_f32(vtmpy0, vx, vap0);
vtmpy1 = vfmaq_f32(vtmpy1, vx, vap1);
vtmpy2 = vfmaq_f32(vtmpy2, vx, vap2);
vtmpy3 = vfmaq_f32(vtmpy3, vx, vap3);
vst1q_f32(&y[i], vy);
}
temp2[0] += vaddvq_f32(vtmpy0);
temp2[1] += vaddvq_f32(vtmpy1);
temp2[2] += vaddvq_f32(vtmpy2);
temp2[3] += vaddvq_f32(vtmpy3);
#endif
}

+ 89
- 0
kernel/arm64/symv_microk_sve_v1x4.c View File

@@ -0,0 +1,89 @@
/***************************************************************************
Copyright (c) 2025, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:

1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.

2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written
permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

#include "common.h"
#include <arm_sve.h>

#ifdef DOUBLE
#define SV_COUNT svcntd
#define SV_TYPE svfloat64_t
#define SV_TRUE svptrue_b64
#define SV_WHILE svwhilelt_b64_s64
#define SV_DUP svdup_f64
#else
#define SV_COUNT svcntw
#define SV_TYPE svfloat32_t
#define SV_TRUE svptrue_b32
#define SV_WHILE svwhilelt_b32_s64
#define SV_DUP svdup_f32
#endif

static void symv_kernel_v1x4(BLASLONG from, BLASLONG to, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT *a3,
FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2)
{
SV_TYPE vtmpx0 = SV_DUP(temp1[0]);
SV_TYPE vtmpx1 = SV_DUP(temp1[1]);
SV_TYPE vtmpx2 = SV_DUP(temp1[2]);
SV_TYPE vtmpx3 = SV_DUP(temp1[3]);
SV_TYPE vtmpy0 = SV_DUP(0.0);
SV_TYPE vtmpy1 = SV_DUP(0.0);
SV_TYPE vtmpy2 = SV_DUP(0.0);
SV_TYPE vtmpy3 = SV_DUP(0.0);
SV_TYPE vx, vy;
SV_TYPE vap0, vap1, vap2, vap3;
BLASLONG i;
uint64_t sve_size = SV_COUNT();
svbool_t pg;

for (i = from; i < to; i += sve_size) {
pg = SV_WHILE(i, to);
vy = svld1(pg, &y[i]);
vx = svld1(pg, &x[i]);
vap0 = svld1(pg, &a0[i]);
vap1 = svld1(pg, &a1[i]);
vap2 = svld1(pg, &a2[i]);
vap3 = svld1(pg, &a3[i]);
vy = svmla_m(pg, vy, vtmpx0, vap0);
vy = svmla_m(pg, vy, vtmpx1, vap1);
vy = svmla_m(pg, vy, vtmpx2, vap2);
vy = svmla_m(pg, vy, vtmpx3, vap3);
vtmpy0 = svmla_m(pg, vtmpy0, vx, vap0);
vtmpy1 = svmla_m(pg, vtmpy1, vx, vap1);
vtmpy2 = svmla_m(pg, vtmpy2, vx, vap2);
vtmpy3 = svmla_m(pg, vtmpy3, vx, vap3);
svst1(pg, &y[i], vy);
}
pg = SV_TRUE();
temp2[0] += svaddv(pg, vtmpy0);
temp2[1] += svaddv(pg, vtmpy1);
temp2[2] += svaddv(pg, vtmpy2);
temp2[3] += svaddv(pg, vtmpy3);
}

+ 1
- 6
kernel/loongarch64/amax_lasx.S View File

@@ -56,17 +56,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
LDINT INCX, 0(INCX)
#endif

xvxor.v VM0, VM0, VM0
bge $r0, N, .L999
bge $r0, INCX, .L999
li.d TEMP, 1
slli.d TEMP, TEMP, BASE_SHIFT
slli.d INCX, INCX, BASE_SHIFT
#ifdef DOUBLE
xvldrepl.d VM0, X, 0
#else
xvldrepl.w VM0, X, 0
#endif
XVFSUB VM0, VM0, VM0
bne INCX, TEMP, .L20

srai.d I, N, 4


+ 10
- 12
kernel/loongarch64/asum_lasx.S View File

@@ -103,21 +103,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvfadd.d res1, VX2, res1
xvfadd.d res1, VX3, res1
#else
xvfadd.s res2, res1, res2
xvpickve.w VX1, res1, 1
xvpickve.w VX2, res1, 2
xvpickve.w VX3, res1, 3
xvfadd.s res1, VX1, res1
xvfadd.s res1, VX2, res1
xvfadd.s res1, VX3, res1
xvpickve.w VX0, res2, 4
xvpickve.w VX1, res2, 5
xvpickve.w VX2, res2, 6
xvpickve.w VX3, res2, 7
xvpickve.w VX0, res1, 4
xvpickve.w VX1, res1, 5
xvpickve.w VX2, res1, 6
xvpickve.w VX3, res1, 7
xvfadd.s res1, VX0, res1
xvfadd.s res1, VX1, res1
xvfadd.s res1, VX2, res1
xvfadd.s res1, VX2, res1
xvfadd.s res1, VX3, res1
#endif
.align 3

@@ -217,21 +216,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvfadd.d res1, VX2, res1
xvfadd.d res1, VX3, res1
#else
xvfadd.s res2, res1, res2
xvpickve.w VX1, res1, 1
xvpickve.w VX2, res1, 2
xvpickve.w VX3, res1, 3
xvfadd.s res1, VX1, res1
xvfadd.s res1, VX2, res1
xvfadd.s res1, VX3, res1
xvpickve.w VX0, res2, 4
xvpickve.w VX1, res2, 5
xvpickve.w VX2, res2, 6
xvpickve.w VX3, res2, 7
xvpickve.w VX0, res1, 4
xvpickve.w VX1, res1, 5
xvpickve.w VX2, res1, 6
xvpickve.w VX3, res1, 7
xvfadd.s res1, VX0, res1
xvfadd.s res1, VX1, res1
xvfadd.s res1, VX2, res1
xvfadd.s res1, VX2, res1
xvfadd.s res1, VX3, res1
#endif
.align 3



+ 1
- 1
kernel/loongarch64/cdot_lasx.S View File

@@ -288,7 +288,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvinsgr2vr.w x2, t2, 6
xvinsgr2vr.w x1, t3, 7
xvinsgr2vr.w x2, t4, 7
addi.d Y, Y, 8 * SIZE
addi.d Y, Y, 16 * SIZE
xvpickev.w x3, VX3, VX2
xvpickod.w x4, VX3, VX2
xvfmadd.s res1, x1, x3, res1


+ 53
- 25
kernel/loongarch64/cnrm2_lasx.S View File

@@ -47,6 +47,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define VX4 $xr21
#define res1 $xr19
#define res2 $xr20
#define RCP $f2
#define VALPHA $xr3

PROLOGUE

@@ -55,10 +57,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
LDINT INCX, 0(INCX)
#endif

xvxor.v res1, res1, res1
xvxor.v res2, res2, res2
bge $r0, N, .L999
beq $r0, INCX, .L999

addi.d $sp, $sp, -32
st.d $ra, $sp, 0
st.d N, $sp, 8
st.d X, $sp, 16
st.d INCX, $sp, 24
#ifdef DYNAMIC_ARCH
bl camax_k_LA264
#else
bl camax_k
#endif
ld.d $ra, $sp, 0
ld.d N, $sp, 8
ld.d X, $sp, 16
ld.d INCX, $sp, 24
addi.d $sp, $sp, 32

frecip.s RCP, $f0
vreplvei.w $vr3, $vr2, 0
xvpermi.d VALPHA, $xr3,0x00
xvxor.v res1, res1, res1
xvxor.v res2, res2, res2
fcmp.ceq.s $fcc0, $f0, $f19
bcnez $fcc0, .L999

li.d TEMP, SIZE
slli.d INCX, INCX, ZBASE_SHIFT
srai.d I, N, 2
@@ -67,13 +92,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.align 3

.L10:
xvld VX0, X, 0 * SIZE
xvfcvtl.d.s VX1, VX0
xvfcvth.d.s VX2, VX0
xvfmadd.d res1, VX1, VX1, res1
xvfmadd.d res2, VX2, VX2, res2
addi.d I, I, -1
addi.d X, X, 8 * SIZE

xvld VX0, X, 0 * SIZE
xvld VX1, X, 8 * SIZE
xvfmul.s VX0, VX0, VALPHA
xvfmul.s VX1, VX1, VALPHA
xvfmadd.s res1, VX0, VX0, res1
xvfmadd.s res2, VX1, VX1, res2

addi.d X, X, 16 * SIZE
blt $r0, I, .L10
.align 3
b .L996
@@ -103,22 +131,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvinsgr2vr.w VX0, t3, 6
xvinsgr2vr.w VX0, t4, 7
add.d X, X, INCX
xvfcvtl.d.s VX1, VX0
xvfcvth.d.s VX2, VX0
xvfmadd.d res1, VX1, VX1, res1
xvfmadd.d res2, VX2, VX2, res2
xvfmul.s VX0, VX0, VALPHA
xvfmadd.s res2, VX0, VX0, res2
addi.d I, I, -1
blt $r0, I, .L21
b .L996

.L996:
xvfadd.d res1, res1, res2
xvpickve.d VX1, res1, 1
xvpickve.d VX2, res1, 2
xvpickve.d VX3, res1, 3
xvfadd.d res1, VX1, res1
xvfadd.d res1, VX2, res1
xvfadd.d res1, VX3, res1
xvfadd.s res1, res1, res2
xvpermi.d VX1, res1, 0x4e
xvfadd.s res1, res1, VX1
vreplvei.w $vr17, $vr19, 1
vreplvei.w $vr18, $vr19, 2
vreplvei.w $vr21, $vr19, 3
xvfadd.s res1, VX2, res1
xvfadd.s res1, VX3, res1
xvfadd.s res1, VX4, res1
.align 3

.L997:
@@ -130,18 +158,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fld.s a1, X, 0 * SIZE
fld.s a2, X, 1 * SIZE
addi.d I, I, -1
fcvt.d.s a1, a1
fcvt.d.s a2, a2
fmadd.d res, a1, a1, res
fmadd.d res, a2, a2, res
fmul.s a1, a1, RCP
fmul.s a2, a2, RCP
fmadd.s res, a1, a1, res
fmadd.s res, a2, a2, res
add.d X, X, INCX
blt $r0, I, .L998
.align 3

.L999:
fsqrt.d res, res
fsqrt.s res, res
fmul.s $f0, res, $f0
move $r4, $r17
fcvt.s.d $f0, res
jirl $r0, $r1, 0x0

EPILOGUE

+ 4
- 4
kernel/loongarch64/copy_lasx.S View File

@@ -260,9 +260,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add.d Y, Y, INCY
ST a2, Y, 0
add.d Y, Y, INCY
ST a3, X, 0
ST a3, Y, 0
add.d Y, Y, INCY
ST a4, X, 0
ST a4, Y, 0
add.d Y, Y, INCY
LD a1, X, 0
add.d X, X, INCX
@@ -276,9 +276,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add.d Y, Y, INCY
ST a2, Y, 0
add.d Y, Y, INCY
ST a3, X, 0
ST a3, Y, 0
add.d Y, Y, INCY
ST a4, X, 0
ST a4, Y, 0
add.d Y, Y, INCY
addi.d I, I, -1
blt $r0, I, .L222


+ 61
- 186
kernel/loongarch64/cscal_lasx.S View File

@@ -33,6 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ALPHAI $f1
#define X $r7
#define INCX $r8
#define DUMMY2 $r9

#define I $r12
#define TEMP $r13
@@ -65,6 +66,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

bge $r0, N, .L999
bge $r0, INCX, .L999
ld.d DUMMY2, $sp, 0
li.d TEMP, 1
movgr2fr.d a1, $r0
FFINT a1, a1
@@ -86,24 +88,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
bne INCX, TEMP, .L22

/////// INCX == 1 ////////
.L11:
bge $r0, I, .L997
CMPEQ $fcc0, ALPHAR, a1
CMPEQ $fcc1, ALPHAI, a1
bceqz $fcc0, .L13
b .L14
.align 3
bge $r0, I, .L19
/////// INCX == 1 && N >= 4 ////////
bnez DUMMY2, .L17 // if DUMMPY2 == 1, called from c/zscal.

.L13:
bceqz $fcc1, .L114 //alpha_r != 0.0 && alpha_i != 0.0
b .L113 //alpha_r != 0.0 && alpha_i == 0.0
bceqz $fcc0, .L17

.L14:
bceqz $fcc1, .L114 //alpha_r == 0.0 && alpha_i != 0.0
b .L111 //alpha_r == 0.0 && alpha_i == 0.0
.align 3
bceqz $fcc1, .L17

.L111: //alpha_r == 0.0 && alpha_i == 0.0
.L15: //alpha_r == 0.0 && alpha_i == 0.0
xvst VXZ, X, 0 * SIZE
#ifdef DOUBLE
xvst VXZ, X, 4 * SIZE
@@ -113,41 +110,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi.d X, X, 16 * SIZE
#endif
addi.d I, I, -1
blt $r0, I, .L111
b .L997
.align 3

.L113: //alpha_r != 0.0 && alpha_i == 0.0
xvld VX0, X, 0 * SIZE
#ifdef DOUBLE
xvld VX1, X, 4 * SIZE
xvpickev.d x1, VX1, VX0
xvpickod.d x2, VX1, VX0
xvfmul.d x3, VXAR, x1
xvfmul.d x4, VXAR, x2
xvilvl.d VX2, x4 ,x3
xvilvh.d VX3, x4, x3
xvst VX2, X, 0 * SIZE
xvst VX3, X, 4 * SIZE
addi.d X, X, 8 * SIZE
#else
xvld VX1, X, 8 * SIZE
xvpickev.w x1, VX1, VX0
xvpickod.w x2, VX1, VX0
xvfmul.s x3, VXAR, x1
xvfmul.s x4, VXAR, x2
xvilvl.w VX2, x4 ,x3
xvilvh.w VX3, x4, x3
xvst VX2, X, 0 * SIZE
xvst VX3, X, 8 * SIZE
addi.d X, X, 16 * SIZE
#endif
addi.d I, I, -1
blt $r0, I, .L113
b .L997
blt $r0, I, .L15
b .L19
.align 3

.L114: //alpha_r != 0.0 && alpha_i != 0.0
.L17:
xvld VX0, X, 0 * SIZE
#ifdef DOUBLE
xvld VX1, X, 4 * SIZE
@@ -177,29 +144,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi.d X, X, 16 * SIZE
#endif
addi.d I, I, -1
blt $r0, I, .L114
b .L997
blt $r0, I, .L17
b .L19
.align 3

/////// INCX == 1 && N < 8 ///////
.L19:
#ifdef DOUBLE
andi I, N, 3
#else
andi I, N, 7
#endif
beqz I, .L999
bnez DUMMY2, .L998 // if DUMMPY2 == 1, called from c/zscal.

bceqz $fcc0, .L998

bceqz $fcc1, .L998

b .L995 // alpha_r == 0.0 && alpha_i == 0.0
.align 3

/////// INCX != 1 ////////
.L22:
bge $r0, I, .L997
move XX, X
CMPEQ $fcc0, ALPHAR, a1
CMPEQ $fcc1, ALPHAI, a1
bceqz $fcc0, .L23
b .L24
.align 3

.L23:
bceqz $fcc1, .L224 //alpha_r != 0.0 && alpha_i != 0.0
b .L223 //alpha_r != 0.0 && alpha_i == 0.0
move XX, X
bge $r0, I, .L29
bnez DUMMY2, .L25 // if DUMMPY2 == 1, called from c/zscal.
bceqz $fcc0, .L25

.L24:
bceqz $fcc1, .L224 //alpha_r == 0.0 && alpha_i != 0.0
b .L221 //alpha_r == 0.0 && alpha_i == 0.0
.align 3
bceqz $fcc1, .L25

.L221: //alpha_r == 0.0 && alpha_i == 0.0
.L27: //alpha_r == 0.0 && alpha_i == 0.0
#ifdef DOUBLE
xvstelm.d VXZ, X, 0, 0
xvstelm.d VXZ, X, 1 * SIZE, 0
@@ -239,122 +216,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
add.d X, X, INCX
addi.d I, I, -1
blt $r0, I, .L221
b .L997
.align 3

.L223: //alpha_r != 0.0 && alpha_i == 0.0
#ifdef DOUBLE
ld.d t1, X, 0 * SIZE
ld.d t2, X, 1 * SIZE
add.d X, X, INCX
ld.d t3, X, 0 * SIZE
ld.d t4, X, 1 * SIZE
add.d X, X, INCX
xvinsgr2vr.d x1, t1, 0
xvinsgr2vr.d x2, t2, 0
xvinsgr2vr.d x1, t3, 1
xvinsgr2vr.d x2, t4, 1
ld.d t1, X, 0 * SIZE
ld.d t2, X, 1 * SIZE
add.d X, X, INCX
ld.d t3, X, 0 * SIZE
ld.d t4, X, 1 * SIZE
xvinsgr2vr.d x1, t1, 2
xvinsgr2vr.d x2, t2, 2
xvinsgr2vr.d x1, t3, 3
xvinsgr2vr.d x2, t4, 3
add.d X, X, INCX

xvfmul.d x3, VXAR, x1
xvfmul.d x4, VXAR, x2
addi.d I, I, -1
xvstelm.d x3, XX, 0 * SIZE, 0
xvstelm.d x4, XX, 1 * SIZE, 0
add.d XX, XX, INCX
xvstelm.d x3, XX, 0 * SIZE, 1
xvstelm.d x4, XX, 1 * SIZE, 1
add.d XX, XX, INCX
xvstelm.d x3, XX, 0 * SIZE, 2
xvstelm.d x4, XX, 1 * SIZE, 2
add.d XX, XX, INCX
xvstelm.d x3, XX, 0 * SIZE, 3
xvstelm.d x4, XX, 1 * SIZE, 3
#else
ld.w t1, X, 0 * SIZE
ld.w t2, X, 1 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
ld.w t4, X, 1 * SIZE
add.d X, X, INCX
xvinsgr2vr.w x1, t1, 0
xvinsgr2vr.w x2, t2, 0
xvinsgr2vr.w x1, t3, 1
xvinsgr2vr.w x2, t4, 1
ld.w t1, X, 0 * SIZE
ld.w t2, X, 1 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
ld.w t4, X, 1 * SIZE
xvinsgr2vr.w x1, t1, 2
xvinsgr2vr.w x2, t2, 2
xvinsgr2vr.w x1, t3, 3
xvinsgr2vr.w x2, t4, 3
add.d X, X, INCX
ld.w t1, X, 0 * SIZE
ld.w t2, X, 1 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
ld.w t4, X, 1 * SIZE
add.d X, X, INCX
xvinsgr2vr.w x1, t1, 4
xvinsgr2vr.w x2, t2, 4
xvinsgr2vr.w x1, t3, 5
xvinsgr2vr.w x2, t4, 5
ld.w t1, X, 0 * SIZE
ld.w t2, X, 1 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
ld.w t4, X, 1 * SIZE
xvinsgr2vr.w x1, t1, 6
xvinsgr2vr.w x2, t2, 6
xvinsgr2vr.w x1, t3, 7
xvinsgr2vr.w x2, t4, 7
add.d X, X, INCX

xvfmul.s x3, VXAR, x1
xvfmul.s x4, VXAR, x2
addi.d I, I, -1
xvstelm.w x3, XX, 0 * SIZE, 0
xvstelm.w x4, XX, 1 * SIZE, 0
add.d XX, XX, INCX
xvstelm.w x3, XX, 0 * SIZE, 1
xvstelm.w x4, XX, 1 * SIZE, 1
add.d XX, XX, INCX
xvstelm.w x3, XX, 0 * SIZE, 2
xvstelm.w x4, XX, 1 * SIZE, 2
add.d XX, XX, INCX
xvstelm.w x3, XX, 0 * SIZE, 3
xvstelm.w x4, XX, 1 * SIZE, 3
add.d XX, XX, INCX
xvstelm.w x3, XX, 0 * SIZE, 4
xvstelm.w x4, XX, 1 * SIZE, 4
add.d XX, XX, INCX
xvstelm.w x3, XX, 0 * SIZE, 5
xvstelm.w x4, XX, 1 * SIZE, 5
add.d XX, XX, INCX
xvstelm.w x3, XX, 0 * SIZE, 6
xvstelm.w x4, XX, 1 * SIZE, 6
add.d XX, XX, INCX
xvstelm.w x3, XX, 0 * SIZE, 7
xvstelm.w x4, XX, 1 * SIZE, 7
#endif
add.d XX, XX, INCX
blt $r0, I, .L223
b .L997
blt $r0, I, .L27
b .L29
.align 3

.L224: //alpha_r != 0.0 && alpha_i != 0.0
.L25:
#ifdef DOUBLE
ld.d t1, X, 0 * SIZE
ld.d t2, X, 1 * SIZE
@@ -376,7 +242,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvinsgr2vr.d x1, t3, 3
xvinsgr2vr.d x2, t4, 3
add.d X, X, INCX

xvfmul.d VX0, VXAI, x2
xvfmsub.d x3, VXAR, x1, VX0
xvfmul.d VX1, VXAI, x1
@@ -434,7 +299,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvinsgr2vr.w x1, t3, 7
xvinsgr2vr.w x2, t4, 7
add.d X, X, INCX

xvfmul.s VX0, VXAI, x2
xvfmsub.s x3, VXAR, x1, VX0
xvfmul.s VX1, VXAI, x1
@@ -465,19 +329,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvstelm.w x4, XX, 1 * SIZE, 7
#endif
add.d XX, XX, INCX
blt $r0, I, .L224
b .L997
blt $r0, I, .L25
b .L29
.align 3

.L997:
/////// INCX != 1 && N < 8 ///////
.L29:
#ifdef DOUBLE
andi I, N, 3
andi I, N, 3
#else
andi I, N, 7
andi I, N, 7
#endif
bge $r0, I, .L999
.align 3
beqz I, .L999
bnez DUMMY2, .L998 // if DUMMPY2 == 1, called from c/zscal.

bceqz $fcc0, .L998

bceqz $fcc1, .L998

.L995: // alpha_r == 0.0 && alpha_i == 0.0
ST a1, X, 0 * SIZE
ST a1, X, 1 * SIZE
addi.d I, I, -1
add.d X, X, INCX
blt $r0, I, .L995
b .L999
.L998:
LD a1, X, 0 * SIZE
LD a2, X, 1 * SIZE
@@ -490,11 +366,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ST s2, X, 1 * SIZE
add.d X, X, INCX
blt $r0, I, .L998
.align 3
b .L999

.L999:
move $r4, $r12
jirl $r0, $r1, 0x0
.align 3

EPILOGUE

+ 29
- 57
kernel/loongarch64/dot_lasx.S View File

@@ -53,8 +53,8 @@ PROLOGUE
#endif

/* init $f8 and $f9 to zero */
SUB s1, s1, s1
SUB s2, s2, s2
xvxor.v $xr8, $xr8, $xr8
xvxor.v $xr9, $xr9, $xr9
slli.d INCX, INCX, BASE_SHIFT
li.d TEMP, SIZE
slli.d INCY, INCY, BASE_SHIFT
@@ -64,20 +64,6 @@ PROLOGUE

/* !((inc_x == 1) && (inc_y == 1)) */

/* init $xr8 and $xr9 to zero */
#ifdef DOUBLE
xvldrepl.d $xr0, X, 0
#else
xvldrepl.w $xr0, X, 0
#endif
#ifdef DSDOT
xvfcvtl.d.s $xr0, $xr0
xvfsub.d $xr8, $xr0, $xr0
xvfsub.d $xr9, $xr0, $xr0
#else
XVFSUB $xr8, $xr0, $xr0
XVFSUB $xr9, $xr0, $xr0
#endif

#ifdef DOUBLE
srai.d I, N, 4
@@ -99,31 +85,31 @@ PROLOGUE
addi.w I, I, -1
addi.d X, X, 128
addi.d Y, Y, 128
#ifdef DSDOT
#ifndef DOUBLE
xvfcvtl.d.s $xr10, $xr0
xvfcvtl.d.s $xr11, $xr4
xvfcvth.d.s $xr12, $xr0
xvfcvth.d.s $xr13, $xr4
xvfmadd.d $xr8, $xr10, $xr12, $xr8
xvfmadd.d $xr9, $xr11, $xr13, $xr9
xvfmadd.d $xr8, $xr10, $xr11, $xr8
xvfmadd.d $xr9, $xr12, $xr13, $xr9
xvfcvtl.d.s $xr10, $xr1
xvfcvtl.d.s $xr11, $xr5
xvfcvth.d.s $xr12, $xr1
xvfcvth.d.s $xr13, $xr5
xvfmadd.d $xr8, $xr10, $xr12, $xr8
xvfmadd.d $xr9, $xr11, $xr13, $xr9
xvfmadd.d $xr8, $xr10, $xr11, $xr8
xvfmadd.d $xr9, $xr12, $xr13, $xr9
xvfcvtl.d.s $xr10, $xr2
xvfcvtl.d.s $xr11, $xr6
xvfcvth.d.s $xr12, $xr2
xvfcvth.d.s $xr13, $xr6
xvfmadd.d $xr8, $xr10, $xr12, $xr8
xvfmadd.d $xr9, $xr11, $xr13, $xr9
xvfmadd.d $xr8, $xr10, $xr11, $xr8
xvfmadd.d $xr9, $xr12, $xr13, $xr9
xvfcvtl.d.s $xr10, $xr3
xvfcvtl.d.s $xr11, $xr7
xvfcvth.d.s $xr12, $xr3
xvfcvth.d.s $xr13, $xr7
xvfmadd.d $xr8, $xr10, $xr12, $xr8
xvfmadd.d $xr9, $xr11, $xr13, $xr9
xvfmadd.d $xr8, $xr10, $xr11, $xr8
xvfmadd.d $xr9, $xr12, $xr13, $xr9
#else
XVFMADD $xr8, $xr0, $xr4, $xr8
XVFMADD $xr9, $xr1, $xr5, $xr9
@@ -149,13 +135,13 @@ PROLOGUE
addi.w I, I, -1
addi.d X, X, 32
addi.d Y, Y, 32
#ifdef DSDOT
#ifndef DOUBLE
xvfcvtl.d.s $xr10, $xr0
xvfcvtl.d.s $xr11, $xr4
xvfcvth.d.s $xr12, $xr0
xvfcvth.d.s $xr13, $xr4
xvfmadd.d $xr8, $xr10, $xr12, $xr8
xvfmadd.d $xr9, $xr11, $xr13, $xr9
xvfmadd.d $xr8, $xr10, $xr11, $xr8
xvfmadd.d $xr9, $xr12, $xr13, $xr9
#else
XVFMADD $xr8, $xr0, $xr4, $xr8
#endif
@@ -163,27 +149,12 @@ PROLOGUE
.align 3
.L14:
/* store dot in s1 $f8 */
#ifdef DSDOT
xvfadd.d $xr8, $xr8, $xr9
fsub.s s2, s2, s2 /* set s2 to 0.0 */
fsub.d s2, s2, s2 /* set s2 to 0.0 */
xvpermi.q $xr0, $xr8, 0x1
vfadd.d $vr8, $vr8, $vr0
vpackod.d $vr0, $vr8, $vr8
vfadd.d $vr8, $vr8, $vr0
#else
XVFADD $xr8, $xr8, $xr9
SUB s2, s2, s2 /* set s2 to 0.0 */
xvpermi.q $xr0, $xr8, 0x1
VFADD $vr8, $vr8, $vr0
vpackod.d $vr0, $vr8, $vr8
#ifdef DOUBLE
VFADD $vr8, $vr8, $vr0
#else
VFADD $vr8, $vr8, $vr0
vpackod.w $vr0, $vr8, $vr8
VFADD $vr8, $vr8, $vr0
#endif /* defined DOUBLE */
#endif /* defined DSDOT */
.align 3
.L15:
#ifdef DOUBLE
@@ -197,7 +168,7 @@ PROLOGUE
/* FLOAT: 1~7 ; DOUBLE: 1~3 */
LD a1, X, 0
LD b1, Y, 0
#ifdef DSDOT
#ifndef DOUBLE
fcvt.d.s a1, a1
fcvt.d.s b1, b1
fmadd.d s1, b1, a1, s1
@@ -240,7 +211,7 @@ PROLOGUE
add.d X, X, INCX
LD b1, Y, 0 * SIZE
add.d Y, Y, INCY
#ifdef DSDOT
#ifndef DOUBLE
fcvt.d.s a1, a1
fcvt.d.s b1, b1
fmadd.d s1, b1, a1, s1
@@ -252,7 +223,7 @@ PROLOGUE
add.d X, X, INCX
LD b1, Y, 0 * SIZE
add.d Y, Y, INCY
#ifdef DSDOT
#ifndef DOUBLE
fcvt.d.s a1, a1
fcvt.d.s b1, b1
fmadd.d s2, b1, a1, s2
@@ -264,7 +235,7 @@ PROLOGUE
add.d X, X, INCX
LD b1, Y, 0 * SIZE
add.d Y, Y, INCY
#ifdef DSDOT
#ifndef DOUBLE
fcvt.d.s a1, a1
fcvt.d.s b1, b1
fmadd.d s1, b1, a1, s1
@@ -276,7 +247,7 @@ PROLOGUE
add.d X, X, INCX
LD b1, Y, 0 * SIZE
add.d Y, Y, INCY
#ifdef DSDOT
#ifndef DOUBLE
fcvt.d.s a1, a1
fcvt.d.s b1, b1
fmadd.d s2, b1, a1, s2
@@ -288,7 +259,7 @@ PROLOGUE
add.d X, X, INCX
LD b1, Y, 0 * SIZE
add.d Y, Y, INCY
#ifdef DSDOT
#ifndef DOUBLE
fcvt.d.s a1, a1
fcvt.d.s b1, b1
fmadd.d s1, b1, a1, s1
@@ -300,7 +271,7 @@ PROLOGUE
add.d X, X, INCX
LD b1, Y, 0 * SIZE
add.d Y, Y, INCY
#ifdef DSDOT
#ifndef DOUBLE
fcvt.d.s a1, a1
fcvt.d.s b1, b1
fmadd.d s2, b1, a1, s2
@@ -312,7 +283,7 @@ PROLOGUE
add.d X, X, INCX
LD b1, Y, 0 * SIZE
add.d Y, Y, INCY
#ifdef DSDOT
#ifndef DOUBLE
fcvt.d.s a1, a1
fcvt.d.s b1, b1
fmadd.d s1, b1, a1, s1
@@ -325,7 +296,7 @@ PROLOGUE
LD b1, Y, 0 * SIZE
add.d Y, Y, INCY
addi.d I, I, -1
#ifdef DSDOT
#ifndef DOUBLE
fcvt.d.s a1, a1
fcvt.d.s b1, b1
fmadd.d s2, b1, a1, s2
@@ -346,7 +317,7 @@ PROLOGUE
LD b1, Y, 0 * SIZE
add.d Y, Y, INCY
addi.d I, I, -1
#ifdef DSDOT
#ifndef DOUBLE
fcvt.d.s a1, a1
fcvt.d.s b1, b1
fmadd.d s1, b1, a1, s1
@@ -357,12 +328,13 @@ PROLOGUE
.align 3

.L999:
#ifdef DSDOT
fadd.d $f0, s1, s2
move $r4, $r17
#if defined(DOUBLE)
#elif defined(DSDOT)
#else
ADD $f0, s1, s2
fcvt.s.d $f0, $f0
#endif
move $r4, $r17
jirl $r0, $r1, 0x0

EPILOGUE

+ 282
- 284
kernel/loongarch64/iamax_lasx.S View File

@@ -56,25 +56,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define VI3 $xr8
#define VI4 $xr19
#define VT0 $xr23
#define VZE $xr3
#define VT1 $xr4
#define VT2 $xr5
#define VC0 $xr6

PROLOGUE
li.d i0, 0
bge $r0, N, .L999
bge $r0, INCX, .L999
li.d TEMP, 1
xvldi VZE, 0
slli.d TEMP, TEMP, BASE_SHIFT
slli.d INCX, INCX, BASE_SHIFT
bne INCX, TEMP, .L20
xvld VM0, X, 0
#ifdef DOUBLE
xvfsub.d VT1, VZE, VM0
addi.d i0, i0, 1
srai.d I, N, 3
bge $r0, I, .L21
slli.d i0, i0, 2 //4
xvfmaxa.d VM0, VM0, VT1
bge $r0, I, .L11
slli.d i0, i0, 1 //2
xvreplgr2vr.d VINC4, i0
slli.d i0, i0, 1 //8
slli.d i0, i0, 1 //4
xvreplgr2vr.d VINC8, i0
addi.d i0, i0, -15
addi.d i0, i0, -7
xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
addi.d i0, i0, 1
xvinsgr2vr.d VI1, i0, 1
@@ -82,19 +89,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvinsgr2vr.d VI1, i0, 2
addi.d i0, i0, 1
xvinsgr2vr.d VI1, i0, 3
addi.d i0, i0, 5
xvinsgr2vr.d VI0, i0, 0 //1
addi.d i0, i0, 1
xvinsgr2vr.d VI0, i0, 1 //2
xvinsgr2vr.d VI0, i0, 0 //initialize the index value for vectorization
addi.d i0, i0, 1
xvinsgr2vr.d VI0, i0, 2 //3
xvinsgr2vr.d VI0, i0, 1
addi.d i0, i0, 1
xvinsgr2vr.d VI0, i0, 3 //4
xvinsgr2vr.d VI0, i0, 2
addi.d i0, i0, 1
xvinsgr2vr.d VI0, i0, 3
#else
xvfsub.s VT1, VZE, VM0
addi.w i0, i0, 1
srai.d I, N, 3
xvfmaxa.s VM0, VM0, VT1
bge $r0, I, .L21
slli.w i0, i0, 3 //8
slli.w i0, i0, 2 //4
xvreplgr2vr.w VINC4, i0
slli.w i0, i0, 1 //8
xvreplgr2vr.w VINC8, i0
addi.w i0, i0, -15
xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
@@ -135,73 +146,124 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifdef DOUBLE
xvld VX0, X, 0 * SIZE
xvadd.d VI1, VI1, VINC8
xvld VX1, X, 4 * SIZE
xvld VX1, X, 2 * SIZE
xvadd.d VI2, VI1, VINC4
xvfsub.d VT1, VZE, VX0
xvfsub.d VT2, VZE, VX1
xvfmaxa.d VX0, VX0, VT1
xvfmaxa.d VX1, VX1, VT2
xvfcmp.clt.d VT0, VX0, VX1 //abx(x0) < abs(x1)
xvbitsel.v x1, VX0, VX1, VT0 //abs(maxf)
xvbitsel.v x2, VI1, VI2, VT0 //i

xvld VX0, X, 4 * SIZE
xvadd.d VI1, VI2, VINC4
xvld VX1, X, 6 * SIZE
xvadd.d VI2, VI1, VINC4
xvfmaxa.d VM1, VX0, VX1
xvfcmp.ceq.d VT0, VX0, VM1
xvfsub.d VT1, VZE, VX0
xvfsub.d VT2, VZE, VX1
xvfmaxa.d VX0, VX0, VT1
xvfmaxa.d VX1, VX1, VT2
xvfcmp.clt.d VT0, VX0, VX1
xvbitsel.v x3, VX0, VX1, VT0 //abs(maxf)
xvbitsel.v x4, VI1, VI2, VT0 //i
xvfcmp.clt.d VC0, x1, x3
xvbitsel.v x1, x1, x3, VC0 //abs(maxf)
xvbitsel.v x2, x2, x4, VC0 //i
xvfcmp.clt.d VT0, VM0, x1
addi.d I, I, -1
xvbitsel.v VI2, VI2, VI1, VT0
xvfmaxa.d VM1, VM0, VM1
xvfcmp.ceq.d VT0, VM0, VM1
addi.d X, X, 8 * SIZE
xvbitsel.v VM0, VM1, VM0, VT0
xvbitsel.v VI0, VI2, VI0, VT0
xvbitsel.v VM0, VM0, x1, VT0
xvbitsel.v VI0, VI0, x2, VT0
#else
xvld VX0, X, 0 * SIZE
addi.d I, I, -1
xvadd.w VI1, VI1, VINC8
xvfmaxa.s VM1, VX0, VM0
xvfcmp.ceq.s VT0, VM0, VM1
xvld VX1, X, 4 * SIZE
xvadd.w VI2, VI1, VINC4
xvfsub.s VT1, VZE, VX0
xvfsub.s VT2, VZE, VX1
xvfmaxa.s VX0, VX0, VT1
xvfmaxa.s VX1, VX1, VT2
xvfcmp.clt.s VT0, VX0, VX1
xvbitsel.v x1, VX0, VX1, VT0 //abs(maxf)
xvbitsel.v x2, VI1, VI2, VT0 //i
addi.d I, I, -1
xvfcmp.clt.s VT0, VM0, x1
addi.d X, X, 8 * SIZE
xvbitsel.v VM0, VM1, VM0, VT0
xvbitsel.v VI0, VI1, VI0, VT0
xvbitsel.v VM0, VM0, x1, VT0
xvbitsel.v VI0, VI0, x2, VT0

#endif
blt $r0, I, .L10
.align 3

.L15:
#ifdef DOUBLE
xvpickve.d VI1, VI0, 0
xvpickve.d VI2, VI0, 1
xvpickve.d VI3, VI0, 2
xvpickve.d VI4, VI0, 3
xvpickve.d x1, VM0, 0
xvpickve.d x2, VM0, 1
xvpickve.d x3, VM0, 2
xvpickve.d x4, VM0, 3
vreplvei.d $vr21, $vr20, 0
vreplvei.d $vr22, $vr20, 1
vreplvei.d $vr9, $vr15, 0
vreplvei.d $vr10, $vr15, 1
fcmp.ceq.d $fcc0, $f9, $f10
bceqz $fcc0, .L16
xvfcmp.clt.d VT0, VI1, VI2
xvbitsel.v VI0, VI2, VI1, VT0
b .L17
#else
xvxor.v VX0, VX0, VX0
xvor.v VX0, VI0, VX0
xvxor.v VX1, VX1, VX1
xvor.v VX1, VM0, VX1
xvpickve.w VI1, VI0, 0
xvpickve.w VI2, VI0, 1
xvpickve.w VI3, VI0, 2
xvpickve.w VI4, VI0, 3
xvpickve.w x1, VM0, 0
xvpickve.w x2, VM0, 1
xvpickve.w x3, VM0, 2
xvpickve.w x4, VM0, 3
vreplvei.w $vr21, $vr20, 0
vreplvei.w $vr22, $vr20, 1
vreplvei.w $vr8, $vr20, 2
vreplvei.w $vr19, $vr20, 3
vreplvei.w $vr9, $vr15, 0
vreplvei.w $vr10, $vr15, 1
vreplvei.w $vr11, $vr15, 2
vreplvei.w $vr12, $vr15, 3
b .L26
#endif
XVFMAXA VM1, x1, x2
XVCMPEQ VT0, x1, VM1
xvbitsel.v VINC4, VI2, VI1, VT0
XVFMAXA VM0, x3, x4
XVCMPEQ VT0, x3, VM0
xvbitsel.v VINC8, VI4, VI3, VT0
XVFMAXA VM0, VM0, VM1
XVCMPEQ VT0, VM0, VM1
xvbitsel.v VI0, VINC8, VINC4, VT0
CMPEQ $fcc0, $f15, $f9
bceqz $fcc0, .L26
XVCMPLT VT0, VI1, VI0
.align 3

#ifdef DOUBLE
.L16:
xvfcmp.clt.d VT0, x1, x2
xvbitsel.v VI0, VI1, VI2, VT0
xvbitsel.v VM0, x1, x2, VT0
.align 3

.L17:
movfr2gr.d i0, $f20
.align 3

.L11: //INCX==1 and N<8
andi I, N, 7
bge $r0, I, .L14
srai.d i1, N, 3
slli.d i1, i1, 3
addi.d i1, i1, 1 //current index
movgr2fr.d $f21, i1
movgr2fr.d $f20, i0
.align 3

.L13:
fld.d $f9, X, 0
fsub.d $f10, $f3, $f9
xvfmaxa.d x1, x1, x2
xvfcmp.clt.d VT0, VM0, x1
xvbitsel.v VM0, VM0, x1, VT0
xvbitsel.v VI0, VI0, VI1, VT0
b .L26
addi.d I, I, -1
addi.d i1, i1, 1
addi.d X, X, SIZE
movgr2fr.d $f21, i1
blt $r0, I, .L13
movfr2gr.d i0, $f20
.align 3

.L14:
move $r4, $r17
jirl $r0, $r1, 0x0
.align 3

.L20: // INCX!=1
move TEMP, X
#ifdef DOUBLE
addi.d i0, i0, 1
ld.d t1, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
@@ -210,34 +272,103 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
bge $r0, I, .L21
ld.d t2, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
ld.d t3, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
ld.d t4, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
xvinsgr2vr.d VM0, t2, 1
xvinsgr2vr.d VM0, t3, 2
xvinsgr2vr.d VM0, t4, 3
slli.d i0, i0, 2 //4
slli.d i0, i0, 1 //2
xvfsub.d VT1, VZE, VM0
xvreplgr2vr.d VINC4, i0
slli.d i0, i0, 1 //8
slli.d i0, i0, 1 //4
xvreplgr2vr.d VINC8, i0
addi.d i0, i0, -15
addi.d i0, i0, -7
xvfmaxa.d VM0, VM0, VT1
xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
addi.d i0, i0, 1
xvinsgr2vr.d VI1, i0, 1
addi.d i0, i0, 1
xvinsgr2vr.d VI1, i0, 2
addi.d i0, i0, 1
xvinsgr2vr.d VI1, i0, 3
addi.d i0, i0, 5
addi.d i0, i0, 3
xvinsgr2vr.d VI0, i0, 0 //1
addi.d i0, i0, 1
xvinsgr2vr.d VI0, i0, 1 //2
addi.d i0, i0, 1
xvinsgr2vr.d VI0, i0, 2 //3
addi.d i0, i0, 1
xvinsgr2vr.d VI0, i0, 3 //4
.align 3

.L24:
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.d VX0, t1, 0
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.d VX0, t2, 1
xvadd.d VI1, VI1, VINC8
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.d VX1, t1, 0
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.d VX1, t2, 1
xvadd.d VI2, VI1, VINC4

xvfsub.d VT1, VZE, VX0
xvfsub.d VT2, VZE, VX1
xvfmaxa.d VX0, VX0, VT1
xvfmaxa.d VX1, VX1, VT2
xvfcmp.clt.d VT0, VX0, VX1
xvbitsel.v x1, VX0, VX1, VT0
xvbitsel.v x2, VI1, VI2, VT0
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.d VX0, t1, 0
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.d VX0, t2, 1
xvadd.d VI1, VI2, VINC4
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.d VX1, t1, 0
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.d VX1, t2, 1
xvadd.d VI2, VI1, VINC4
xvfsub.d VT1, VZE, VX0
xvfsub.d VT2, VZE, VX1
xvfmaxa.d VX0, VX0, VT1
xvfmaxa.d VX1, VX1, VT2
xvfcmp.clt.d VT0, VX0, VX1
xvbitsel.v x3, VX0, VX1, VT0
xvbitsel.v x4, VI1, VI2, VT0
xvfcmp.clt.d VC0, x1, x3
xvbitsel.v x1, x1, x3, VC0
xvbitsel.v x2, x2, x4, VC0
xvfcmp.clt.d VT0, VM0, x1
xvbitsel.v VM0, VM0, x1, VT0
xvbitsel.v VI0, VI0, x2, VT0

addi.d I, I, -1
blt $r0, I, .L24
.align 3

.L25:
vreplvei.d $vr21, $vr20, 0
vreplvei.d $vr22, $vr20, 1
vreplvei.d $vr9, $vr15, 0
vreplvei.d $vr10, $vr15, 1
fcmp.ceq.d $fcc0, $f10, $f9
bceqz $fcc0, .L26
xvfcmp.clt.d VT0, VI1, VI2
xvbitsel.v VI0, VI2, VI1, VT0
b .L27
.align 3

.L26:
xvfcmp.clt.d VT0, x1, x2
xvbitsel.v VI0, VI1, VI2, VT0
xvbitsel.v VM0, x1, x2, VT0
.align 3

.L27:
movfr2gr.d i0, $f20
.align 3

#else
.L20: // INCX!=1
move TEMP, X
addi.w i0, i0, 1
ld.w t1, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
@@ -253,19 +384,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvinsgr2vr.w VM0, t2, 1
xvinsgr2vr.w VM0, t3, 2
xvinsgr2vr.w VM0, t4, 3
ld.w t1, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
ld.w t2, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
ld.w t3, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
ld.w t4, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
xvinsgr2vr.w VM0, t1, 4
xvinsgr2vr.w VM0, t2, 5
xvinsgr2vr.w VM0, t3, 6
xvinsgr2vr.w VM0, t4, 7
slli.w i0, i0, 3 //8
slli.w i0, i0, 2 //4
xvreplgr2vr.w VINC4, i0
slli.w i0, i0, 1 //8
xvreplgr2vr.w VINC8, i0
addi.w i0, i0, -15
xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
@@ -275,15 +396,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvinsgr2vr.w VI1, i0, 2
addi.w i0, i0, 1
xvinsgr2vr.w VI1, i0, 3
addi.w i0, i0, 1
xvinsgr2vr.w VI1, i0, 4
addi.w i0, i0, 1
xvinsgr2vr.w VI1, i0, 5
addi.w i0, i0, 1
xvinsgr2vr.w VI1, i0, 6
addi.w i0, i0, 1
xvinsgr2vr.w VI1, i0, 7
addi.w i0, i0, 1
addi.w i0, i0, 5
xvinsgr2vr.w VI0, i0, 0 //1
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 1 //2
@@ -291,54 +404,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvinsgr2vr.w VI0, i0, 2 //3
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 3 //4
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 4 //5
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 5 //6
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 6 //7
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 7 //8
#endif
.align 3

.L24:
#ifdef DOUBLE
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
ld.d t3, X, 0 * SIZE
add.d X, X, INCX
ld.d t4, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.d VX0, t1, 0
xvinsgr2vr.d VX0, t2, 1
xvinsgr2vr.d VX0, t3, 2
xvinsgr2vr.d VX0, t4, 3
xvadd.d VI1, VI1, VINC8
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
ld.d t3, X, 0 * SIZE
add.d X, X, INCX
ld.d t4, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.d VX1, t1, 0
xvinsgr2vr.d VX1, t2, 1
xvinsgr2vr.d VX1, t3, 2
xvinsgr2vr.d VX1, t4, 3
xvadd.d VI2, VI1, VINC4
xvfmaxa.d VM1, VX0, VX1
xvfcmp.ceq.d VT0, VX0, VM1
addi.d I, I, -1
xvbitsel.v VI2, VI2, VI1, VT0
xvfmaxa.d VM1, VM0, VM1
xvfcmp.ceq.d VT0, VM0, VM1
xvbitsel.v VM0, VM1, VM0, VT0
xvbitsel.v VI0, VI2, VI0, VT0
#else
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
@@ -351,6 +419,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvinsgr2vr.w VX0, t2, 1
xvinsgr2vr.w VX0, t3, 2
xvinsgr2vr.w VX0, t4, 3
xvadd.w VI1, VI1, VINC8
ld.w t1, X, 0 * SIZE
add.d X, X, INCX
ld.w t2, X, 0 * SIZE
@@ -359,158 +428,80 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add.d X, X, INCX
ld.w t4, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.w VX0, t1, 4
xvinsgr2vr.w VX0, t2, 5
xvinsgr2vr.w VX0, t3, 6
xvinsgr2vr.w VX0, t4, 7
xvadd.w VI1, VI1, VINC8
xvfmaxa.s VM1, VX0, VM0
xvfcmp.ceq.s VT0, VM1, VM0
xvinsgr2vr.w VX1, t1, 0
xvinsgr2vr.w VX1, t2, 1
xvinsgr2vr.w VX1, t3, 2
xvinsgr2vr.w VX1, t4, 3
xvadd.w VI2, VI1, VINC4
xvfsub.s VT1, VZE, VX0
xvfsub.s VT2, VZE, VX1
xvfmaxa.s VX0, VX0, VT1
xvfmaxa.s VX1, VX1, VT2
xvfcmp.clt.s VT0, VX0, VX1
xvbitsel.v x1, VX0, VX1, VT0
xvbitsel.v x2, VI1, VI2, VT0 //i

addi.d I, I, -1
xvbitsel.v VM0, VM1, VM0, VT0
xvbitsel.v VI0, VI1, VI0, VT0
#endif
xvfcmp.clt.s VT0, VM0, x1
xvbitsel.v VM0, VM0, x1, VT0
xvbitsel.v VI0, VI0, x2, VT0
blt $r0, I, .L24
.align 3

.L25:
#ifdef DOUBLE
xvpickve.d VI1, VI0, 0
xvpickve.d VI2, VI0, 1
xvpickve.d VI3, VI0, 2
xvpickve.d VI4, VI0, 3
xvpickve.d x1, VM0, 0
xvpickve.d x2, VM0, 1
xvpickve.d x3, VM0, 2
xvpickve.d x4, VM0, 3
xvfmaxa.d VM1, x1, x2
xvfcmp.ceq.d VT0, x1, VM1
xvbitsel.v VINC4, VI2, VI1, VT0
xvfmaxa.d VM0, x4, x3
xvfcmp.ceq.d VT0, x3, VM0
xvbitsel.v VINC8, VI4, VI3, VT0
xvfmaxa.d VM0, VM0, VM1
xvfcmp.ceq.d VT0, VM0, VM1
xvbitsel.v VI0, VINC8, VINC4, VT0
#else
xvxor.v VX0, VX0, VX0
xvor.v VX0, VI0, VX0
xvxor.v VX1, VX1, VX1
xvor.v VX1, VM0, VX1
xvpickve.w VI1, VI0, 0
xvpickve.w VI2, VI0, 1
xvpickve.w VI3, VI0, 2
xvpickve.w VI4, VI0, 3
xvpickve.w x1, VM0, 0
xvpickve.w x2, VM0, 1
xvpickve.w x3, VM0, 2
xvpickve.w x4, VM0, 3
xvfmaxa.s VM1, x1, x2
xvfcmp.ceq.s VT0, x1, VM1
xvbitsel.v VINC4, VI2, VI1, VT0
xvfmaxa.s VM0, x3, x4
xvfcmp.ceq.s VT0, x3, VM0
xvbitsel.v VINC8, VI3, VI4, VT0
xvfmaxa.s VM0, VM0, VM1
xvfcmp.ceq.s VT0, VM0, VM1
xvbitsel.v VM0, VM0, VM1, VT0
xvbitsel.v VI0, VINC8, VINC4, VT0
#endif
CMPEQ $fcc0, $f15, $f9
bceqz $fcc0, .L26
XVCMPLT VT0, VI1, VI0
xvbitsel.v VI0, VI0, VI1, VT0
vreplvei.w $vr21, $vr20, 0
vreplvei.w $vr22, $vr20, 1
vreplvei.w $vr8, $vr20, 2
vreplvei.w $vr19, $vr20, 3
vreplvei.w $vr9, $vr15, 0
vreplvei.w $vr10, $vr15, 1
vreplvei.w $vr11, $vr15, 2
vreplvei.w $vr12, $vr15, 3
.align 3

.L26:
fcmp.ceq.d $fcc0, $f15, $f10
bceqz $fcc0, .L27
XVCMPLT VT0, VI2, VI0
xvbitsel.v VI0, VI0, VI2, VT0
fcmp.ceq.s $fcc0, $f9, $f10
bceqz $fcc0, .L31
xvfcmp.clt.s VT0, VI1, VI2
xvbitsel.v VI1, VI2, VI1, VT0
b .L32
.align 3

.L27:
fcmp.ceq.d $fcc0, $f15, $f11
bceqz $fcc0, .L28
XVCMPLT VT0, VI3, VI0
xvbitsel.v VI0, VI0, VI3, VT0
.L31:
xvfcmp.clt.s VT0, x1, x2
xvbitsel.v VI1, VI1, VI2, VT0
xvbitsel.v x1, x1, x2, VT0
.align 3
.L28:
fcmp.ceq.d $fcc0, $f15, $f12
bceqz $fcc0, .L29
XVCMPLT VT0, VI4, VI0
xvbitsel.v VI0, VI0, VI4, VT0
.L32:
fcmp.ceq.s $fcc0, $f11, $f12
bceqz $fcc0, .L33
xvfcmp.clt.s VT1, VI3, VI4
xvbitsel.v VI3, VI4, VI3, VT1
b .L34
.align 3

.L29:
#ifdef DOUBLE
movfr2gr.d i0, $f20
#else
fmov.s $f16, $f20
#endif
.L33:
xvfcmp.clt.s VT1, x3, x4
xvbitsel.v x3, x3, x4, VT1
xvbitsel.v VI3, VI3, VI4, VT1
.align 3

#ifdef DOUBLE

#else
.L252:
xvxor.v VI0, VI0, VI0
xvor.v VI0, VI0, VX0
fmov.s $f13, $f15
xvxor.v VM0, VM0, VM0
xvor.v VM0, VM0, VX1
xvpickve.w VI1, VI0, 4
xvpickve.w VI2, VI0, 5
xvpickve.w VI3, VI0, 6
xvpickve.w VI4, VI0, 7
xvpickve.w x1, VM0, 4
xvpickve.w x2, VM0, 5
xvpickve.w x3, VM0, 6
xvpickve.w x4, VM0, 7
xvfmaxa.s VM1, x1, x2
xvfcmp.ceq.s VT0, x1, VM1
xvbitsel.v VINC4, VI2, VI1, VT0
xvfmaxa.s VM0, x3, x4
xvfcmp.ceq.s VT0, x3, VM0
xvbitsel.v VINC8, VI4, VI3, VT0
xvfmaxa.s VM0, VM0, VM1
xvfcmp.ceq.s VT0, VM0, VM1
xvbitsel.v VI0, VINC8, VINC4, VT0
fcmp.ceq.d $fcc0, $f15, $f9
bceqz $fcc0, .L262
xvfcmp.clt.s VT0, VI1, VI0
xvbitsel.v VI0, VI0, VI1, VT0
.L34:
fcmp.ceq.s $fcc0, $f9, $f11
bceqz $fcc0, .L35
xvfcmp.clt.s VT0, VI1, VI3
xvbitsel.v VI0, VI3, VI1, VT0
xvxor.v VM0, x1, VZE
b .L29
.align 3

.L262:
fcmp.ceq.d $fcc0, $f15, $f10
bceqz $fcc0, .L272
xvfcmp.clt.s VT0, VI2, VI0
xvbitsel.v VI0, VI0, VI2, VT0
.L35:
xvfcmp.clt.s VT0, x1, x3
xvbitsel.v VM0, x1, x3, VT0
xvbitsel.v VI0, VI1, VI3, VT0
.align 3
.L272:
fcmp.ceq.d $fcc0, $f15, $f11
bceqz $fcc0, .L282
xvfcmp.clt.s VT0, VI3, VI0
xvbitsel.v VI0, VI0, VI3, VT0
.align 3

.L282:
fcmp.ceq.d $fcc0, $f15, $f12
bceqz $fcc0, .L292
xvfcmp.clt.s VT0, VI4, VI0
xvbitsel.v VI0, VI0, VI4, VT0
.L29:
movfr2gr.s i0, $f20
.align 3

.L292:
xvfmaxa.s VM0, VX0, VM0
xvfcmp.ceq.s VT0, VM0, VX0
xvbitsel.v VI0, VI0, VI1, VT0
movfr2gr.s i0, $f20
#endif

.L21: //N<8
.L21: // N<8
andi I, N, 7
bge $r0, I, .L999
srai.d i1, N, 3
@@ -521,17 +512,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.align 3

.L22:
LD $f9, X, 0
LD $f9, X, 0
#ifdef DOUBLE
fsub.d $f10, $f3, $f9
xvfmaxa.d x1, x1, x2
xvfcmp.clt.d VT0, VM0, x1
#else
fsub.s $f10, $f3, $f9
xvfmaxa.s x1, x1, x2
xvfcmp.clt.s VT0, VM0, x1
#endif
xvbitsel.v VM0, VM0, x1, VT0
xvbitsel.v VI0, VI0, VI1, VT0
addi.d I, I, -1
XVFMAXA VM1, x1, VM0
XVCMPEQ VT0, VM0, VM1
add.d X, X, INCX
xvbitsel.v VM0, VM1, VM0, VT0
xvbitsel.v VI0, VI1, VI0, VT0
addi.d i1, i1, 1
add.d X, X, INCX
movgr2fr.d $f21, i1
blt $r0, I, .L22
MTG i0, $f20
MTG i0, $f20
.align 3

.L999:


+ 165
- 247
kernel/loongarch64/icamax_lasx.S View File

@@ -76,66 +76,66 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi.d i0, i0, 1
srai.d I, N, 2
bge $r0, I, .L21
slli.d i0, i0, 2 //4
slli.d i0, i0, 1 //2
xvreplgr2vr.d VINC4, i0
addi.d i0, i0, -7
addi.d i0, i0, -3
xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
addi.d i0, i0, 2
addi.d i0, i0, 1
xvinsgr2vr.d VI1, i0, 1
addi.d i0, i0, -1
addi.d i0, i0, 1
xvinsgr2vr.d VI1, i0, 2
addi.d i0, i0, 2
xvinsgr2vr.d VI1, i0, 3
addi.d i0, i0, 1
xvinsgr2vr.d VI0, i0, 0 //1
addi.d i0, i0, 2
xvinsgr2vr.d VI0, i0, 1 //3
xvinsgr2vr.d VI1, i0, 3
addi.d i0, i0, -1
xvinsgr2vr.d VI0, i0, 2 //2
addi.d i0, i0, 2
xvinsgr2vr.d VI0, i0, 3 //4
xvinsgr2vr.d VI0, i0, 0
addi.d i0, i0, 1
xvinsgr2vr.d VI0, i0, 1
addi.d i0, i0, 1
xvinsgr2vr.d VI0, i0, 2
addi.d i0, i0, 1
xvinsgr2vr.d VI0, i0, 3
#else
li.w I, -1
xvreplgr2vr.w VI4, I
xvffint.s.w VI4, VI4 // -1
bne INCX, TEMP, .L20
addi.w i0, i0, 1
srai.d I, N, 3
srai.d I, N, 2
bge $r0, I, .L21
slli.w i0, i0, 3 //8
xvreplgr2vr.w VINC8, i0
addi.w i0, i0, -15
slli.w i0, i0, 2 //4
xvreplgr2vr.w VINC4, i0
addi.w i0, i0, -7
xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
addi.w i0, i0, 1
xvinsgr2vr.w VI1, i0, 1
addi.w i0, i0, 3
addi.w i0, i0, 1
xvinsgr2vr.w VI1, i0, 2
addi.w i0, i0, 1
xvinsgr2vr.w VI1, i0, 3
addi.w i0, i0, -3
addi.w i0, i0, 1
xvinsgr2vr.w VI1, i0, 4
addi.w i0, i0, 1
xvinsgr2vr.w VI1, i0, 5
addi.w i0, i0, 3
addi.w i0, i0, 1
xvinsgr2vr.w VI1, i0, 6
addi.w i0, i0, 1
xvinsgr2vr.w VI1, i0, 7
addi.w i0, i0, -3
xvinsgr2vr.w VI0, i0, 0
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 0 //1
xvinsgr2vr.w VI0, i0, 1
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 1 //2
addi.w i0, i0, 3
xvinsgr2vr.w VI0, i0, 2 //5
xvinsgr2vr.w VI0, i0, 2
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 3 //6
addi.w i0, i0, -3
xvinsgr2vr.w VI0, i0, 4 //3
xvinsgr2vr.w VI0, i0, 3
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 4
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 5 //4
addi.w i0, i0, 3
xvinsgr2vr.w VI0, i0, 6 //7
xvinsgr2vr.w VI0, i0, 5
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 7 //8
xvinsgr2vr.w VI0, i0, 6
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 7
#endif
.align 3

@@ -143,7 +143,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvld VX0, X, 0 * SIZE
#ifdef DOUBLE
xvadd.d VI1, VI1, VINC4
xvld VX1, X, 4 * SIZE
xvld VX1, X, 2 * SIZE
addi.d I, I, -1
xvpickev.d x1, VX1, VX0
xvpickod.d x2, VX1, VX0
@@ -153,22 +153,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvfcmp.clt.d VINC8, x2, VI3
xvbitsel.v x1, x1, x3, VT0
xvbitsel.v x2, x2, x4, VINC8
xvfadd.d x1, x1, x2
xvfmax.d x3, VM0, x1
xvfcmp.ceq.d VT0, x3, VM0
xvbitsel.v VM0, x3, VM0, VT0
xvbitsel.v VI0, VI1, VI0, VT0
xvld VX0, X, 4 * SIZE
xvadd.d VI1, VI1, VINC4
xvld VX1, X, 6 * SIZE
xvpickev.d x1, VX1, VX0
xvpickod.d x2, VX1, VX0
xvfmul.d x3, VI4, x1
xvfmul.d x4, VI4, x2
#else
xvadd.w VI1, VI1, VINC8
xvld VX1, X, 8 * SIZE
xvadd.w VI1, VI1, VINC4
xvld VX1, X, 4 * SIZE
addi.d I, I, -1
xvpickev.w x1, VX1, VX0
xvpickod.w x2, VX1, VX0
xvfmul.s x3, VI4, x1
xvfmul.s x4, VI4, x2
xvfcmp.clt.s VT0, x1, VI3
xvfcmp.clt.s VINC4, x2, VI3
xvbitsel.v x1, x1, x3, VT0
xvbitsel.v x2, x2, x4, VINC4
#endif
XVFADD x1, x1, x2
XVFMAX x3, VM0, x1
XVCMPEQ VT0, x3, VM0
XVCMPLT VT0, x1, VI3
XVCMPLT VINC8, x2, VI3
xvbitsel.v x1, x1, x3, VT0
xvbitsel.v x2, x2, x4, VINC8
XVFADD x1, x1, x2
XVFMAX x3, VM0, x1
XVCMPEQ VT0, x3, VM0
addi.d X, X, 8 * SIZE
xvbitsel.v VM0, x3, VM0, VT0
xvbitsel.v VI0, VI1, VI0, VT0
@@ -177,51 +189,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

.L15:
#ifdef DOUBLE
xvpickve.d VI1, VI0, 0
xvpickve.d VI2, VI0, 1
xvpickve.d VI3, VI0, 2
xvpickve.d VI4, VI0, 3
xvpickve.d x1, VM0, 0
xvpickve.d x2, VM0, 1
xvpickve.d x3, VM0, 2
xvpickve.d x4, VM0, 3
xvfmax.d VM1, x1, x2
xvfcmp.ceq.d VT0, VM1, x1
vreplvei.d $vr21, $vr20, 0
vreplvei.d $vr22, $vr20, 1
vreplvei.d $vr9, $vr15, 0
vreplvei.d $vr10, $vr15, 1
fcmp.ceq.d $fcc0, $f10, $f9
bceqz $fcc0, .L26
xvfcmp.clt.d VT0, VI1, VI2
xvbitsel.v VI0, VI2, VI1, VT0
b .L27
#else
vreplvei.w $vr21, $vr20, 0
vreplvei.w $vr22, $vr20, 1
vreplvei.w $vr8, $vr20, 2
vreplvei.w $vr19, $vr20, 3
vreplvei.w $vr9, $vr15, 0
vreplvei.w $vr10, $vr15, 1
vreplvei.w $vr11, $vr15, 2
vreplvei.w $vr12, $vr15, 3
xvfmaxa.s VM1, x1, x2
xvfcmp.ceq.s VT0, VM1, x1
xvbitsel.v VINC4, VI2, VI1, VT0
xvfmax.d VM0, x3, x4
xvfcmp.ceq.d VT0, x3, VM0
xvfmaxa.s VM0, x3, x4
xvfcmp.ceq.s VT0, x3, VM0
xvbitsel.v VINC8, VI4, VI3, VT0
xvfmax.d VM0, VM0, VM1
xvfcmp.ceq.d VT0, VM0, VM1
xvfmaxa.s VM0, VM0, VM1
xvfcmp.ceq.s VT0, VM0, VM1
xvbitsel.v VI0, VINC8, VINC4, VT0
#else
xvxor.v VX0, VX0, VX0
xvor.v VX0, VI0, VX0
xvxor.v VX1, VX1, VX1
xvor.v VX1, VM0, VX1
xvpickve.w VI1, VI0, 0
xvpickve.w VI2, VI0, 1
xvpickve.w VI3, VI0, 2
xvpickve.w VI4, VI0, 3
xvpickve.w x1, VM0, 0
xvpickve.w x2, VM0, 1
xvpickve.w x3, VM0, 2
xvpickve.w x4, VM0, 3
xvfcmp.clt.s VT0, x1, x2
xvbitsel.v VM1, x1, x2, VT0
xvbitsel.v VINC4, VI1, VI2, VT0
xvfcmp.clt.s VT0, x3, x4
xvbitsel.v VM0, x3, x4, VT0
xvbitsel.v VINC8, VI3, VI4, VT0
xvfcmp.clt.s VT0, VM0, VM1
xvbitsel.v VM0, VM0, VM1, VT0
xvbitsel.v VI0, VINC8, VINC4, VT0
#endif
fcmp.ceq.d $fcc0, $f15, $f9
bceqz $fcc0, .L26
XVCMPLT VT0, VI1, VI0
xvfcmp.clt.s VT0, VI1, VI0
xvbitsel.v VI0, VI0, VI1, VT0
b .L26
#endif
.align 3

.L20: // INCX!=1
@@ -229,62 +229,62 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi.d i0, i0, 1
srai.d I, N, 2
bge $r0, I, .L21
slli.d i0, i0, 2 //4
slli.d i0, i0, 1 //2
xvreplgr2vr.d VINC4, i0
addi.d i0, i0, -7
addi.d i0, i0, -3
xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
addi.d i0, i0, 2
addi.d i0, i0, 1
xvinsgr2vr.d VI1, i0, 1
addi.d i0, i0, -1
addi.d i0, i0, 1
xvinsgr2vr.d VI1, i0, 2
addi.d i0, i0, 2
xvinsgr2vr.d VI1, i0, 3
addi.d i0, i0, 1
xvinsgr2vr.d VI0, i0, 0 //1
addi.d i0, i0, 2
xvinsgr2vr.d VI0, i0, 1 //3
xvinsgr2vr.d VI1, i0, 3
addi.d i0, i0, -1
xvinsgr2vr.d VI0, i0, 2 //2
addi.d i0, i0, 2
xvinsgr2vr.d VI0, i0, 3 //4
xvinsgr2vr.d VI0, i0, 0
addi.d i0, i0, 1
xvinsgr2vr.d VI0, i0, 1
addi.d i0, i0, 1
xvinsgr2vr.d VI0, i0, 2
addi.d i0, i0, 1
xvinsgr2vr.d VI0, i0, 3
#else
addi.w i0, i0, 1
srai.d I, N, 3
srai.d I, N, 2
bge $r0, I, .L21
slli.w i0, i0, 3 //8
xvreplgr2vr.w VINC8, i0
addi.w i0, i0, -15
slli.w i0, i0, 2 //4
xvreplgr2vr.w VINC4, i0
addi.w i0, i0, -7
xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
addi.w i0, i0, 1
xvinsgr2vr.w VI1, i0, 1
addi.w i0, i0, 3
addi.w i0, i0, 1
xvinsgr2vr.w VI1, i0, 2
addi.w i0, i0, 1
xvinsgr2vr.w VI1, i0, 3
addi.w i0, i0, -3
addi.w i0, i0, 1
xvinsgr2vr.w VI1, i0, 4
addi.w i0, i0, 1
xvinsgr2vr.w VI1, i0, 5
addi.w i0, i0, 3
addi.w i0, i0, 1
xvinsgr2vr.w VI1, i0, 6
addi.w i0, i0, 1
xvinsgr2vr.w VI1, i0, 7
addi.w i0, i0, -3
xvinsgr2vr.w VI0, i0, 0
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 0 //1
xvinsgr2vr.w VI0, i0, 1
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 1 //2
addi.w i0, i0, 3
xvinsgr2vr.w VI0, i0, 2 //5
xvinsgr2vr.w VI0, i0, 2
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 3 //6
addi.w i0, i0, -3
xvinsgr2vr.w VI0, i0, 4 //3
xvinsgr2vr.w VI0, i0, 3
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 4
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 5 //4
addi.w i0, i0, 3
xvinsgr2vr.w VI0, i0, 6 //7
xvinsgr2vr.w VI0, i0, 5
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 7 //8
xvinsgr2vr.w VI0, i0, 6
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 7
#endif
.align 3

@@ -301,16 +301,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvinsgr2vr.d x1, t3, 1
xvinsgr2vr.d x2, t4, 1
xvadd.d VI1, VI1, VINC4
xvfmul.d x3, VI4, x1
xvfmul.d x4, VI4, x2
xvfcmp.clt.d VT0, x1, VI3
xvfcmp.clt.d VINC8, x2, VI3
xvbitsel.v x1, x1, x3, VT0
xvbitsel.v x2, x2, x4, VINC8
xvfadd.d x1, x1, x2
xvfmax.d x3, VM0, x1
ld.d t1, X, 0 * SIZE
xvfcmp.ceq.d VT0, x3, VM0
ld.d t2, X, 1 * SIZE
xvbitsel.v VM0, x3, VM0, VT0
xvbitsel.v VI0, VI1, VI0, VT0
add.d X, X, INCX
ld.d t3, X, 0 * SIZE
ld.d t4, X, 1 * SIZE
add.d X, X, INCX
xvinsgr2vr.d x1, t1, 2
xvinsgr2vr.d x2, t2, 2
xvinsgr2vr.d x1, t3, 3
xvinsgr2vr.d x2, t4, 3
xvinsgr2vr.d x1, t1, 0
xvinsgr2vr.d x2, t2, 0
xvinsgr2vr.d x1, t3, 1
xvinsgr2vr.d x2, t4, 1
xvadd.d VI1, VI1, VINC4
addi.d I, I, -1
xvfmul.d x3, VI4, x1
xvfmul.d x4, VI4, x2
@@ -332,6 +344,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvinsgr2vr.w x2, t2, 0
xvinsgr2vr.w x1, t3, 1
xvinsgr2vr.w x2, t4, 1
xvadd.w VI1, VI1, VINC4
ld.w t1, X, 0 * SIZE
ld.w t2, X, 1 * SIZE
add.d X, X, INCX
@@ -342,31 +355,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvinsgr2vr.w x2, t2, 2
xvinsgr2vr.w x1, t3, 3
xvinsgr2vr.w x2, t4, 3
xvadd.w VI1, VI1, VINC8
ld.w t1, X, 0 * SIZE
ld.w t2, X, 1 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
ld.w t4, X, 1 * SIZE
add.d X, X, INCX
xvinsgr2vr.w x1, t1, 4
xvinsgr2vr.w x2, t2, 4
xvinsgr2vr.w x1, t3, 5
xvinsgr2vr.w x2, t4, 5
xvadd.w VI1, VI1, VINC8
ld.w t1, X, 0 * SIZE
ld.w t2, X, 1 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
ld.w t4, X, 1 * SIZE
add.d X, X, INCX
xvinsgr2vr.w x1, t1, 6
xvinsgr2vr.w x2, t2, 6
xvinsgr2vr.w x1, t3, 7
xvinsgr2vr.w x2, t4, 7
addi.d I, I, -1
xvpickev.w x1, VX1, VX0
xvpickod.w x2, VX1, VX0
xvfmul.s x3, VI4, x1
xvfmul.s x4, VI4, x2
xvfcmp.clt.s VT0, x1, VI3
@@ -384,152 +373,82 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

.L25:
#ifdef DOUBLE
xvpickve.d VI1, VI0, 0
xvpickve.d VI2, VI0, 1
xvpickve.d VI3, VI0, 2
xvpickve.d VI4, VI0, 3
xvpickve.d x1, VM0, 0
xvpickve.d x2, VM0, 1
xvpickve.d x3, VM0, 2
xvpickve.d x4, VM0, 3
xvfmaxa.d VM1, x1, x2
xvfcmp.ceq.d VT0, VM1, x1
vreplvei.d $vr21, $vr20, 0
vreplvei.d $vr22, $vr20, 1
vreplvei.d $vr9, $vr15, 0
vreplvei.d $vr10, $vr15, 1
fcmp.ceq.d $fcc0, $f10, $f9
bceqz $fcc0, .L26
xvfcmp.clt.d VT0, VI1, VI2
xvbitsel.v VI0, VI2, VI1, VT0
b .L27
#else
vreplvei.w $vr21, $vr20, 0
vreplvei.w $vr22, $vr20, 1
vreplvei.w $vr8, $vr20, 2
vreplvei.w $vr19, $vr20, 3
vreplvei.w $vr9, $vr15, 0
vreplvei.w $vr10, $vr15, 1
vreplvei.w $vr11, $vr15, 2
vreplvei.w $vr12, $vr15, 3
xvfmaxa.s VM1, x1, x2
xvfcmp.ceq.s VT0, VM1, x1
xvbitsel.v VINC4, VI2, VI1, VT0
xvfmaxa.d VM0, x3, x4
xvfcmp.ceq.d VT0, x3, VM0
xvfmaxa.s VM0, x3, x4
xvfcmp.ceq.s VT0, x3, VM0
xvbitsel.v VINC8, VI4, VI3, VT0
xvfmaxa.d VM0, VM0, VM1
xvfcmp.ceq.d VT0, VM0, VM1
xvfmaxa.s VM0, VM0, VM1
xvfcmp.ceq.s VT0, VM0, VM1
xvbitsel.v VI0, VINC8, VINC4, VT0
#else
xvxor.v VX0, VX0, VX0
xvor.v VX0, VI0, VX0
xvxor.v VX1, VX1, VX1
xvor.v VX1, VM0, VX1
xvpickve.w VI1, VI0, 0
xvpickve.w VI2, VI0, 1
xvpickve.w VI3, VI0, 2
xvpickve.w VI4, VI0, 3
xvpickve.w x1, VM0, 0
xvpickve.w x2, VM0, 1
xvpickve.w x3, VM0, 2
xvpickve.w x4, VM0, 3
xvfcmp.clt.s VT0, x1, x2
xvbitsel.v VM1, x1, x2, VT0
xvbitsel.v VINC4, VI1, VI2, VT0
xvfcmp.clt.s VT0, x3, x4
xvbitsel.v VM0, x3, x4, VT0
xvbitsel.v VINC8, VI3, VI4, VT0
xvfcmp.clt.s VT0, VM0, VM1
xvbitsel.v VM0, VM0, VM1, VT0
xvbitsel.v VI0, VINC8, VINC4, VT0
#endif
fcmp.ceq.d $fcc0, $f15, $f9
bceqz $fcc0, .L26
XVCMPLT VT0, VI1, VI0
xvfcmp.clt.s VT0, VI1, VI0
xvbitsel.v VI0, VI0, VI1, VT0
#endif
.align 3

#ifdef DOUBLE
.L26:
fcmp.ceq.d $fcc0, $f15, $f10
bceqz $fcc0, .L27
XVCMPLT VT0, VI2, VI0
xvbitsel.v VI0, VI0, VI2, VT0
xvfmaxa.d VM0, x1, x2
xvfcmp.ceq.d VT0, x1, VM0
xvbitsel.v VI0, VI2, VI1, VT0
.align 3

.L27:
fcmp.ceq.d $fcc0, $f15, $f11
bceqz $fcc0, .L28
XVCMPLT VT0, VI3, VI0
xvbitsel.v VI0, VI0, VI3, VT0
.align 3

.L28:
fcmp.ceq.d $fcc0, $f15, $f12
bceqz $fcc0, .L29
XVCMPLT VT0, VI4, VI0
xvbitsel.v VI0, VI0, VI4, VT0
.align 3

.L29:
#ifdef DOUBLE
movfr2gr.d i0, $f20
#else
fmov.s $f16, $f20
#endif
.align 3

#ifdef DOUBLE
#else
.L252:
xvxor.v VI0, VI0, VI0
xvor.v VI0, VI0, VX0
fmov.s $f13, $f15
xvxor.v VM0, VM0, VM0
xvor.v VM0, VM0, VX1
xvpickve.w VI1, VI0, 4
xvpickve.w VI2, VI0, 5
xvpickve.w VI3, VI0, 6
xvpickve.w VI4, VI0, 7
xvpickve.w x1, VM0, 4
xvpickve.w x2, VM0, 5
xvpickve.w x3, VM0, 6
xvpickve.w x4, VM0, 7
xvfcmp.clt.s VT0, x1, x2
xvbitsel.v x1, x1, x2, VT0
xvbitsel.v VINC4, VI1, VI2, VT0
xvfcmp.clt.s VT0, x3, x4
xvbitsel.v VM0, x3, x4, VT0
xvbitsel.v VINC8, VI3, VI4, VT0
xvfcmp.clt.s VT0, VM0, x1
xvbitsel.v VM0, VM0, x1, VT0
xvbitsel.v VI0, VINC8, VINC4, VT0
fcmp.ceq.d $fcc0, $f15, $f9
bceqz $fcc0, .L262
xvfcmp.clt.s VT0, VI1, VI0
xvbitsel.v VI0, VI0, VI1, VT0
.align 3

.L262:
.L26:
fcmp.ceq.d $fcc0, $f15, $f10
bceqz $fcc0, .L272
bceqz $fcc0, .L27
xvfcmp.clt.s VT0, VI2, VI0
xvbitsel.v VI0, VI0, VI2, VT0
.align 3

.L272:
.L27:
fcmp.ceq.d $fcc0, $f15, $f11
bceqz $fcc0, .L282
bceqz $fcc0, .L28
xvfcmp.clt.s VT0, VI3, VI0
xvbitsel.v VI0, VI0, VI3, VT0
.align 3

.L282:
.L28:
fcmp.ceq.d $fcc0, $f15, $f12
bceqz $fcc0, .L292
bceqz $fcc0, .L29
xvfcmp.clt.s VT0, VI4, VI0
xvbitsel.v VI0, VI0, VI4, VT0
.align 3

.L292:
fcmp.clt.s $fcc0, $f15, $f13
fsel $f15, $f15, $f13, $fcc0
fsel $f20, $f20, $f16, $fcc0
.L29:
movfr2gr.s i0, $f20
.align 3

#endif
.L21: //N<8
#ifdef DOUBLE
.L21: //N<4
andi I, N, 3
bge $r0, I, .L999
srai.d i1, N, 2
slli.d i1, i1, 2
#else
andi I, N, 7
bge $r0, I, .L999
srai.d i1, N, 3
slli.d i1, i1, 3
#endif
addi.d i1, i1, 1 //current index
movgr2fr.d $f21, i1
movgr2fr.d $f20, i0
@@ -550,10 +469,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi.d i1, i1, 1
movgr2fr.d $f21, i1
blt $r0, I, .L22
MTG i0, $f20
MTG i0, $f20
.align 3


.L999:
move $r4, $r17
jirl $r0, $r1, 0x0


+ 123
- 1177
kernel/loongarch64/rot_lasx.S
File diff suppressed because it is too large
View File


+ 94
- 32
kernel/loongarch64/snrm2_lasx.S View File

@@ -43,15 +43,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define t2 $r13
#define t3 $r14
#define t4 $r15

/* Don't change following FR unless you know the effects. */
#define VX0 $xr15
#define VX1 $xr16
#define VX2 $xr17
#define VX3 $xr18
#define VX4 $xr21
#define VX5 $xr22
/* Don't change following FR unless you know the effects. */
#define res1 $xr19
#define res2 $xr20
#define RCP $f2
#define VALPHA $xr3

// The optimization for snrm2 cannot simply involve
// extending the data type from float to double and
// then summing the squares of the data. LAPACK tests
// have shown that this approach can still lead to data overflow.
// Instead, we need to find the maximum absolute value in the entire
// array and divide each data element by this maximum value before
// performing the calculation. This approach can avoid overflow (and does not require extending the data type).

PROLOGUE

@@ -59,29 +69,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
LDINT N, 0(N)
LDINT INCX, 0(INCX)
#endif
bge $r0, N, .L999
beq $r0, INCX, .L999

addi.d $sp, $sp, -32
st.d $ra, $sp, 0
st.d N, $sp, 8
st.d X, $sp, 16
st.d INCX, $sp, 24
#ifdef DYNAMIC_ARCH
bl samax_k_LA264
#else
bl samax_k
#endif
ld.d $ra, $sp, 0
ld.d N, $sp, 8
ld.d X, $sp, 16
ld.d INCX, $sp, 24
addi.d $sp, $sp, 32

frecip.s RCP, $f0
vreplvei.w $vr3, $vr2, 0
xvpermi.d VALPHA, $xr3,0x00
xvxor.v res1, res1, res1
xvxor.v res2, res2, res2
bge $r0, N, .L999
beq $r0, INCX, .L999
fcmp.ceq.s $fcc0, $f0, $f19
bcnez $fcc0, .L999
li.d TEMP, SIZE
slli.d INCX, INCX, BASE_SHIFT
srai.d I, N, 3
srai.d I, N, 4
bne INCX, TEMP, .L20
bge $r0, I, .L997
bge $r0, I, .L997
.align 3

.L10:
xvld VX0, X, 0
xvfcvtl.d.s VX1, VX0
xvfcvth.d.s VX2, VX0
xvfmadd.d res1, VX1, VX1, res1
xvfmadd.d res2, VX2, VX2, res2
xvld VX0, X, 0
xvld VX5, X, 8 * SIZE
addi.d I, I, -1
addi.d X, X, 8 * SIZE
addi.d X, X, 16 * SIZE

xvfmul.s VX0, VX0, VALPHA
xvfmul.s VX5, VX5, VALPHA

xvfmadd.s res1, VX0, VX0, res1
xvfmadd.s res2, VX5, VX5, res2
blt $r0, I, .L10
.align 3
b .L996
.align 3

.L20:
bge $r0, I, .L997
@@ -107,47 +141,75 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld.w t3, X, 0
add.d X, X, INCX
ld.w t4, X, 0
add.d X, X, INCX
xvinsgr2vr.w VX0, t1, 4
xvinsgr2vr.w VX0, t2, 5
xvinsgr2vr.w VX0, t3, 6
xvinsgr2vr.w VX0, t4, 7
xvfmul.s VX0, VX0, VALPHA
xvfmadd.s res1, VX0, VX0, res1

ld.w t1, X, 0
add.d X, X, INCX
ld.w t2, X, 0
add.d X, X, INCX
xvfcvtl.d.s VX1, VX0
xvfcvth.d.s VX2, VX0
xvfmadd.d res1, VX1, VX1, res1
xvfmadd.d res2, VX2, VX2, res2
ld.w t3, X, 0
add.d X, X, INCX
ld.w t4, X, 0
add.d X, X, INCX
xvinsgr2vr.w VX0, t1, 0
xvinsgr2vr.w VX0, t2, 1
xvinsgr2vr.w VX0, t3, 2
xvinsgr2vr.w VX0, t4, 3
ld.w t1, X, 0
add.d X, X, INCX
ld.w t2, X, 0
add.d X, X, INCX
ld.w t3, X, 0
add.d X, X, INCX
ld.w t4, X, 0
add.d X, X, INCX
xvinsgr2vr.w VX0, t1, 4
xvinsgr2vr.w VX0, t2, 5
xvinsgr2vr.w VX0, t3, 6
xvinsgr2vr.w VX0, t4, 7
xvfmul.s VX0, VX0, VALPHA
xvfmadd.s res2, VX0, VX0, res2
addi.d I, I, -1
blt $r0, I, .L21
b .L996
.align 3

.L996:
xvfadd.d res1, res1, res2
xvpickve.d VX1, res1, 1
xvpickve.d VX2, res1, 2
xvpickve.d VX3, res1, 3
fadd.d $f19, $f19, $f16
fadd.d $f19, $f19, $f17
fadd.d $f19, $f19, $f18
xvfadd.s res1, res1, res2
xvpermi.d VX1, res1, 0x4e
xvfadd.s res1, res1, VX1
vreplvei.w $vr16, $vr19, 1
vreplvei.w $vr17, $vr19, 2
vreplvei.w $vr18, $vr19, 3
xvfadd.s res1, VX1, res1
xvfadd.s res1, VX2, res1
xvfadd.s res1, VX3, res1
.align 3

.L997:
andi I, N, 7
andi I, N, 15
bge $r0, I, .L999
.align 3

.L998:
fld.s $f15, X, 0
add.d X, X, INCX
addi.d I, I, -1
fcvt.d.s $f15, $f15
fmadd.d $f19, $f15, $f15, $f19
addi.d I, I, -1
fmul.s $f15, $f15, RCP
fmadd.s $f19, $f15, $f15, $f19
add.d X, X, INCX
blt $r0, I, .L998
.align 3

.L999:
fsqrt.d $f19, $f19
fsqrt.s $f19, $f19
fmul.s $f0, $f19, $f0
move $r4, $r17
fcvt.s.d $f0, $f19
jirl $r0, $r1, 0x0
.align 3

EPILOGUE

+ 9
- 56
kernel/loongarch64/swap_lasx.S View File

@@ -318,62 +318,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
move XX, X

.L222:
LD a1, X, 0
add.d X, X, INCX
LD a2, X, 0
add.d X, X, INCX
LD a3, X, 0
add.d X, X, INCX
LD a4, X, 0
add.d X, X, INCX
LD b1, Y, 0
ST a1, Y, 0
add.d Y, Y, INCY
LD b2, Y, 0
ST a2, Y, 0
add.d Y, Y, INCY
LD b3, Y, 0
ST a3, Y, 0
add.d Y, Y, INCY
LD b4, Y, 0
ST a4, Y, 0
add.d Y, Y, INCY
LD a1, X, 0
add.d X, X, INCX
ST b1, XX, 0
add.d XX, XX, INCX
LD b1, Y, 0
ST a1, Y, 0
add.d Y, Y, INCY
LD a2, X, 0
add.d X, X, INCX
ST b2, XX, 0
add.d XX, XX, INCX
LD b2, Y, 0
ST a2, Y, 0
add.d Y, Y, INCY
LD a3, X, 0
add.d X, X, INCX
ST b3, XX, 0
add.d XX, XX, INCX
LD b3, Y, 0
ST a3, Y, 0
LD a4, X, 0
add.d X, X, INCX
ST b4, XX, 0
add.d XX, XX, INCX
LD b4, Y, 0
ST a4, Y, 0
add.d Y, Y, INCY
ST b1, XX, 0
add.d XX, XX, INCX
ST b2, XX, 0
add.d XX, XX, INCX
ST b3, XX, 0
add.d XX, XX, INCX
ST b4, XX, 0
add.d XX, XX, INCX
addi.d I, I, -1
.rept 8
LD $f12, X, 0
LD $f14, Y, 0
ST $f12, Y, 0
ST $f14, X, 0
add.d X, X, INCX
add.d Y, Y, INCY
.endr
addi.d I, I, -1
blt $r0, I, .L222
.align 3



+ 336
- 421
kernel/power/sgemv_n.c View File

@@ -17,454 +17,369 @@ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#if !defined(__VEC__) || !defined(__ALTIVEC__)
#include "../arm/gemv_n.c"

#else

#include "common.h"
#include <altivec.h>

#include "common.h"
#define NBMAX 4096

static void sgemv_kernel_4x8(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, BLASLONG lda4, FLOAT *alpha)
{

static void sgemv_kernel_4x8(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y,
BLASLONG lda4, FLOAT *alpha) {
BLASLONG i;
FLOAT *a0,*a1,*a2,*a3,*b0,*b1,*b2,*b3;
FLOAT x0,x1,x2,x3,x4,x5,x6,x7;
a0 = ap[0];
a1 = ap[1];
a2 = ap[2];
a3 = ap[3];
b0 = a0 + lda4 ;
b1 = a1 + lda4 ;
b2 = a2 + lda4 ;
b3 = a3 + lda4 ;
x0 = xo[0] * *alpha;
x1 = xo[1] * *alpha;
x2 = xo[2] * *alpha;
x3 = xo[3] * *alpha;
x4 = xo[4] * *alpha;
x5 = xo[5] * *alpha;
x6 = xo[6] * *alpha;
x7 = xo[7] * *alpha;
__vector float* va0 = (__vector float*)a0;
__vector float* va1 = (__vector float*)a1;
__vector float* va2 = (__vector float*)a2;
__vector float* va3 = (__vector float*)a3;
__vector float* vb0 = (__vector float*)b0;
__vector float* vb1 = (__vector float*)b1;
__vector float* vb2 = (__vector float*)b2;
__vector float* vb3 = (__vector float*)b3;
__vector float v_x0 = {x0,x0,x0,x0};
__vector float v_x1 = {x1,x1,x1,x1};
__vector float v_x2 = {x2,x2,x2,x2};
__vector float v_x3 = {x3,x3,x3,x3};
__vector float v_x4 = {x4,x4,x4,x4};
__vector float v_x5 = {x5,x5,x5,x5};
__vector float v_x6 = {x6,x6,x6,x6};
__vector float v_x7 = {x7,x7,x7,x7};
__vector float* v_y =(__vector float*)y;
for ( i=0; i< n/4; i++)
{
register __vector float vy=v_y[i];
vy += v_x0 * va0[i] + v_x1 * va1[i] + v_x2 * va2[i] + v_x3 * va3[i] ;
vy += v_x4 * vb0[i] + v_x5 * vb1[i] + v_x6 * vb2[i] + v_x7 * vb3[i] ;
v_y[i] =vy;
FLOAT *a0, *a1, *a2, *a3, *b0, *b1, *b2, *b3;
FLOAT x0, x1, x2, x3, x4, x5, x6, x7;
a0 = ap[0];
a1 = ap[1];
a2 = ap[2];
a3 = ap[3];
b0 = a0 + lda4;
b1 = a1 + lda4;
b2 = a2 + lda4;
b3 = a3 + lda4;
x0 = xo[0] * (*alpha);
x1 = xo[1] * (*alpha);
x2 = xo[2] * (*alpha);
x3 = xo[3] * (*alpha);
x4 = xo[4] * (*alpha);
x5 = xo[5] * (*alpha);
x6 = xo[6] * (*alpha);
x7 = xo[7] * (*alpha);

__vector float v_x0 = {x0, x0, x0, x0};
__vector float v_x1 = {x1, x1, x1, x1};
__vector float v_x2 = {x2, x2, x2, x2};
__vector float v_x3 = {x3, x3, x3, x3};
__vector float v_x4 = {x4, x4, x4, x4};
__vector float v_x5 = {x5, x5, x5, x5};
__vector float v_x6 = {x6, x6, x6, x6};
__vector float v_x7 = {x7, x7, x7, x7};

for (i = 0; i < n; i += 4) {
__vector float vy = vec_vsx_ld(0, &y[i]);
__vector float va0 = vec_vsx_ld(0, &a0[i]);
__vector float va1 = vec_vsx_ld(0, &a1[i]);
__vector float va2 = vec_vsx_ld(0, &a2[i]);
__vector float va3 = vec_vsx_ld(0, &a3[i]);
__vector float vb0 = vec_vsx_ld(0, &b0[i]);
__vector float vb1 = vec_vsx_ld(0, &b1[i]);
__vector float vb2 = vec_vsx_ld(0, &b2[i]);
__vector float vb3 = vec_vsx_ld(0, &b3[i]);
vy += v_x0 * va0 + v_x1 * va1 + v_x2 * va2 + v_x3 * va3;
vy += v_x4 * vb0 + v_x5 * vb1 + v_x6 * vb2 + v_x7 * vb3;
vec_vsx_st(vy, 0, &y[i]);
}

}
static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
{
static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y,
FLOAT *alpha) {
BLASLONG i;
FLOAT x0,x1,x2,x3;
x0 = xo[0] * *alpha;
x1 = xo[1] * *alpha;
x2 = xo[2] * *alpha;
x3 = xo[3] * *alpha;
__vector float v_x0 = {x0,x0,x0,x0};
__vector float v_x1 = {x1,x1,x1,x1};
__vector float v_x2 = {x2,x2,x2,x2};
__vector float v_x3 = {x3,x3,x3,x3};
__vector float* v_y =(__vector float*)y;
__vector float* va0 = (__vector float*)ap[0];
__vector float* va1 = (__vector float*)ap[1];
__vector float* va2 = (__vector float*)ap[2];
__vector float* va3 = (__vector float*)ap[3];
for ( i=0; i< n/4; i++ )
{
register __vector float vy=v_y[i];
vy += v_x0 * va0[i] + v_x1 * va1[i] + v_x2 * va2[i] + v_x3 * va3[i] ;
v_y[i] =vy;
FLOAT x0, x1, x2, x3;
FLOAT *a0, *a1, *a2, *a3;
a0 = ap[0];
a1 = ap[1];
a2 = ap[2];
a3 = ap[3];
x0 = xo[0] * (*alpha);
x1 = xo[1] * (*alpha);
x2 = xo[2] * (*alpha);
x3 = xo[3] * (*alpha);
__vector float v_x0 = {x0, x0, x0, x0};
__vector float v_x1 = {x1, x1, x1, x1};
__vector float v_x2 = {x2, x2, x2, x2};
__vector float v_x3 = {x3, x3, x3, x3};

for (i = 0; i < n; i += 4) {
__vector float vy = vec_vsx_ld(0, &y[i]);
__vector float va0 = vec_vsx_ld(0, &a0[i]);
__vector float va1 = vec_vsx_ld(0, &a1[i]);
__vector float va2 = vec_vsx_ld(0, &a2[i]);
__vector float va3 = vec_vsx_ld(0, &a3[i]);
vy += v_x0 * va0 + v_x1 * va1 + v_x2 * va2 + v_x3 * va3;
vec_vsx_st(vy, 0, &y[i]);
}
}

}

static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
{

static void sgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y,
FLOAT *alpha) {
BLASLONG i;
FLOAT x0,x1;
x0 = x[0] * *alpha;
x1 = x[1] * *alpha;
__vector float v_x0 = {x0,x0,x0,x0};
__vector float v_x1 = {x1,x1,x1,x1};
__vector float* v_y =(__vector float*)y;
__vector float* va0 = (__vector float*)ap[0];
__vector float* va1 = (__vector float*)ap[1];
for ( i=0; i< n/4; i++ )
{
v_y[i] += v_x0 * va0[i] + v_x1 * va1[i] ;
FLOAT x0, x1;
FLOAT *a0, *a1;
a0 = ap[0];
a1 = ap[1];
x0 = x[0] * (*alpha);
x1 = x[1] * (*alpha);
__vector float v_x0 = {x0, x0, x0, x0};
__vector float v_x1 = {x1, x1, x1, x1};

for (i = 0; i < n; i += 4) {
__vector float vy = vec_vsx_ld(0, &y[i]);
__vector float va0 = vec_vsx_ld(0, &a0[i]);
__vector float va1 = vec_vsx_ld(0, &a1[i]);
vy += v_x0 * va0 + v_x1 * va1;
vec_vsx_st(vy, 0, &y[i]);
}
}

}
static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
{

static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y,
FLOAT *alpha) {
BLASLONG i;
FLOAT x0 ;
x0 = x[0] * *alpha;
__vector float v_x0 = {x0,x0,x0,x0};
__vector float* v_y =(__vector float*)y;
__vector float* va0 = (__vector float*)ap;
for ( i=0; i< n/4; i++ )
{
v_y[i] += v_x0 * va0[i] ;
FLOAT x0 = x[0] * (*alpha);
__vector float v_x0 = {x0, x0, x0, x0};

for (i = 0; i < n; i += 4) {
__vector float vy = vec_vsx_ld(0, &y[i]);
__vector float va0 = vec_vsx_ld(0, &ap[i]);
vy += v_x0 * va0;
vec_vsx_st(vy, 0, &y[i]);
}

}
static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
{

static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) {
BLASLONG i;
for ( i=0; i<n; i++ ){
*dest += *src;
src++;
dest += inc_dest;
for (i = 0; i < n; i++) {
*dest += *src;
src++;
dest += inc_dest;
}
return;

}

int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
{
BLASLONG i;
FLOAT *a_ptr;
FLOAT *x_ptr;
FLOAT *y_ptr;
FLOAT *ap[4];
BLASLONG n1;
BLASLONG m1;
BLASLONG m2;
BLASLONG m3;
BLASLONG n2;
BLASLONG lda4 = lda << 2;
BLASLONG lda8 = lda << 3;
FLOAT xbuffer[8] __attribute__((aligned(16)));
FLOAT *ybuffer;

if ( m < 1 ) return(0);
if ( n < 1 ) return(0);

ybuffer = buffer;
if ( inc_x == 1 )
{
n1 = n >> 3 ;
n2 = n & 7 ;
}
else
{
n1 = n >> 2 ;
n2 = n & 3 ;

}
m3 = m & 3 ;
m1 = m & -4 ;
m2 = (m & (NBMAX-1)) - m3 ;


y_ptr = y;

BLASLONG NB = NBMAX;

while ( NB == NBMAX )
{
m1 -= NB;
if ( m1 < 0)
{
if ( m2 == 0 ) break;
NB = m2;
}
a_ptr = a;
x_ptr = x;
ap[0] = a_ptr;
ap[1] = a_ptr + lda;
ap[2] = ap[1] + lda;
ap[3] = ap[2] + lda;

if ( inc_y != 1 )
memset(ybuffer,0,NB*4);
else
ybuffer = y_ptr;

if ( inc_x == 1 )
{


for( i = 0; i < n1 ; i++)
{
sgemv_kernel_4x8(NB,ap,x_ptr,ybuffer,lda4,&alpha);
ap[0] += lda8;
ap[1] += lda8;
ap[2] += lda8;
ap[3] += lda8;
a_ptr += lda8;
x_ptr += 8;
}


if ( n2 & 4 )
{
sgemv_kernel_4x4(NB,ap,x_ptr,ybuffer,&alpha);
ap[0] += lda4;
ap[1] += lda4;
ap[2] += lda4;
ap[3] += lda4;
a_ptr += lda4;
x_ptr += 4;
}

if ( n2 & 2 )
{
sgemv_kernel_4x2(NB,ap,x_ptr,ybuffer,&alpha);
a_ptr += lda*2;
x_ptr += 2;
}


if ( n2 & 1 )
{
sgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,&alpha);
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a,
BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
FLOAT *buffer) {
BLASLONG i, n1, m1, m2, m3, n2, lda4, lda8;
FLOAT *a_ptr, *x_ptr, *y_ptr, *ap[4];

lda4 = lda << 2;
lda8 = lda << 3;
FLOAT xbuffer[8] __attribute__((aligned(16)));
FLOAT *ybuffer = buffer;

if (m < 1) return (0);
if (n < 1) return (0);

if (inc_x == 1) {
n1 = n >> 3;
n2 = n & 7;
} else {
n1 = n >> 2;
n2 = n & 3;
}

m3 = m & 3;
m1 = m & -4;
m2 = (m & (NBMAX - 1)) - m3;
y_ptr = y;
BLASLONG NB = NBMAX;

while (NB == NBMAX) {
m1 -= NB;
if (m1 < 0) {
if (m2 == 0) break;
NB = m2;
}

a_ptr = a;
x_ptr = x;

ap[0] = a_ptr;
ap[1] = a_ptr + lda;
ap[2] = ap[1] + lda;
ap[3] = ap[2] + lda;

if (inc_y != 1)
memset(ybuffer, 0, NB * 4);
else
ybuffer = y_ptr;

if (inc_x == 1) {
for (i = 0; i < n1; i++) {
sgemv_kernel_4x8(NB, ap, x_ptr, ybuffer, lda4, &alpha);
ap[0] += lda8;
ap[1] += lda8;
ap[2] += lda8;
ap[3] += lda8;
a_ptr += lda8;
x_ptr += 8;
}
if (n2 & 4) {
sgemv_kernel_4x4(NB, ap, x_ptr, ybuffer, &alpha);
ap[0] += lda4;
ap[1] += lda4;
ap[2] += lda4;
ap[3] += lda4;
a_ptr += lda4;
x_ptr += 4;
}

if (n2 & 2) {
sgemv_kernel_4x2(NB, ap, x_ptr, ybuffer, &alpha);
a_ptr += lda * 2;
x_ptr += 2;
}

if (n2 & 1) {
sgemv_kernel_4x1(NB, a_ptr, x_ptr, ybuffer, &alpha);
a_ptr += lda;
x_ptr += 1;
}

} else {
for (i = 0; i < n1; i++) {
xbuffer[0] = x_ptr[0];
x_ptr += inc_x;
xbuffer[1] = x_ptr[0];
x_ptr += inc_x;
xbuffer[2] = x_ptr[0];
x_ptr += inc_x;
xbuffer[3] = x_ptr[0];
x_ptr += inc_x;
sgemv_kernel_4x4(NB, ap, xbuffer, ybuffer, &alpha);
ap[0] += lda4;
ap[1] += lda4;
ap[2] += lda4;
ap[3] += lda4;
a_ptr += lda4;
}

for (i = 0; i < n2; i++) {
xbuffer[0] = x_ptr[0];
x_ptr += inc_x;
sgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer, &alpha);
a_ptr += lda;
}
}

a += NB;
if (inc_y != 1) {
add_y(NB, ybuffer, y_ptr, inc_y);
y_ptr += NB * inc_y;
} else
y_ptr += NB;
}

if (m3 == 0) return (0);

if (m3 == 3) {
a_ptr = a;
x_ptr = x;
FLOAT temp0 = 0.0;
FLOAT temp1 = 0.0;
FLOAT temp2 = 0.0;
if (lda == 3 && inc_x == 1) {
for (i = 0; i < (n & -4); i += 4) {
temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1];
temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1];
temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1];

temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9] * x_ptr[3];
temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3];
temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3];

a_ptr += 12;
x_ptr += 4;
}

for (; i < n; i++) {
temp0 += a_ptr[0] * x_ptr[0];
temp1 += a_ptr[1] * x_ptr[0];
temp2 += a_ptr[2] * x_ptr[0];
a_ptr += 3;
x_ptr++;
}

} else {
for (i = 0; i < n; i++) {
temp0 += a_ptr[0] * x_ptr[0];
temp1 += a_ptr[1] * x_ptr[0];
temp2 += a_ptr[2] * x_ptr[0];
a_ptr += lda;
x_ptr += inc_x;
}
}
y_ptr[0] += alpha * temp0;
y_ptr += inc_y;
y_ptr[0] += alpha * temp1;
y_ptr += inc_y;
y_ptr[0] += alpha * temp2;
return (0);
}

if (m3 == 2) {
a_ptr = a;
x_ptr = x;
FLOAT temp0 = 0.0;
FLOAT temp1 = 0.0;
if (lda == 2 && inc_x == 1) {
for (i = 0; i < (n & -4); i += 4) {
temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1];
temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1];
temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3];
temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3];
a_ptr += 8;
x_ptr += 4;
}

for (; i < n; i++) {
temp0 += a_ptr[0] * x_ptr[0];
temp1 += a_ptr[1] * x_ptr[0];
a_ptr += 2;
x_ptr++;
}

} else {
for (i = 0; i < n; i++) {
temp0 += a_ptr[0] * x_ptr[0];
temp1 += a_ptr[1] * x_ptr[0];
a_ptr += lda;
x_ptr += inc_x;
}
}
y_ptr[0] += alpha * temp0;
y_ptr += inc_y;
y_ptr[0] += alpha * temp1;
return (0);
}

if (m3 == 1) {
a_ptr = a;
x_ptr = x;
FLOAT temp = 0.0;
if (lda == 1 && inc_x == 1) {
for (i = 0; i < (n & -4); i += 4) {
temp += a_ptr[i] * x_ptr[i] + a_ptr[i + 1] * x_ptr[i + 1] +
a_ptr[i + 2] * x_ptr[i + 2] +
a_ptr[i + 3] * x_ptr[i + 3];
}

for (; i < n; i++) {
temp += a_ptr[i] * x_ptr[i];
}

} else {
for (i = 0; i < n; i++) {
temp += a_ptr[0] * x_ptr[0];
a_ptr += lda;
x_ptr += 1;
}


}
else
{

for( i = 0; i < n1 ; i++)
{
xbuffer[0] = x_ptr[0];
x_ptr += inc_x;
xbuffer[1] = x_ptr[0];
x_ptr += inc_x;
xbuffer[2] = x_ptr[0];
x_ptr += inc_x;
xbuffer[3] = x_ptr[0];
x_ptr += inc_x;
sgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,&alpha);
ap[0] += lda4;
ap[1] += lda4;
ap[2] += lda4;
ap[3] += lda4;
a_ptr += lda4;
}

for( i = 0; i < n2 ; i++)
{
xbuffer[0] = x_ptr[0];
x_ptr += inc_x;
sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,&alpha);
a_ptr += lda;

}

}

a += NB;
if ( inc_y != 1 )
{
add_y(NB,ybuffer,y_ptr,inc_y);
y_ptr += NB * inc_y;
}
else
y_ptr += NB ;

}

if ( m3 == 0 ) return(0);

if ( m3 == 3 )
{
a_ptr = a;
x_ptr = x;
FLOAT temp0 = 0.0;
FLOAT temp1 = 0.0;
FLOAT temp2 = 0.0;
if ( lda == 3 && inc_x ==1 )
{

for( i = 0; i < ( n & -4 ); i+=4 )
{

temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1];
temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1];
temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1];

temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9] * x_ptr[3];
temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3];
temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3];

a_ptr += 12;
x_ptr += 4;
}

for( ; i < n; i++ )
{
temp0 += a_ptr[0] * x_ptr[0];
temp1 += a_ptr[1] * x_ptr[0];
temp2 += a_ptr[2] * x_ptr[0];
a_ptr += 3;
x_ptr ++;
}

}
else
{

for( i = 0; i < n; i++ )
{
temp0 += a_ptr[0] * x_ptr[0];
temp1 += a_ptr[1] * x_ptr[0];
temp2 += a_ptr[2] * x_ptr[0];
a_ptr += lda;
x_ptr += inc_x;


}

}
y_ptr[0] += alpha * temp0;
y_ptr += inc_y;
y_ptr[0] += alpha * temp1;
y_ptr += inc_y;
y_ptr[0] += alpha * temp2;
return(0);
}


if ( m3 == 2 )
{
a_ptr = a;
x_ptr = x;
FLOAT temp0 = 0.0;
FLOAT temp1 = 0.0;
if ( lda == 2 && inc_x ==1 )
{

for( i = 0; i < (n & -4) ; i+=4 )
{
temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1];
temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1];
temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3];
temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3];
a_ptr += 8;
x_ptr += 4;

}


for( ; i < n; i++ )
{
temp0 += a_ptr[0] * x_ptr[0];
temp1 += a_ptr[1] * x_ptr[0];
a_ptr += 2;
x_ptr ++;
}

}
else
{

for( i = 0; i < n; i++ )
{
temp0 += a_ptr[0] * x_ptr[0];
temp1 += a_ptr[1] * x_ptr[0];
a_ptr += lda;
x_ptr += inc_x;


}

}
y_ptr[0] += alpha * temp0;
y_ptr += inc_y;
y_ptr[0] += alpha * temp1;
return(0);
}

if ( m3 == 1 )
{
a_ptr = a;
x_ptr = x;
FLOAT temp = 0.0;
if ( lda == 1 && inc_x ==1 )
{

for( i = 0; i < (n & -4); i+=4 )
{
temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3];
}

for( ; i < n; i++ )
{
temp += a_ptr[i] * x_ptr[i];
}

}
else
{

for( i = 0; i < n; i++ )
{
temp += a_ptr[0] * x_ptr[0];
a_ptr += lda;
x_ptr += inc_x;
}

}
y_ptr[0] += alpha * temp;
return(0);
}


return(0);
x_ptr += inc_x;
}
}
y_ptr[0] += alpha * temp;
return (0);
}

return (0);
}

#endif


+ 128
- 172
kernel/power/sgemv_t.c View File

@@ -17,12 +17,12 @@ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#if !defined(__VEC__) || !defined(__ALTIVEC__)
#include "../arm/gemv_t.c"
@@ -33,20 +33,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#define NBMAX 2048

#include <altivec.h>
static void sgemv_kernel_4x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) {
BLASLONG i;
#include <altivec.h>

static void sgemv_kernel_4x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x,
FLOAT *y, FLOAT alpha) {
BLASLONG i;
FLOAT *a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7;
__vector float *va0, *va1, *va2, *va3, *va4, *va5, *va6, *va7, *v_x;
register __vector float temp0 = {0,0,0,0};
register __vector float temp1 = {0,0,0,0};
register __vector float temp2 = {0,0,0,0};
register __vector float temp3 = {0,0,0,0};
register __vector float temp4 = {0,0,0,0};
register __vector float temp5 = {0,0,0,0};
register __vector float temp6 = {0,0,0,0};
register __vector float temp7 = {0,0,0,0};
register __vector float temp0 = {0, 0, 0, 0};
register __vector float temp1 = {0, 0, 0, 0};
register __vector float temp2 = {0, 0, 0, 0};
register __vector float temp3 = {0, 0, 0, 0};
register __vector float temp4 = {0, 0, 0, 0};
register __vector float temp5 = {0, 0, 0, 0};
register __vector float temp6 = {0, 0, 0, 0};
register __vector float temp7 = {0, 0, 0, 0};

a0 = ap;
a1 = ap + lda;
@@ -56,43 +56,32 @@ static void sgemv_kernel_4x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA
a5 = a4 + lda;
a6 = a5 + lda;
a7 = a6 + lda;
va0 = (__vector float*) a0;
va1 = (__vector float*) a1;
va2 = (__vector float*) a2;
va3 = (__vector float*) a3;
va4 = (__vector float*) a4;
va5 = (__vector float*) a5;
va6 = (__vector float*) a6;
va7 = (__vector float*) a7;
v_x = (__vector float*) x;
for (i = 0; i < n/4; i ++) {
temp0 += v_x[i] * va0[i];
temp1 += v_x[i] * va1[i];
temp2 += v_x[i] * va2[i];
temp3 += v_x[i] * va3[i];
temp4 += v_x[i] * va4[i];
temp5 += v_x[i] * va5[i];
temp6 += v_x[i] * va6[i];
temp7 += v_x[i] * va7[i];
}
#if defined(POWER8)
y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]);
y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]);
y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]);
y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]);

y[4] += alpha * (temp4[0] + temp4[1]+temp4[2] + temp4[3]);
y[5] += alpha * (temp5[0] + temp5[1]+temp5[2] + temp5[3]);
y[6] += alpha * (temp6[0] + temp6[1]+temp6[2] + temp6[3]);
y[7] += alpha * (temp7[0] + temp7[1]+temp7[2] + temp7[3]);
#else
register __vector float t0, t1, t2, t3;
register __vector float a = { alpha, alpha, alpha, alpha };
__vector float *v_y = (__vector float*) y;

for (i = 0; i < n; i += 4) {
__vector float vx = vec_vsx_ld(0, &x[i]);
__vector float vva0 = vec_vsx_ld(0, &a0[i]);
__vector float vva1 = vec_vsx_ld(0, &a1[i]);
__vector float vva2 = vec_vsx_ld(0, &a2[i]);
__vector float vva3 = vec_vsx_ld(0, &a3[i]);
__vector float vva4 = vec_vsx_ld(0, &a4[i]);
__vector float vva5 = vec_vsx_ld(0, &a5[i]);
__vector float vva6 = vec_vsx_ld(0, &a6[i]);
__vector float vva7 = vec_vsx_ld(0, &a7[i]);
temp0 += vx * vva0;
temp1 += vx * vva1;
temp2 += vx * vva2;
temp3 += vx * vva3;
temp4 += vx * vva4;
temp5 += vx * vva5;
temp6 += vx * vva6;
temp7 += vx * vva7;
}


register __vector float t0, t1, t2, t3;
register __vector float a = {alpha, alpha, alpha, alpha};
__vector float vy0 = vec_vsx_ld(0, y);
__vector float vy1 = vec_vsx_ld(0, &(y[4]));
t0 = vec_mergeh(temp0, temp2);
t1 = vec_mergel(temp0, temp2);
t2 = vec_mergeh(temp1, temp3);
@@ -113,44 +102,41 @@ static void sgemv_kernel_4x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA
temp7 = vec_mergel(t1, t3);
temp4 += temp5 + temp6 + temp7;

v_y[0] += a * temp0;
v_y[1] += a * temp4;
#endif
vy0 += a * temp0;
vy1 += a * temp4;
vec_vsx_st(vy0, 0, y);
vec_vsx_st(vy1, 0, &(y[4]));

}

static void sgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) {
static void sgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x,
FLOAT *y, FLOAT alpha) {
BLASLONG i = 0;
FLOAT *a0, *a1, *a2, *a3;
a0 = ap;
a1 = ap + lda;
a2 = a1 + lda;
a3 = a2 + lda;
__vector float* va0 = (__vector float*) a0;
__vector float* va1 = (__vector float*) a1;
__vector float* va2 = (__vector float*) a2;
__vector float* va3 = (__vector float*) a3;
__vector float* v_x = (__vector float*) x;
register __vector float temp0 = {0,0,0,0};
register __vector float temp1 = {0,0,0,0};
register __vector float temp2 = {0,0,0,0};
register __vector float temp3 = {0,0,0,0};
for (i = 0; i < n / 4; i ++) {
temp0 += v_x[i] * va0[i];
temp1 += v_x[i] * va1[i];
temp2 += v_x[i] * va2[i];
temp3 += v_x[i] * va3[i];
register __vector float temp0 = {0, 0, 0, 0};
register __vector float temp1 = {0, 0, 0, 0};
register __vector float temp2 = {0, 0, 0, 0};
register __vector float temp3 = {0, 0, 0, 0};
for (i = 0; i < n; i += 4) {
__vector float vx = vec_vsx_ld(0, &x[i]);
__vector float vva0 = vec_vsx_ld(0, &a0[i]);
__vector float vva1 = vec_vsx_ld(0, &a1[i]);
__vector float vva2 = vec_vsx_ld(0, &a2[i]);
__vector float vva3 = vec_vsx_ld(0, &a3[i]);
temp0 += vx * vva0;
temp1 += vx * vva1;
temp2 += vx * vva2;
temp3 += vx * vva3;
}
#if defined(POWER8)
y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]);
y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]);
y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]);
y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]);
#else

register __vector float t0, t1, t2, t3;
register __vector float a = { alpha, alpha, alpha, alpha };
__vector float *v_y = (__vector float*) y;
register __vector float a = {alpha, alpha, alpha, alpha};
__vector float vy0 = vec_vsx_ld(0, y);

t0 = vec_mergeh(temp0, temp2);
t1 = vec_mergel(temp0, temp2);
@@ -162,47 +148,42 @@ static void sgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA
temp3 = vec_mergel(t1, t3);
temp0 += temp1 + temp2 + temp3;

v_y[0] += a * temp0;
#endif
}
vy0 += a * temp0;
vec_vsx_st(vy0, 0, y);

static void sgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha, BLASLONG inc_y) {
}

static void sgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x,
FLOAT *y, FLOAT alpha, BLASLONG inc_y) {
BLASLONG i;
FLOAT *a0, *a1;
a0 = ap;
a1 = ap + lda;
__vector float* va0 = (__vector float*) a0;
__vector float* va1 = (__vector float*) a1;
__vector float* v_x = (__vector float*) x;
__vector float temp0 = {0,0,0,0};
__vector float temp1 = {0,0,0,0};
for (i = 0; i < n / 4; i ++) {
temp0 += v_x[i] * va0[i];
temp1 += v_x[i] * va1[i];
__vector float temp0 = {0, 0, 0, 0};
__vector float temp1 = {0, 0, 0, 0};
for (i = 0; i < n; i += 4) {
__vector float vx = vec_vsx_ld(0, &x[i]);
__vector float vva0 = vec_vsx_ld(0, &a0[i]);
__vector float vva1 = vec_vsx_ld(0, &a1[i]);
temp0 += vx * vva0;
temp1 += vx * vva1;
}



y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]);
y[inc_y] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]);
y[0] += alpha * (temp0[0] + temp0[1] + temp0[2] + temp0[3]);
y[inc_y] += alpha * (temp1[0] + temp1[1] + temp1[2] + temp1[3]);
}

static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) {
static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y,
FLOAT alpha) {
BLASLONG i;
FLOAT *a0;
a0 = ap;
__vector float* va0 = (__vector float*) a0;
__vector float* v_x = (__vector float*) x;
__vector float temp0 = {0,0,0,0};
for (i = 0; i < n / 4; i ++) {
temp0 += v_x[i] * va0[i] ;
__vector float temp0 = {0, 0, 0, 0};
for (i = 0; i < n; i += 4) {
__vector float vx = vec_vsx_ld(0, &x[i]);
__vector float vva0 = vec_vsx_ld(0, &ap[i]);
temp0 += vx * vva0;
}

y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]);

y[0] += alpha * (temp0[0] + temp0[1] + temp0[2] + temp0[3]);
}

static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) {
@@ -213,20 +194,14 @@ static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) {
}
}

int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) {
BLASLONG i;
BLASLONG j;
FLOAT *a_ptr;
FLOAT *x_ptr;
FLOAT *y_ptr;

BLASLONG n1;
BLASLONG m1;
BLASLONG m2;
BLASLONG m3;
BLASLONG n2;
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a,
BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
FLOAT *buffer) {

BLASLONG i, j, n1, m1, m2, m3, n2;
FLOAT *a_ptr, *x_ptr, *y_ptr;
FLOAT ybuffer[8] __attribute__((aligned(16)));
FLOAT *xbuffer;
FLOAT *xbuffer;
if (m < 1) return (0);
if (n < 1) return (0);

@@ -242,7 +217,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
BLASLONG NB = NBMAX;

while (NB == NBMAX) {

m1 -= NB;
if (m1 < 0) {
if (m2 == 0) break;
@@ -260,20 +234,15 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO

BLASLONG lda8 = lda << 3;


if (inc_y == 1) {

for (i = 0; i < n1; i++) {
sgemv_kernel_4x8(NB, lda, a_ptr, xbuffer, y_ptr, alpha);
y_ptr += 8;
a_ptr += lda8;
}

} else {
for (i = 0; i < n1; i++) {
ybuffer[0] = 0;
ybuffer[1] = 0;
@@ -285,8 +254,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
ybuffer[7] = 0;
sgemv_kernel_4x8(NB, lda, a_ptr, xbuffer, ybuffer, alpha);


*y_ptr += ybuffer[0];
y_ptr += inc_y;
*y_ptr += ybuffer[1];
@@ -307,10 +274,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO

a_ptr += lda8;
}

}


if (n2 & 4) {
ybuffer[0] = 0;
ybuffer[1] = 0;
@@ -318,7 +283,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
ybuffer[3] = 0;
sgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, ybuffer, alpha);

a_ptr += lda<<2;
a_ptr += lda << 2;

*y_ptr += ybuffer[0];
y_ptr += inc_y;
@@ -334,20 +299,16 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
sgemv_kernel_4x2(NB, lda, a_ptr, xbuffer, y_ptr, alpha, inc_y);
a_ptr += lda << 1;
y_ptr += 2 * inc_y;

}

if (n2 & 1) {
sgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha);
a_ptr += lda;
y_ptr += inc_y;

}

a += NB;
x += NB * inc_x;


}

if (m3 == 0) return (0);
@@ -365,13 +326,14 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
y_ptr = y;

if (lda == 3 && inc_y == 1) {

for (j = 0; j < (n & -4); j += 4) {

y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2;
y_ptr[j + 1] += aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2;
y_ptr[j + 2] += aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2;
y_ptr[j + 3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2;
y_ptr[j + 1] +=
aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2;
y_ptr[j + 2] +=
aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2;
y_ptr[j + 3] +=
aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2;
aj += 12;
}

@@ -381,38 +343,40 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
}

} else {

if (inc_y == 1) {

BLASLONG register lda2 = lda << 1;
BLASLONG register lda4 = lda << 2;
BLASLONG register lda3 = lda2 + lda;

for (j = 0; j < (n & -4); j += 4) {

y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2;
y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1 + *(aj + lda + 2) * xtemp2;
y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1 + *(aj + lda2 + 2) * xtemp2;
y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1 + *(aj + lda3 + 2) * xtemp2;
y_ptr[j] +=
*aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2;
y_ptr[j + 1] += *(aj + lda) * xtemp0 +
*(aj + lda + 1) * xtemp1 +
*(aj + lda + 2) * xtemp2;
y_ptr[j + 2] += *(aj + lda2) * xtemp0 +
*(aj + lda2 + 1) * xtemp1 +
*(aj + lda2 + 2) * xtemp2;
y_ptr[j + 3] += *(aj + lda3) * xtemp0 +
*(aj + lda3 + 1) * xtemp1 +
*(aj + lda3 + 2) * xtemp2;
aj += lda4;
}

for (; j < n; j++) {

y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2;
y_ptr[j] +=
*aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2;
aj += lda;
}

} else {

for (j = 0; j < n; j++) {
*y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2;
*y_ptr +=
*aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2;
y_ptr += inc_y;
aj += lda;
}

}

}
return (0);
}
@@ -426,14 +390,12 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
y_ptr = y;

if (lda == 2 && inc_y == 1) {

for (j = 0; j < (n & -4); j += 4) {
y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1;
y_ptr[j + 1] += aj[2] * xtemp0 + aj[3] * xtemp1;
y_ptr[j + 2] += aj[4] * xtemp0 + aj[5] * xtemp1;
y_ptr[j + 3] += aj[6] * xtemp0 + aj[7] * xtemp1;
aj += 8;

}

for (; j < n; j++) {
@@ -443,22 +405,22 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO

} else {
if (inc_y == 1) {

BLASLONG register lda2 = lda << 1;
BLASLONG register lda4 = lda << 2;
BLASLONG register lda3 = lda2 + lda;

for (j = 0; j < (n & -4); j += 4) {

y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1;
y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1;
y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1;
y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1;
y_ptr[j + 1] +=
*(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1;
y_ptr[j + 2] +=
*(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1;
y_ptr[j + 3] +=
*(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1;
aj += lda4;
}

for (; j < n; j++) {

y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1;
aj += lda;
}
@@ -470,10 +432,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
aj += lda;
}
}

}
return (0);

}

FLOAT xtemp = *x_ptr * alpha;
@@ -490,10 +450,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
y_ptr[j] += aj[j] * xtemp;
}


} else {
if (inc_y == 1) {

BLASLONG register lda2 = lda << 1;
BLASLONG register lda4 = lda << 2;
BLASLONG register lda3 = lda2 + lda;
@@ -516,12 +474,10 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
y_ptr += inc_y;
aj += lda;
}

}
}

return (0);

}

#endif

+ 207
- 97
kernel/riscv64/gemv_n_vector.c View File

@@ -27,13 +27,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#include "common.h"
#if !defined(DOUBLE)
#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n)
#define FLOAT_V_T vfloat32m4_t
#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m4)
#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4)
#define VSEV_FLOAT RISCV_RVV(vse32_v_f32m4)
#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4)
#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4)
#define VSETVL(n) RISCV_RVV(vsetvl_e32m8)(n)
#define FLOAT_V_T vfloat32m8_t
#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m8)
#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m8)
#define VSEV_FLOAT RISCV_RVV(vse32_v_f32m8)
#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m8)
#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m8)
#define VFMUL_VF_FLOAT RISCV_RVV(vfmul_vf_f32m8)
#define VFILL_ZERO_FLOAT RISCV_RVV(vfsub_vv_f32m8)
#else
#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n)
#define FLOAT_V_T vfloat64m4_t
@@ -42,103 +44,211 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define VSEV_FLOAT RISCV_RVV(vse64_v_f64m4)
#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4)
#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4)
#define VFMUL_VF_FLOAT RISCV_RVV(vfmul_vf_f64m4)
#define VFILL_ZERO_FLOAT RISCV_RVV(vfsub_vv_f64m4)
#endif

int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
{
BLASLONG i = 0, j = 0, k = 0;
BLASLONG ix = 0, iy = 0;

if(n < 0) return(0);
FLOAT *a_ptr = a;
FLOAT temp = 0.0;
FLOAT_V_T va0, va1, vy0, vy1;
unsigned int gvl = 0;
if(inc_y == 1){
gvl = VSETVL(m);
if(gvl <= m/2){
for(k=0,j=0; k<m/(2*gvl); k++){
a_ptr = a;
ix = 0;
vy0 = VLEV_FLOAT(&y[j], gvl);
vy1 = VLEV_FLOAT(&y[j+gvl], gvl);
for(i = 0; i < n; i++){
temp = alpha * x[ix];
va0 = VLEV_FLOAT(&a_ptr[j], gvl);
vy0 = VFMACCVF_FLOAT(vy0, temp, va0, gvl);

va1 = VLEV_FLOAT(&a_ptr[j+gvl], gvl);
vy1 = VFMACCVF_FLOAT(vy1, temp, va1, gvl);
a_ptr += lda;
ix += inc_x;
}
VSEV_FLOAT(&y[j], vy0, gvl);
VSEV_FLOAT(&y[j+gvl], vy1, gvl);
j += gvl * 2;
}
}
//tail
for(;j < m;){
gvl = VSETVL(m-j);
a_ptr = a;
ix = 0;
vy0 = VLEV_FLOAT(&y[j], gvl);
for(i = 0; i < n; i++){
temp = alpha * x[ix];
va0 = VLEV_FLOAT(&a_ptr[j], gvl);
vy0 = VFMACCVF_FLOAT(vy0, temp, va0, gvl);

a_ptr += lda;
ix += inc_x;
}
VSEV_FLOAT(&y[j], vy0, gvl);
j += gvl;
BLASLONG i = 0, j = 0, k = 0;
BLASLONG ix = 0, iy = 0;

if(n < 0) return(0);
FLOAT *a_ptr = a;
FLOAT temp[4];
FLOAT_V_T va0, va1, vy0, vy1,vy0_temp, vy1_temp , temp_v ,va0_0 , va0_1 , va1_0 ,va1_1 ,va2_0 ,va2_1 ,va3_0 ,va3_1 ;
unsigned int gvl = 0;
if(inc_y == 1 && inc_x == 1){
gvl = VSETVL(m);
if(gvl <= m/2){
for(k=0,j=0; k<m/(2*gvl); k++){
a_ptr = a;
ix = 0;
vy0_temp = VLEV_FLOAT(&y[j], gvl);
vy1_temp = VLEV_FLOAT(&y[j+gvl], gvl);
vy0 = VFILL_ZERO_FLOAT(vy0 , vy0 , gvl);
vy1 = VFILL_ZERO_FLOAT(vy1 , vy1 , gvl);
int i;

int remainder = n % 4;
for(i = 0; i < remainder; i++){
temp[0] = x[ix];
va0 = VLEV_FLOAT(&a_ptr[j], gvl);
vy0 = VFMACCVF_FLOAT(vy0, temp[0], va0, gvl);

va1 = VLEV_FLOAT(&a_ptr[j+gvl], gvl);
vy1 = VFMACCVF_FLOAT(vy1, temp[0], va1, gvl);
a_ptr += lda;
ix ++;
}
}else{
BLASLONG stride_y = inc_y * sizeof(FLOAT);
gvl = VSETVL(m);
if(gvl <= m/2){
BLASLONG inc_yv = inc_y * gvl;
for(k=0,j=0; k<m/(2*gvl); k++){
a_ptr = a;
ix = 0;
vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
vy1 = VLSEV_FLOAT(&y[iy+inc_yv], stride_y, gvl);
for(i = 0; i < n; i++){
temp = alpha * x[ix];
va0 = VLEV_FLOAT(&a_ptr[j], gvl);
vy0 = VFMACCVF_FLOAT(vy0, temp, va0, gvl);

va1 = VLEV_FLOAT(&a_ptr[j+gvl], gvl);
vy1 = VFMACCVF_FLOAT(vy1, temp, va1, gvl);
a_ptr += lda;
ix += inc_x;
}
VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl);
VSSEV_FLOAT(&y[iy+inc_yv], stride_y, vy1, gvl);
j += gvl * 2;
iy += inc_yv * 2;
}

for(i = remainder; i < n; i += 4){
va0_0 = VLEV_FLOAT(&(a_ptr)[j], gvl);
va0_1 = VLEV_FLOAT(&(a_ptr)[j+gvl], gvl);
va1_0 = VLEV_FLOAT(&(a_ptr+lda * 1)[j], gvl);
va1_1 = VLEV_FLOAT(&(a_ptr+lda * 1)[j+gvl], gvl);
va2_0 = VLEV_FLOAT(&(a_ptr+lda * 2)[j], gvl);
va2_1 = VLEV_FLOAT(&(a_ptr+lda * 2)[j+gvl], gvl);
va3_0 = VLEV_FLOAT(&(a_ptr+lda * 3)[j], gvl);
va3_1 = VLEV_FLOAT(&(a_ptr+lda * 3)[j+gvl], gvl);

vy0 = VFMACCVF_FLOAT(vy0, x[ix], va0_0, gvl);
vy1 = VFMACCVF_FLOAT(vy1, x[ix], va0_1, gvl);

vy0 = VFMACCVF_FLOAT(vy0, x[ix+1], va1_0, gvl);
vy1 = VFMACCVF_FLOAT(vy1, x[ix+1], va1_1, gvl);

vy0 = VFMACCVF_FLOAT(vy0, x[ix+2], va2_0, gvl);
vy1 = VFMACCVF_FLOAT(vy1, x[ix+2], va2_1, gvl);

vy0 = VFMACCVF_FLOAT(vy0, x[ix+3], va3_0, gvl);
vy1 = VFMACCVF_FLOAT(vy1, x[ix+3], va3_1, gvl);
a_ptr += 4 * lda;
ix +=4;
}
//tail
for(;j < m;){
gvl = VSETVL(m-j);
a_ptr = a;
ix = 0;
vy0 = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl);
for(i = 0; i < n; i++){
temp = alpha * x[ix];
va0 = VLEV_FLOAT(&a_ptr[j], gvl);
vy0 = VFMACCVF_FLOAT(vy0, temp, va0, gvl);

a_ptr += lda;
ix += inc_x;
}
VSSEV_FLOAT(&y[j*inc_y], stride_y, vy0, gvl);
j += gvl;
vy0 = VFMACCVF_FLOAT(vy0_temp, alpha, vy0, gvl);
vy1 = VFMACCVF_FLOAT(vy1_temp, alpha, vy1, gvl);
VSEV_FLOAT(&y[j], vy0, gvl);
VSEV_FLOAT(&y[j+gvl], vy1, gvl);
j += gvl * 2;
}
}
//tail
if(gvl <= m - j ){
a_ptr = a;
ix = 0;
vy0_temp = VLEV_FLOAT(&y[j], gvl);
vy0 = VFILL_ZERO_FLOAT(vy0 , vy0 , gvl);
int i;

int remainder = n % 4;
for(i = 0; i < remainder; i++){
temp[0] = x[ix];
va0 = VLEV_FLOAT(&a_ptr[j], gvl);
vy0 = VFMACCVF_FLOAT(vy0, temp[0], va0, gvl);
a_ptr += lda;
ix ++;
}

for(i = remainder; i < n; i += 4){
va0_0 = VLEV_FLOAT(&(a_ptr)[j], gvl);
va1_0 = VLEV_FLOAT(&(a_ptr+lda * 1)[j], gvl);
va2_0 = VLEV_FLOAT(&(a_ptr+lda * 2)[j], gvl);
va3_0 = VLEV_FLOAT(&(a_ptr+lda * 3)[j], gvl);
vy0 = VFMACCVF_FLOAT(vy0, x[ix], va0_0, gvl);
vy0 = VFMACCVF_FLOAT(vy0, x[ix+1], va1_0, gvl);
vy0 = VFMACCVF_FLOAT(vy0, x[ix+2], va2_0, gvl);
vy0 = VFMACCVF_FLOAT(vy0, x[ix+3], va3_0, gvl);
a_ptr += 4 * lda;
ix +=4;
}
vy0 = VFMACCVF_FLOAT(vy0_temp, alpha, vy0, gvl);
VSEV_FLOAT(&y[j], vy0, gvl);
j += gvl ;
}

for(;j < m;){
gvl = VSETVL(m-j);
a_ptr = a;
ix = 0;
vy0 = VLEV_FLOAT(&y[j], gvl);
for(i = 0; i < n; i++){
temp[0] = alpha * x[ix];
va0 = VLEV_FLOAT(&a_ptr[j], gvl);
vy0 = VFMACCVF_FLOAT(vy0, temp[0], va0, gvl);

a_ptr += lda;
ix += inc_x;
}
VSEV_FLOAT(&y[j], vy0, gvl);
j += gvl;
}
}else if (inc_y == 1 && inc_x !=1) {
gvl = VSETVL(m);
if(gvl <= m/2){
for(k=0,j=0; k<m/(2*gvl); k++){
a_ptr = a;
ix = 0;
vy0 = VLEV_FLOAT(&y[j], gvl);
vy1 = VLEV_FLOAT(&y[j+gvl], gvl);
for(i = 0; i < n; i++){
temp[0] = alpha * x[ix];
va0 = VLEV_FLOAT(&a_ptr[j], gvl);
vy0 = VFMACCVF_FLOAT(vy0, temp[0], va0, gvl);

va1 = VLEV_FLOAT(&a_ptr[j+gvl], gvl);
vy1 = VFMACCVF_FLOAT(vy1, temp[0], va1, gvl);
a_ptr += lda;
ix += inc_x;
}
VSEV_FLOAT(&y[j], vy0, gvl);
VSEV_FLOAT(&y[j+gvl], vy1, gvl);
j += gvl * 2;
}
}
return(0);
}
//tail
for(;j < m;){
gvl = VSETVL(m-j);
a_ptr = a;
ix = 0;
vy0 = VLEV_FLOAT(&y[j], gvl);
for(i = 0; i < n; i++){
temp[0] = alpha * x[ix];
va0 = VLEV_FLOAT(&a_ptr[j], gvl);
vy0 = VFMACCVF_FLOAT(vy0, temp[0], va0, gvl);

a_ptr += lda;
ix += inc_x;
}
VSEV_FLOAT(&y[j], vy0, gvl);
j += gvl;
}
}else{
BLASLONG stride_y = inc_y * sizeof(FLOAT);
gvl = VSETVL(m);
if(gvl <= m/2){
BLASLONG inc_yv = inc_y * gvl;
for(k=0,j=0; k<m/(2*gvl); k++){
a_ptr = a;
ix = 0;
vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
vy1 = VLSEV_FLOAT(&y[iy+inc_yv], stride_y, gvl);
for(i = 0; i < n; i++){
temp[0] = alpha * x[ix];
va0 = VLEV_FLOAT(&a_ptr[j], gvl);
vy0 = VFMACCVF_FLOAT(vy0, temp[0], va0, gvl);

va1 = VLEV_FLOAT(&a_ptr[j+gvl], gvl);
vy1 = VFMACCVF_FLOAT(vy1, temp[0], va1, gvl);
a_ptr += lda;
ix += inc_x;
}
VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl);
VSSEV_FLOAT(&y[iy+inc_yv], stride_y, vy1, gvl);
j += gvl * 2;
iy += inc_yv * 2;
}
}
//tail
for(;j < m;){
gvl = VSETVL(m-j);
a_ptr = a;
ix = 0;
vy0 = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl);
for(i = 0; i < n; i++){
temp[0] = alpha * x[ix];
va0 = VLEV_FLOAT(&a_ptr[j], gvl);
vy0 = VFMACCVF_FLOAT(vy0, temp[0], va0, gvl);

a_ptr += lda;
ix += inc_x;
}
VSSEV_FLOAT(&y[j*inc_y], stride_y, vy0, gvl);
j += gvl;
}
}
return(0);
}

+ 221
- 74
kernel/riscv64/zgemv_n_vector.c View File

@@ -27,32 +27,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#include "common.h"
#if !defined(DOUBLE)
#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n)
#define FLOAT_V_T vfloat32m4_t
#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m4)
#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4)
#define VSEV_FLOAT RISCV_RVV(vse32_v_f32m4)
#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4)
#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4)
#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f32m4)
#define VSETVL(n) RISCV_RVV(vsetvl_e32m2)(n)
#define FLOAT_V_T vfloat32m2_t
#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m2)
#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m2)
#define VSEV_FLOAT RISCV_RVV(vse32_v_f32m2)
#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m2)
#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m2)
#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f32m2)
#define VFMUL_VF_FLOAT RISCV_RVV(vfmul_vf_f32m2)
#define VSEV_FLOAT RISCV_RVV(vse32_v_f32m2)
#else
#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n)
#define FLOAT_V_T vfloat64m4_t
#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m4)
#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4)
#define VSEV_FLOAT RISCV_RVV(vse64_v_f64m4)
#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4)
#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4)
#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f64m4)
#define VSETVL(n) RISCV_RVV(vsetvl_e64m2)(n)
#define FLOAT_V_T vfloat64m2_t
#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m2)
#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m2)
#define VSEV_FLOAT RISCV_RVV(vse64_v_f64m2)
#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m2)
#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m2)
#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f64m2)
#define VFMUL_VF_FLOAT RISCV_RVV(vfmul_vf_f64m2)
#define VSEV_FLOAT RISCV_RVV(vse64_v_f64m2)
#endif

int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
{
BLASLONG i = 0, j = 0, k = 0;
BLASLONG i = 0, j = 0, k = 0;
BLASLONG ix = 0, iy = 0;
FLOAT *a_ptr = a;
FLOAT temp_r = 0.0, temp_i = 0.0;
FLOAT_V_T va0, va1, vy0, vy1;
FLOAT temp_r = 0.0, temp_i = 0.0, temp_r1, temp_i1, temp_r2, temp_i2, temp_r3, temp_i3, temp_rr[4], temp_ii[4];
FLOAT_V_T va0, va1, vy0, vy1, vy0_new, vy1_new, va2, va3, va4, va5, va6, va7, temp_iv, temp_rv, x_v0, x_v1, temp_v1, temp_v2, temp_v3, temp_v4;
unsigned int gvl = 0;
BLASLONG stride_a = sizeof(FLOAT) * 2;
BLASLONG stride_y = inc_y * sizeof(FLOAT) * 2;
@@ -60,104 +64,248 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
BLASLONG inc_yv = inc_y * gvl * 2;
BLASLONG inc_x2 = inc_x * 2;
BLASLONG lda2 = lda * 2;
for(k=0,j=0; k<m/gvl; k++){
vy0_new = VLSEV_FLOAT(&y[iy], stride_y, gvl);
vy1_new = VLSEV_FLOAT(&y[iy + 1], stride_y, gvl);
for (k = 0, j = 0; k < m / gvl; k++)
{
a_ptr = a;
ix = 0;
vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl);
for(i = 0; i < n; i++){
vy0 = vy0_new;
vy1 = vy1_new;

if (k < m / gvl - 1)
{
vy0_new = VLSEV_FLOAT(&y[iy + inc_yv], stride_y, gvl);
vy1_new = VLSEV_FLOAT(&y[iy + inc_yv + 1], stride_y, gvl);
}
for (i = 0; i < n % 4; i++)
{
#if !defined(XCONJ)
temp_r = alpha_r * x[ix] - alpha_i * x[ix+1];
temp_i = alpha_r * x[ix+1] + alpha_i * x[ix];
temp_r = alpha_r * x[ix] - alpha_i * x[ix + 1];
temp_i = alpha_r * x[ix + 1] + alpha_i * x[ix];
#else
temp_r = alpha_r * x[ix] + alpha_i * x[ix+1];
temp_i = alpha_r * x[ix+1] - alpha_i * x[ix];
temp_r = alpha_r * x[ix] + alpha_i * x[ix + 1];
temp_i = alpha_r * x[ix + 1] - alpha_i * x[ix];
#endif

va0 = VLSEV_FLOAT(&a_ptr[j], stride_a, gvl);
va1 = VLSEV_FLOAT(&a_ptr[j+1], stride_a, gvl);
va1 = VLSEV_FLOAT(&a_ptr[j + 1], stride_a, gvl);
#if !defined(CONJ)
#if !defined(XCONJ)
vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, gvl);
vy0 = VFNMSACVF_FLOAT(vy0, temp_i, va1, gvl);
vy1 = VFMACCVF_FLOAT(vy1, temp_r, va1, gvl);
vy1 = VFMACCVF_FLOAT(vy1, temp_i, va0, gvl);
vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, gvl);
vy0 = VFNMSACVF_FLOAT(vy0, temp_i, va1, gvl);
vy1 = VFMACCVF_FLOAT(vy1, temp_r, va1, gvl);
vy1 = VFMACCVF_FLOAT(vy1, temp_i, va0, gvl);
#else

vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, gvl);
vy0 = VFMACCVF_FLOAT(vy0, temp_i, va1, gvl);
vy1 = VFMACCVF_FLOAT(vy1, temp_r, va1, gvl);
vy1 = VFNMSACVF_FLOAT(vy1, temp_i, va0, gvl);
vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, gvl);
vy0 = VFMACCVF_FLOAT(vy0, temp_i, va1, gvl);
vy1 = VFMACCVF_FLOAT(vy1, temp_r, va1, gvl);
vy1 = VFNMSACVF_FLOAT(vy1, temp_i, va0, gvl);
#endif

#else

#if !defined(XCONJ)
vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, gvl);
vy0 = VFMACCVF_FLOAT(vy0, temp_i, va1, gvl);
vy1 = VFNMSACVF_FLOAT(vy1, temp_r, va1, gvl);
vy1 = VFMACCVF_FLOAT(vy1, temp_i, va0, gvl);
vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, gvl);
vy0 = VFMACCVF_FLOAT(vy0, temp_i, va1, gvl);
vy1 = VFNMSACVF_FLOAT(vy1, temp_r, va1, gvl);
vy1 = VFMACCVF_FLOAT(vy1, temp_i, va0, gvl);
#else
vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, gvl);
vy0 = VFNMSACVF_FLOAT(vy0, temp_i, va1, gvl);
vy1 = VFNMSACVF_FLOAT(vy1, temp_r, va1, gvl);
vy1 = VFNMSACVF_FLOAT(vy1, temp_i, va0, gvl);
vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, gvl);
vy0 = VFNMSACVF_FLOAT(vy0, temp_i, va1, gvl);
vy1 = VFNMSACVF_FLOAT(vy1, temp_r, va1, gvl);
vy1 = VFNMSACVF_FLOAT(vy1, temp_i, va0, gvl);
#endif

#endif
a_ptr += lda2;
ix += inc_x2;
}

for (; i < n; i += 4)
{
#if !defined(XCONJ)

x_v0 = VLSEV_FLOAT(&x[ix], inc_x2 * sizeof(FLOAT), 4);
x_v1 = VLSEV_FLOAT(&x[ix + 1], inc_x2 * sizeof(FLOAT), 4);
temp_rv = VFMUL_VF_FLOAT(x_v0, alpha_r, 4);
temp_iv = VFMUL_VF_FLOAT(x_v0, alpha_i, 4);
temp_rv = VFNMSACVF_FLOAT(temp_rv, alpha_i, x_v1, 4);
temp_iv = VFMACCVF_FLOAT(temp_iv, alpha_r, x_v1, 4);
VSEV_FLOAT(&temp_rr[0], temp_rv, 4);
VSEV_FLOAT(&temp_ii[0], temp_iv, 4);

#else
x_v0 = VLSEV_FLOAT(&x[ix], inc_x2 * sizeof(FLOAT), 4);
x_v1 = VLSEV_FLOAT(&x[ix + 1], inc_x2 * sizeof(FLOAT), 4);
temp_rv = VFMUL_VF_FLOAT(x_v0, alpha_r, 4);
temp_iv = VFMUL_VF_FLOAT(x_v0, alpha_i, 4);
temp_rv = VFMACCVF_FLOAT(temp_rv, alpha_i, x_v1, 4);
temp_iv = VFNMSACVF_FLOAT(temp_iv, alpha_r, x_v1, 4);
VSEV_FLOAT(&temp_rr[0], temp_rv, 4);
VSEV_FLOAT(&temp_ii[0], temp_iv, 4);

#endif

va0 = VLSEV_FLOAT(&a_ptr[j], stride_a, gvl);
va1 = VLSEV_FLOAT(&a_ptr[j + 1], stride_a, gvl);
va2 = VLSEV_FLOAT(&a_ptr[j + lda2], stride_a, gvl);
va3 = VLSEV_FLOAT(&a_ptr[j + lda2 + 1], stride_a, gvl);
va4 = VLSEV_FLOAT(&a_ptr[j + lda2 * 2], stride_a, gvl);
va5 = VLSEV_FLOAT(&a_ptr[j + lda2 * 2 + 1], stride_a, gvl);
va6 = VLSEV_FLOAT(&a_ptr[j + lda2 * 3], stride_a, gvl);
va7 = VLSEV_FLOAT(&a_ptr[j + lda2 * 3 + 1], stride_a, gvl);

#if !defined(CONJ)
#if !defined(XCONJ)
vy0 = VFMACCVF_FLOAT(vy0, temp_rr[0], va0, gvl);
vy0 = VFNMSACVF_FLOAT(vy0, temp_ii[0], va1, gvl);
vy1 = VFMACCVF_FLOAT(vy1, temp_rr[0], va1, gvl);
vy1 = VFMACCVF_FLOAT(vy1, temp_ii[0], va0, gvl);

vy0 = VFMACCVF_FLOAT(vy0, temp_rr[1], va2, gvl);
vy0 = VFNMSACVF_FLOAT(vy0, temp_ii[1], va3, gvl);
vy1 = VFMACCVF_FLOAT(vy1, temp_rr[1], va3, gvl);
vy1 = VFMACCVF_FLOAT(vy1, temp_ii[1], va2, gvl);

vy0 = VFMACCVF_FLOAT(vy0, temp_rr[2], va4, gvl);
vy0 = VFNMSACVF_FLOAT(vy0, temp_ii[2], va5, gvl);
vy1 = VFMACCVF_FLOAT(vy1, temp_rr[2], va5, gvl);
vy1 = VFMACCVF_FLOAT(vy1, temp_ii[2], va4, gvl);

vy0 = VFMACCVF_FLOAT(vy0, temp_rr[3], va6, gvl);
vy0 = VFNMSACVF_FLOAT(vy0, temp_ii[3], va7, gvl);
vy1 = VFMACCVF_FLOAT(vy1, temp_rr[3], va7, gvl);
vy1 = VFMACCVF_FLOAT(vy1, temp_ii[3], va6, gvl);

#else

vy0 = VFMACCVF_FLOAT(vy0, temp_rr[0], va0, gvl);
vy0 = VFMACCVF_FLOAT(vy0, temp_ii[0], va1, gvl);
vy1 = VFMACCVF_FLOAT(vy1, temp_rr[0], va1, gvl);
vy1 = VFNMSACVF_FLOAT(vy1, temp_ii[0], va0, gvl);

vy0 = VFMACCVF_FLOAT(vy0, temp_rr[1], va2, gvl);
vy0 = VFMACCVF_FLOAT(vy0, temp_ii[1], va3, gvl);
vy1 = VFMACCVF_FLOAT(vy1, temp_rr[1], va3, gvl);
vy1 = VFNMSACVF_FLOAT(vy1, temp_ii[1], va2, gvl);

vy0 = VFMACCVF_FLOAT(vy0, temp_rr[2], va4, gvl);
vy0 = VFMACCVF_FLOAT(vy0, temp_ii[2], va5, gvl);
vy1 = VFMACCVF_FLOAT(vy1, temp_rr[2], va5, gvl);
vy1 = VFNMSACVF_FLOAT(vy1, temp_ii[2], va4, gvl);

vy0 = VFMACCVF_FLOAT(vy0, temp_rr[3], va6, gvl);
vy0 = VFMACCVF_FLOAT(vy0, temp_ii[3], va7, gvl);
vy1 = VFMACCVF_FLOAT(vy1, temp_rr[3], va7, gvl);
vy1 = VFNMSACVF_FLOAT(vy1, temp_ii[3], va6, gvl);

#endif

#else

#if !defined(XCONJ)
vy0 = VFMACCVF_FLOAT(vy0, temp_rr[0], va0, gvl);
vy0 = VFMACCVF_FLOAT(vy0, temp_ii[0], va1, gvl);
vy1 = VFNMSACVF_FLOAT(vy1, temp_rr[0], va1, gvl);
vy1 = VFMACCVF_FLOAT(vy1, temp_ii[0], va0, gvl);

vy0 = VFMACCVF_FLOAT(vy0, temp_rr[1], va2, gvl);
vy0 = VFMACCVF_FLOAT(vy0, temp_ii[1], va3, gvl);
vy1 = VFNMSACVF_FLOAT(vy1, temp_rr[1], va3, gvl);
vy1 = VFMACCVF_FLOAT(vy1, temp_ii[1], va2, gvl);

vy0 = VFMACCVF_FLOAT(vy0, temp_rr[2], va4, gvl);
vy0 = VFMACCVF_FLOAT(vy0, temp_ii[2], va5, gvl);
vy1 = VFNMSACVF_FLOAT(vy1, temp_rr[2], va5, gvl);
vy1 = VFMACCVF_FLOAT(vy1, temp_ii[2], va4, gvl);

vy0 = VFMACCVF_FLOAT(vy0, temp_rr[3], va6, gvl);
vy0 = VFMACCVF_FLOAT(vy0, temp_ii[3], va7, gvl);
vy1 = VFNMSACVF_FLOAT(vy1, temp_rr[3], va7, gvl);
vy1 = VFMACCVF_FLOAT(vy1, temp_ii[3], va6, gvl);

#else
vy0 = VFMACCVF_FLOAT(vy0, temp_rr[0], va0, gvl);
vy0 = VFNMSACVF_FLOAT(vy0, temp_ii[0], va1, gvl);
vy1 = VFNMSACVF_FLOAT(vy1, temp_rr[0], va1, gvl);
vy1 = VFNMSACVF_FLOAT(vy1, temp_ii[0], va0, gvl);

vy0 = VFMACCVF_FLOAT(vy0, temp_rr[1], va2, gvl);
vy0 = VFNMSACVF_FLOAT(vy0, temp_ii[1], va3, gvl);
vy1 = VFNMSACVF_FLOAT(vy1, temp_rr[1], va3, gvl);
vy1 = VFNMSACVF_FLOAT(vy1, temp_ii[1], va2, gvl);

vy0 = VFMACCVF_FLOAT(vy0, temp_rr[2], va4, gvl);
vy0 = VFNMSACVF_FLOAT(vy0, temp_ii[2], va5, gvl);
vy1 = VFNMSACVF_FLOAT(vy1, temp_rr[2], va5, gvl);
vy1 = VFNMSACVF_FLOAT(vy1, temp_ii[2], va4, gvl);

vy0 = VFMACCVF_FLOAT(vy0, temp_rr[3], va6, gvl);
vy0 = VFNMSACVF_FLOAT(vy0, temp_ii[3], va7, gvl);
vy1 = VFNMSACVF_FLOAT(vy1, temp_rr[3], va7, gvl);
vy1 = VFNMSACVF_FLOAT(vy1, temp_ii[3], va6, gvl);

#endif

#endif
a_ptr += lda2 * 4;
ix += inc_x2 * 4;
}

VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl);
VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl);
VSSEV_FLOAT(&y[iy + 1], stride_y, vy1, gvl);
j += gvl * 2;
iy += inc_yv;
}
//tail
if(j/2 < m){
gvl = VSETVL(m-j/2);
// tail
if (j / 2 < m)
{
gvl = VSETVL(m - j / 2);
a_ptr = a;
ix = 0;
vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl);
for(i = 0; i < n; i++){
vy1 = VLSEV_FLOAT(&y[iy + 1], stride_y, gvl);
for (i = 0; i < n; i++)
{
#if !defined(XCONJ)
temp_r = alpha_r * x[ix] - alpha_i * x[ix+1];
temp_i = alpha_r * x[ix+1] + alpha_i * x[ix];
temp_r = alpha_r * x[ix] - alpha_i * x[ix + 1];
temp_i = alpha_r * x[ix + 1] + alpha_i * x[ix];
#else
temp_r = alpha_r * x[ix] + alpha_i * x[ix+1];
temp_i = alpha_r * x[ix+1] - alpha_i * x[ix];
temp_r = alpha_r * x[ix] + alpha_i * x[ix + 1];
temp_i = alpha_r * x[ix + 1] - alpha_i * x[ix];
#endif

va0 = VLSEV_FLOAT(&a_ptr[j], stride_a, gvl);
va1 = VLSEV_FLOAT(&a_ptr[j+1], stride_a, gvl);
va1 = VLSEV_FLOAT(&a_ptr[j + 1], stride_a, gvl);
#if !defined(CONJ)

#if !defined(XCONJ)
vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, gvl);
vy0 = VFNMSACVF_FLOAT(vy0, temp_i, va1, gvl);
vy1 = VFMACCVF_FLOAT(vy1, temp_r, va1, gvl);
vy1 = VFMACCVF_FLOAT(vy1, temp_i, va0, gvl);
vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, gvl);
vy0 = VFNMSACVF_FLOAT(vy0, temp_i, va1, gvl);
vy1 = VFMACCVF_FLOAT(vy1, temp_r, va1, gvl);
vy1 = VFMACCVF_FLOAT(vy1, temp_i, va0, gvl);
#else

vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, gvl);
vy0 = VFMACCVF_FLOAT(vy0, temp_i, va1, gvl);
vy1 = VFMACCVF_FLOAT(vy1, temp_r, va1, gvl);
vy1 = VFNMSACVF_FLOAT(vy1, temp_i, va0, gvl);
vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, gvl);
vy0 = VFMACCVF_FLOAT(vy0, temp_i, va1, gvl);
vy1 = VFMACCVF_FLOAT(vy1, temp_r, va1, gvl);
vy1 = VFNMSACVF_FLOAT(vy1, temp_i, va0, gvl);
#endif

#else

#if !defined(XCONJ)
vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, gvl);
vy0 = VFMACCVF_FLOAT(vy0, temp_i, va1, gvl);
vy1 = VFNMSACVF_FLOAT(vy1, temp_r, va1, gvl);
vy1 = VFMACCVF_FLOAT(vy1, temp_i, va0, gvl);
vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, gvl);
vy0 = VFMACCVF_FLOAT(vy0, temp_i, va1, gvl);
vy1 = VFNMSACVF_FLOAT(vy1, temp_r, va1, gvl);
vy1 = VFMACCVF_FLOAT(vy1, temp_i, va0, gvl);
#else
vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, gvl);
vy0 = VFNMSACVF_FLOAT(vy0, temp_i, va1, gvl);
vy1 = VFNMSACVF_FLOAT(vy1, temp_r, va1, gvl);
vy1 = VFNMSACVF_FLOAT(vy1, temp_i, va0, gvl);
vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, gvl);
vy0 = VFNMSACVF_FLOAT(vy0, temp_i, va1, gvl);
vy1 = VFNMSACVF_FLOAT(vy1, temp_r, va1, gvl);
vy1 = VFNMSACVF_FLOAT(vy1, temp_i, va0, gvl);
#endif

#endif
@@ -165,9 +313,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
ix += inc_x2;
}
VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl);
VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl);
VSSEV_FLOAT(&y[iy + 1], stride_y, vy1, gvl);
}
return(0);
return (0);
}



+ 0
- 2
kernel/setparam-ref.c View File

@@ -180,9 +180,7 @@ gotoblas_t TABLE_NAME = {
sgemm_direct_performantTS,
#endif
#ifdef ARCH_ARM64
#ifdef HAVE_SME
sgemm_directTS,
#endif
#endif

sgemm_kernelTS, sgemm_betaTS,


+ 1
- 1
kernel/x86_64/sbgemv_n_microk_cooperlake_template.c View File

@@ -231,7 +231,7 @@ static int sbgemv_kernel_32xN_lda_direct(BLASLONG m, BLASLONG n, float alpha, bf
accum512_8 = _mm512_permutex2var_ps(accum512_0, idx_base_0, accum512_1);
accum512_9 = _mm512_permutex2var_ps(accum512_0, idx_base_1, accum512_1);

if ((m-tag_m_32x) > 16) {
if ((m-tag_m_32x) >= 16) {
STORE16_COMPLETE_RESULT(accum512_8, y+tag_m_32x+0)
STORE16_MASK_COMPLETE_RESULT(accum512_9, y+tag_m_32x+16, store_tail_mask)
} else {


+ 1
- 1
kernel/x86_64/zsum.c View File

@@ -54,7 +54,7 @@ static FLOAT sum_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x)

if (n <= 0 || inc_x <= 0) return(sumf);
if (inc_x == 1) {
sumf = zsum_kernel(n, x);
sumf = zasum_kernel(n, x);
}
else {
inc_x2 = 2 * inc_x;


+ 2
- 0
kernel/zarch/ctrmm4x4V.S View File

@@ -714,6 +714,8 @@ ld %f10,136(%r15)
ld %f11,144(%r15)
ld %f12,152(%r15)
br %r14

EPILOGUE
.end




+ 2
- 0
kernel/zarch/gemm8x4V.S View File

@@ -604,6 +604,8 @@ ALIGN_2
/*end*/
lmg %r6,%r12,48(%r15)
br %r14

EPILOGUE
.end




+ 2
- 0
kernel/zarch/strmm8x4V.S View File

@@ -845,6 +845,8 @@ ALIGN_2
lmg %r6,%r12,48(%r15)
#endif
br %r14

EPILOGUE
.end




+ 2
- 0
kernel/zarch/trmm8x4V.S View File

@@ -864,6 +864,8 @@ ALIGN_2
lmg %r6,%r12,48(%r15)
#endif
br %r14

EPILOGUE
.end




+ 2
- 0
kernel/zarch/ztrmm4x4V.S View File

@@ -719,6 +719,8 @@ ld %f10,136(%r15)
ld %f11,144(%r15)
ld %f12,152(%r15)
br %r14

EPILOGUE
.end




+ 10
- 0
lapack-netlib/LAPACKE/include/lapacke_config.h View File

@@ -75,7 +75,16 @@ extern "C" {

#ifndef LAPACK_COMPLEX_CUSTOM
#if defined(_MSC_VER) && !defined(__INTEL_CLANG_COMPILER)
#if defined(LAPACK_COMPLEX_CPP)
#include <complex>
#define lapack_complex_float std::complex<float>
#define lapack_complex_double std::complex<double>
#define lapack_complex_float_real(z) ((z).real())
#define lapack_complex_float_imag(z) ((z).imag())
#define lapack_complex_double_real(z) ((z).real())
#define lapack_complex_double_imag(z) ((z).imag())
#define _CRT_USE_C_COMPLEX_H
#else
#include <complex.h>
#define LAPACK_COMPLEX_CUSTOM
#define lapack_complex_float _Fcomplex
@@ -84,6 +93,7 @@ extern "C" {
#define lapack_complex_float_imag(z) (cimag(z))
#define lapack_complex_double_real(z) (creal(z))
#define lapack_complex_double_imag(z) (cimag(z))
#endif
#else

#if defined(LAPACK_COMPLEX_STRUCTURE)


+ 2
- 2
lapack-netlib/SRC/cgees.c View File

@@ -710,8 +710,8 @@ or GE matrices</b> */
/* > \ingroup complexGEeigen */

/* ===================================================================== */
/* Subroutine */ void cgees_(char *jobvs, char *sort, L_fp select, integer *n,
complex *a, integer *lda, integer *sdim, complex *w, complex *vs,
/* Subroutine */ void cgees_(char *jobvs, char *sort, logical (*select)(complex*),
integer *n, complex *a, integer *lda, integer *sdim, complex *w, complex *vs,
integer *ldvs, complex *work, integer *lwork, real *rwork, logical *
bwork, integer *info)
{


+ 2
- 2
lapack-netlib/SRC/cgeesx.c View File

@@ -752,8 +752,8 @@ f"> */
/* > \ingroup complexGEeigen */

/* ===================================================================== */
/* Subroutine */ void cgeesx_(char *jobvs, char *sort, L_fp select, char *
sense, integer *n, complex *a, integer *lda, integer *sdim, complex *
/* Subroutine */ void cgeesx_(char *jobvs, char *sort, logical (*select)(complex*),
char *sense, integer *n, complex *a, integer *lda, integer *sdim, complex *
w, complex *vs, integer *ldvs, real *rconde, real *rcondv, complex *
work, integer *lwork, real *rwork, logical *bwork, integer *info)
{


+ 3
- 3
lapack-netlib/SRC/cgeev.f View File

@@ -485,12 +485,12 @@
* Undo scaling if necessary
*
50 CONTINUE
IF( SCALEA ) THEN
IF( SCALEA .AND. INFO.GT.0 ) THEN
CALL CLASCL( 'G', 0, 0, CSCALE, ANRM, N-INFO, 1, W( INFO+1 ),
$ MAX( N-INFO, 1 ), IERR )
IF( INFO.GT.0 ) THEN
CALL CLASCL( 'G', 0, 0, CSCALE, ANRM, ILO-1, 1, W, N, IERR )
END IF
END IF
*
WORK( 1 ) = SROUNDUP_LWORK(MAXWRK)


+ 2
- 2
lapack-netlib/SRC/cgges.c View File

@@ -784,8 +784,8 @@ or GE matrices</b> */
/* > \ingroup complexGEeigen */

/* ===================================================================== */
/* Subroutine */ void cgges_(char *jobvsl, char *jobvsr, char *sort, L_fp
selctg, integer *n, complex *a, integer *lda, complex *b, integer *
/* Subroutine */ void cgges_(char *jobvsl, char *jobvsr, char *sort, logical
(*selctg)(complex*,complex*), integer *n, complex *a, integer *lda, complex *b, integer *
ldb, integer *sdim, complex *alpha, complex *beta, complex *vsl,
integer *ldvsl, complex *vsr, integer *ldvsr, complex *work, integer *
lwork, real *rwork, logical *bwork, integer *info)


+ 2
- 2
lapack-netlib/SRC/cgges3.c View File

@@ -783,8 +783,8 @@ f"> */
/* > \ingroup complexGEeigen */

/* ===================================================================== */
/* Subroutine */ void cgges3_(char *jobvsl, char *jobvsr, char *sort, L_fp
selctg, integer *n, complex *a, integer *lda, complex *b, integer *
/* Subroutine */ void cgges3_(char *jobvsl, char *jobvsr, char *sort, logical
(*selctg)(complex*,complex*), integer *n, complex *a, integer *lda, complex *b, integer *
ldb, integer *sdim, complex *alpha, complex *beta, complex *vsl,
integer *ldvsl, complex *vsr, integer *ldvsr, complex *work, integer *
lwork, real *rwork, logical *bwork, integer *info)


+ 2
- 2
lapack-netlib/SRC/cggesx.c View File

@@ -843,8 +843,8 @@ f"> */
/* > \ingroup complexGEeigen */

/* ===================================================================== */
/* Subroutine */ void cggesx_(char *jobvsl, char *jobvsr, char *sort, L_fp
selctg, char *sense, integer *n, complex *a, integer *lda, complex *b,
/* Subroutine */ void cggesx_(char *jobvsl, char *jobvsr, char *sort, logical
(*selctg)(complex*,complex*), char *sense, integer *n, complex *a, integer *lda, complex *b,
integer *ldb, integer *sdim, complex *alpha, complex *beta, complex *
vsl, integer *ldvsl, complex *vsr, integer *ldvsr, real *rconde, real
*rcondv, complex *work, integer *lwork, real *rwork, integer *iwork,


+ 1
- 1
lapack-netlib/SRC/dgees.c View File

@@ -729,7 +729,7 @@ or GE matrices</b> */
/* > \ingroup doubleGEeigen */

/* ===================================================================== */
/* Subroutine */ void dgees_(char *jobvs, char *sort, L_fp select, integer *n,
/* Subroutine */ void dgees_(char *jobvs, char *sort, logical(*select)(doublereal*,doublereal*), integer *n,
doublereal *a, integer *lda, integer *sdim, doublereal *wr,
doublereal *wi, doublereal *vs, integer *ldvs, doublereal *work,
integer *lwork, logical *bwork, integer *info)


+ 1
- 1
lapack-netlib/SRC/dgeesx.c View File

@@ -793,7 +793,7 @@ f"> */
/* > \ingroup doubleGEeigen */

/* ===================================================================== */
/* Subroutine */ void dgeesx_(char *jobvs, char *sort, L_fp select, char *
/* Subroutine */ void dgeesx_(char *jobvs, char *sort, logical(*select)(doublereal*,doublereal*), char *
sense, integer *n, doublereal *a, integer *lda, integer *sdim,
doublereal *wr, doublereal *wi, doublereal *vs, integer *ldvs,
doublereal *rconde, doublereal *rcondv, doublereal *work, integer *


+ 3
- 3
lapack-netlib/SRC/dgeev.f View File

@@ -506,17 +506,17 @@
* Undo scaling if necessary
*
50 CONTINUE
IF( SCALEA ) THEN
IF( SCALEA .AND. INFO.GT.0) THEN
CALL DLASCL( 'G', 0, 0, CSCALE, ANRM, N-INFO, 1, WR( INFO+1 ),
$ MAX( N-INFO, 1 ), IERR )
CALL DLASCL( 'G', 0, 0, CSCALE, ANRM, N-INFO, 1, WI( INFO+1 ),
$ MAX( N-INFO, 1 ), IERR )
IF( INFO.GT.0 ) THEN
CALL DLASCL( 'G', 0, 0, CSCALE, ANRM, ILO-1, 1, WR, N,
$ IERR )
CALL DLASCL( 'G', 0, 0, CSCALE, ANRM, ILO-1, 1, WI, N,
$ IERR )
END IF
END IF
*
WORK( 1 ) = MAXWRK


+ 2
- 2
lapack-netlib/SRC/dgges.c View File

@@ -798,8 +798,8 @@ or GE matrices</b> */
/* > \ingroup doubleGEeigen */

/* ===================================================================== */
/* Subroutine */ void dgges_(char *jobvsl, char *jobvsr, char *sort, L_fp
selctg, integer *n, doublereal *a, integer *lda, doublereal *b,
/* Subroutine */ void dgges_(char *jobvsl, char *jobvsr, char *sort, logical
(selctg)(doublereal*, doublereal*, doublereal*), integer *n, doublereal *a, integer *lda, doublereal *b,
integer *ldb, integer *sdim, doublereal *alphar, doublereal *alphai,
doublereal *beta, doublereal *vsl, integer *ldvsl, doublereal *vsr,
integer *ldvsr, doublereal *work, integer *lwork, logical *bwork,


+ 2
- 2
lapack-netlib/SRC/dgges3.c View File

@@ -796,8 +796,8 @@ f"> */
/* > \ingroup doubleGEeigen */

/* ===================================================================== */
/* Subroutine */ void dgges3_(char *jobvsl, char *jobvsr, char *sort, L_fp
selctg, integer *n, doublereal *a, integer *lda, doublereal *b,
/* Subroutine */ void dgges3_(char *jobvsl, char *jobvsr, char *sort, logical
(*selctg)(doublereal*,doublereal*,doublereal*), integer *n, doublereal *a, integer *lda, doublereal *b,
integer *ldb, integer *sdim, doublereal *alphar, doublereal *alphai,
doublereal *beta, doublereal *vsl, integer *ldvsl, doublereal *vsr,
integer *ldvsr, doublereal *work, integer *lwork, logical *bwork,


+ 2
- 2
lapack-netlib/SRC/dggesx.c View File

@@ -878,8 +878,8 @@ f"> */
/* > \endverbatim */
/* > */
/* ===================================================================== */
/* Subroutine */ void dggesx_(char *jobvsl, char *jobvsr, char *sort, L_fp
selctg, char *sense, integer *n, doublereal *a, integer *lda,
/* Subroutine */ void dggesx_(char *jobvsl, char *jobvsr, char *sort, logical
(*selctg)(doublereal*,doublereal*,doublereal*), char *sense, integer *n, doublereal *a, integer *lda,
doublereal *b, integer *ldb, integer *sdim, doublereal *alphar,
doublereal *alphai, doublereal *beta, doublereal *vsl, integer *ldvsl,
doublereal *vsr, integer *ldvsr, doublereal *rconde, doublereal *


+ 1
- 1
lapack-netlib/SRC/sgees.c View File

@@ -482,7 +482,7 @@ or GE matrices</b> */
/* > \ingroup realGEeigen */

/* ===================================================================== */
/* Subroutine */ void sgees_(char *jobvs, char *sort, L_fp select, integer *n,
/* Subroutine */ void sgees_(char *jobvs, char *sort, logical(*select)(real*,real*), integer *n,
real *a, integer *lda, integer *sdim, real *wr, real *wi, real *vs,
integer *ldvs, real *work, integer *lwork, logical *bwork, integer *
info)


+ 1
- 1
lapack-netlib/SRC/sgeesx.c View File

@@ -550,7 +550,7 @@ f"> */
/* > \ingroup realGEeigen */

/* ===================================================================== */
/* Subroutine */ void sgeesx_(char *jobvs, char *sort, L_fp select, char *
/* Subroutine */ void sgeesx_(char *jobvs, char *sort, logical(*select)(real*,real*), char *
sense, integer *n, real *a, integer *lda, integer *sdim, real *wr,
real *wi, real *vs, integer *ldvs, real *rconde, real *rcondv, real *
work, integer *lwork, integer *iwork, integer *liwork, logical *bwork,


+ 3
- 3
lapack-netlib/SRC/sgeev.f View File

@@ -504,17 +504,17 @@
* Undo scaling if necessary
*
50 CONTINUE
IF( SCALEA ) THEN
IF( SCALEA .AND. INFO.GT.0) THEN
CALL SLASCL( 'G', 0, 0, CSCALE, ANRM, N-INFO, 1, WR( INFO+1 ),
$ MAX( N-INFO, 1 ), IERR )
CALL SLASCL( 'G', 0, 0, CSCALE, ANRM, N-INFO, 1, WI( INFO+1 ),
$ MAX( N-INFO, 1 ), IERR )
IF( INFO.GT.0 ) THEN
CALL SLASCL( 'G', 0, 0, CSCALE, ANRM, ILO-1, 1, WR, N,
$ IERR )
CALL SLASCL( 'G', 0, 0, CSCALE, ANRM, ILO-1, 1, WI, N,
$ IERR )
END IF
END IF
*
WORK( 1 ) = SROUNDUP_LWORK(MAXWRK)


+ 2
- 2
lapack-netlib/SRC/sgges.c View File

@@ -555,8 +555,8 @@ or GE matrices</b> */
/* > \ingroup realGEeigen */

/* ===================================================================== */
/* Subroutine */ void sgges_(char *jobvsl, char *jobvsr, char *sort, L_fp
selctg, integer *n, real *a, integer *lda, real *b, integer *ldb,
/* Subroutine */ void sgges_(char *jobvsl, char *jobvsr, char *sort, logical
(*selctg)(real*,real*,real*), integer *n, real *a, integer *lda, real *b, integer *ldb,
integer *sdim, real *alphar, real *alphai, real *beta, real *vsl,
integer *ldvsl, real *vsr, integer *ldvsr, real *work, integer *lwork,
logical *bwork, integer *info)


+ 2
- 2
lapack-netlib/SRC/sgges3.c View File

@@ -553,8 +553,8 @@ f"> */
/* > \ingroup realGEeigen */

/* ===================================================================== */
/* Subroutine */ void sgges3_(char *jobvsl, char *jobvsr, char *sort, L_fp
selctg, integer *n, real *a, integer *lda, real *b, integer *ldb,
/* Subroutine */ void sgges3_(char *jobvsl, char *jobvsr, char *sort, logical
(*selctg)(real*,real*,real*), integer *n, real *a, integer *lda, real *b, integer *ldb,
integer *sdim, real *alphar, real *alphai, real *beta, real *vsl,
integer *ldvsl, real *vsr, integer *ldvsr, real *work, integer *lwork,
logical *bwork, integer *info)


+ 2
- 2
lapack-netlib/SRC/sggesx.c View File

@@ -635,8 +635,8 @@ f"> */
/* > \endverbatim */
/* > */
/* ===================================================================== */
/* Subroutine */ void sggesx_(char *jobvsl, char *jobvsr, char *sort, L_fp
selctg, char *sense, integer *n, real *a, integer *lda, real *b,
/* Subroutine */ void sggesx_(char *jobvsl, char *jobvsr, char *sort, logical
(*selctg)(real*,real*,real*), char *sense, integer *n, real *a, integer *lda, real *b,
integer *ldb, integer *sdim, real *alphar, real *alphai, real *beta,
real *vsl, integer *ldvsl, real *vsr, integer *ldvsr, real *rconde,
real *rcondv, real *work, integer *lwork, integer *iwork, integer *


+ 2
- 2
lapack-netlib/SRC/zgees.c View File

@@ -710,8 +710,8 @@ or GE matrices</b> */
/* > \ingroup complex16GEeigen */

/* ===================================================================== */
/* Subroutine */ void zgees_(char *jobvs, char *sort, L_fp select, integer *n,
doublecomplex *a, integer *lda, integer *sdim, doublecomplex *w,
/* Subroutine */ void zgees_(char *jobvs, char *sort, logical (*select)(doublecomplex*),
integer *n, doublecomplex *a, integer *lda, integer *sdim, doublecomplex *w,
doublecomplex *vs, integer *ldvs, doublecomplex *work, integer *lwork,
doublereal *rwork, logical *bwork, integer *info)
{


+ 2
- 2
lapack-netlib/SRC/zgeesx.c View File

@@ -751,8 +751,8 @@ f"> */
/* > \ingroup complex16GEeigen */

/* ===================================================================== */
/* Subroutine */ void zgeesx_(char *jobvs, char *sort, L_fp select, char *
sense, integer *n, doublecomplex *a, integer *lda, integer *sdim,
/* Subroutine */ void zgeesx_(char *jobvs, char *sort, logical (*select)(doublecomplex*),
char * sense, integer *n, doublecomplex *a, integer *lda, integer *sdim,
doublecomplex *w, doublecomplex *vs, integer *ldvs, doublereal *
rconde, doublereal *rcondv, doublecomplex *work, integer *lwork,
doublereal *rwork, logical *bwork, integer *info)


+ 3
- 3
lapack-netlib/SRC/zgeev.f View File

@@ -485,12 +485,12 @@
* Undo scaling if necessary
*
50 CONTINUE
IF( SCALEA ) THEN
IF( SCALEA .AND. INFO.GT.0) THEN
CALL ZLASCL( 'G', 0, 0, CSCALE, ANRM, N-INFO, 1, W( INFO+1 ),
$ MAX( N-INFO, 1 ), IERR )
IF( INFO.GT.0 ) THEN
CALL ZLASCL( 'G', 0, 0, CSCALE, ANRM, ILO-1, 1, W, N, IERR )
END IF
END IF
*
WORK( 1 ) = MAXWRK


+ 3
- 2
lapack-netlib/SRC/zgges.c View File

@@ -784,8 +784,9 @@ or GE matrices</b> */
/* > \ingroup complex16GEeigen */

/* ===================================================================== */
/* Subroutine */ void zgges_(char *jobvsl, char *jobvsr, char *sort, L_fp
selctg, integer *n, doublecomplex *a, integer *lda, doublecomplex *b,
/* Subroutine */ void zgges_(char *jobvsl, char *jobvsr, char *sort, logical
(*selctg)(doublecomplex*,doublecomplex*), integer *n, doublecomplex *a,
integer *lda, doublecomplex *b,
integer *ldb, integer *sdim, doublecomplex *alpha, doublecomplex *
beta, doublecomplex *vsl, integer *ldvsl, doublecomplex *vsr, integer
*ldvsr, doublecomplex *work, integer *lwork, doublereal *rwork,


+ 3
- 2
lapack-netlib/SRC/zgges3.c View File

@@ -783,8 +783,9 @@ f"> */
/* > \ingroup complex16GEeigen */

/* ===================================================================== */
/* Subroutine */ void zgges3_(char *jobvsl, char *jobvsr, char *sort, L_fp
selctg, integer *n, doublecomplex *a, integer *lda, doublecomplex *b,
/* Subroutine */ void zgges3_(char *jobvsl, char *jobvsr, char *sort, logical
(*selctg)(doublecomplex*,doublecomplex*), integer *n, doublecomplex *a,
integer *lda, doublecomplex *b,
integer *ldb, integer *sdim, doublecomplex *alpha, doublecomplex *
beta, doublecomplex *vsl, integer *ldvsl, doublecomplex *vsr, integer
*ldvsr, doublecomplex *work, integer *lwork, doublereal *rwork,


+ 3
- 2
lapack-netlib/SRC/zggesx.c View File

@@ -843,8 +843,9 @@ f"> */
/* > \ingroup complex16GEeigen */

/* ===================================================================== */
/* Subroutine */ void zggesx_(char *jobvsl, char *jobvsr, char *sort, L_fp
selctg, char *sense, integer *n, doublecomplex *a, integer *lda,
/* Subroutine */ void zggesx_(char *jobvsl, char *jobvsr, char *sort, logical
(*selctg)(doublecomplex*,doublecomplex*), char *sense, integer *n,
doublecomplex *a, integer *lda,
doublecomplex *b, integer *ldb, integer *sdim, doublecomplex *alpha,
doublecomplex *beta, doublecomplex *vsl, integer *ldvsl,
doublecomplex *vsr, integer *ldvsr, doublereal *rconde, doublereal *


+ 0
- 6
lapack-netlib/TESTING/EIG/CMakeLists.txt View File

@@ -107,12 +107,6 @@ set(ZDMDEIGTST zchkdmd.f90)
macro(add_eig_executable name)
add_executable(${name} ${ARGN})
target_link_libraries(${name} ${LIBNAMEPREFIX}openblas${LIBNAMESUFFIX}${SUFFIX64_UNDERSCORE})

if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang))
string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}")
target_link_libraries(${name} omp pthread)
endif()

#${TMGLIB} ../${LAPACK_LIBRARIES} ${BLAS_LIBRARIES})
endmacro()



+ 6
- 6
lapack-netlib/TESTING/EIG/cerred.f View File

@@ -332,7 +332,7 @@
WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ),
$ NT
ELSE
WRITE( NOUT, FMT = 9998 )
WRITE( NOUT, FMT = 9998 )SRNAMT( 1:LEN_TRIM( SRNAMT ) )
END IF
*
* Test CGESDD
@@ -367,7 +367,7 @@
WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ),
$ NT
ELSE
WRITE( NOUT, FMT = 9998 )
WRITE( NOUT, FMT = 9998 )SRNAMT( 1:LEN_TRIM( SRNAMT ) )
END IF
*
* Test CGEJSV
@@ -433,7 +433,7 @@
WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ),
$ NT
ELSE
WRITE( NOUT, FMT = 9998 )
WRITE( NOUT, FMT = 9998 )SRNAMT( 1:LEN_TRIM( SRNAMT ) )
END IF
*
* Test CGESVDX
@@ -492,7 +492,7 @@
WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ),
$ NT
ELSE
WRITE( NOUT, FMT = 9998 )
WRITE( NOUT, FMT = 9998 )SRNAMT( 1:LEN_TRIM( SRNAMT ) )
END IF
*
* Test CGESVDQ
@@ -547,7 +547,7 @@
WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ),
$ NT
ELSE
WRITE( NOUT, FMT = 9998 )
WRITE( NOUT, FMT = 9998 )SRNAMT( 1:LEN_TRIM( SRNAMT ) )
END IF
END IF
*
@@ -558,7 +558,7 @@
WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ),
$ NT
ELSE
WRITE( NOUT, FMT = 9998 )
WRITE( NOUT, FMT = 9998 )SRNAMT( 1:LEN_TRIM( SRNAMT ) )
END IF
END IF
*


+ 6
- 6
lapack-netlib/TESTING/EIG/derred.f View File

@@ -329,7 +329,7 @@
WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ),
$ NT
ELSE
WRITE( NOUT, FMT = 9998 )
WRITE( NOUT, FMT = 9998 )SRNAMT( 1:LEN_TRIM( SRNAMT ) )
END IF
*
* Test DGESDD
@@ -358,7 +358,7 @@
WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ),
$ NT
ELSE
WRITE( NOUT, FMT = 9998 )
WRITE( NOUT, FMT = 9998 )SRNAMT( 1:LEN_TRIM( SRNAMT ) )
END IF
*
* Test DGEJSV
@@ -424,7 +424,7 @@
WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ),
$ NT
ELSE
WRITE( NOUT, FMT = 9998 )
WRITE( NOUT, FMT = 9998 )SRNAMT( 1:LEN_TRIM( SRNAMT ) )
END IF
*
* Test DGESVDX
@@ -483,7 +483,7 @@
WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ),
$ NT
ELSE
WRITE( NOUT, FMT = 9998 )
WRITE( NOUT, FMT = 9998 )SRNAMT( 1:LEN_TRIM( SRNAMT ) )
END IF
*
* Test DGESVDQ
@@ -538,7 +538,7 @@
WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ),
$ NT
ELSE
WRITE( NOUT, FMT = 9998 )
WRITE( NOUT, FMT = 9998 )SRNAMT( 1:LEN_TRIM( SRNAMT ) )
END IF
END IF
*
@@ -549,7 +549,7 @@
WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ),
$ NT
ELSE
WRITE( NOUT, FMT = 9998 )
WRITE( NOUT, FMT = 9998 )SRNAMT( 1:LEN_TRIM( SRNAMT ) )
END IF
END IF
*


+ 6
- 6
lapack-netlib/TESTING/EIG/serred.f View File

@@ -329,7 +329,7 @@
WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ),
$ NT
ELSE
WRITE( NOUT, FMT = 9998 )
WRITE( NOUT, FMT = 9998 )SRNAMT( 1:LEN_TRIM( SRNAMT ) )
END IF
*
* Test SGESDD
@@ -358,7 +358,7 @@
WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ),
$ NT
ELSE
WRITE( NOUT, FMT = 9998 )
WRITE( NOUT, FMT = 9998 )SRNAMT( 1:LEN_TRIM( SRNAMT ) )
END IF
*
* Test SGEJSV
@@ -424,7 +424,7 @@
WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ),
$ NT
ELSE
WRITE( NOUT, FMT = 9998 )
WRITE( NOUT, FMT = 9998 )SRNAMT( 1:LEN_TRIM( SRNAMT ) )
END IF
*
* Test SGESVDX
@@ -483,7 +483,7 @@
WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ),
$ NT
ELSE
WRITE( NOUT, FMT = 9998 )
WRITE( NOUT, FMT = 9998 )SRNAMT( 1:LEN_TRIM( SRNAMT ) )
END IF
*
* Test SGESVDQ
@@ -538,7 +538,7 @@
WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ),
$ NT
ELSE
WRITE( NOUT, FMT = 9998 )
WRITE( NOUT, FMT = 9998 )SRNAMT( 1:LEN_TRIM( SRNAMT ) )
END IF
END IF
*
@@ -549,7 +549,7 @@
WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ),
$ NT
ELSE
WRITE( NOUT, FMT = 9998 )
WRITE( NOUT, FMT = 9998 )SRNAMT( 1:LEN_TRIM( SRNAMT ) )
END IF
END IF
*


+ 6
- 6
lapack-netlib/TESTING/EIG/zerred.f View File

@@ -332,7 +332,7 @@
WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ),
$ NT
ELSE
WRITE( NOUT, FMT = 9998 )
WRITE( NOUT, FMT = 9998 )SRNAMT( 1:LEN_TRIM( SRNAMT ) )
END IF
*
* Test ZGESDD
@@ -367,7 +367,7 @@
WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ),
$ NT
ELSE
WRITE( NOUT, FMT = 9998 )
WRITE( NOUT, FMT = 9998 )SRNAMT( 1:LEN_TRIM( SRNAMT ) )
END IF
*
* Test ZGEJSV
@@ -433,7 +433,7 @@
WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ),
$ NT
ELSE
WRITE( NOUT, FMT = 9998 )
WRITE( NOUT, FMT = 9998 )SRNAMT( 1:LEN_TRIM( SRNAMT ) )
END IF
*
* Test ZGESVDX
@@ -492,7 +492,7 @@
WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ),
$ NT
ELSE
WRITE( NOUT, FMT = 9998 )
WRITE( NOUT, FMT = 9998 )SRNAMT( 1:LEN_TRIM( SRNAMT ) )
END IF
*
* Test ZGESVDQ
@@ -547,7 +547,7 @@
WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ),
$ NT
ELSE
WRITE( NOUT, FMT = 9998 )
WRITE( NOUT, FMT = 9998 )SRNAMT( 1:LEN_TRIM( SRNAMT ) )
END IF
END IF
*
@@ -558,7 +558,7 @@
WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ),
$ NT
ELSE
WRITE( NOUT, FMT = 9998 )
WRITE( NOUT, FMT = 9998 )SRNAMT( 1:LEN_TRIM( SRNAMT ) )
END IF
END IF
*


+ 0
- 4
lapack-netlib/TESTING/LIN/CMakeLists.txt View File

@@ -240,10 +240,6 @@ set(ZLINTSTRFP zchkrfp.f zdrvrfp.f zdrvrf1.f zdrvrf2.f zdrvrf3.f zdrvrf4.f zerrr
macro(add_lin_executable name)
add_executable(${name} ${ARGN})
target_link_libraries(${name} ${LIBNAMEPREFIX}openblas${LIBNAMESUFFIX}${SUFFIX64_UNDERSCORE})
if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang))
string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}")
target_link_libraries(${name} omp pthread)
endif()
#${TMGLIB} ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES})
endmacro()



Some files were not shown because too many files changed in this diff

Loading…
Cancel
Save