| @@ -58,8 +58,8 @@ task: | |||
| - export VALID_ARCHS="i386 x86_64" | |||
| - xcrun --sdk macosx --show-sdk-path | |||
| - xcodebuild -version | |||
| - export CC=/Applications/Xcode_15.4.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang | |||
| - export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode_15.4.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX14.5.sdk -arch x86_64" | |||
| - export CC=/Applications/Xcode_16.3.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang | |||
| - export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode_16.3.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX15.4.sdk -arch x86_64" | |||
| - make TARGET=CORE2 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 RANLIB="ls -l" | |||
| always: | |||
| config_artifacts: | |||
| @@ -78,8 +78,8 @@ task: | |||
| - export #PATH=/opt/homebrew/opt/llvm/bin:$PATH | |||
| - export #LDFLAGS="-L/opt/homebrew/opt/llvm/lib" | |||
| - export #CPPFLAGS="-I/opt/homebrew/opt/llvm/include" | |||
| - export CC=/Applications/Xcode_15.4.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang | |||
| - export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode_15.4.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS17.5.sdk -arch arm64 -miphoneos-version-min=10.0" | |||
| - export CC=/Applications/Xcode_16.3.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang | |||
| - export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode_16.3.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS18.4.sdk -arch arm64 -miphoneos-version-min=10.0" | |||
| - xcrun --sdk iphoneos --show-sdk-path | |||
| - ls -l /Applications | |||
| - make TARGET=ARMV8 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 CROSS=1 | |||
| @@ -102,6 +102,7 @@ jobs: | |||
| mkdir build && cd build | |||
| cmake -DDYNAMIC_ARCH=1 \ | |||
| -DUSE_OPENMP=${{matrix.openmp}} \ | |||
| -DOpenMP_Fortran_LIB_NAMES=omp \ | |||
| -DINTERFACE64=${{matrix.ilp64}} \ | |||
| -DNOFORTRAN=0 \ | |||
| -DBUILD_WITHOUT_LAPACK=0 \ | |||
| @@ -31,7 +31,7 @@ jobs: | |||
| steps: | |||
| - name: Checkout repository | |||
| uses: actions/checkout@v3 | |||
| uses: actions/checkout@v4 | |||
| - name: install build deps | |||
| run: | | |||
| @@ -40,18 +40,18 @@ jobs: | |||
| gcc-${{ matrix.apt_triple }} gfortran-${{ matrix.apt_triple }} libgomp1-riscv64-cross libglib2.0-dev | |||
| - name: checkout qemu | |||
| uses: actions/checkout@v3 | |||
| uses: actions/checkout@v4 | |||
| with: | |||
| repository: T-head-Semi/qemu | |||
| repository: XUANTIE-RV/qemu | |||
| path: qemu | |||
| ref: 1e692ebb43d396c52352406323fc782c1ac99a42 | |||
| ref: e0ace167effcd36d1f82c7ccb4522b3126011479 # xuantie-qemu-9.0 | |||
| - name: build qemu | |||
| run: | | |||
| # Force use c910v qemu-user | |||
| wget https://github.com/revyos/qemu/commit/5164bca5a4bcde4534dc1a9aa3a7f619719874cf.patch | |||
| wget https://github.com/revyos/qemu/commit/222729c7455784dd855216d7a2bec4bd8f2a6800.patch | |||
| cd qemu | |||
| patch -p1 < ../5164bca5a4bcde4534dc1a9aa3a7f619719874cf.patch | |||
| patch -p1 < ../222729c7455784dd855216d7a2bec4bd8f2a6800.patch | |||
| export CXXFLAGS="-Wno-error"; export CFLAGS="-Wno-error" | |||
| ./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=riscv64-linux-user --disable-system | |||
| make -j$(nproc) | |||
| @@ -77,6 +77,17 @@ set(SYMBOLPREFIX "" CACHE STRING "Add a prefix to all exported symbol names in | |||
| set(SYMBOLSUFFIX "" CACHE STRING "Add a suffix to all exported symbol names in the shared library, e.g. _64 for INTERFACE64 builds" ) | |||
| if (CMAKE_SYSTEM_NAME MATCHES "Windows" AND BUILD_SHARED_LIBS AND NOT ("${SYMBOLPREFIX}${SYMBOLSUFFIX}" STREQUAL "")) | |||
| set (DELETE_STATIC_LIBS "") | |||
| if (NOT BUILD_STATIC_LIBS) | |||
| message (STATUS "forcing build of a temporary static library for symbol renaming") | |||
| set (BUILD_SHARED_LIBS OFF CACHE BOOL "Build shared library" FORCE) | |||
| set (BUILD_STATIC_LIBS ON CACHE BOOL "Build static library" FORCE) | |||
| set (DELETE_STATIC_LIBS file (REMOVE $<TARGET_FILE_DIR:${OpenBLAS_LIBNAME}_static>/${OpenBLAS_LIBNAME}.lib)) | |||
| endif () | |||
| endif() | |||
| ####### | |||
| if(BUILD_WITHOUT_LAPACK) | |||
| set(NO_LAPACK 1) | |||
| @@ -109,10 +120,6 @@ endif() | |||
| message(WARNING "CMake support is experimental. It does not yet support all build options and may not produce the same Makefiles that OpenBLAS ships with.") | |||
| if (USE_OPENMP) | |||
| find_package(OpenMP REQUIRED) | |||
| endif () | |||
| include("${PROJECT_SOURCE_DIR}/cmake/utils.cmake") | |||
| include("${PROJECT_SOURCE_DIR}/cmake/system.cmake") | |||
| @@ -230,6 +237,12 @@ endif () | |||
| # add objects to the openblas lib | |||
| if(NOT NO_LAPACK) | |||
| add_library(LAPACK_OVERRIDES OBJECT ${LA_SOURCES}) | |||
| if (USE_OPENMP AND (NOT NOFORTRAN)) | |||
| # Disable OpenMP for LAPACK Fortran codes on Windows. | |||
| if(NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Windows") | |||
| target_link_libraries(LAPACK_OVERRIDES OpenMP::OpenMP_Fortran) | |||
| endif() | |||
| endif() | |||
| list(APPEND TARGET_OBJS "$<TARGET_OBJECTS:LAPACK_OVERRIDES>") | |||
| endif() | |||
| if(NOT NO_LAPACKE) | |||
| @@ -271,30 +284,59 @@ endif() | |||
| if (USE_OPENMP) | |||
| if(BUILD_STATIC_LIBS) | |||
| target_link_libraries(${OpenBLAS_LIBNAME}_static OpenMP::OpenMP_C) | |||
| if(NOFORTRAN) | |||
| target_link_libraries(${OpenBLAS_LIBNAME}_static OpenMP::OpenMP_C) | |||
| else() | |||
| target_link_libraries(${OpenBLAS_LIBNAME}_static OpenMP::OpenMP_C OpenMP::OpenMP_Fortran) | |||
| endif() | |||
| endif() | |||
| if(BUILD_SHARED_LIBS) | |||
| target_link_libraries(${OpenBLAS_LIBNAME}_shared OpenMP::OpenMP_C) | |||
| if(NOFORTRAN) | |||
| target_link_libraries(${OpenBLAS_LIBNAME}_shared OpenMP::OpenMP_C) | |||
| else() | |||
| target_link_libraries(${OpenBLAS_LIBNAME}_shared OpenMP::OpenMP_C OpenMP::OpenMP_Fortran) | |||
| endif() | |||
| endif() | |||
| endif() | |||
| # Seems that this hack doesn't required since macOS 11 Big Sur | |||
| if (APPLE AND BUILD_SHARED_LIBS AND CMAKE_HOST_SYSTEM_VERSION VERSION_LESS 20) | |||
| set (CMAKE_C_USE_RESPONSE_FILE_FOR_OBJECTS 1) | |||
| if (NOT NOFORTRAN) | |||
| set (CMAKE_Fortran_USE_RESPONSE_FILE_FOR_OBJECTS 1) | |||
| set (CMAKE_Fortran_CREATE_SHARED_LIBRARY | |||
| "sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ${CMAKE_AR} -ru libopenblas.a && exit 0' " | |||
| "sh -c '${CMAKE_AR} -rs libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' " | |||
| "sh -c 'echo \"\" | ${CMAKE_Fortran_COMPILER} -o dummy.o -c -x f95-cpp-input - '" | |||
| "sh -c '${CMAKE_Fortran_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load dummy.o -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'" | |||
| "sh -c 'ls -l ${CMAKE_BINARY_DIR}/lib'") | |||
| else () | |||
| set (CMAKE_C_CREATE_SHARED_LIBRARY | |||
| "sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ${CMAKE_AR} -ru libopenblas.a && exit 0' " | |||
| "sh -c '${CMAKE_AR} -rs libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' " | |||
| "sh -c '${CMAKE_C_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'") | |||
| endif () | |||
| # Fix "Argument list too long" for macOS with Intel CPUs and DYNAMIC_ARCH turned on | |||
| if(APPLE AND DYNAMIC_ARCH AND (NOT CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "arm64")) | |||
| # Use response files | |||
| set(CMAKE_C_USE_RESPONSE_FILE_FOR_OBJECTS 1) | |||
| # Always build static library first | |||
| if(BUILD_STATIC_LIBS) | |||
| set(STATIC_PATH "${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/lib${OpenBLAS_LIBNAME}.a") | |||
| else() | |||
| add_library(${OpenBLAS_LIBNAME}_static STATIC ${TARGET_OBJS} ${OpenBLAS_DEF_FILE}) | |||
| set(STATIC_PATH "lib${OpenBLAS_LIBNAME}.a") | |||
| endif() | |||
| set(CREATE_STATIC_LIBRARY_COMMAND | |||
| "sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/${OpenBLAS_LIBNAME}_static.dir/objects*.rsp | xargs -n 1024 ${CMAKE_AR} -ru ${STATIC_PATH} && exit 0' " | |||
| "sh -c '${CMAKE_AR} -rs ${STATIC_PATH} ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' ") | |||
| if(BUILD_SHARED_LIBS) | |||
| add_dependencies(${OpenBLAS_LIBNAME}_shared ${OpenBLAS_LIBNAME}_static) | |||
| set(SHARED_PATH "${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib") | |||
| endif() | |||
| if(USE_OPENMP) | |||
| get_target_property(OMP_LIB OpenMP::OpenMP_C INTERFACE_LINK_LIBRARIES) | |||
| else() | |||
| set(OMP_LIB "") | |||
| endif() | |||
| if(NOT NOFORTRAN) | |||
| set(CMAKE_Fortran_USE_RESPONSE_FILE_FOR_OBJECTS 1) | |||
| set(CMAKE_Fortran_CREATE_STATIC_LIBRARY ${CREATE_STATIC_LIBRARY_COMMAND}) | |||
| if(BUILD_SHARED_LIBS) | |||
| set(CMAKE_Fortran_CREATE_SHARED_LIBRARY | |||
| "sh -c 'echo \"\" | ${CMAKE_Fortran_COMPILER} -o dummy.o -c -x f95-cpp-input - '" | |||
| "sh -c '${CMAKE_Fortran_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,${STATIC_PATH} dummy.o -o ${SHARED_PATH} ${OMP_LIB}'") | |||
| endif() | |||
| else() | |||
| set(CMAKE_C_CREATE_STATIC_LIBRARY ${CREATE_STATIC_LIBRARY_COMMAND}) | |||
| if(BUILD_SHARED_LIBS) | |||
| set(CMAKE_C_CREATE_SHARED_LIBRARY | |||
| "sh -c '${CMAKE_C_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,${STATIC_PATH} -o ${SHARED_PATH} ${OMP_LIB}'") | |||
| endif() | |||
| endif() | |||
| endif() | |||
| # Handle MSVC exports | |||
| @@ -379,7 +421,7 @@ if (BUILD_SHARED_LIBS AND BUILD_RELAPACK) | |||
| endif() | |||
| endif() | |||
| if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFFIX} STREQUAL "") | |||
| if (BUILD_SHARED_LIBS OR DELETE_STATIC_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFFIX} STREQUAL "") | |||
| if (NOT DEFINED ARCH) | |||
| set(ARCH_IN "x86_64") | |||
| else() | |||
| @@ -467,10 +509,33 @@ if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFFIX} STREQUAL "") | |||
| else () | |||
| set (BZ 0) | |||
| endif() | |||
| if (CMAKE_SYSTEM_NAME MATCHES "Windows") | |||
| set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib) | |||
| set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib) | |||
| if (CMAKE_BUILD_TYPE MATCHES "Debug") | |||
| set (CRTLIB msvcrtd) | |||
| set (PDBOPT -debug -pdb:$<TARGET_FILE_DIR:${OpenBLAS_LIBNAME}_static>/${OpenBLAS_LIBNAME}.pdb) | |||
| set (PDB_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib) | |||
| else () | |||
| set (CRTLIB msvcrt) | |||
| set (PDBOPT "") | |||
| endif() | |||
| #if (USE_PERL) | |||
| message(STATUS "adding postbuild instruction to rename syms") | |||
| add_custom_command(TARGET ${OpenBLAS_LIBNAME}_static POST_BUILD | |||
| COMMAND perl ${PROJECT_SOURCE_DIR}/exports/gensymbol.pl "win2k" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" "${SYMBOLPREFIX}" "${SYMBOLSUFFIX}" "${BLD}" "${BBF16}" "${BS}" "${BD}" "${BC}" "${BZ}" > ${PROJECT_BINARY_DIR}/renamesyms.def | |||
| COMMAND ${CMAKE_C_COMPILER} ${CMAKE_C_FLAGS} -I${PROJECT_SOURCE_DIR} -I${PROJECT_BINARY_DIR} -c -o ${PROJECT_BINARY_DIR}/dllinit.o ${PROJECT_SOURCE_DIR}/exports/dllinit.c | |||
| COMMAND lld-link -nodefaultlib:libcmt -defaultlib:${CRTLIB} ${CMAKE_LINKER_FLAGS} -errorlimit:0 -def:${PROJECT_BINARY_DIR}/renamesyms.def ${PROJECT_BINARY_DIR}/dllinit.o $<TARGET_FILE:${OpenBLAS_LIBNAME}_static> -wholearchive:$<TARGET_FILE:${OpenBLAS_LIBNAME}_static> -dll -out:$<TARGET_FILE_DIR:${OpenBLAS_LIBNAME}_static>/${OpenBLAS_LIBNAME}.dll -implib:$<TARGET_FILE_DIR:${OpenBLAS_LIBNAME}_static>/${OpenBLAS_LIBNAME}.dll.a ${PDBOPT} | |||
| #COMMAND lld-link -nodefaultlib:libcmt -defaultlib:msvcrt ${CMAKE_LINKER_FLAGS} -errorlimit:0 -def:${PROJECT_BINARY_DIR}/renamesyms.def ${PROJECT_BINARY_DIR}/dllinit.o $<TARGET_FILE:${OpenBLAS_LIBNAME}_static> -wholearchive:$<TARGET_FILE:${OpenBLAS_LIBNAME}_static> -dll -out:$<TARGET_FILE_DIR:${OpenBLAS_LIBNAME}_static>/${OpenBLAS_LIBNAME}.dll -implib:$<TARGET_FILE_DIR:${OpenBLAS_LIBNAME}_static>/${OpenBLAS_LIBNAME}.dll.a | |||
| ${REMOVE_STATIC_LIB} VERBATIM | |||
| ) | |||
| #endif () | |||
| else () | |||
| if (NOT USE_PERL) | |||
| add_custom_command(TARGET ${OpenBLAS_LIBNAME}_shared POST_BUILD | |||
| COMMAND ${PROJECT_SOURCE_DIR}/exports/gensymbol "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BLD}" "${BBF16}" "${BS}" "${BD}" "${BC}" "${BZ}" > ${PROJECT_BINARY_DIR}/objcopy.def | |||
| COMMAND objcopy -v --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def ${PROJECT_BINARY_DIR}/lib/lib${OpenBLAS_LIBNAME}.so | |||
| COMMAND sh ${PROJECT_SOURCE_DIR}/exports/gensymbol "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BLD}" "${BBF16}" "${BS}" "${BD}" "${BC}" "${BZ}" > ${PROJECT_BINARY_DIR}/objcopy.def | |||
| COMMAND objcopy -v --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def ${PROJECT_BINARY_DIR}/lib/${OpenBLAS_LIBNAME}.so | |||
| COMMENT "renaming symbols" | |||
| ) | |||
| else() | |||
| @@ -481,6 +546,7 @@ if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFFIX} STREQUAL "") | |||
| ) | |||
| endif() | |||
| endif() | |||
| endif() | |||
| if (BUILD_BENCHMARKS) | |||
| #find_package(OpenMP REQUIRED) | |||
| @@ -650,4 +716,4 @@ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${PN}ConfigVersion.cmake | |||
| DESTINATION ${CMAKECONFIG_INSTALL_DIR}) | |||
| install(EXPORT "${PN}${SUFFIX64}Targets" | |||
| NAMESPACE "${PN}${SUFFIX64}::" | |||
| DESTINATION ${CMAKECONFIG_INSTALL_DIR}) | |||
| DESTINATION ${CMAKECONFIG_INSTALL_DIR}) | |||
| @@ -26,6 +26,9 @@ | |||
| * Chris Sidebottom <chris.sidebottom@arm.com> | |||
| * Optimizations and other improvements targeting AArch64 | |||
| * Annop Wongwathanarat <annop.wongwathanarat@arm.com> | |||
| * Optimizations and other improvements targeting AArch64 | |||
| ## Previous Developers | |||
| * Zaheer Chothia <zaheer.chothia@gmail.com> | |||
| @@ -247,4 +250,7 @@ In chronological order: | |||
| * Ye Tao <ye.tao@arm.com> | |||
| * [2025-02-03] Optimize SBGEMM kernel on NEOVERSEV1 | |||
| * [2025-02-27] Add sbgemv_n_neon kernel | |||
| * [2025-02-27] Add sbgemv_n_neon kernel | |||
| * Abhishek Kumar <https://github.com/abhishek-iitmadras> | |||
| * [2025-04-22] Optimise dot kernel for NEOVERSE V1 | |||
| @@ -93,6 +93,11 @@ ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) | |||
| echo " Fortran compiler ... $(F_COMPILER) (command line : $(FC))";\ | |||
| fi | |||
| endif | |||
| ifeq ($(OSNAME), WINNT) | |||
| @-$(LNCMD) $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) | |||
| endif | |||
| ifneq ($(OSNAME), AIX) | |||
| @echo -n " Library Name ... $(LIBNAME)" | |||
| else | |||
| @@ -447,7 +452,7 @@ endif | |||
| @rm -f cblas.tmp cblas.tmp2 | |||
| @touch $(NETLIB_LAPACK_DIR)/make.inc | |||
| @$(MAKE) -C $(NETLIB_LAPACK_DIR) clean | |||
| @rm -f $(NETLIB_LAPACK_DIR)/make.inc $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_mangling.h | |||
| @rm -f $(NETLIB_LAPACK_DIR)/make.inc | |||
| @$(MAKE) -C relapack clean | |||
| @rm -f *.grd Makefile.conf_last config_last.h | |||
| @(cd $(NETLIB_LAPACK_DIR)/TESTING && rm -f x* *.out testing_results.txt) | |||
| @@ -435,6 +435,11 @@ ifeq (x$(XCVER), x 15) | |||
| CCOMMON_OPT += -Wl,-ld_classic | |||
| FCOMMON_OPT += -Wl,-ld_classic | |||
| endif | |||
| ifeq (x$(XCVER), x 16) | |||
| ifeq ($(F_COMPILER), GFORTRAN) | |||
| override CEXTRALIB := $(filter-out(-lto_library, $(CEXTRALIB))) | |||
| endif | |||
| endif | |||
| endif | |||
| ifneq (,$(findstring $(OSNAME), FreeBSD OpenBSD DragonFly)) | |||
| @@ -175,7 +175,7 @@ jobs: | |||
| - script: | | |||
| brew update | |||
| brew install llvm flang | |||
| make TARGET=NEHALEM CC=/usr/local/opt/llvm/bin/clang FC=/usr/local/Cellar/flang/19.1.7_1/bin/flang-new NO_SHARED=1 | |||
| make TARGET=NEHALEM CC=/usr/local/opt/llvm/bin/clang FC=/usr/local/opt/flang/bin/flang NO_SHARED=1 | |||
| - job: OSX_OpenMP_Clang | |||
| pool: | |||
| @@ -31,17 +31,6 @@ if (${CMAKE_C_COMPILER_ID} STREQUAL "Intel") | |||
| set(CCOMMON_OPT "${CCOMMON_OPT} -wd981") | |||
| endif () | |||
| if (USE_OPENMP) | |||
| # USE_SIMPLE_THREADED_LEVEL3 = 1 | |||
| # NO_AFFINITY = 1 | |||
| find_package(OpenMP REQUIRED) | |||
| if (OpenMP_FOUND) | |||
| set(CCOMMON_OPT "${CCOMMON_OPT} ${OpenMP_C_FLAGS} -DUSE_OPENMP") | |||
| set(FCOMMON_OPT "${FCOMMON_OPT} ${OpenMP_Fortran_FLAGS}") | |||
| endif() | |||
| endif () | |||
| if (DYNAMIC_ARCH) | |||
| if (ARM64) | |||
| set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110) | |||
| @@ -7,7 +7,7 @@ if (${F_COMPILER} STREQUAL "FLANG" AND NOT CMAKE_Fortran_COMPILER_ID STREQUAL "L | |||
| # This is for classic Flang. LLVM Flang is handled with gfortran below. | |||
| set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_FLANG") | |||
| if (USE_OPENMP) | |||
| set(FCOMMON_OPT "${FCOMMON_OPT} -fopenmp") | |||
| set(OpenMP_Fortran_FLAGS "-fopenmp" CACHE STRING "OpenMP Fortran compiler flags") | |||
| endif () | |||
| set(FCOMMON_OPT "${FCOMMON_OPT} -Mrecursive -Kieee") | |||
| endif () | |||
| @@ -117,7 +117,7 @@ if (${F_COMPILER} STREQUAL "GFORTRAN" OR ${F_COMPILER} STREQUAL "F95" OR CMAKE_F | |||
| endif () | |||
| if (USE_OPENMP) | |||
| set(FCOMMON_OPT "${FCOMMON_OPT} -fopenmp") | |||
| set(OpenMP_Fortran_FLAGS "-fopenmp" CACHE STRING "OpenMP Fortran compiler flags") | |||
| endif () | |||
| endif () | |||
| @@ -128,14 +128,14 @@ if (${F_COMPILER} STREQUAL "INTEL" OR CMAKE_Fortran_COMPILER_ID MATCHES "Intel") | |||
| endif () | |||
| set(FCOMMON_OPT "${FCOMMON_OPT} -recursive -fp-model=consistent") | |||
| if (USE_OPENMP) | |||
| set(FCOMMON_OPT "${FCOMMON_OPT} -openmp") | |||
| set(OpenMP_Fortran_FLAGS "-openmp" CACHE STRING "OpenMP Fortran compiler flags") | |||
| endif () | |||
| endif () | |||
| if (${F_COMPILER} STREQUAL "FUJITSU") | |||
| set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_FUJITSU") | |||
| if (USE_OPENMP) | |||
| set(FCOMMON_OPT "${FCOMMON_OPT} -openmp") | |||
| set(OpenMP_Fortran_FLAGS "-openmp" CACHE STRING "OpenMP Fortran compiler flags") | |||
| endif () | |||
| endif () | |||
| @@ -151,7 +151,7 @@ if (${F_COMPILER} STREQUAL "IBM") | |||
| set(FCOMMON_OPT "${FCOMMON_OPT} -q32") | |||
| endif () | |||
| if (USE_OPENMP) | |||
| set(FCOMMON_OPT "${FCOMMON_OPT} -openmp") | |||
| set(OpenMP_Fortran_FLAGS "-openmp" CACHE STRING "OpenMP Fortran compiler flags") | |||
| endif () | |||
| endif () | |||
| @@ -168,7 +168,7 @@ if (${F_COMPILER} STREQUAL "PGI" OR ${F_COMPILER} STREQUAL "PGF95") | |||
| endif () | |||
| set(FCOMMON_OPT "${FCOMMON_OPT} -Mrecursive") | |||
| if (USE_OPENMP) | |||
| set(FCOMMON_OPT "${FCOMMON_OPT} -mp") | |||
| set(OpenMP_Fortran_FLAGS "-mp" CACHE STRING "OpenMP Fortran compiler flags") | |||
| endif () | |||
| endif () | |||
| @@ -195,7 +195,7 @@ if (${F_COMPILER} STREQUAL "PATHSCALE") | |||
| endif () | |||
| if (USE_OPENMP) | |||
| set(FCOMMON_OPT "${FCOMMON_OPT} -mp") | |||
| set(OpenMP_Fortran_FLAGS "-mp" CACHE STRING "OpenMP Fortran compiler flags") | |||
| endif () | |||
| endif () | |||
| @@ -233,7 +233,7 @@ if (${F_COMPILER} STREQUAL "OPEN64") | |||
| if (USE_OPENMP) | |||
| set(FEXTRALIB "${FEXTRALIB} -lstdc++") | |||
| set(FCOMMON_OPT "${FCOMMON_OPT} -mp") | |||
| set(OpenMP_Fortran_FLAGS "-mp" CACHE STRING "OpenMP Fortran compiler flags") | |||
| endif () | |||
| endif () | |||
| @@ -245,14 +245,14 @@ if (${F_COMPILER} STREQUAL "SUN") | |||
| set(FCOMMON_OPT "${FCOMMON_OPT} -m64") | |||
| endif () | |||
| if (USE_OPENMP) | |||
| set(FCOMMON_OPT "${FCOMMON_OPT} -xopenmp=parallel") | |||
| set(OpenMP_Fortran_FLAGS "-xopenmp=parallel" CACHE STRING "OpenMP Fortran compiler flags") | |||
| endif () | |||
| endif () | |||
| if (${F_COMPILER} STREQUAL "COMPAQ") | |||
| set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_COMPAQ") | |||
| if (USE_OPENMP) | |||
| set(FCOMMON_OPT "${FCOMMON_OPT} -openmp") | |||
| set(OpenMP_Fortran_FLAGS "-openmp" CACHE STRING "OpenMP Fortran compiler flags") | |||
| endif () | |||
| endif () | |||
| @@ -265,7 +265,7 @@ if (${F_COMPILER} STREQUAL "CRAY") | |||
| if (NOT USE_OPENMP) | |||
| set(FCOMMON_OPT "${FCOMMON_OPT} -fno-openmp") | |||
| else () | |||
| set(FCOMMON_OPT "${FCOMMON_OPT} -fopenmp") | |||
| set(OpenMP_Fortran_FLAGS "-fopenmp" CACHE STRING "OpenMP Fortran compiler flags") | |||
| endif () | |||
| endif () | |||
| @@ -290,7 +290,7 @@ if (${F_COMPILER} STREQUAL "NAGFOR") | |||
| # -w=unused: Suppress warning messages about unused variables | |||
| set(FCOMMON_OPT "${FCOMMON_OPT} -w=x77 -w=ques -w=unused") | |||
| if (USE_OPENMP) | |||
| set(FCOMMON_OPT "${FCOMMON_OPT} -openmp") | |||
| set(OpenMP_Fortran_FLAGS "-openmp" CACHE STRING "OpenMP Fortran compiler flags") | |||
| endif () | |||
| endif () | |||
| @@ -1006,15 +1006,15 @@ endif () | |||
| "#define HAVE_SVE\n" | |||
| "#define ARMV8\n") | |||
| set(SGEMM_UNROLL_M 16) | |||
| set(SGEMM_UNROLL_N 4) | |||
| set(DGEMM_UNROLL_M 8) | |||
| set(DGEMM_UNROLL_N 4) | |||
| set(CGEMM_UNROLL_M 8) | |||
| set(SGEMM_UNROLL_N 8) | |||
| set(DGEMM_UNROLL_M 4) | |||
| set(DGEMM_UNROLL_N 8) | |||
| set(CGEMM_UNROLL_M 2) | |||
| set(CGEMM_UNROLL_N 4) | |||
| set(ZGEMM_UNROLL_M 4) | |||
| set(ZGEMM_UNROLL_M 2) | |||
| set(ZGEMM_UNROLL_N 4) | |||
| set(SYMV_P 16) | |||
| elseif ("${TCORE}" STREQUAL "NEOVERSEN2" or "${TCORE}" STREQUAL "ARMV9SME") | |||
| elseif ("${TCORE}" STREQUAL "NEOVERSEN2" OR "${TCORE}" STREQUAL "ARMV9SME") | |||
| file(APPEND ${TARGET_CONF_TEMP} | |||
| "#define L1_CODE_SIZE\t65536\n" | |||
| "#define L1_CODE_LINESIZE\t64\n" | |||
| @@ -1249,6 +1249,25 @@ endif () | |||
| set(ZGEMM_UNROLL_M 2) | |||
| set(ZGEMM_UNROLL_N 4) | |||
| set(SYMV_P 16) | |||
| elseif ("${TCORE}" STREQUAL "ARMV8SVE" OR "${TCORE}" STREQUAL "CORTEXA510" OR "${TCORE}" STREQUAL "CORTEXX2" OR "${TCORE}" STREQUAL "ARMV9") | |||
| file(APPEND ${TARGET_CONF_TEMP} | |||
| "#define L1_DATA_SIZE\t32768\n" | |||
| "#define L1_DATA_LINESIZE\t64\n" | |||
| "#define L2_SIZE\t262144\n" | |||
| "#define L2_LINESIZE\t64\n" | |||
| "#define DTB_DEFAULT_ENTRIES\t64\n" | |||
| "#define DTB_SIZE\t4096\n" | |||
| "#define L2_ASSOCIATIVE\t32\n" | |||
| "#define ARMV8\n") | |||
| set(SGEMM_UNROLL_M 4) | |||
| set(SGEMM_UNROLL_N 8) | |||
| set(DGEMM_UNROLL_M 4) | |||
| set(DGEMM_UNROLL_N 8) | |||
| set(CGEMM_UNROLL_M 2) | |||
| set(CGEMM_UNROLL_N 4) | |||
| set(ZGEMM_UNROLL_M 2) | |||
| set(ZGEMM_UNROLL_N 4) | |||
| set(SYMV_P 16) | |||
| elseif ("${TCORE}" STREQUAL "P5600") | |||
| file(APPEND ${TARGET_CONF_TEMP} | |||
| "#define L2_SIZE 1048576\n" | |||
| @@ -1409,9 +1428,11 @@ endif () | |||
| # GetArch_2nd | |||
| foreach(float_char S;D;Q;C;Z;X) | |||
| if (NOT DEFINED ${float_char}GEMM_UNROLL_M) | |||
| message(STATUS "setting unrollm=2") | |||
| set(${float_char}GEMM_UNROLL_M 2) | |||
| endif() | |||
| if (NOT DEFINED ${float_char}GEMM_UNROLL_N) | |||
| message(STATUS "setting unrolln=2") | |||
| set(${float_char}GEMM_UNROLL_N 2) | |||
| endif() | |||
| endforeach() | |||
| @@ -372,6 +372,20 @@ else () | |||
| endif () | |||
| endif () | |||
| if (USE_OPENMP) | |||
| find_package(OpenMP COMPONENTS C REQUIRED) | |||
| set(CCOMMON_OPT "${CCOMMON_OPT} -DUSE_OPENMP") | |||
| if (NOT NOFORTRAN) | |||
| find_package(OpenMP COMPONENTS Fortran REQUIRED) | |||
| # Avoid mixed OpenMP linkage | |||
| get_target_property(OMP_C_LIB OpenMP::OpenMP_C INTERFACE_LINK_LIBRARIES) | |||
| get_target_property(OMP_Fortran_LIB OpenMP::OpenMP_Fortran INTERFACE_LINK_LIBRARIES) | |||
| if (NOT OMP_C_LIB STREQUAL OMP_Fortran_LIB) | |||
| message(FATAL_ERROR "Multiple OpenMP runtime libraries detected. Mixed OpenMP runtime linkage is dangerous. You may pass -DOpenMP_LANG_LIB_NAMES and -DOpenMP_omp_LIBRARY to manually choose the OpenMP library.") | |||
| endif() | |||
| endif () | |||
| endif () | |||
| if (BINARY64) | |||
| if (INTERFACE64) | |||
| # CCOMMON_OPT += -DUSE64BITINT | |||
| @@ -655,15 +669,6 @@ if (LAPACK_STRLEN) | |||
| endif() | |||
| set(LAPACK_FPFLAGS "${LAPACK_FPFLAGS} ${FPFLAGS}") | |||
| #Disable -fopenmp for LAPACK Fortran codes on Windows. | |||
| if (${CMAKE_SYSTEM_NAME} STREQUAL "Windows") | |||
| set(FILTER_FLAGS "-fopenmp;-mp;-openmp;-xopenmp=parallel") | |||
| foreach (FILTER_FLAG ${FILTER_FLAGS}) | |||
| string(REPLACE ${FILTER_FLAG} "" LAPACK_FFLAGS ${LAPACK_FFLAGS}) | |||
| string(REPLACE ${FILTER_FLAG} "" LAPACK_FPFLAGS ${LAPACK_FPFLAGS}) | |||
| endforeach () | |||
| endif () | |||
| if (CMAKE_Fortran_COMPILER) | |||
| if ("${F_COMPILER}" STREQUAL "NAGFOR" OR "${F_COMPILER}" STREQUAL "CRAY" OR CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*") | |||
| set(FILTER_FLAGS "-msse3;-mssse3;-msse4.1;-mavx;-mavx2,-mskylake-avx512") | |||
| @@ -224,10 +224,8 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG); | |||
| int (*sgemm_direct_performant) (BLASLONG M, BLASLONG N, BLASLONG K); | |||
| #endif | |||
| #ifdef ARCH_ARM64 | |||
| #ifdef HAVE_SME | |||
| void (*sgemm_direct) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG , float *, BLASLONG , float * , BLASLONG); | |||
| #endif | |||
| #endif | |||
| int (*sgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG); | |||
| @@ -103,9 +103,16 @@ static inline int blas_quickdivide(blasint x, blasint y){ | |||
| .global REALNAME ;\ | |||
| .type REALNAME, %function ;\ | |||
| REALNAME: | |||
| #define EPILOGUE | |||
| #if defined(__ELF__) && defined(__linux__) | |||
| # define GNUSTACK .section .note.GNU-stack,"",@progbits | |||
| #else | |||
| # define GNUSTACK | |||
| #endif | |||
| #define EPILOGUE \ | |||
| .size REALNAME, .-REALNAME; \ | |||
| GNUSTACK | |||
| #define PROFCODE | |||
| @@ -65,3 +65,6 @@ _cpuid: | |||
| .subsections_via_symbols | |||
| #endif | |||
| #if defined(__ELF__) && defined(__linux__) | |||
| .section .note.GNU-stack,"",@progbits | |||
| #endif | |||
| @@ -374,15 +374,20 @@ int detect(void) | |||
| } | |||
| #else | |||
| #ifdef __APPLE__ | |||
| length64 = sizeof(value64); | |||
| sysctlbyname("hw.ncpu",&value64,&length64,NULL,0); | |||
| cpulowperf=value64; | |||
| length64 = sizeof(value64); | |||
| sysctlbyname("hw.nperflevels",&value64,&length64,NULL,0); | |||
| if (value64 > 1) { | |||
| sysctlbyname("hw.perflevel0.cpusperl",&value64,&length64,NULL,0); | |||
| length64 = sizeof(value64); | |||
| sysctlbyname("hw.perflevel0.cpusperl2",&value64,&length64,NULL,0); | |||
| cpuhiperf=value64; | |||
| sysctlbyname("hw.perflevel1.cpusperl",&value64,&length64,NULL,0); | |||
| length64 = sizeof(value64); | |||
| sysctlbyname("hw.perflevel1.cpusperl2",&value64,&length64,NULL,0); | |||
| cpulowperf=value64; | |||
| } | |||
| length64 = sizeof(value64); | |||
| sysctlbyname("hw.cpufamily",&value64,&length64,NULL,0); | |||
| if (value64 ==131287967|| value64 == 458787763 ) return CPU_VORTEX; //A12/M1 | |||
| if (value64 == 3660830781) return CPU_VORTEX; //A15/M2 | |||
| @@ -467,6 +472,7 @@ int n=0; | |||
| printf("#define NUM_CORES_HP %d\n",cpuhiperf); | |||
| #endif | |||
| #ifdef __APPLE__ | |||
| length64 = sizeof(value64); | |||
| sysctlbyname("hw.physicalcpu_max",&value,&length,NULL,0); | |||
| printf("#define NUM_CORES %d\n",value); | |||
| if (cpulowperf >0) | |||
| @@ -698,12 +704,17 @@ void get_cpuconfig(void) | |||
| case CPU_VORTEX: | |||
| printf("#define VORTEX \n"); | |||
| #ifdef __APPLE__ | |||
| length64 = sizeof(value64); | |||
| sysctlbyname("hw.l1icachesize",&value64,&length64,NULL,0); | |||
| printf("#define L1_CODE_SIZE %lld \n",value64); | |||
| length64 = sizeof(value64); | |||
| sysctlbyname("hw.cachelinesize",&value64,&length64,NULL,0); | |||
| printf("#define L1_CODE_LINESIZE %lld \n",value64); | |||
| printf("#define L1_DATA_LINESIZE %lld \n",value64); | |||
| length64 = sizeof(value64); | |||
| sysctlbyname("hw.l1dcachesize",&value64,&length64,NULL,0); | |||
| printf("#define L1_DATA_SIZE %lld \n",value64); | |||
| length64 = sizeof(value64); | |||
| sysctlbyname("hw.l2cachesize",&value64,&length64,NULL,0); | |||
| printf("#define L2_SIZE %lld \n",value64); | |||
| #endif | |||
| @@ -1578,6 +1578,7 @@ int get_cpuname(void){ | |||
| case 12: //family 6 exmodel 12 | |||
| switch (model) { | |||
| case 15: | |||
| case 6: // Arrow Lake | |||
| if(support_avx512()) | |||
| return CPUTYPE_SAPPHIRERAPIDS; | |||
| if(support_avx2()) | |||
| @@ -2421,6 +2422,22 @@ int get_coretype(void){ | |||
| else | |||
| return CORE_NEHALEM; | |||
| } | |||
| case 12: | |||
| switch (model) { | |||
| case 6: // Arrow Lake | |||
| if(support_amx_bf16()) | |||
| return CORE_SAPPHIRERAPIDS; | |||
| if(support_avx512_bf16()) | |||
| return CORE_COOPERLAKE; | |||
| if(support_avx512()) | |||
| return CORE_SKYLAKEX; | |||
| if(support_avx2()) | |||
| return CORE_HASWELL; | |||
| if(support_avx()) | |||
| return CORE_SANDYBRIDGE; | |||
| else | |||
| return CORE_NEHALEM; | |||
| } | |||
| } | |||
| case 15: | |||
| if (model <= 0x2) return CORE_NORTHWOOD; | |||
| @@ -44,10 +44,6 @@ else() | |||
| c_${float_char}blas1.c) | |||
| endif() | |||
| target_link_libraries(x${float_char}cblat1 ${OpenBLAS_LIBNAME}) | |||
| if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang)) | |||
| string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}") | |||
| target_link_libraries(x${float_char}cblat1 omp pthread) | |||
| endif() | |||
| if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX") | |||
| target_link_libraries(x${float_char}cblat1 m) | |||
| endif() | |||
| @@ -73,10 +69,6 @@ else() | |||
| constant.c) | |||
| endif() | |||
| target_link_libraries(x${float_char}cblat2 ${OpenBLAS_LIBNAME}) | |||
| if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang)) | |||
| string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}") | |||
| target_link_libraries(x${float_char}cblat2 omp pthread) | |||
| endif() | |||
| if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX") | |||
| target_link_libraries(x${float_char}cblat2 m) | |||
| endif() | |||
| @@ -124,20 +116,12 @@ else() | |||
| endif() | |||
| endif() | |||
| target_link_libraries(x${float_char}cblat3 ${OpenBLAS_LIBNAME}) | |||
| if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang)) | |||
| string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}") | |||
| target_link_libraries(x${float_char}cblat3 omp pthread) | |||
| endif() | |||
| if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX") | |||
| target_link_libraries(x${float_char}cblat3 m) | |||
| endif() | |||
| if (USE_GEMM3M) | |||
| if ((${float_char} STREQUAL "c") OR (${float_char} STREQUAL "z")) | |||
| target_link_libraries(x${float_char}cblat3_3m ${OpenBLAS_LIBNAME}) | |||
| if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang)) | |||
| string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}") | |||
| target_link_libraries(x${float_char}cblat3 omp pthread) | |||
| endif() | |||
| if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX") | |||
| target_link_libraries(x${float_char}cblat3_3m m) | |||
| endif() | |||
| @@ -235,18 +235,18 @@ FLDFLAGS = $(FFLAGS:-fPIC=) $(LDFLAGS) | |||
| ifeq ($(USE_OPENMP), 1) | |||
| ifeq ($(F_COMPILER), GFORTRAN) | |||
| ifeq ($(C_COMPILER), CLANG) | |||
| CEXTRALIB += -lomp | |||
| EXTRALIB += -lomp | |||
| endif | |||
| endif | |||
| ifeq ($(F_COMPILER), NAG) | |||
| CEXTRALIB = -lgomp | |||
| EXTRALIB = -lgomp | |||
| endif | |||
| ifeq ($(F_COMPILER), IBM) | |||
| ifeq ($(C_COMPILER), GCC) | |||
| CEXTRALIB += -lgomp | |||
| EXTRALIB += -lgomp | |||
| endif | |||
| ifeq ($(C_COMPILER), CLANG) | |||
| CEXTRALIB += -lomp | |||
| EXTRALIB += -lomp | |||
| endif | |||
| endif | |||
| endif | |||
| @@ -440,7 +440,7 @@ static real c_b43 = (float)1.; | |||
| extern /* Subroutine */ int ctest_(integer*, complex*, complex*, complex*, real*); | |||
| static complex mwpcs[5], mwpct[5]; | |||
| extern /* Subroutine */ int itest1_(integer*, integer*), stest1_(real*,real*,real*,real*); | |||
| extern /* Subroutine */ int cscaltest_(), itest1_(), stest1_(); | |||
| extern /* Subroutine */ int cscaltest_(integer*, complex*, complex*, integer*); | |||
| static complex cx[8]; | |||
| extern real scnrm2test_(integer*, complex*, integer*); | |||
| static integer np1; | |||
| @@ -223,3 +223,7 @@ if (USE_THREAD) | |||
| endif () | |||
| add_library(driver_level2 OBJECT ${OPENBLAS_SRC}) | |||
| if (USE_OPENMP) | |||
| target_link_libraries(driver_level2 OpenMP::OpenMP_C) | |||
| endif() | |||
| @@ -171,3 +171,7 @@ endforeach () | |||
| # | |||
| add_library(driver_level3 OBJECT ${OPENBLAS_SRC}) | |||
| if (USE_OPENMP) | |||
| target_link_libraries(driver_level3 OpenMP::OpenMP_C) | |||
| endif() | |||
| @@ -88,3 +88,7 @@ endif () | |||
| #endif | |||
| add_library(driver_others OBJECT ${OPENBLAS_SRC} ${MEMORY} ${SMP_SOURCES} ${COMMON_SOURCES}) | |||
| if (USE_OPENMP) | |||
| target_link_libraries(driver_others OpenMP::OpenMP_C) | |||
| endif() | |||
| @@ -43,6 +43,14 @@ | |||
| #include <sys/auxv.h> | |||
| #endif | |||
| #ifdef __APPLE__ | |||
| #include <sys/sysctl.h> | |||
| int32_t value; | |||
| size_t length=sizeof(value); | |||
| int64_t value64; | |||
| size_t length64=sizeof(value64); | |||
| #endif | |||
| extern gotoblas_t gotoblas_ARMV8; | |||
| #ifdef DYNAMIC_LIST | |||
| #ifdef DYN_CORTEXA53 | |||
| @@ -120,7 +128,7 @@ extern gotoblas_t gotoblas_ARMV9SME; | |||
| #else | |||
| #define gotoblas_ARMV9SME gotoblas_ARMV8 | |||
| #endif | |||
| #ifdef DYN_CORTEX_A55 | |||
| #ifdef DYN_CORTEXA55 | |||
| extern gotoblas_t gotoblas_CORTEXA55; | |||
| #else | |||
| #define gotoblas_CORTEXA55 gotoblas_ARMV8 | |||
| @@ -147,17 +155,17 @@ extern gotoblas_t gotoblas_NEOVERSEV1; | |||
| extern gotoblas_t gotoblas_NEOVERSEN2; | |||
| extern gotoblas_t gotoblas_ARMV8SVE; | |||
| extern gotoblas_t gotoblas_A64FX; | |||
| #ifndef NO_SME | |||
| extern gotoblas_t gotoblas_ARMV9SME; | |||
| #else | |||
| #define gotoblas_ARMV9SME gotoblas_ARMV8SVE | |||
| #endif | |||
| #else | |||
| #define gotoblas_NEOVERSEV1 gotoblas_ARMV8 | |||
| #define gotoblas_NEOVERSEN2 gotoblas_ARMV8 | |||
| #define gotoblas_ARMV8SVE gotoblas_ARMV8 | |||
| #define gotoblas_A64FX gotoblas_ARMV8 | |||
| #endif | |||
| #ifndef NO_SME | |||
| extern gotoblas_t gotoblas_ARMV9SME; | |||
| #else | |||
| #define gotoblas_ARMV9SME gotoblas_ARMV8SVE | |||
| #define gotoblas_ARMV9SME gotoblas_ARMV8 | |||
| #endif | |||
| extern gotoblas_t gotoblas_THUNDERX3T110; | |||
| @@ -168,7 +176,7 @@ extern void openblas_warning(int verbose, const char * msg); | |||
| #define FALLBACK_VERBOSE 1 | |||
| #define NEOVERSEN1_FALLBACK "OpenBLAS : Your OS does not support SVE instructions. OpenBLAS is using Neoverse N1 kernels as a fallback, which may give poorer performance.\n" | |||
| #define NUM_CORETYPES 18 | |||
| #define NUM_CORETYPES 19 | |||
| /* | |||
| * In case asm/hwcap.h is outdated on the build system, make sure | |||
| @@ -207,6 +215,7 @@ static char *corename[] = { | |||
| "cortexa55", | |||
| "armv8sve", | |||
| "a64fx", | |||
| "armv9sme", | |||
| "unknown" | |||
| }; | |||
| @@ -229,6 +238,7 @@ char *gotoblas_corename(void) { | |||
| if (gotoblas == &gotoblas_CORTEXA55) return corename[15]; | |||
| if (gotoblas == &gotoblas_ARMV8SVE) return corename[16]; | |||
| if (gotoblas == &gotoblas_A64FX) return corename[17]; | |||
| if (gotoblas == &gotoblas_ARMV9SME) return corename[18]; | |||
| return corename[NUM_CORETYPES]; | |||
| } | |||
| @@ -266,6 +276,7 @@ static gotoblas_t *force_coretype(char *coretype) { | |||
| case 15: return (&gotoblas_CORTEXA55); | |||
| case 16: return (&gotoblas_ARMV8SVE); | |||
| case 17: return (&gotoblas_A64FX); | |||
| case 18: return (&gotoblas_ARMV9SME); | |||
| } | |||
| snprintf(message, 128, "Core not found: %s\n", coretype); | |||
| openblas_warning(1, message); | |||
| @@ -277,6 +288,11 @@ static gotoblas_t *get_coretype(void) { | |||
| char coremsg[128]; | |||
| #if defined (OS_DARWIN) | |||
| //future #if !defined(NO_SME) | |||
| // if (support_sme1()) { | |||
| // return &gotoblas_ARMV9SME; | |||
| // } | |||
| // #endif | |||
| return &gotoblas_NEOVERSEN1; | |||
| #endif | |||
| @@ -439,6 +455,7 @@ static gotoblas_t *get_coretype(void) { | |||
| } | |||
| break; | |||
| case 0x61: // Apple | |||
| //future if (support_sme1()) return &gotoblas_ARMV9SME; | |||
| return &gotoblas_NEOVERSEN1; | |||
| break; | |||
| default: | |||
| @@ -446,8 +463,8 @@ static gotoblas_t *get_coretype(void) { | |||
| openblas_warning(1, coremsg); | |||
| } | |||
| #if !defined(NO_SME) && defined(HWCAP2_SME) | |||
| if ((getauxval(AT_HWCAP2) & HWCAP2_SME)) { | |||
| #if !defined(NO_SME) | |||
| if (support_sme1()) { | |||
| return &gotoblas_ARMV9SME; | |||
| } | |||
| #endif | |||
| @@ -511,6 +528,10 @@ int support_sme1(void) { | |||
| if(getauxval(AT_HWCAP2) & HWCAP2_SME){ | |||
| ret = 1; | |||
| } | |||
| #endif | |||
| #if defined(__APPLE__) | |||
| sysctlbyname("hw.optional.arm.FEAT_SME",&value64,&length64,NULL,0); | |||
| ret = value64; | |||
| #endif | |||
| return ret; | |||
| } | |||
| @@ -21,7 +21,7 @@ | |||
| chbmv,chemm,chemv,cher2,cher2k,cher,cherk,scabs1,scamax, | |||
| chpmv,chpr2,chpr,crotg,cscal,csrot,csscal,cswap,scamin,scasum,scnrm2, | |||
| csymm,csyr2k,csyrk,ctbmv,ctbsv,ctpmv,ctpsv,ctrmm,ctrmv,ctrsm, | |||
| ctrsv,icamax,icamin,cimatcopy,comatcopy,cgeadd,scsum,cgemmt); | |||
| ctrsv,icamax,icamin,cimatcopy,comatcopy,cgeadd,scsum,cgemmt,cgemmtr); | |||
| @blasobjsd = ( | |||
| damax,damin,dasum,daxpy,daxpby,dcabs1,dcopy,ddot,dgbmv,dgemm, | |||
| @@ -29,7 +29,7 @@ | |||
| dscal,dsdot,dspmv,dspr2,dimatcopy,domatcopy, | |||
| dspr,dswap,dsymm,dsymv,dsyr2,dsyr2k,dsyr,dsyrk,dtbmv,dtbsv, | |||
| dtpmv,dtpsv,dtrmm,dtrmv,dtrsm,dtrsv, | |||
| idamax,idamin,idmax,idmin,dgeadd,dsum,dgemmt); | |||
| idamax,idamin,idmax,idmin,dgeadd,dsum,dgemmt,dgemmtr); | |||
| @blasobjss = ( | |||
| isamax,isamin,ismax,ismin, | |||
| @@ -38,7 +38,7 @@ | |||
| smax,smin,snrm2,simatcopy,somatcopy, | |||
| srot,srotg,srotm,srotmg,ssbmv,sscal,sspmv,sspr2,sspr,sswap, | |||
| ssymm,ssymv,ssyr2,ssyr2k,ssyr,ssyrk,stbmv,stbsv,stpmv,stpsv, | |||
| strmm,strmv,strsm,strsv, sgeadd,ssum,sgemmt); | |||
| strmm,strmv,strsm,strsv, sgeadd,ssum,sgemmt,sgemmtr); | |||
| @blasobjsz = ( | |||
| izamax,izamin,, | |||
| @@ -48,28 +48,29 @@ | |||
| zhpr,zrotg,zscal,zswap,zsymm,zsyr2k,zsyrk,ztbmv, | |||
| ztbsv,ztpmv,ztpsv,ztrmm,ztrmv,ztrsm,ztrsv, | |||
| zomatcopy, zimatcopy,dzamax,dzamin,dzasum,dznrm2, | |||
| zgeadd, dzsum, zgemmt); | |||
| zgeadd, dzsum, zgemmt,zgemmtr); | |||
| @blasobjs = (lsame, xerbla); | |||
| @bfblasobjs = (sbgemm, sbgemv, sbdot, sbstobf16, sbdtobf16, sbf16tos, dbf16tod); | |||
| @bfblasobjs = (sbgemm, sbgemmt, sbgemmtr, sbgemv, sbdot, sbstobf16, sbdtobf16, sbf16tos, dbf16tod); | |||
| @cblasobjsc = ( | |||
| cblas_caxpy, cblas_ccopy, cblas_cdotc, cblas_cdotu, cblas_cgbmv, cblas_cgemm, cblas_cgemv, | |||
| cblas_cgerc, cblas_cgeru, cblas_chbmv, cblas_chemm, cblas_chemv, cblas_cher2, cblas_cher2k, | |||
| cblas_cher, cblas_cherk, cblas_chpmv, cblas_chpr2, cblas_chpr, cblas_cscal, cblas_caxpby, | |||
| cblas_csscal, cblas_cswap, cblas_csymm, cblas_csyr2k, cblas_csyrk, cblas_ctbmv, cblas_cgeadd, | |||
| cblas_ctbsv, cblas_ctpmv, cblas_ctpsv, cblas_ctrmm, cblas_ctrmv, cblas_ctrsm, cblas_ctrsv, | |||
| cblas_scnrm2, cblas_scasum, | |||
| cblas_icamax, cblas_icamin, cblas_icmin, cblas_icmax, cblas_scsum,cblas_cimatcopy,cblas_comatcopy | |||
| cblas_cgemmt); | |||
| cblas_scnrm2, cblas_scasum, cblas_cgemmt, cblas_cgemmtr, | |||
| cblas_icamax, cblas_icamin, cblas_icmin, cblas_icmax, cblas_scsum,cblas_cimatcopy,cblas_comatcopy, | |||
| cblas_caxpyc, cblas_crotg, cblas_csrot, cblas_scamax, cblas_scamin, cblas_cgemm_batch); | |||
| @cblasobjsd = ( | |||
| cblas_dasum, cblas_daxpy, cblas_dcopy, cblas_ddot, | |||
| cblas_dgbmv, cblas_dgemm, cblas_dgemv, cblas_dger, cblas_dnrm2, | |||
| cblas_drot, cblas_drotg, cblas_drotm, cblas_drotmg, cblas_dsbmv, cblas_dscal, cblas_dsdot, | |||
| cblas_dspmv, cblas_dspr2, cblas_dspr, cblas_dswap, cblas_dsymm, cblas_dsymv, cblas_dsyr2, | |||
| cblas_dsyr2k, cblas_dsyr, cblas_dsyrk, cblas_dtbmv, cblas_dtbsv, cblas_dtpmv, cblas_dtpsv, | |||
| cblas_dtrmm, cblas_dtrmv, cblas_dtrsm, cblas_dtrsv, cblas_daxpby, cblas_dgeadd, | |||
| cblas_idamax, cblas_idamin, cblas_idmin, cblas_idmax, cblas_dsum,cblas_dimatcopy,cblas_domatcopy | |||
| cblas_dgemmt); | |||
| cblas_dtrmm, cblas_dtrmv, cblas_dtrsm, cblas_dtrsv, cblas_daxpby, cblas_dgeadd, cblas_dgemmt, cblas_dgemmtr, | |||
| cblas_idamax, cblas_idamin, cblas_idmin, cblas_idmax, cblas_dsum,cblas_dimatcopy,cblas_domatcopy, | |||
| cblas_damax, cblas_damin, cblas_dgemm_batch); | |||
| @cblasobjss = ( | |||
| cblas_sasum, cblas_saxpy, cblas_saxpby, | |||
| @@ -78,9 +79,10 @@ | |||
| cblas_srotm, cblas_srotmg, cblas_ssbmv, cblas_sscal, cblas_sspmv, cblas_sspr2, cblas_sspr, | |||
| cblas_sswap, cblas_ssymm, cblas_ssymv, cblas_ssyr2, cblas_ssyr2k, cblas_ssyr, cblas_ssyrk, | |||
| cblas_stbmv, cblas_stbsv, cblas_stpmv, cblas_stpsv, cblas_strmm, cblas_strmv, cblas_strsm, | |||
| cblas_strsv, cblas_sgeadd, | |||
| cblas_isamax, cblas_isamin, cblas_ismin, cblas_ismax, cblas_ssum,cblas_simatcopy,cblas_somatcopy | |||
| cblas_sgemmt); | |||
| cblas_strsv, cblas_sgeadd, cblas_sgemmt, cblas_sgemmtr, | |||
| cblas_isamax, cblas_isamin, cblas_ismin, cblas_ismax, cblas_ssum,cblas_simatcopy,cblas_somatcopy, | |||
| cblas_samax, cblas_samin, cblas_sgemm_batch); | |||
| @cblasobjsz = ( | |||
| cblas_dzasum, cblas_dznrm2, cblas_zaxpy, cblas_zcopy, cblas_zdotc, cblas_zdotu, cblas_zdscal, | |||
| cblas_zgbmv, cblas_zgemm, cblas_zgemv, cblas_zgerc, cblas_zgeru, cblas_zhbmv, cblas_zhemm, | |||
| @@ -88,13 +90,13 @@ | |||
| cblas_zhpr, cblas_zscal, cblas_zswap, cblas_zsymm, cblas_zsyr2k, cblas_zsyrk, | |||
| cblas_ztbmv, cblas_ztbsv, cblas_ztpmv, cblas_ztpsv, cblas_ztrmm, cblas_ztrmv, cblas_ztrsm, | |||
| cblas_ztrsv, cblas_cdotc_sub, cblas_cdotu_sub, cblas_zdotc_sub, cblas_zdotu_sub, | |||
| cblas_zaxpby, cblas_zgeadd, | |||
| cblas_izamax, cblas_izamin, cblas_izmin, cblas_izmax, cblas_dzsum,cblas_zimatcopy,cblas_zomatcopy | |||
| cblas_zgemmt); | |||
| cblas_zaxpby, cblas_zgeadd, cblas_zgemmt, cblas_zgemmtr, | |||
| cblas_izamax, cblas_izamin, cblas_izmin, cblas_izmax, cblas_dzsum,cblas_zimatcopy,cblas_zomatcopy, | |||
| cblas_zaxpyc, cblas_zdrot, cblas_zrotg, cblas_dzamax, cblas_dzamin, cblas_zgemm_batch); | |||
| @cblasobjs = ( cblas_xerbla ); | |||
| @bfcblasobjs = (cblas_sbgemm, cblas_sbgemv, cblas_sbdot, cblas_sbstobf16, cblas_sbdtobf16, cblas_sbf16tos, cblas_dbf16tod); | |||
| @bfcblasobjs = (cblas_sbgemm, cblas_sbgemmt, cblas_sbgemmtr, cblas_sbgemv, cblas_sbdot, cblas_sbstobf16, cblas_sbdtobf16, cblas_sbf16tos, cblas_dbf16tod, cblas_sbgemm_batch); | |||
| @exblasobjs = ( | |||
| qamax,qamin,qasum,qaxpy,qcabs1,qcopy,qdot,qgbmv,qgemm, | |||
| @@ -709,6 +711,7 @@ zpotri, | |||
| # functions added for lapack-3.7.0 | |||
| @lapackobjs2s = (@lapackobjs2s, | |||
| slarfy, | |||
| ssyconvf, | |||
| strevc3, | |||
| sgelqt, | |||
| sgelqt3, | |||
| @@ -832,12 +835,82 @@ zpotri, | |||
| zungtsqr_row | |||
| ); | |||
| #functions added for lapack-3.11 | |||
| @lapackobjs2c = (@lapackobjs2c, | |||
| cgedmd, | |||
| cgedmdq | |||
| ); | |||
| @lapackobjs2d = (@lapackobjs2d, | |||
| dgedmd, | |||
| dgedmdq | |||
| ); | |||
| @lapackobjs2s = (@lapackobjs2s, | |||
| sgedmd, | |||
| sgedmdq | |||
| ); | |||
| @lapackobjs2z = (@lapackobjs2z, | |||
| zgedmd, | |||
| zgedmdq | |||
| ); | |||
| #functions added post 3.11 | |||
| @lapackobjs2c = (@lapackobjs2c, | |||
| cgelst, | |||
| cgeqp3rk, | |||
| claqp2rk, | |||
| claqp3rk, | |||
| clatrs3, | |||
| crscl, | |||
| ctrsyl3 | |||
| ); | |||
| # claqz0 | |||
| # claqz1 | |||
| # claqz2 | |||
| # claqz3 | |||
| # clatrs3 | |||
| @lapackobjs2d = (@lapackobjs2d, | |||
| dgelst, | |||
| dgeqp3rk, | |||
| dlaqp2rk, | |||
| dlaqp3rk, | |||
| dlarmm, | |||
| dlatrs3, | |||
| dtrsyl3 | |||
| ); | |||
| @lapackobjs2s = (@lapackobjs2s, | |||
| sgelst, | |||
| sgeqp3rk, | |||
| slaqp2rk, | |||
| slaqp3rk, | |||
| slarmm, | |||
| slatrs3, | |||
| strsyl3 | |||
| ); | |||
| @lapackobjs2z = (@lapackobjs2z, | |||
| zgelst, | |||
| zgeqp3rk, | |||
| zlaqp2rk, | |||
| zlaqp3rk, | |||
| zlatrs3, | |||
| zrscl, | |||
| ztrsyl3 | |||
| ); | |||
| # zlaqz0 | |||
| # zlaqz1 | |||
| # zlaqz2 | |||
| # zlaqz3 | |||
| @lapack_extendedprecision_objs = ( | |||
| zposvxx, clagge, clatms, chesvxx, cposvxx, cgesvxx, ssyrfssx, csyrfsx, | |||
| dlagsy, dsysvxx, sporfsx, slatms, zlatms, zherfsx, csysvxx, | |||
| ); | |||
| @lapack_deprecated_objsc = ( | |||
| cgelqs, cgeqrs, | |||
| cgegs, cggsvd, | |||
| cgegv, cggsvp, | |||
| cgelsx, clahrd, | |||
| @@ -845,13 +918,16 @@ zpotri, | |||
| ctzrqf, | |||
| ); | |||
| @lapack_deprecated_objsd = ( | |||
| dgelqs, dgeqrs, | |||
| dgegs, dgeqpf, | |||
| dgegv, dggsvd, | |||
| dgelsx, dggsvp, | |||
| dlahrd, | |||
| dlatzm, dtzrqf); | |||
| @lapack_deprecated_objss = ( | |||
| @lapack_deprecated_objss = ( | |||
| sgelqs, | |||
| sgeqrs, | |||
| sgelsx, | |||
| sgegs, | |||
| sgegv, | |||
| @@ -864,6 +940,8 @@ zpotri, | |||
| ); | |||
| @lapack_deprecated_objsz = ( | |||
| zgelqs, | |||
| zgeqrs, | |||
| zgegs, | |||
| zgegv, | |||
| zgelsx, | |||
| @@ -997,6 +1075,10 @@ zpotri, | |||
| LAPACKE_cgebrd_work, | |||
| LAPACKE_cgecon, | |||
| LAPACKE_cgecon_work, | |||
| LAPACKE_cgedmd, | |||
| LAPACKE_cgedmd_work, | |||
| LAPACKE_cgedmdq, | |||
| LAPACKE_cgedmdq_work, | |||
| LAPACKE_cgeequ, | |||
| LAPACKE_cgeequ_work, | |||
| LAPACKE_cgeequb, | |||
| @@ -1584,8 +1666,15 @@ zpotri, | |||
| LAPACKE_cgetsqrhrt, | |||
| LAPACKE_cgetsqrhrt_work, | |||
| LAPACKE_cungtsqr_row, | |||
| LAPACKE_cungtsqr_row_work | |||
| LAPACKE_cungtsqr_row_work, | |||
| LAPACKE_clangb, | |||
| LAPACKE_clangb_work, | |||
| LAPACKE_ctrsyl3, | |||
| LAPACKE_ctrsyl3_work, | |||
| LAPACKE_ctz_nancheck, | |||
| LAPACKE_ctz_trans, | |||
| LAPACKE_cunhr_col, | |||
| LAPACKE_cunhr_col_work | |||
| ); | |||
| @lapackeobjsd = ( | |||
| LAPACKE_dgb_nancheck, | |||
| @@ -1656,6 +1745,10 @@ zpotri, | |||
| LAPACKE_dgebrd_work, | |||
| LAPACKE_dgecon, | |||
| LAPACKE_dgecon_work, | |||
| LAPACKE_dgedmd, | |||
| LAPACKE_dgedmd_work, | |||
| LAPACKE_dgedmdq, | |||
| LAPACKE_dgedmdq_work, | |||
| LAPACKE_dgeequ, | |||
| LAPACKE_dgeequ_work, | |||
| LAPACKE_dgeequb, | |||
| @@ -2197,7 +2290,15 @@ zpotri, | |||
| LAPACKE_dgetsqrhrt, | |||
| LAPACKE_dgetsqrhrt_work, | |||
| LAPACKE_dorgtsqr_row, | |||
| LAPACKE_dorgtsqr_row_work | |||
| LAPACKE_dorgtsqr_row_work, | |||
| LAPACKE_dlangb, | |||
| LAPACKE_dlangb_work, | |||
| LAPACKE_dorhr_col, | |||
| LAPACKE_dorhr_col_work, | |||
| LAPACKE_dtrsyl3, | |||
| LAPACKE_dtrsyl3_work, | |||
| LAPACKE_dtz_nancheck, | |||
| LAPACKE_dtz_trans, | |||
| ); | |||
| @lapackeobjss = ( | |||
| @@ -2269,6 +2370,10 @@ zpotri, | |||
| LAPACKE_sgebrd_work, | |||
| LAPACKE_sgecon, | |||
| LAPACKE_sgecon_work, | |||
| LAPACKE_sgedmd, | |||
| LAPACKE_sgedmd_work, | |||
| LAPACKE_sgedmdq, | |||
| LAPACKE_sgedmdq_work, | |||
| LAPACKE_sgeequ, | |||
| LAPACKE_sgeequ_work, | |||
| LAPACKE_sgeequb, | |||
| @@ -2802,7 +2907,15 @@ zpotri, | |||
| LAPACKE_sgetsqrhrt, | |||
| LAPACKE_sgetsqrhrt_work, | |||
| LAPACKE_sorgtsqr_row, | |||
| LAPACKE_sorgtsqr_row_work | |||
| LAPACKE_sorgtsqr_row_work, | |||
| LAPACKE_slangb, | |||
| LAPACKE_slangb_work, | |||
| LAPACKE_sorhr_col, | |||
| LAPACKE_sorhr_col_work, | |||
| LAPACKE_strsyl3, | |||
| LAPACKE_strsyl3_work, | |||
| LAPACKE_stz_nancheck, | |||
| LAPACKE_stz_trans, | |||
| ); | |||
| @lapackeobjsz = ( | |||
| @@ -2878,6 +2991,10 @@ zpotri, | |||
| LAPACKE_zgebrd_work, | |||
| LAPACKE_zgecon, | |||
| LAPACKE_zgecon_work, | |||
| LAPACKE_zgedmd, | |||
| LAPACKE_zgedmd_work, | |||
| LAPACKE_zgedmdq, | |||
| LAPACKE_zgedmdq_work, | |||
| LAPACKE_zgeequ, | |||
| LAPACKE_zgeequ_work, | |||
| LAPACKE_zgeequb, | |||
| @@ -3345,7 +3462,15 @@ zpotri, | |||
| LAPACKE_zgetsqrhrt, | |||
| LAPACKE_zgetsqrhrt_work, | |||
| LAPACKE_zungtsqr_row, | |||
| LAPACKE_zungtsqr_row_work | |||
| LAPACKE_zungtsqr_row_work, | |||
| LAPACKE_zlangb, | |||
| LAPACKE_zlangb_work, | |||
| LAPACKE_zunhr_col, | |||
| LAPACKE_zunhr_col_work, | |||
| LAPACKE_ztrsyl3, | |||
| LAPACKE_ztrsyl3_work, | |||
| LAPACKE_ztz_nancheck, | |||
| LAPACKE_ztz_trans, | |||
| ## @(SRCX_OBJ) from `lapack-3.4.1/lapacke/src/Makefile` | |||
| ## Not exported: requires LAPACKE_EXTENDED to be set and depends on the | |||
| @@ -3551,7 +3676,7 @@ zpotri, | |||
| LAPACKE_zsytrs_aa_2stage_work, | |||
| # new functions from 3.9.0 | |||
| LAPACKE_zgesvdq, | |||
| LAPACKE_zgesvdq_work | |||
| LAPACKE_zgesvdq_work, | |||
| ); | |||
| #These function may need 2 underscores. | |||
| @@ -3573,7 +3698,7 @@ zpotri, | |||
| ssygv_2stage, | |||
| ssysv_aa_2stage, ssytrf_aa_2stage, | |||
| ssytrs_aa_2stage, | |||
| slaorhr_col_getrfnp, slaorhr_col_getrfnp2, sorhr_col, | |||
| slaorhr_col_getrfnp, slaorhr_col_getrfnp2, sorhr_col, slarfb_gett | |||
| ); | |||
| @lapack_embeded_underscore_objs_c=( | |||
| chetf2_rook, chetrf_rook, chetri_rook, | |||
| @@ -3598,7 +3723,7 @@ zpotri, | |||
| chetrf_aa_2stage, chetrs_aa_2stage, | |||
| csysv_aa_2stage, csytrf_aa_2stage, | |||
| csytrs_aa_2stage, | |||
| claunhr_col_getrfnp, claunhr_col_getrfnp2, cunhr_col, | |||
| claunhr_col_getrfnp, claunhr_col_getrfnp2, cunhr_col, clarfb_gett | |||
| ); | |||
| @lapack_embeded_underscore_objs_d=( | |||
| dlasyf_rook, | |||
| @@ -3615,7 +3740,7 @@ zpotri, | |||
| dsbevd_2stage, dsygv_2stage, | |||
| dsysv_aa_2stage, | |||
| dsytrf_aa_2stage, dsytrs_aa_2stage, | |||
| dlaorhr_col_getrfnp, dlaorhr_col_getrfnp2, dorhr_col, | |||
| dlaorhr_col_getrfnp, dlaorhr_col_getrfnp2, dorhr_col, dlarfb_gett | |||
| ); | |||
| @lapack_embeded_underscore_objs_z=( | |||
| zhetf2_rook, zhetrf_rook, zhetri_rook, | |||
| @@ -3639,7 +3764,7 @@ zpotri, | |||
| zhesv_aa_2stage, zhetrf_aa_2stage, | |||
| zhetrs_aa_2stage, zsysv_aa_2stage, | |||
| zsytrf_aa_2stage, zsytrs_aa_2stage, | |||
| zlaunhr_col_getrfnp, zlaunhr_col_getrfnp2, zunhr_col | |||
| zlaunhr_col_getrfnp, zlaunhr_col_getrfnp2, zunhr_col, zlarfb_gett | |||
| ); | |||
| @@ -30,17 +30,17 @@ set(BLAS2_SOURCES | |||
| gemv.c ger.c | |||
| trsv.c trmv.c | |||
| syr2.c gbmv.c | |||
| sbmv.c | |||
| sbmv.c spmv.c | |||
| spr2.c | |||
| tbsv.c tbmv.c | |||
| tpsv.c tpmv.c | |||
| ) | |||
| set(BLAS2_REAL_ONLY_SOURCES | |||
| symv.c syr.c spmv.c spr.c | |||
| symv.c syr.c spr.c | |||
| ) | |||
| set(BLAS2_COMPLEX_LAPACK_SOURCES | |||
| symv.c syr.c spmv.c spr.c | |||
| symv.c syr.c spr.c | |||
| ) | |||
| set(BLAS2_COMPLEX_ONLY_MANGLED_SOURCES | |||
| @@ -195,7 +195,7 @@ if (NOT DEFINED NO_CBLAS) | |||
| endforeach () | |||
| endif() | |||
| if (NOT DEFINED NO_LAPACK) | |||
| if (NOT NO_LAPACK) | |||
| set(LAPACK_SOURCES | |||
| lapack/gesv.c | |||
| ) | |||
| @@ -250,3 +250,7 @@ if ( BUILD_COMPLEX16 AND NOT BUILD_DOUBLE) | |||
| endif () | |||
| add_library(interface OBJECT ${OPENBLAS_SRC}) | |||
| if (USE_OPENMP) | |||
| target_link_libraries(interface OpenMP::OpenMP_C) | |||
| endif() | |||
| @@ -70,11 +70,22 @@ static int (*gemv_thread[])(BLASLONG, BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT | |||
| #if defined(DYNAMIC_ARCH) || defined(NEOVERSEV1) | |||
| static inline int get_gemv_optimal_nthreads_neoversev1(BLASLONG MN, int ncpu) { | |||
| return | |||
| MN < 25600L ? 1 | |||
| : MN < 63001L ? MIN(ncpu, 4) | |||
| : MN < 459684L ? MIN(ncpu, 16) | |||
| : ncpu; | |||
| #ifdef DOUBLE | |||
| return (MN < 8100L) ? 1 | |||
| : (MN < 12100L) ? MIN(ncpu, 2) | |||
| : (MN < 36100L) ? MIN(ncpu, 4) | |||
| : (MN < 84100L) ? MIN(ncpu, 8) | |||
| : (MN < 348100L) ? MIN(ncpu, 16) | |||
| : (MN < 435600L) ? MIN(ncpu, 24) | |||
| : (MN < 810000L) ? MIN(ncpu, 32) | |||
| : (MN < 1050625L) ? MIN(ncpu, 40) | |||
| : ncpu; | |||
| #else | |||
| return (MN < 25600L) ? 1 | |||
| : (MN < 63001L) ? MIN(ncpu, 4) | |||
| : (MN < 459684L) ? MIN(ncpu, 16) | |||
| : ncpu; | |||
| #endif | |||
| } | |||
| #endif | |||
| @@ -96,11 +107,11 @@ static inline int get_gemv_optimal_nthreads(BLASLONG MN) { | |||
| return num_cpu_avail(4); | |||
| return 1; | |||
| #endif | |||
| #if defined(NEOVERSEV1) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) | |||
| #if defined(NEOVERSEV1) && !defined(COMPLEX) && !defined(BFLOAT16) | |||
| return get_gemv_optimal_nthreads_neoversev1(MN, ncpu); | |||
| #elif defined(NEOVERSEV2) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) | |||
| return get_gemv_optimal_nthreads_neoversev2(MN, ncpu); | |||
| #elif defined(DYNAMIC_ARCH) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) | |||
| #elif defined(DYNAMIC_ARCH) && !defined(COMPLEX) && !defined(BFLOAT16) | |||
| if (strcmp(gotoblas_corename(), "neoversev1") == 0) { | |||
| return get_gemv_optimal_nthreads_neoversev1(MN, ncpu); | |||
| } | |||
| @@ -208,7 +208,7 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) | |||
| set(USE_TRMM true) | |||
| endif () | |||
| set(USE_DIRECT_SGEMM false) | |||
| if (X86_64 OR (ARM64 AND (UC_TARGET_CORE MATCHES ARMV9SME))) | |||
| if (X86_64 OR ARM64) | |||
| set(USE_DIRECT_SGEMM true) | |||
| endif() | |||
| @@ -225,9 +225,11 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) | |||
| set (SGEMMDIRECTSMEKERNEL sgemm_direct_sme1.S) | |||
| set (SGEMMDIRECTPREKERNEL sgemm_direct_sme1_preprocess.S) | |||
| GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTKERNEL}" "" "gemm_direct" false "" "" false SINGLE) | |||
| if (HAVE_SME) | |||
| GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTSMEKERNEL}" "" "gemm_direct_sme1" false "" "" false SINGLE) | |||
| GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTPREKERNEL}" "" "gemm_direct_sme1_preprocess" false "" "" false SINGLE) | |||
| endif () | |||
| endif () | |||
| endif() | |||
| foreach (float_type SINGLE DOUBLE) | |||
| @@ -1364,6 +1366,9 @@ endif () | |||
| if (USE_GEMM3M) | |||
| target_compile_definitions(kernel${TSUFFIX} PRIVATE USE_GEMM3M) | |||
| endif() | |||
| if (USE_OPENMP) | |||
| target_link_libraries(kernel${TSUFFIX} OpenMP::OpenMP_C) | |||
| endif() | |||
| endfunction () | |||
| @@ -103,8 +103,8 @@ endif | |||
| ifeq ($(ARCH), arm64) | |||
| ifeq ($(TARGET_CORE), ARMV9SME) | |||
| HAVE_SME = 1 | |||
| SGEMMDIRECTKERNEL = sgemm_direct_arm64_sme1.c | |||
| endif | |||
| SGEMMDIRECTKERNEL = sgemm_direct_arm64_sme1.c | |||
| endif | |||
| endif | |||
| endif | |||
| @@ -143,9 +143,10 @@ SKERNELOBJS += \ | |||
| sgemm_direct_performant$(TSUFFIX).$(SUFFIX) | |||
| endif | |||
| ifeq ($(ARCH), arm64) | |||
| SKERNELOBJS += \ | |||
| sgemm_direct$(TSUFFIX).$(SUFFIX) | |||
| ifdef HAVE_SME | |||
| SKERNELOBJS += \ | |||
| sgemm_direct$(TSUFFIX).$(SUFFIX) \ | |||
| sgemm_direct_sme1$(TSUFFIX).$(SUFFIX) \ | |||
| sgemm_direct_sme1_preprocess$(TSUFFIX).$(SUFFIX) | |||
| endif | |||
| @@ -835,9 +836,9 @@ $(KDIR)sgemm_direct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMDIRECTKERNEL) | |||
| $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ | |||
| endif | |||
| ifeq ($(ARCH), arm64) | |||
| ifdef HAVE_SME | |||
| $(KDIR)sgemm_direct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMDIRECTKERNEL) | |||
| $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ | |||
| ifdef HAVE_SME | |||
| $(KDIR)sgemm_direct_sme1$(TSUFFIX).$(SUFFIX) : | |||
| $(CC) $(CFLAGS) -c $(KERNELDIR)/sgemm_direct_sme1.S -UDOUBLE -UCOMPLEX -o $@ | |||
| $(KDIR)sgemm_direct_sme1_preprocess$(TSUFFIX).$(SUFFIX) : | |||
| @@ -1,6 +1,6 @@ | |||
| include $(KERNELDIR)/KERNEL.ARMV8SVE | |||
| SGEMVNKERNEL = gemv_n_sve.c | |||
| DGEMVNKERNEL = gemv_n_sve.c | |||
| SGEMVNKERNEL = gemv_n_sve_v4x3.c | |||
| DGEMVNKERNEL = gemv_n_sve_v4x3.c | |||
| SGEMVTKERNEL = gemv_t_sve_v4x3.c | |||
| DGEMVTKERNEL = gemv_t_sve_v4x3.c | |||
| @@ -74,16 +74,21 @@ DSCALKERNEL = scal.S | |||
| CSCALKERNEL = zscal.S | |||
| ZSCALKERNEL = zscal.S | |||
| SGEMVNKERNEL = gemv_n_sve.c | |||
| DGEMVNKERNEL = gemv_n.S | |||
| SGEMVNKERNEL = gemv_n_sve_v1x3.c | |||
| DGEMVNKERNEL = gemv_n_sve_v1x3.c | |||
| CGEMVNKERNEL = zgemv_n.S | |||
| ZGEMVNKERNEL = zgemv_n.S | |||
| SGEMVTKERNEL = gemv_t.S | |||
| DGEMVTKERNEL = gemv_t.S | |||
| SGEMVTKERNEL = gemv_t_sve_v1x3.c | |||
| DGEMVTKERNEL = gemv_t_sve_v1x3.c | |||
| CGEMVTKERNEL = zgemv_t.S | |||
| ZGEMVTKERNEL = zgemv_t.S | |||
| SSYMV_L_KERNEL = symv_L_sve_v1x4.c | |||
| SSYMV_U_KERNEL = symv_U_sve_v1x4.c | |||
| DSYMV_L_KERNEL = symv_L_sve_v1x4.c | |||
| DSYMV_U_KERNEL = symv_U_sve_v1x4.c | |||
| SASUMKERNEL = sasum_thunderx2t99.c | |||
| DASUMKERNEL = dasum_thunderx2t99.c | |||
| CASUMKERNEL = casum_thunderx2t99.c | |||
| @@ -60,7 +60,7 @@ DSCALKERNEL = scal.S | |||
| CSCALKERNEL = zscal.S | |||
| ZSCALKERNEL = zscal.S | |||
| SGEMVNKERNEL = gemv_n.S | |||
| SGEMVNKERNEL = sgemv_n_neon.c | |||
| DGEMVNKERNEL = gemv_n.S | |||
| CGEMVNKERNEL = zgemv_n.S | |||
| ZGEMVNKERNEL = zgemv_n.S | |||
| @@ -70,6 +70,10 @@ DGEMVTKERNEL = gemv_t.S | |||
| CGEMVTKERNEL = zgemv_t.S | |||
| ZGEMVTKERNEL = zgemv_t.S | |||
| SSYMV_L_KERNEL = symv_L_asimd_4x4.c | |||
| SSYMV_U_KERNEL = symv_U_asimd_4x4.c | |||
| DSYMV_L_KERNEL = symv_L_asimd_4x4.c | |||
| DSYMV_U_KERNEL = symv_U_asimd_4x4.c | |||
| SASUMKERNEL = sasum_thunderx2t99.c | |||
| DASUMKERNEL = dasum_thunderx2t99.c | |||
| @@ -60,13 +60,13 @@ DSCALKERNEL = scal.S | |||
| CSCALKERNEL = zscal.S | |||
| ZSCALKERNEL = zscal.S | |||
| SGEMVNKERNEL = gemv_n.S | |||
| DGEMVNKERNEL = gemv_n.S | |||
| SGEMVNKERNEL = gemv_n_sve_v1x3.c | |||
| DGEMVNKERNEL = gemv_n_sve_v1x3.c | |||
| CGEMVNKERNEL = zgemv_n.S | |||
| ZGEMVNKERNEL = zgemv_n.S | |||
| SGEMVTKERNEL = gemv_t.S | |||
| DGEMVTKERNEL = gemv_t.S | |||
| SGEMVTKERNEL = gemv_t_sve_v1x3.c | |||
| DGEMVTKERNEL = gemv_t_sve_v1x3.c | |||
| CGEMVTKERNEL = zgemv_t.S | |||
| ZGEMVTKERNEL = zgemv_t.S | |||
| @@ -1,5 +1,7 @@ | |||
| include $(KERNELDIR)/KERNEL.ARMV8SVE | |||
| SGEMVNKERNEL = gemv_n_sve_v1x3.c | |||
| DGEMVNKERNEL = gemv_n_sve_v1x3.c | |||
| SGEMVTKERNEL = gemv_t_sve_v1x3.c | |||
| DGEMVTKERNEL = gemv_t_sve_v1x3.c | |||
| ifeq ($(BUILD_BFLOAT16), 1) | |||
| @@ -48,6 +48,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n, | |||
| BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, | |||
| void *c, BLASLONG ldc, int (*function)(), int nthreads); | |||
| #ifdef DYNAMIC_ARCH | |||
| extern char* gotoblas_corename(void); | |||
| #endif | |||
| #if defined(DYNAMIC_ARCH) || defined(NEOVERSEV1) | |||
| static inline int get_dot_optimal_nthreads_neoversev1(BLASLONG N, int ncpu) { | |||
| #ifdef DOUBLE | |||
| return (N <= 10000L) ? 1 | |||
| : (N <= 64500L) ? 1 | |||
| : (N <= 100000L) ? MIN(ncpu, 2) | |||
| : (N <= 150000L) ? MIN(ncpu, 4) | |||
| : (N <= 260000L) ? MIN(ncpu, 8) | |||
| : (N <= 360000L) ? MIN(ncpu, 16) | |||
| : (N <= 520000L) ? MIN(ncpu, 24) | |||
| : (N <= 1010000L) ? MIN(ncpu, 56) | |||
| : ncpu; | |||
| #else | |||
| return (N <= 10000L) ? 1 | |||
| : (N <= 110000L) ? 1 | |||
| : (N <= 200000L) ? MIN(ncpu, 2) | |||
| : (N <= 280000L) ? MIN(ncpu, 4) | |||
| : (N <= 520000L) ? MIN(ncpu, 8) | |||
| : (N <= 830000L) ? MIN(ncpu, 16) | |||
| : (N <= 1010000L) ? MIN(ncpu, 24) | |||
| : ncpu; | |||
| #endif | |||
| } | |||
| #endif | |||
| static inline int get_dot_optimal_nthreads(BLASLONG n) { | |||
| int ncpu = num_cpu_avail(1); | |||
| #if defined(NEOVERSEV1) && !defined(COMPLEX) && !defined(BFLOAT16) | |||
| return get_dot_optimal_nthreads_neoversev1(n, ncpu); | |||
| #elif defined(DYNAMIC_ARCH) && !defined(COMPLEX) && !defined(BFLOAT16) | |||
| if (strcmp(gotoblas_corename(), "neoversev1") == 0) { | |||
| return get_dot_optimal_nthreads_neoversev1(n, ncpu); | |||
| } | |||
| #endif | |||
| // Default case | |||
| if (n <= 10000L) | |||
| return 1; | |||
| else | |||
| return num_cpu_avail(1); | |||
| } | |||
| #endif | |||
| static RETURN_TYPE dot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| @@ -85,10 +132,10 @@ RETURN_TYPE CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y | |||
| RETURN_TYPE dot = 0.0; | |||
| #if defined(SMP) | |||
| if (inc_x == 0 || inc_y == 0 || n <= 10000) | |||
| if (inc_x == 0 || inc_y == 0) | |||
| nthreads = 1; | |||
| else | |||
| nthreads = num_cpu_avail(1); | |||
| nthreads = get_dot_optimal_nthreads(n); | |||
| if (nthreads == 1) { | |||
| dot = dot_compute(n, x, inc_x, y, inc_y); | |||
| @@ -105,7 +152,7 @@ RETURN_TYPE CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y | |||
| blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, | |||
| x, inc_x, y, inc_y, result, 0, | |||
| ( void *)dot_thread_function, nthreads); | |||
| (void *)dot_thread_function, nthreads); | |||
| ptr = (RETURN_TYPE *)result; | |||
| for (i = 0; i < nthreads; i++) { | |||
| @@ -0,0 +1,138 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2025, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written | |||
| permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include <arm_sve.h> | |||
| #include "common.h" | |||
| #ifdef DOUBLE | |||
| #define SV_COUNT svcntd | |||
| #define SV_TYPE svfloat64_t | |||
| #define SV_TRUE svptrue_b64 | |||
| #define SV_WHILE svwhilelt_b64_s64 | |||
| #define SV_DUP svdup_f64 | |||
| #else | |||
| #define SV_COUNT svcntw | |||
| #define SV_TYPE svfloat32_t | |||
| #define SV_TRUE svptrue_b32 | |||
| #define SV_WHILE svwhilelt_b32_s64 | |||
| #define SV_DUP svdup_f32 | |||
| #endif | |||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, | |||
| BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, | |||
| FLOAT *buffer) | |||
| { | |||
| BLASLONG i; | |||
| BLASLONG ix,iy; | |||
| BLASLONG j; | |||
| FLOAT *a_ptr; | |||
| FLOAT temp; | |||
| ix = 0; | |||
| a_ptr = a; | |||
| if (inc_y == 1) { | |||
| BLASLONG width = (n + 3 - 1) / 3; | |||
| FLOAT *a0_ptr = a_ptr + lda * width * 0; | |||
| FLOAT *a1_ptr = a_ptr + lda * width * 1; | |||
| FLOAT *a2_ptr = a_ptr + lda * width * 2; | |||
| FLOAT *x0_ptr = x + inc_x * width * 0; | |||
| FLOAT *x1_ptr = x + inc_x * width * 1; | |||
| FLOAT *x2_ptr = x + inc_x * width * 2; | |||
| for (j = 0; j < width; j++) { | |||
| svbool_t pg00 = ((j + width * 0) < n) ? SV_TRUE() : svpfalse(); | |||
| svbool_t pg01 = ((j + width * 1) < n) ? SV_TRUE() : svpfalse(); | |||
| svbool_t pg02 = ((j + width * 2) < n) ? SV_TRUE() : svpfalse(); | |||
| SV_TYPE temp0_vec = ((j + width * 0) < n) ? SV_DUP(alpha * x0_ptr[ix]) : SV_DUP(0.0); | |||
| SV_TYPE temp1_vec = ((j + width * 1) < n) ? SV_DUP(alpha * x1_ptr[ix]) : SV_DUP(0.0); | |||
| SV_TYPE temp2_vec = ((j + width * 2) < n) ? SV_DUP(alpha * x2_ptr[ix]) : SV_DUP(0.0); | |||
| i = 0; | |||
| BLASLONG sve_size = SV_COUNT(); | |||
| while ((i + sve_size * 1 - 1) < m) { | |||
| SV_TYPE y0_vec = svld1_vnum(SV_TRUE(), y + i, 0); | |||
| SV_TYPE a00_vec = svld1_vnum(pg00, a0_ptr + i, 0); | |||
| SV_TYPE a01_vec = svld1_vnum(pg01, a1_ptr + i, 0); | |||
| SV_TYPE a02_vec = svld1_vnum(pg02, a2_ptr + i, 0); | |||
| y0_vec = svmla_m(pg00, y0_vec, temp0_vec, a00_vec); | |||
| y0_vec = svmla_m(pg01, y0_vec, temp1_vec, a01_vec); | |||
| y0_vec = svmla_m(pg02, y0_vec, temp2_vec, a02_vec); | |||
| svst1_vnum(SV_TRUE(), y + i, 0, y0_vec); | |||
| i += sve_size * 1; | |||
| } | |||
| if (i < m) { | |||
| svbool_t pg0 = SV_WHILE(i + sve_size * 0, m); | |||
| pg00 = svand_z(SV_TRUE(), pg0, pg00); | |||
| pg01 = svand_z(SV_TRUE(), pg0, pg01); | |||
| pg02 = svand_z(SV_TRUE(), pg0, pg02); | |||
| SV_TYPE y0_vec = svld1_vnum(pg0, y + i, 0); | |||
| SV_TYPE a00_vec = svld1_vnum(pg00, a0_ptr + i, 0); | |||
| SV_TYPE a01_vec = svld1_vnum(pg01, a1_ptr + i, 0); | |||
| SV_TYPE a02_vec = svld1_vnum(pg02, a2_ptr + i, 0); | |||
| y0_vec = svmla_m(pg00, y0_vec, temp0_vec, a00_vec); | |||
| y0_vec = svmla_m(pg01, y0_vec, temp1_vec, a01_vec); | |||
| y0_vec = svmla_m(pg02, y0_vec, temp2_vec, a02_vec); | |||
| svst1_vnum(pg0, y + i, 0, y0_vec); | |||
| } | |||
| a0_ptr += lda; | |||
| a1_ptr += lda; | |||
| a2_ptr += lda; | |||
| ix += inc_x; | |||
| } | |||
| return(0); | |||
| } | |||
| for (j = 0; j < n; j++) { | |||
| temp = alpha * x[ix]; | |||
| iy = 0; | |||
| for (i = 0; i < m; i++) { | |||
| y[iy] += temp * a_ptr[i]; | |||
| iy += inc_y; | |||
| } | |||
| a_ptr += lda; | |||
| ix += inc_x; | |||
| } | |||
| return (0); | |||
| } | |||
| @@ -0,0 +1,207 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2025, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written | |||
| permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include <arm_sve.h> | |||
| #include "common.h" | |||
| #ifdef DOUBLE | |||
| #define SV_COUNT svcntd | |||
| #define SV_TYPE svfloat64_t | |||
| #define SV_TRUE svptrue_b64 | |||
| #define SV_WHILE svwhilelt_b64_s64 | |||
| #define SV_DUP svdup_f64 | |||
| #else | |||
| #define SV_COUNT svcntw | |||
| #define SV_TYPE svfloat32_t | |||
| #define SV_TRUE svptrue_b32 | |||
| #define SV_WHILE svwhilelt_b32_s64 | |||
| #define SV_DUP svdup_f32 | |||
| #endif | |||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, | |||
| BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, | |||
| FLOAT *buffer) | |||
| { | |||
| BLASLONG i; | |||
| BLASLONG ix,iy; | |||
| BLASLONG j; | |||
| FLOAT *a_ptr; | |||
| FLOAT temp; | |||
| ix = 0; | |||
| a_ptr = a; | |||
| if (inc_y == 1) { | |||
| BLASLONG width = (n + 3 - 1) / 3; | |||
| FLOAT *a0_ptr = a_ptr + lda * width * 0; | |||
| FLOAT *a1_ptr = a_ptr + lda * width * 1; | |||
| FLOAT *a2_ptr = a_ptr + lda * width * 2; | |||
| FLOAT *x0_ptr = x + inc_x * width * 0; | |||
| FLOAT *x1_ptr = x + inc_x * width * 1; | |||
| FLOAT *x2_ptr = x + inc_x * width * 2; | |||
| for (j = 0; j < width; j++) { | |||
| svbool_t pg00 = ((j + width * 0) < n) ? SV_TRUE() : svpfalse(); | |||
| svbool_t pg10 = ((j + width * 0) < n) ? SV_TRUE() : svpfalse(); | |||
| svbool_t pg20 = ((j + width * 0) < n) ? SV_TRUE() : svpfalse(); | |||
| svbool_t pg30 = ((j + width * 0) < n) ? SV_TRUE() : svpfalse(); | |||
| svbool_t pg01 = ((j + width * 1) < n) ? SV_TRUE() : svpfalse(); | |||
| svbool_t pg11 = ((j + width * 1) < n) ? SV_TRUE() : svpfalse(); | |||
| svbool_t pg21 = ((j + width * 1) < n) ? SV_TRUE() : svpfalse(); | |||
| svbool_t pg31 = ((j + width * 1) < n) ? SV_TRUE() : svpfalse(); | |||
| svbool_t pg02 = ((j + width * 2) < n) ? SV_TRUE() : svpfalse(); | |||
| svbool_t pg12 = ((j + width * 2) < n) ? SV_TRUE() : svpfalse(); | |||
| svbool_t pg22 = ((j + width * 2) < n) ? SV_TRUE() : svpfalse(); | |||
| svbool_t pg32 = ((j + width * 2) < n) ? SV_TRUE() : svpfalse(); | |||
| SV_TYPE temp0_vec = ((j + width * 0) < n) ? SV_DUP(alpha * x0_ptr[ix]) : SV_DUP(0.0); | |||
| SV_TYPE temp1_vec = ((j + width * 1) < n) ? SV_DUP(alpha * x1_ptr[ix]) : SV_DUP(0.0); | |||
| SV_TYPE temp2_vec = ((j + width * 2) < n) ? SV_DUP(alpha * x2_ptr[ix]) : SV_DUP(0.0); | |||
| i = 0; | |||
| BLASLONG sve_size = SV_COUNT(); | |||
| while ((i + sve_size * 4 - 1) < m) { | |||
| SV_TYPE y0_vec = svld1_vnum(SV_TRUE(), y + i, 0); | |||
| SV_TYPE y1_vec = svld1_vnum(SV_TRUE(), y + i, 1); | |||
| SV_TYPE y2_vec = svld1_vnum(SV_TRUE(), y + i, 2); | |||
| SV_TYPE y3_vec = svld1_vnum(SV_TRUE(), y + i, 3); | |||
| SV_TYPE a00_vec = svld1_vnum(pg00, a0_ptr + i, 0); | |||
| SV_TYPE a10_vec = svld1_vnum(pg10, a0_ptr + i, 1); | |||
| SV_TYPE a20_vec = svld1_vnum(pg20, a0_ptr + i, 2); | |||
| SV_TYPE a30_vec = svld1_vnum(pg30, a0_ptr + i, 3); | |||
| SV_TYPE a01_vec = svld1_vnum(pg01, a1_ptr + i, 0); | |||
| SV_TYPE a11_vec = svld1_vnum(pg11, a1_ptr + i, 1); | |||
| SV_TYPE a21_vec = svld1_vnum(pg21, a1_ptr + i, 2); | |||
| SV_TYPE a31_vec = svld1_vnum(pg31, a1_ptr + i, 3); | |||
| SV_TYPE a02_vec = svld1_vnum(pg02, a2_ptr + i, 0); | |||
| SV_TYPE a12_vec = svld1_vnum(pg12, a2_ptr + i, 1); | |||
| SV_TYPE a22_vec = svld1_vnum(pg22, a2_ptr + i, 2); | |||
| SV_TYPE a32_vec = svld1_vnum(pg32, a2_ptr + i, 3); | |||
| y0_vec = svmla_m(pg00, y0_vec, temp0_vec, a00_vec); | |||
| y1_vec = svmla_m(pg10, y1_vec, temp0_vec, a10_vec); | |||
| y2_vec = svmla_m(pg20, y2_vec, temp0_vec, a20_vec); | |||
| y3_vec = svmla_m(pg30, y3_vec, temp0_vec, a30_vec); | |||
| y0_vec = svmla_m(pg01, y0_vec, temp1_vec, a01_vec); | |||
| y1_vec = svmla_m(pg11, y1_vec, temp1_vec, a11_vec); | |||
| y2_vec = svmla_m(pg21, y2_vec, temp1_vec, a21_vec); | |||
| y3_vec = svmla_m(pg31, y3_vec, temp1_vec, a31_vec); | |||
| y0_vec = svmla_m(pg02, y0_vec, temp2_vec, a02_vec); | |||
| y1_vec = svmla_m(pg12, y1_vec, temp2_vec, a12_vec); | |||
| y2_vec = svmla_m(pg22, y2_vec, temp2_vec, a22_vec); | |||
| y3_vec = svmla_m(pg32, y3_vec, temp2_vec, a32_vec); | |||
| svst1_vnum(SV_TRUE(), y + i, 0, y0_vec); | |||
| svst1_vnum(SV_TRUE(), y + i, 1, y1_vec); | |||
| svst1_vnum(SV_TRUE(), y + i, 2, y2_vec); | |||
| svst1_vnum(SV_TRUE(), y + i, 3, y3_vec); | |||
| i += sve_size * 4; | |||
| } | |||
| if (i < m) { | |||
| svbool_t pg0 = SV_WHILE(i + sve_size * 0, m); | |||
| svbool_t pg1 = SV_WHILE(i + sve_size * 1, m); | |||
| svbool_t pg2 = SV_WHILE(i + sve_size * 2, m); | |||
| svbool_t pg3 = SV_WHILE(i + sve_size * 3, m); | |||
| pg00 = svand_z(SV_TRUE(), pg0, pg00); | |||
| pg10 = svand_z(SV_TRUE(), pg1, pg10); | |||
| pg20 = svand_z(SV_TRUE(), pg2, pg20); | |||
| pg30 = svand_z(SV_TRUE(), pg3, pg30); | |||
| pg01 = svand_z(SV_TRUE(), pg0, pg01); | |||
| pg11 = svand_z(SV_TRUE(), pg1, pg11); | |||
| pg21 = svand_z(SV_TRUE(), pg2, pg21); | |||
| pg31 = svand_z(SV_TRUE(), pg3, pg31); | |||
| pg02 = svand_z(SV_TRUE(), pg0, pg02); | |||
| pg12 = svand_z(SV_TRUE(), pg1, pg12); | |||
| pg22 = svand_z(SV_TRUE(), pg2, pg22); | |||
| pg32 = svand_z(SV_TRUE(), pg3, pg32); | |||
| SV_TYPE y0_vec = svld1_vnum(pg0, y + i, 0); | |||
| SV_TYPE y1_vec = svld1_vnum(pg1, y + i, 1); | |||
| SV_TYPE y2_vec = svld1_vnum(pg2, y + i, 2); | |||
| SV_TYPE y3_vec = svld1_vnum(pg3, y + i, 3); | |||
| SV_TYPE a00_vec = svld1_vnum(pg00, a0_ptr + i, 0); | |||
| SV_TYPE a10_vec = svld1_vnum(pg10, a0_ptr + i, 1); | |||
| SV_TYPE a20_vec = svld1_vnum(pg20, a0_ptr + i, 2); | |||
| SV_TYPE a30_vec = svld1_vnum(pg30, a0_ptr + i, 3); | |||
| SV_TYPE a01_vec = svld1_vnum(pg01, a1_ptr + i, 0); | |||
| SV_TYPE a11_vec = svld1_vnum(pg11, a1_ptr + i, 1); | |||
| SV_TYPE a21_vec = svld1_vnum(pg21, a1_ptr + i, 2); | |||
| SV_TYPE a31_vec = svld1_vnum(pg31, a1_ptr + i, 3); | |||
| SV_TYPE a02_vec = svld1_vnum(pg02, a2_ptr + i, 0); | |||
| SV_TYPE a12_vec = svld1_vnum(pg12, a2_ptr + i, 1); | |||
| SV_TYPE a22_vec = svld1_vnum(pg22, a2_ptr + i, 2); | |||
| SV_TYPE a32_vec = svld1_vnum(pg32, a2_ptr + i, 3); | |||
| y0_vec = svmla_m(pg00, y0_vec, temp0_vec, a00_vec); | |||
| y1_vec = svmla_m(pg10, y1_vec, temp0_vec, a10_vec); | |||
| y2_vec = svmla_m(pg20, y2_vec, temp0_vec, a20_vec); | |||
| y3_vec = svmla_m(pg30, y3_vec, temp0_vec, a30_vec); | |||
| y0_vec = svmla_m(pg01, y0_vec, temp1_vec, a01_vec); | |||
| y1_vec = svmla_m(pg11, y1_vec, temp1_vec, a11_vec); | |||
| y2_vec = svmla_m(pg21, y2_vec, temp1_vec, a21_vec); | |||
| y3_vec = svmla_m(pg31, y3_vec, temp1_vec, a31_vec); | |||
| y0_vec = svmla_m(pg02, y0_vec, temp2_vec, a02_vec); | |||
| y1_vec = svmla_m(pg12, y1_vec, temp2_vec, a12_vec); | |||
| y2_vec = svmla_m(pg22, y2_vec, temp2_vec, a22_vec); | |||
| y3_vec = svmla_m(pg32, y3_vec, temp2_vec, a32_vec); | |||
| svst1_vnum(pg0, y + i, 0, y0_vec); | |||
| svst1_vnum(pg1, y + i, 1, y1_vec); | |||
| svst1_vnum(pg2, y + i, 2, y2_vec); | |||
| svst1_vnum(pg3, y + i, 3, y3_vec); | |||
| } | |||
| a0_ptr += lda; | |||
| a1_ptr += lda; | |||
| a2_ptr += lda; | |||
| ix += inc_x; | |||
| } | |||
| return(0); | |||
| } | |||
| for (j = 0; j < n; j++) { | |||
| temp = alpha * x[ix]; | |||
| iy = 0; | |||
| for (i = 0; i < m; i++) { | |||
| y[iy] += temp * a_ptr[i]; | |||
| iy += inc_y; | |||
| } | |||
| a_ptr += lda; | |||
| ix += inc_x; | |||
| } | |||
| return (0); | |||
| } | |||
| @@ -7,7 +7,6 @@ | |||
| #include <stdlib.h> | |||
| #include <inttypes.h> | |||
| #include <math.h> | |||
| #if defined(HAVE_SME) | |||
| /* Function prototypes */ | |||
| @@ -44,7 +43,17 @@ void CNAME (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A,\ | |||
| m_mod = ceil((double)M/(double)vl_elms) * vl_elms; | |||
| float *A_mod = (float *) malloc(m_mod*K*sizeof(float)); | |||
| /* Prevent compiler optimization by reading from memory instead | |||
| * of reading directly from vector (z) registers. | |||
| * */ | |||
| asm volatile("" : : :"p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", | |||
| "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", | |||
| "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", | |||
| "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", | |||
| "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", | |||
| "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"); | |||
| /* Pre-process the left matrix to make it suitable for | |||
| matrix sum of outer-product calculation | |||
| */ | |||
| @@ -52,8 +61,20 @@ void CNAME (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A,\ | |||
| /* Calculate C = A*B */ | |||
| sgemm_direct_sme1_2VLx2VL(M, K, N, A_mod, B, R); | |||
| asm volatile("" : : :"p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", | |||
| "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", | |||
| "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", | |||
| "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", | |||
| "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", | |||
| "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"); | |||
| free(A_mod); | |||
| } | |||
| #else | |||
| void CNAME (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A,\ | |||
| BLASLONG strideA, float * __restrict B, BLASLONG strideB ,\ | |||
| float * __restrict R, BLASLONG strideR){} | |||
| #endif | |||
| @@ -0,0 +1,219 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2025, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written | |||
| permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include <arm_neon.h> | |||
| #include "common.h" | |||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||
| { | |||
| BLASLONG i; | |||
| BLASLONG ix,iy; | |||
| BLASLONG j; | |||
| FLOAT *a_ptr; | |||
| FLOAT temp; | |||
| ix = 0; | |||
| a_ptr = a; | |||
| if (inc_x == 1 && inc_y == 1) { | |||
| FLOAT *a0_ptr = a + lda * 0; | |||
| FLOAT *a1_ptr = a + lda * 1; | |||
| FLOAT *a2_ptr = a + lda * 2; | |||
| FLOAT *a3_ptr = a + lda * 3; | |||
| FLOAT *a4_ptr = a + lda * 4; | |||
| FLOAT *a5_ptr = a + lda * 5; | |||
| FLOAT *a6_ptr = a + lda * 6; | |||
| FLOAT *a7_ptr = a + lda * 7; | |||
| j = 0; | |||
| while (j + 3 < n) { | |||
| float32x4_t x0_vec = vld1q_f32(x + j); | |||
| x0_vec = vmulq_n_f32(x0_vec, alpha); | |||
| i = 0; | |||
| while (i + 7 < m) { | |||
| float32x4_t a00_vec = vld1q_f32(a0_ptr + i); | |||
| float32x4_t a01_vec = vld1q_f32(a0_ptr + i + 4); | |||
| float32x4_t a10_vec = vld1q_f32(a1_ptr + i); | |||
| float32x4_t a11_vec = vld1q_f32(a1_ptr + i + 4); | |||
| float32x4_t a20_vec = vld1q_f32(a2_ptr + i); | |||
| float32x4_t a21_vec = vld1q_f32(a2_ptr + i + 4); | |||
| float32x4_t a30_vec = vld1q_f32(a3_ptr + i); | |||
| float32x4_t a31_vec = vld1q_f32(a3_ptr + i + 4); | |||
| float32x4_t y0_vec = vld1q_f32(y + i); | |||
| float32x4_t y1_vec = vld1q_f32(y + i + 4); | |||
| y0_vec = vmlaq_laneq_f32(y0_vec, a00_vec, x0_vec, 0); | |||
| y0_vec = vmlaq_laneq_f32(y0_vec, a10_vec, x0_vec, 1); | |||
| y0_vec = vmlaq_laneq_f32(y0_vec, a20_vec, x0_vec, 2); | |||
| y0_vec = vmlaq_laneq_f32(y0_vec, a30_vec, x0_vec, 3); | |||
| y1_vec = vmlaq_laneq_f32(y1_vec, a01_vec, x0_vec, 0); | |||
| y1_vec = vmlaq_laneq_f32(y1_vec, a11_vec, x0_vec, 1); | |||
| y1_vec = vmlaq_laneq_f32(y1_vec, a21_vec, x0_vec, 2); | |||
| y1_vec = vmlaq_laneq_f32(y1_vec, a31_vec, x0_vec, 3); | |||
| vst1q_f32(y + i, y0_vec); | |||
| vst1q_f32(y + i + 4, y1_vec); | |||
| i += 8; | |||
| } | |||
| while (i + 3 < m) { | |||
| float32x4_t a0_vec = vld1q_f32(a0_ptr + i); | |||
| float32x4_t a1_vec = vld1q_f32(a1_ptr + i); | |||
| float32x4_t a2_vec = vld1q_f32(a2_ptr + i); | |||
| float32x4_t a3_vec = vld1q_f32(a3_ptr + i); | |||
| float32x4_t y_vec = vld1q_f32(y + i); | |||
| y_vec = vmlaq_laneq_f32(y_vec, a0_vec, x0_vec, 0); | |||
| y_vec = vmlaq_laneq_f32(y_vec, a1_vec, x0_vec, 1); | |||
| y_vec = vmlaq_laneq_f32(y_vec, a2_vec, x0_vec, 2); | |||
| y_vec = vmlaq_laneq_f32(y_vec, a3_vec, x0_vec, 3); | |||
| vst1q_f32(y + i, y_vec); | |||
| i += 4; | |||
| } | |||
| while (i + 1 < m) { | |||
| float32x2_t a0_vec = vld1_f32(a0_ptr + i); | |||
| float32x2_t a1_vec = vld1_f32(a1_ptr + i); | |||
| float32x2_t a2_vec = vld1_f32(a2_ptr + i); | |||
| float32x2_t a3_vec = vld1_f32(a3_ptr + i); | |||
| float32x2_t y_vec = vld1_f32(y + i); | |||
| y_vec = vmla_laneq_f32(y_vec, a0_vec, x0_vec, 0); | |||
| y_vec = vmla_laneq_f32(y_vec, a1_vec, x0_vec, 1); | |||
| y_vec = vmla_laneq_f32(y_vec, a2_vec, x0_vec, 2); | |||
| y_vec = vmla_laneq_f32(y_vec, a3_vec, x0_vec, 3); | |||
| vst1_f32(y + i, y_vec); | |||
| i += 2; | |||
| } | |||
| while (i < m) { | |||
| y[i] += a0_ptr[i] * x0_vec[0]; | |||
| y[i] += a1_ptr[i] * x0_vec[1]; | |||
| y[i] += a2_ptr[i] * x0_vec[2]; | |||
| y[i] += a3_ptr[i] * x0_vec[3]; | |||
| i++; | |||
| } | |||
| a0_ptr += lda * 4; | |||
| a1_ptr += lda * 4; | |||
| a2_ptr += lda * 4; | |||
| a3_ptr += lda * 4; | |||
| j += 4; | |||
| } | |||
| while (j + 1 < n) { | |||
| float32x2_t x0_vec = vld1_f32(x + j); | |||
| x0_vec = vmul_n_f32(x0_vec, alpha); | |||
| i = 0; | |||
| while (i + 7 < m) { | |||
| float32x4_t a00_vec = vld1q_f32(a0_ptr + i); | |||
| float32x4_t a01_vec = vld1q_f32(a0_ptr + i + 4); | |||
| float32x4_t a10_vec = vld1q_f32(a1_ptr + i); | |||
| float32x4_t a11_vec = vld1q_f32(a1_ptr + i + 4); | |||
| float32x4_t y0_vec = vld1q_f32(y + i); | |||
| float32x4_t y1_vec = vld1q_f32(y + i + 4); | |||
| y0_vec = vmlaq_lane_f32(y0_vec, a00_vec, x0_vec, 0); | |||
| y0_vec = vmlaq_lane_f32(y0_vec, a10_vec, x0_vec, 1); | |||
| y1_vec = vmlaq_lane_f32(y1_vec, a01_vec, x0_vec, 0); | |||
| y1_vec = vmlaq_lane_f32(y1_vec, a11_vec, x0_vec, 1); | |||
| vst1q_f32(y + i, y0_vec); | |||
| vst1q_f32(y + i + 4, y1_vec); | |||
| i += 8; | |||
| } | |||
| while (i + 3 < m) { | |||
| float32x4_t a0_vec = vld1q_f32(a0_ptr + i); | |||
| float32x4_t a1_vec = vld1q_f32(a1_ptr + i); | |||
| float32x4_t y_vec = vld1q_f32(y + i); | |||
| y_vec = vmlaq_lane_f32(y_vec, a0_vec, x0_vec, 0); | |||
| y_vec = vmlaq_lane_f32(y_vec, a1_vec, x0_vec, 1); | |||
| vst1q_f32(y + i, y_vec); | |||
| i += 4; | |||
| } | |||
| while (i + 1 < m) { | |||
| float32x2_t a0_vec = vld1_f32(a0_ptr + i); | |||
| float32x2_t a1_vec = vld1_f32(a1_ptr + i); | |||
| float32x2_t y_vec = vld1_f32(y + i); | |||
| y_vec = vmla_lane_f32(y_vec, a0_vec, x0_vec, 0); | |||
| y_vec = vmla_lane_f32(y_vec, a1_vec, x0_vec, 1); | |||
| vst1_f32(y + i, y_vec); | |||
| i += 2; | |||
| } | |||
| while (i < m) { | |||
| y[i] += a0_ptr[i] * x0_vec[0]; | |||
| y[i] += a1_ptr[i] * x0_vec[1]; | |||
| i++; | |||
| } | |||
| a0_ptr += lda * 2; | |||
| a1_ptr += lda * 2; | |||
| j += 2; | |||
| } | |||
| while (j < n) { | |||
| i = 0; | |||
| temp = alpha * x[j]; | |||
| while (i < m) { | |||
| y[i] += a0_ptr[i] * temp; | |||
| i++; | |||
| } | |||
| a0_ptr += lda; | |||
| j++; | |||
| } | |||
| return (0); | |||
| } | |||
| for (j = 0; j < n; j++) { | |||
| temp = alpha * x[ix]; | |||
| iy = 0; | |||
| for (i = 0; i < m; i++) { | |||
| y[iy] += temp * a_ptr[i]; | |||
| iy += inc_y; | |||
| } | |||
| a_ptr += lda; | |||
| ix += inc_x; | |||
| } | |||
| return (0); | |||
| } | |||
| @@ -0,0 +1,113 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2025, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written | |||
| permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "symv_microk_asimd_4x4.c" | |||
| int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, | |||
| FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||
| { | |||
| BLASLONG i, j; | |||
| FLOAT temp1, temp2; | |||
| FLOAT tmp1[4]; | |||
| FLOAT tmp2[4]; | |||
| FLOAT *a0, *a1, *a2, *a3; | |||
| FLOAT x0, x1, x2, x3; | |||
| FLOAT *X = x; | |||
| FLOAT *Y = y; | |||
| if (inc_y != 1) { | |||
| Y = buffer; | |||
| COPY_K(m, y, inc_y, Y, 1); | |||
| } | |||
| if (inc_x != 1) { | |||
| if (inc_y != 1) { | |||
| X = Y + m; | |||
| } else { | |||
| X = buffer; | |||
| } | |||
| COPY_K(m, x, inc_x, X, 1); | |||
| } | |||
| BLASLONG offset1 = (offset / 4) * 4; | |||
| for (j = 0; j < offset1; j+=4) { | |||
| a0 = &a[j*lda]; | |||
| a1 = a0 + lda; | |||
| a2 = a1 + lda; | |||
| a3 = a2 + lda; | |||
| x0 = X[j]; | |||
| x1 = X[j+1]; | |||
| x2 = X[j+2]; | |||
| x3 = X[j+3]; | |||
| tmp2[0] = a0[j ]*x0 + a0[j+1]*x1 + a0[j+2]*x2 + a0[j+3]*x3; | |||
| tmp2[1] = a0[j+1]*x0 + a1[j+1]*x1 + a1[j+2]*x2 + a1[j+3]*x3; | |||
| tmp2[2] = a0[j+2]*x0 + a1[j+2]*x1 + a2[j+2]*x2 + a2[j+3]*x3; | |||
| tmp2[3] = a0[j+3]*x0 + a1[j+3]*x1 + a2[j+3]*x2 + a3[j+3]*x3; | |||
| tmp1[0] = alpha * x0; | |||
| tmp1[1] = alpha * x1; | |||
| tmp1[2] = alpha * x2; | |||
| tmp1[3] = alpha * x3; | |||
| BLASLONG m2 = (m/4)*4; | |||
| if (m2 > j+4) | |||
| symv_kernel_4x4(j+4, m2, a0, a1, a2, a3, X, Y, tmp1, tmp2); | |||
| for (i = m2; i < m; i++) { | |||
| Y[i] += tmp1[0] * a0[i]; | |||
| tmp2[0] += a0[i] * X[i]; | |||
| Y[i] += tmp1[1] * a1[i]; | |||
| tmp2[1] += a1[i] * X[i]; | |||
| Y[i] += tmp1[2] * a2[i]; | |||
| tmp2[2] += a2[i] * X[i]; | |||
| Y[i] += tmp1[3] * a3[i]; | |||
| tmp2[3] += a3[i] * X[i]; | |||
| } | |||
| Y[j] += alpha * tmp2[0]; | |||
| Y[j+1] += alpha * tmp2[1]; | |||
| Y[j+2] += alpha * tmp2[2]; | |||
| Y[j+3] += alpha * tmp2[3]; | |||
| } | |||
| for (j = offset1; j < offset; j++) { | |||
| temp1 = alpha * X[j]; | |||
| temp2 = 0.0; | |||
| Y[j] += temp1 * a[j*lda+j]; | |||
| for (i = j+1; i < m; i++) { | |||
| Y[i] += temp1 * a[j*lda+i]; | |||
| temp2 += a[j*lda+i] * X[i]; | |||
| } | |||
| Y[j] += alpha * temp2; | |||
| } | |||
| if (inc_y != 1) { | |||
| COPY_K(m, Y, 1, y, inc_y); | |||
| } | |||
| return(0); | |||
| } | |||
| @@ -0,0 +1,103 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2025, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written | |||
| permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "symv_microk_sve_v1x4.c" | |||
| int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, | |||
| FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||
| { | |||
| BLASLONG i, j; | |||
| FLOAT temp1, temp2; | |||
| FLOAT tmp1[4]; | |||
| FLOAT tmp2[4]; | |||
| FLOAT *a0, *a1, *a2, *a3; | |||
| FLOAT x0, x1, x2, x3; | |||
| FLOAT *X = x; | |||
| FLOAT *Y = y; | |||
| if (inc_y != 1) { | |||
| Y = buffer; | |||
| COPY_K(m, y, inc_y, Y, 1); | |||
| } | |||
| if (inc_x != 1) { | |||
| if (inc_y != 1) { | |||
| X = Y + m; | |||
| } else { | |||
| X = buffer; | |||
| } | |||
| COPY_K(m, x, inc_x, X, 1); | |||
| } | |||
| BLASLONG offset1 = (offset / 4) * 4; | |||
| for (j = 0; j < offset1; j+=4) { | |||
| a0 = &a[j*lda]; | |||
| a1 = a0 + lda; | |||
| a2 = a1 + lda; | |||
| a3 = a2 + lda; | |||
| x0 = X[j]; | |||
| x1 = X[j+1]; | |||
| x2 = X[j+2]; | |||
| x3 = X[j+3]; | |||
| tmp2[0] = a0[j ]*x0 + a0[j+1]*x1 + a0[j+2]*x2 + a0[j+3]*x3; | |||
| tmp2[1] = a0[j+1]*x0 + a1[j+1]*x1 + a1[j+2]*x2 + a1[j+3]*x3; | |||
| tmp2[2] = a0[j+2]*x0 + a1[j+2]*x1 + a2[j+2]*x2 + a2[j+3]*x3; | |||
| tmp2[3] = a0[j+3]*x0 + a1[j+3]*x1 + a2[j+3]*x2 + a3[j+3]*x3; | |||
| tmp1[0] = alpha * x0; | |||
| tmp1[1] = alpha * x1; | |||
| tmp1[2] = alpha * x2; | |||
| tmp1[3] = alpha * x3; | |||
| symv_kernel_v1x4(j+4, m, a0, a1, a2, a3, X, Y, tmp1, tmp2); | |||
| Y[j] += alpha * tmp2[0]; | |||
| Y[j+1] += alpha * tmp2[1]; | |||
| Y[j+2] += alpha * tmp2[2]; | |||
| Y[j+3] += alpha * tmp2[3]; | |||
| } | |||
| for (j = offset1; j < offset; j++) { | |||
| temp1 = alpha * X[j]; | |||
| temp2 = 0.0; | |||
| a0 = &a[j*lda]; | |||
| Y[j] += temp1 * a0[j]; | |||
| for (i = j+1; i < m; i++) { | |||
| Y[i] += temp1 * a0[i]; | |||
| temp2 += a0[i] * X[i]; | |||
| } | |||
| Y[j] += alpha * temp2; | |||
| } | |||
| if (inc_y != 1) { | |||
| COPY_K(m, Y, 1, y, inc_y); | |||
| } | |||
| return(0); | |||
| } | |||
| @@ -0,0 +1,106 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2025, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written | |||
| permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "symv_microk_asimd_4x4.c" | |||
| int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, | |||
| FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||
| { | |||
| BLASLONG i, j, j1, j2, m2; | |||
| FLOAT temp1, temp2; | |||
| FLOAT tmp1[4]; | |||
| FLOAT tmp2[4]; | |||
| FLOAT *a0, *a1, *a2, *a3; | |||
| FLOAT *X = x; | |||
| FLOAT *Y = y; | |||
| BLASLONG m1 = m - offset; | |||
| if (inc_y != 1) { | |||
| Y = buffer; | |||
| COPY_K(m, y, inc_y, Y, 1); | |||
| } | |||
| if (inc_x != 1) { | |||
| if (inc_y != 1) { | |||
| X = Y + m; | |||
| } else { | |||
| X = buffer; | |||
| } | |||
| COPY_K(m, x, inc_x, X, 1); | |||
| } | |||
| m2 = m - (offset % 4); | |||
| for (j = m1; j < m2; j += 4) { | |||
| tmp1[0] = alpha * X[j]; | |||
| tmp1[1] = alpha * X[j+1]; | |||
| tmp1[2] = alpha * X[j+2]; | |||
| tmp1[3] = alpha * X[j+3]; | |||
| tmp2[0] = 0.0; | |||
| tmp2[1] = 0.0; | |||
| tmp2[2] = 0.0; | |||
| tmp2[3] = 0.0; | |||
| a0 = &a[j*lda]; | |||
| a1 = a0 + lda; | |||
| a2 = a1 + lda; | |||
| a3 = a2 + lda; | |||
| j1 = (j / 4) * 4; | |||
| if ( j1 ) | |||
| symv_kernel_4x4(0, j1, a0, a1, a2, a3, X, Y, tmp1, tmp2); | |||
| j2 = 0; | |||
| for (j1 = j ; j1 < j+4 ; j1++) { | |||
| temp1 = tmp1[j2]; | |||
| temp2 = tmp2[j2]; | |||
| a0 = &a[j1*lda]; | |||
| for (i=j ; i<j1; i++) { | |||
| Y[i] += temp1 * a0[i]; | |||
| temp2 += a0[i] * X[i]; | |||
| } | |||
| Y[j1] += temp1 * a0[j1] + alpha * temp2; | |||
| j2++; | |||
| } | |||
| } | |||
| for ( ; j < m; j++) { | |||
| temp1 = alpha * X[j]; | |||
| temp2 = 0.0; | |||
| a0 = &a[j*lda]; | |||
| for (i = 0 ; i < j; i++) { | |||
| Y[i] += temp1 * a0[i]; | |||
| temp2 += a0[i] * X[i]; | |||
| } | |||
| Y[j] += temp1 * a0[j] + alpha * temp2; | |||
| } | |||
| if (inc_y != 1) { | |||
| COPY_K(m, Y, 1, y, inc_y); | |||
| } | |||
| return(0); | |||
| } | |||
| @@ -0,0 +1,104 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2025, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written | |||
| permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "symv_microk_sve_v1x4.c" | |||
| int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, | |||
| FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||
| { | |||
| BLASLONG i, j, j1, j2, m2; | |||
| FLOAT temp1, temp2; | |||
| FLOAT tmp1[4]; | |||
| FLOAT tmp2[4]; | |||
| FLOAT *a0, *a1, *a2, *a3; | |||
| FLOAT *X = x; | |||
| FLOAT *Y = y; | |||
| BLASLONG m1 = m - offset; | |||
| if (inc_y != 1) { | |||
| Y = buffer; | |||
| COPY_K(m, y, inc_y, Y, 1); | |||
| } | |||
| if (inc_x != 1) { | |||
| if (inc_y != 1) { | |||
| X = Y + m; | |||
| } else { | |||
| X = buffer; | |||
| } | |||
| COPY_K(m, x, inc_x, X, 1); | |||
| } | |||
| m2 = m - (offset % 4); | |||
| for (j = m1; j < m2; j += 4) { | |||
| tmp1[0] = alpha * X[j]; | |||
| tmp1[1] = alpha * X[j+1]; | |||
| tmp1[2] = alpha * X[j+2]; | |||
| tmp1[3] = alpha * X[j+3]; | |||
| tmp2[0] = 0.0; | |||
| tmp2[1] = 0.0; | |||
| tmp2[2] = 0.0; | |||
| tmp2[3] = 0.0; | |||
| a0 = &a[j*lda]; | |||
| a1 = a0 + lda; | |||
| a2 = a1 + lda; | |||
| a3 = a2 + lda; | |||
| symv_kernel_v1x4(0, j, a0, a1, a2, a3, X, Y, tmp1, tmp2); | |||
| j2 = 0; | |||
| for (j1 = j ; j1 < j+4 ; j1++) { | |||
| temp1 = tmp1[j2]; | |||
| temp2 = tmp2[j2]; | |||
| a0 = &a[j1*lda]; | |||
| for (i=j ; i<j1; i++) { | |||
| Y[i] += temp1 * a0[i]; | |||
| temp2 += a0[i] * X[i]; | |||
| } | |||
| Y[j1] += temp1 * a0[j1] + alpha * temp2; | |||
| j2++; | |||
| } | |||
| } | |||
| for ( ; j < m; j++) { | |||
| temp1 = alpha * X[j]; | |||
| temp2 = 0.0; | |||
| a0 = &a[j*lda]; | |||
| for (i = 0 ; i < j; i++) { | |||
| Y[i] += temp1 * a0[i]; | |||
| temp2 += a0[i] * X[i]; | |||
| } | |||
| Y[j] += temp1 * a0[j] + alpha * temp2; | |||
| } | |||
| if (inc_y != 1) { | |||
| COPY_K(m, Y, 1, y, inc_y); | |||
| } | |||
| return(0); | |||
| } | |||
| @@ -0,0 +1,120 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2025, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written | |||
| permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #include <arm_neon.h> | |||
| static void symv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT *a3, | |||
| FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) | |||
| { | |||
| #ifdef DOUBLE | |||
| float64x2_t vtmpx0 = vld1q_dup_f64(&temp1[0]); | |||
| float64x2_t vtmpx1 = vld1q_dup_f64(&temp1[1]); | |||
| float64x2_t vtmpx2 = vld1q_dup_f64(&temp1[2]); | |||
| float64x2_t vtmpx3 = vld1q_dup_f64(&temp1[3]); | |||
| float64x2_t vtmpy0 = {0.0, 0.0}; | |||
| float64x2_t vtmpy1 = {0.0, 0.0}; | |||
| float64x2_t vtmpy2 = {0.0, 0.0}; | |||
| float64x2_t vtmpy3 = {0.0, 0.0}; | |||
| float64x2_t vxl, vxh, vyl, vyh; | |||
| float64x2_t vap0l, vap0h, vap1l, vap1h, vap2l, vap2h, vap3l, vap3h; | |||
| BLASLONG i; | |||
| for (i = from; i < to; i+=4) { | |||
| vyl = vld1q_f64(&y[i]); | |||
| vyh = vld1q_f64(&y[i+2]); | |||
| vxl = vld1q_f64(&x[i]); | |||
| vxh = vld1q_f64(&x[i+2]); | |||
| vap0l = vld1q_f64(&a0[i]); | |||
| vap0h = vld1q_f64(&a0[i+2]); | |||
| vap1l = vld1q_f64(&a1[i]); | |||
| vap1h = vld1q_f64(&a1[i+2]); | |||
| vap2l = vld1q_f64(&a2[i]); | |||
| vap2h = vld1q_f64(&a2[i+2]); | |||
| vap3l = vld1q_f64(&a3[i]); | |||
| vap3h = vld1q_f64(&a3[i+2]); | |||
| vyl = vfmaq_f64(vyl, vtmpx0, vap0l); | |||
| vyh = vfmaq_f64(vyh, vtmpx0, vap0h); | |||
| vyl = vfmaq_f64(vyl, vtmpx1, vap1l); | |||
| vyh = vfmaq_f64(vyh, vtmpx1, vap1h); | |||
| vyl = vfmaq_f64(vyl, vtmpx2, vap2l); | |||
| vyh = vfmaq_f64(vyh, vtmpx2, vap2h); | |||
| vyl = vfmaq_f64(vyl, vtmpx3, vap3l); | |||
| vyh = vfmaq_f64(vyh, vtmpx3, vap3h); | |||
| vtmpy0 = vfmaq_f64(vtmpy0, vxl, vap0l); | |||
| vtmpy0 = vfmaq_f64(vtmpy0, vxh, vap0h); | |||
| vtmpy1 = vfmaq_f64(vtmpy1, vxl, vap1l); | |||
| vtmpy2 = vfmaq_f64(vtmpy2, vxl, vap2l); | |||
| vtmpy1 = vfmaq_f64(vtmpy1, vxh, vap1h); | |||
| vtmpy2 = vfmaq_f64(vtmpy2, vxh, vap2h); | |||
| vtmpy3 = vfmaq_f64(vtmpy3, vxl, vap3l); | |||
| vtmpy3 = vfmaq_f64(vtmpy3, vxh, vap3h); | |||
| vst1q_f64(&y[i], vyl); | |||
| vst1q_f64(&y[i+2], vyh); | |||
| } | |||
| temp2[0] += vaddvq_f64(vtmpy0); | |||
| temp2[1] += vaddvq_f64(vtmpy1); | |||
| temp2[2] += vaddvq_f64(vtmpy2); | |||
| temp2[3] += vaddvq_f64(vtmpy3); | |||
| #else | |||
| float32x4_t vtmpx0 = vld1q_dup_f32(&temp1[0]); | |||
| float32x4_t vtmpx1 = vld1q_dup_f32(&temp1[1]); | |||
| float32x4_t vtmpx2 = vld1q_dup_f32(&temp1[2]); | |||
| float32x4_t vtmpx3 = vld1q_dup_f32(&temp1[3]); | |||
| float32x4_t vtmpy0 = {0.0, 0.0, 0.0, 0.0}; | |||
| float32x4_t vtmpy1 = {0.0, 0.0, 0.0, 0.0}; | |||
| float32x4_t vtmpy2 = {0.0, 0.0, 0.0, 0.0}; | |||
| float32x4_t vtmpy3 = {0.0, 0.0, 0.0, 0.0}; | |||
| float32x4_t vx, vy; | |||
| float32x4_t vap0, vap1, vap2, vap3; | |||
| BLASLONG i; | |||
| for (i = from; i < to; i+=4) { | |||
| vy = vld1q_f32(&y[i]); | |||
| vx = vld1q_f32(&x[i]); | |||
| vap0 = vld1q_f32(&a0[i]); | |||
| vap1 = vld1q_f32(&a1[i]); | |||
| vap2 = vld1q_f32(&a2[i]); | |||
| vap3 = vld1q_f32(&a3[i]); | |||
| vy = vfmaq_f32(vy, vtmpx0, vap0); | |||
| vy = vfmaq_f32(vy, vtmpx1, vap1); | |||
| vy = vfmaq_f32(vy, vtmpx2, vap2); | |||
| vy = vfmaq_f32(vy, vtmpx3, vap3); | |||
| vtmpy0 = vfmaq_f32(vtmpy0, vx, vap0); | |||
| vtmpy1 = vfmaq_f32(vtmpy1, vx, vap1); | |||
| vtmpy2 = vfmaq_f32(vtmpy2, vx, vap2); | |||
| vtmpy3 = vfmaq_f32(vtmpy3, vx, vap3); | |||
| vst1q_f32(&y[i], vy); | |||
| } | |||
| temp2[0] += vaddvq_f32(vtmpy0); | |||
| temp2[1] += vaddvq_f32(vtmpy1); | |||
| temp2[2] += vaddvq_f32(vtmpy2); | |||
| temp2[3] += vaddvq_f32(vtmpy3); | |||
| #endif | |||
| } | |||
| @@ -0,0 +1,89 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2025, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written | |||
| permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #include <arm_sve.h> | |||
| #ifdef DOUBLE | |||
| #define SV_COUNT svcntd | |||
| #define SV_TYPE svfloat64_t | |||
| #define SV_TRUE svptrue_b64 | |||
| #define SV_WHILE svwhilelt_b64_s64 | |||
| #define SV_DUP svdup_f64 | |||
| #else | |||
| #define SV_COUNT svcntw | |||
| #define SV_TYPE svfloat32_t | |||
| #define SV_TRUE svptrue_b32 | |||
| #define SV_WHILE svwhilelt_b32_s64 | |||
| #define SV_DUP svdup_f32 | |||
| #endif | |||
| static void symv_kernel_v1x4(BLASLONG from, BLASLONG to, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT *a3, | |||
| FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) | |||
| { | |||
| SV_TYPE vtmpx0 = SV_DUP(temp1[0]); | |||
| SV_TYPE vtmpx1 = SV_DUP(temp1[1]); | |||
| SV_TYPE vtmpx2 = SV_DUP(temp1[2]); | |||
| SV_TYPE vtmpx3 = SV_DUP(temp1[3]); | |||
| SV_TYPE vtmpy0 = SV_DUP(0.0); | |||
| SV_TYPE vtmpy1 = SV_DUP(0.0); | |||
| SV_TYPE vtmpy2 = SV_DUP(0.0); | |||
| SV_TYPE vtmpy3 = SV_DUP(0.0); | |||
| SV_TYPE vx, vy; | |||
| SV_TYPE vap0, vap1, vap2, vap3; | |||
| BLASLONG i; | |||
| uint64_t sve_size = SV_COUNT(); | |||
| svbool_t pg; | |||
| for (i = from; i < to; i += sve_size) { | |||
| pg = SV_WHILE(i, to); | |||
| vy = svld1(pg, &y[i]); | |||
| vx = svld1(pg, &x[i]); | |||
| vap0 = svld1(pg, &a0[i]); | |||
| vap1 = svld1(pg, &a1[i]); | |||
| vap2 = svld1(pg, &a2[i]); | |||
| vap3 = svld1(pg, &a3[i]); | |||
| vy = svmla_m(pg, vy, vtmpx0, vap0); | |||
| vy = svmla_m(pg, vy, vtmpx1, vap1); | |||
| vy = svmla_m(pg, vy, vtmpx2, vap2); | |||
| vy = svmla_m(pg, vy, vtmpx3, vap3); | |||
| vtmpy0 = svmla_m(pg, vtmpy0, vx, vap0); | |||
| vtmpy1 = svmla_m(pg, vtmpy1, vx, vap1); | |||
| vtmpy2 = svmla_m(pg, vtmpy2, vx, vap2); | |||
| vtmpy3 = svmla_m(pg, vtmpy3, vx, vap3); | |||
| svst1(pg, &y[i], vy); | |||
| } | |||
| pg = SV_TRUE(); | |||
| temp2[0] += svaddv(pg, vtmpy0); | |||
| temp2[1] += svaddv(pg, vtmpy1); | |||
| temp2[2] += svaddv(pg, vtmpy2); | |||
| temp2[3] += svaddv(pg, vtmpy3); | |||
| } | |||
| @@ -56,17 +56,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| LDINT INCX, 0(INCX) | |||
| #endif | |||
| xvxor.v VM0, VM0, VM0 | |||
| bge $r0, N, .L999 | |||
| bge $r0, INCX, .L999 | |||
| li.d TEMP, 1 | |||
| slli.d TEMP, TEMP, BASE_SHIFT | |||
| slli.d INCX, INCX, BASE_SHIFT | |||
| #ifdef DOUBLE | |||
| xvldrepl.d VM0, X, 0 | |||
| #else | |||
| xvldrepl.w VM0, X, 0 | |||
| #endif | |||
| XVFSUB VM0, VM0, VM0 | |||
| bne INCX, TEMP, .L20 | |||
| srai.d I, N, 4 | |||
| @@ -103,21 +103,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xvfadd.d res1, VX2, res1 | |||
| xvfadd.d res1, VX3, res1 | |||
| #else | |||
| xvfadd.s res2, res1, res2 | |||
| xvpickve.w VX1, res1, 1 | |||
| xvpickve.w VX2, res1, 2 | |||
| xvpickve.w VX3, res1, 3 | |||
| xvfadd.s res1, VX1, res1 | |||
| xvfadd.s res1, VX2, res1 | |||
| xvfadd.s res1, VX3, res1 | |||
| xvpickve.w VX0, res2, 4 | |||
| xvpickve.w VX1, res2, 5 | |||
| xvpickve.w VX2, res2, 6 | |||
| xvpickve.w VX3, res2, 7 | |||
| xvpickve.w VX0, res1, 4 | |||
| xvpickve.w VX1, res1, 5 | |||
| xvpickve.w VX2, res1, 6 | |||
| xvpickve.w VX3, res1, 7 | |||
| xvfadd.s res1, VX0, res1 | |||
| xvfadd.s res1, VX1, res1 | |||
| xvfadd.s res1, VX2, res1 | |||
| xvfadd.s res1, VX2, res1 | |||
| xvfadd.s res1, VX3, res1 | |||
| #endif | |||
| .align 3 | |||
| @@ -217,21 +216,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xvfadd.d res1, VX2, res1 | |||
| xvfadd.d res1, VX3, res1 | |||
| #else | |||
| xvfadd.s res2, res1, res2 | |||
| xvpickve.w VX1, res1, 1 | |||
| xvpickve.w VX2, res1, 2 | |||
| xvpickve.w VX3, res1, 3 | |||
| xvfadd.s res1, VX1, res1 | |||
| xvfadd.s res1, VX2, res1 | |||
| xvfadd.s res1, VX3, res1 | |||
| xvpickve.w VX0, res2, 4 | |||
| xvpickve.w VX1, res2, 5 | |||
| xvpickve.w VX2, res2, 6 | |||
| xvpickve.w VX3, res2, 7 | |||
| xvpickve.w VX0, res1, 4 | |||
| xvpickve.w VX1, res1, 5 | |||
| xvpickve.w VX2, res1, 6 | |||
| xvpickve.w VX3, res1, 7 | |||
| xvfadd.s res1, VX0, res1 | |||
| xvfadd.s res1, VX1, res1 | |||
| xvfadd.s res1, VX2, res1 | |||
| xvfadd.s res1, VX2, res1 | |||
| xvfadd.s res1, VX3, res1 | |||
| #endif | |||
| .align 3 | |||
| @@ -288,7 +288,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xvinsgr2vr.w x2, t2, 6 | |||
| xvinsgr2vr.w x1, t3, 7 | |||
| xvinsgr2vr.w x2, t4, 7 | |||
| addi.d Y, Y, 8 * SIZE | |||
| addi.d Y, Y, 16 * SIZE | |||
| xvpickev.w x3, VX3, VX2 | |||
| xvpickod.w x4, VX3, VX2 | |||
| xvfmadd.s res1, x1, x3, res1 | |||
| @@ -47,6 +47,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define VX4 $xr21 | |||
| #define res1 $xr19 | |||
| #define res2 $xr20 | |||
| #define RCP $f2 | |||
| #define VALPHA $xr3 | |||
| PROLOGUE | |||
| @@ -55,10 +57,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| LDINT INCX, 0(INCX) | |||
| #endif | |||
| xvxor.v res1, res1, res1 | |||
| xvxor.v res2, res2, res2 | |||
| bge $r0, N, .L999 | |||
| beq $r0, INCX, .L999 | |||
| addi.d $sp, $sp, -32 | |||
| st.d $ra, $sp, 0 | |||
| st.d N, $sp, 8 | |||
| st.d X, $sp, 16 | |||
| st.d INCX, $sp, 24 | |||
| #ifdef DYNAMIC_ARCH | |||
| bl camax_k_LA264 | |||
| #else | |||
| bl camax_k | |||
| #endif | |||
| ld.d $ra, $sp, 0 | |||
| ld.d N, $sp, 8 | |||
| ld.d X, $sp, 16 | |||
| ld.d INCX, $sp, 24 | |||
| addi.d $sp, $sp, 32 | |||
| frecip.s RCP, $f0 | |||
| vreplvei.w $vr3, $vr2, 0 | |||
| xvpermi.d VALPHA, $xr3,0x00 | |||
| xvxor.v res1, res1, res1 | |||
| xvxor.v res2, res2, res2 | |||
| fcmp.ceq.s $fcc0, $f0, $f19 | |||
| bcnez $fcc0, .L999 | |||
| li.d TEMP, SIZE | |||
| slli.d INCX, INCX, ZBASE_SHIFT | |||
| srai.d I, N, 2 | |||
| @@ -67,13 +92,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .align 3 | |||
| .L10: | |||
| xvld VX0, X, 0 * SIZE | |||
| xvfcvtl.d.s VX1, VX0 | |||
| xvfcvth.d.s VX2, VX0 | |||
| xvfmadd.d res1, VX1, VX1, res1 | |||
| xvfmadd.d res2, VX2, VX2, res2 | |||
| addi.d I, I, -1 | |||
| addi.d X, X, 8 * SIZE | |||
| xvld VX0, X, 0 * SIZE | |||
| xvld VX1, X, 8 * SIZE | |||
| xvfmul.s VX0, VX0, VALPHA | |||
| xvfmul.s VX1, VX1, VALPHA | |||
| xvfmadd.s res1, VX0, VX0, res1 | |||
| xvfmadd.s res2, VX1, VX1, res2 | |||
| addi.d X, X, 16 * SIZE | |||
| blt $r0, I, .L10 | |||
| .align 3 | |||
| b .L996 | |||
| @@ -103,22 +131,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xvinsgr2vr.w VX0, t3, 6 | |||
| xvinsgr2vr.w VX0, t4, 7 | |||
| add.d X, X, INCX | |||
| xvfcvtl.d.s VX1, VX0 | |||
| xvfcvth.d.s VX2, VX0 | |||
| xvfmadd.d res1, VX1, VX1, res1 | |||
| xvfmadd.d res2, VX2, VX2, res2 | |||
| xvfmul.s VX0, VX0, VALPHA | |||
| xvfmadd.s res2, VX0, VX0, res2 | |||
| addi.d I, I, -1 | |||
| blt $r0, I, .L21 | |||
| b .L996 | |||
| .L996: | |||
| xvfadd.d res1, res1, res2 | |||
| xvpickve.d VX1, res1, 1 | |||
| xvpickve.d VX2, res1, 2 | |||
| xvpickve.d VX3, res1, 3 | |||
| xvfadd.d res1, VX1, res1 | |||
| xvfadd.d res1, VX2, res1 | |||
| xvfadd.d res1, VX3, res1 | |||
| xvfadd.s res1, res1, res2 | |||
| xvpermi.d VX1, res1, 0x4e | |||
| xvfadd.s res1, res1, VX1 | |||
| vreplvei.w $vr17, $vr19, 1 | |||
| vreplvei.w $vr18, $vr19, 2 | |||
| vreplvei.w $vr21, $vr19, 3 | |||
| xvfadd.s res1, VX2, res1 | |||
| xvfadd.s res1, VX3, res1 | |||
| xvfadd.s res1, VX4, res1 | |||
| .align 3 | |||
| .L997: | |||
| @@ -130,18 +158,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| fld.s a1, X, 0 * SIZE | |||
| fld.s a2, X, 1 * SIZE | |||
| addi.d I, I, -1 | |||
| fcvt.d.s a1, a1 | |||
| fcvt.d.s a2, a2 | |||
| fmadd.d res, a1, a1, res | |||
| fmadd.d res, a2, a2, res | |||
| fmul.s a1, a1, RCP | |||
| fmul.s a2, a2, RCP | |||
| fmadd.s res, a1, a1, res | |||
| fmadd.s res, a2, a2, res | |||
| add.d X, X, INCX | |||
| blt $r0, I, .L998 | |||
| .align 3 | |||
| .L999: | |||
| fsqrt.d res, res | |||
| fsqrt.s res, res | |||
| fmul.s $f0, res, $f0 | |||
| move $r4, $r17 | |||
| fcvt.s.d $f0, res | |||
| jirl $r0, $r1, 0x0 | |||
| EPILOGUE | |||
| @@ -260,9 +260,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| add.d Y, Y, INCY | |||
| ST a2, Y, 0 | |||
| add.d Y, Y, INCY | |||
| ST a3, X, 0 | |||
| ST a3, Y, 0 | |||
| add.d Y, Y, INCY | |||
| ST a4, X, 0 | |||
| ST a4, Y, 0 | |||
| add.d Y, Y, INCY | |||
| LD a1, X, 0 | |||
| add.d X, X, INCX | |||
| @@ -276,9 +276,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| add.d Y, Y, INCY | |||
| ST a2, Y, 0 | |||
| add.d Y, Y, INCY | |||
| ST a3, X, 0 | |||
| ST a3, Y, 0 | |||
| add.d Y, Y, INCY | |||
| ST a4, X, 0 | |||
| ST a4, Y, 0 | |||
| add.d Y, Y, INCY | |||
| addi.d I, I, -1 | |||
| blt $r0, I, .L222 | |||
| @@ -33,6 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define ALPHAI $f1 | |||
| #define X $r7 | |||
| #define INCX $r8 | |||
| #define DUMMY2 $r9 | |||
| #define I $r12 | |||
| #define TEMP $r13 | |||
| @@ -65,6 +66,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| bge $r0, N, .L999 | |||
| bge $r0, INCX, .L999 | |||
| ld.d DUMMY2, $sp, 0 | |||
| li.d TEMP, 1 | |||
| movgr2fr.d a1, $r0 | |||
| FFINT a1, a1 | |||
| @@ -86,24 +88,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| bne INCX, TEMP, .L22 | |||
| /////// INCX == 1 //////// | |||
| .L11: | |||
| bge $r0, I, .L997 | |||
| CMPEQ $fcc0, ALPHAR, a1 | |||
| CMPEQ $fcc1, ALPHAI, a1 | |||
| bceqz $fcc0, .L13 | |||
| b .L14 | |||
| .align 3 | |||
| bge $r0, I, .L19 | |||
| /////// INCX == 1 && N >= 4 //////// | |||
| bnez DUMMY2, .L17 // if DUMMPY2 == 1, called from c/zscal. | |||
| .L13: | |||
| bceqz $fcc1, .L114 //alpha_r != 0.0 && alpha_i != 0.0 | |||
| b .L113 //alpha_r != 0.0 && alpha_i == 0.0 | |||
| bceqz $fcc0, .L17 | |||
| .L14: | |||
| bceqz $fcc1, .L114 //alpha_r == 0.0 && alpha_i != 0.0 | |||
| b .L111 //alpha_r == 0.0 && alpha_i == 0.0 | |||
| .align 3 | |||
| bceqz $fcc1, .L17 | |||
| .L111: //alpha_r == 0.0 && alpha_i == 0.0 | |||
| .L15: //alpha_r == 0.0 && alpha_i == 0.0 | |||
| xvst VXZ, X, 0 * SIZE | |||
| #ifdef DOUBLE | |||
| xvst VXZ, X, 4 * SIZE | |||
| @@ -113,41 +110,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| addi.d X, X, 16 * SIZE | |||
| #endif | |||
| addi.d I, I, -1 | |||
| blt $r0, I, .L111 | |||
| b .L997 | |||
| .align 3 | |||
| .L113: //alpha_r != 0.0 && alpha_i == 0.0 | |||
| xvld VX0, X, 0 * SIZE | |||
| #ifdef DOUBLE | |||
| xvld VX1, X, 4 * SIZE | |||
| xvpickev.d x1, VX1, VX0 | |||
| xvpickod.d x2, VX1, VX0 | |||
| xvfmul.d x3, VXAR, x1 | |||
| xvfmul.d x4, VXAR, x2 | |||
| xvilvl.d VX2, x4 ,x3 | |||
| xvilvh.d VX3, x4, x3 | |||
| xvst VX2, X, 0 * SIZE | |||
| xvst VX3, X, 4 * SIZE | |||
| addi.d X, X, 8 * SIZE | |||
| #else | |||
| xvld VX1, X, 8 * SIZE | |||
| xvpickev.w x1, VX1, VX0 | |||
| xvpickod.w x2, VX1, VX0 | |||
| xvfmul.s x3, VXAR, x1 | |||
| xvfmul.s x4, VXAR, x2 | |||
| xvilvl.w VX2, x4 ,x3 | |||
| xvilvh.w VX3, x4, x3 | |||
| xvst VX2, X, 0 * SIZE | |||
| xvst VX3, X, 8 * SIZE | |||
| addi.d X, X, 16 * SIZE | |||
| #endif | |||
| addi.d I, I, -1 | |||
| blt $r0, I, .L113 | |||
| b .L997 | |||
| blt $r0, I, .L15 | |||
| b .L19 | |||
| .align 3 | |||
| .L114: //alpha_r != 0.0 && alpha_i != 0.0 | |||
| .L17: | |||
| xvld VX0, X, 0 * SIZE | |||
| #ifdef DOUBLE | |||
| xvld VX1, X, 4 * SIZE | |||
| @@ -177,29 +144,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| addi.d X, X, 16 * SIZE | |||
| #endif | |||
| addi.d I, I, -1 | |||
| blt $r0, I, .L114 | |||
| b .L997 | |||
| blt $r0, I, .L17 | |||
| b .L19 | |||
| .align 3 | |||
| /////// INCX == 1 && N < 8 /////// | |||
| .L19: | |||
| #ifdef DOUBLE | |||
| andi I, N, 3 | |||
| #else | |||
| andi I, N, 7 | |||
| #endif | |||
| beqz I, .L999 | |||
| bnez DUMMY2, .L998 // if DUMMPY2 == 1, called from c/zscal. | |||
| bceqz $fcc0, .L998 | |||
| bceqz $fcc1, .L998 | |||
| b .L995 // alpha_r == 0.0 && alpha_i == 0.0 | |||
| .align 3 | |||
| /////// INCX != 1 //////// | |||
| .L22: | |||
| bge $r0, I, .L997 | |||
| move XX, X | |||
| CMPEQ $fcc0, ALPHAR, a1 | |||
| CMPEQ $fcc1, ALPHAI, a1 | |||
| bceqz $fcc0, .L23 | |||
| b .L24 | |||
| .align 3 | |||
| .L23: | |||
| bceqz $fcc1, .L224 //alpha_r != 0.0 && alpha_i != 0.0 | |||
| b .L223 //alpha_r != 0.0 && alpha_i == 0.0 | |||
| move XX, X | |||
| bge $r0, I, .L29 | |||
| bnez DUMMY2, .L25 // if DUMMPY2 == 1, called from c/zscal. | |||
| bceqz $fcc0, .L25 | |||
| .L24: | |||
| bceqz $fcc1, .L224 //alpha_r == 0.0 && alpha_i != 0.0 | |||
| b .L221 //alpha_r == 0.0 && alpha_i == 0.0 | |||
| .align 3 | |||
| bceqz $fcc1, .L25 | |||
| .L221: //alpha_r == 0.0 && alpha_i == 0.0 | |||
| .L27: //alpha_r == 0.0 && alpha_i == 0.0 | |||
| #ifdef DOUBLE | |||
| xvstelm.d VXZ, X, 0, 0 | |||
| xvstelm.d VXZ, X, 1 * SIZE, 0 | |||
| @@ -239,122 +216,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| add.d X, X, INCX | |||
| addi.d I, I, -1 | |||
| blt $r0, I, .L221 | |||
| b .L997 | |||
| .align 3 | |||
| .L223: //alpha_r != 0.0 && alpha_i == 0.0 | |||
| #ifdef DOUBLE | |||
| ld.d t1, X, 0 * SIZE | |||
| ld.d t2, X, 1 * SIZE | |||
| add.d X, X, INCX | |||
| ld.d t3, X, 0 * SIZE | |||
| ld.d t4, X, 1 * SIZE | |||
| add.d X, X, INCX | |||
| xvinsgr2vr.d x1, t1, 0 | |||
| xvinsgr2vr.d x2, t2, 0 | |||
| xvinsgr2vr.d x1, t3, 1 | |||
| xvinsgr2vr.d x2, t4, 1 | |||
| ld.d t1, X, 0 * SIZE | |||
| ld.d t2, X, 1 * SIZE | |||
| add.d X, X, INCX | |||
| ld.d t3, X, 0 * SIZE | |||
| ld.d t4, X, 1 * SIZE | |||
| xvinsgr2vr.d x1, t1, 2 | |||
| xvinsgr2vr.d x2, t2, 2 | |||
| xvinsgr2vr.d x1, t3, 3 | |||
| xvinsgr2vr.d x2, t4, 3 | |||
| add.d X, X, INCX | |||
| xvfmul.d x3, VXAR, x1 | |||
| xvfmul.d x4, VXAR, x2 | |||
| addi.d I, I, -1 | |||
| xvstelm.d x3, XX, 0 * SIZE, 0 | |||
| xvstelm.d x4, XX, 1 * SIZE, 0 | |||
| add.d XX, XX, INCX | |||
| xvstelm.d x3, XX, 0 * SIZE, 1 | |||
| xvstelm.d x4, XX, 1 * SIZE, 1 | |||
| add.d XX, XX, INCX | |||
| xvstelm.d x3, XX, 0 * SIZE, 2 | |||
| xvstelm.d x4, XX, 1 * SIZE, 2 | |||
| add.d XX, XX, INCX | |||
| xvstelm.d x3, XX, 0 * SIZE, 3 | |||
| xvstelm.d x4, XX, 1 * SIZE, 3 | |||
| #else | |||
| ld.w t1, X, 0 * SIZE | |||
| ld.w t2, X, 1 * SIZE | |||
| add.d X, X, INCX | |||
| ld.w t3, X, 0 * SIZE | |||
| ld.w t4, X, 1 * SIZE | |||
| add.d X, X, INCX | |||
| xvinsgr2vr.w x1, t1, 0 | |||
| xvinsgr2vr.w x2, t2, 0 | |||
| xvinsgr2vr.w x1, t3, 1 | |||
| xvinsgr2vr.w x2, t4, 1 | |||
| ld.w t1, X, 0 * SIZE | |||
| ld.w t2, X, 1 * SIZE | |||
| add.d X, X, INCX | |||
| ld.w t3, X, 0 * SIZE | |||
| ld.w t4, X, 1 * SIZE | |||
| xvinsgr2vr.w x1, t1, 2 | |||
| xvinsgr2vr.w x2, t2, 2 | |||
| xvinsgr2vr.w x1, t3, 3 | |||
| xvinsgr2vr.w x2, t4, 3 | |||
| add.d X, X, INCX | |||
| ld.w t1, X, 0 * SIZE | |||
| ld.w t2, X, 1 * SIZE | |||
| add.d X, X, INCX | |||
| ld.w t3, X, 0 * SIZE | |||
| ld.w t4, X, 1 * SIZE | |||
| add.d X, X, INCX | |||
| xvinsgr2vr.w x1, t1, 4 | |||
| xvinsgr2vr.w x2, t2, 4 | |||
| xvinsgr2vr.w x1, t3, 5 | |||
| xvinsgr2vr.w x2, t4, 5 | |||
| ld.w t1, X, 0 * SIZE | |||
| ld.w t2, X, 1 * SIZE | |||
| add.d X, X, INCX | |||
| ld.w t3, X, 0 * SIZE | |||
| ld.w t4, X, 1 * SIZE | |||
| xvinsgr2vr.w x1, t1, 6 | |||
| xvinsgr2vr.w x2, t2, 6 | |||
| xvinsgr2vr.w x1, t3, 7 | |||
| xvinsgr2vr.w x2, t4, 7 | |||
| add.d X, X, INCX | |||
| xvfmul.s x3, VXAR, x1 | |||
| xvfmul.s x4, VXAR, x2 | |||
| addi.d I, I, -1 | |||
| xvstelm.w x3, XX, 0 * SIZE, 0 | |||
| xvstelm.w x4, XX, 1 * SIZE, 0 | |||
| add.d XX, XX, INCX | |||
| xvstelm.w x3, XX, 0 * SIZE, 1 | |||
| xvstelm.w x4, XX, 1 * SIZE, 1 | |||
| add.d XX, XX, INCX | |||
| xvstelm.w x3, XX, 0 * SIZE, 2 | |||
| xvstelm.w x4, XX, 1 * SIZE, 2 | |||
| add.d XX, XX, INCX | |||
| xvstelm.w x3, XX, 0 * SIZE, 3 | |||
| xvstelm.w x4, XX, 1 * SIZE, 3 | |||
| add.d XX, XX, INCX | |||
| xvstelm.w x3, XX, 0 * SIZE, 4 | |||
| xvstelm.w x4, XX, 1 * SIZE, 4 | |||
| add.d XX, XX, INCX | |||
| xvstelm.w x3, XX, 0 * SIZE, 5 | |||
| xvstelm.w x4, XX, 1 * SIZE, 5 | |||
| add.d XX, XX, INCX | |||
| xvstelm.w x3, XX, 0 * SIZE, 6 | |||
| xvstelm.w x4, XX, 1 * SIZE, 6 | |||
| add.d XX, XX, INCX | |||
| xvstelm.w x3, XX, 0 * SIZE, 7 | |||
| xvstelm.w x4, XX, 1 * SIZE, 7 | |||
| #endif | |||
| add.d XX, XX, INCX | |||
| blt $r0, I, .L223 | |||
| b .L997 | |||
| blt $r0, I, .L27 | |||
| b .L29 | |||
| .align 3 | |||
| .L224: //alpha_r != 0.0 && alpha_i != 0.0 | |||
| .L25: | |||
| #ifdef DOUBLE | |||
| ld.d t1, X, 0 * SIZE | |||
| ld.d t2, X, 1 * SIZE | |||
| @@ -376,7 +242,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xvinsgr2vr.d x1, t3, 3 | |||
| xvinsgr2vr.d x2, t4, 3 | |||
| add.d X, X, INCX | |||
| xvfmul.d VX0, VXAI, x2 | |||
| xvfmsub.d x3, VXAR, x1, VX0 | |||
| xvfmul.d VX1, VXAI, x1 | |||
| @@ -434,7 +299,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xvinsgr2vr.w x1, t3, 7 | |||
| xvinsgr2vr.w x2, t4, 7 | |||
| add.d X, X, INCX | |||
| xvfmul.s VX0, VXAI, x2 | |||
| xvfmsub.s x3, VXAR, x1, VX0 | |||
| xvfmul.s VX1, VXAI, x1 | |||
| @@ -465,19 +329,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xvstelm.w x4, XX, 1 * SIZE, 7 | |||
| #endif | |||
| add.d XX, XX, INCX | |||
| blt $r0, I, .L224 | |||
| b .L997 | |||
| blt $r0, I, .L25 | |||
| b .L29 | |||
| .align 3 | |||
| .L997: | |||
| /////// INCX != 1 && N < 8 /////// | |||
| .L29: | |||
| #ifdef DOUBLE | |||
| andi I, N, 3 | |||
| andi I, N, 3 | |||
| #else | |||
| andi I, N, 7 | |||
| andi I, N, 7 | |||
| #endif | |||
| bge $r0, I, .L999 | |||
| .align 3 | |||
| beqz I, .L999 | |||
| bnez DUMMY2, .L998 // if DUMMPY2 == 1, called from c/zscal. | |||
| bceqz $fcc0, .L998 | |||
| bceqz $fcc1, .L998 | |||
| .L995: // alpha_r == 0.0 && alpha_i == 0.0 | |||
| ST a1, X, 0 * SIZE | |||
| ST a1, X, 1 * SIZE | |||
| addi.d I, I, -1 | |||
| add.d X, X, INCX | |||
| blt $r0, I, .L995 | |||
| b .L999 | |||
| .L998: | |||
| LD a1, X, 0 * SIZE | |||
| LD a2, X, 1 * SIZE | |||
| @@ -490,11 +366,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ST s2, X, 1 * SIZE | |||
| add.d X, X, INCX | |||
| blt $r0, I, .L998 | |||
| .align 3 | |||
| b .L999 | |||
| .L999: | |||
| move $r4, $r12 | |||
| jirl $r0, $r1, 0x0 | |||
| .align 3 | |||
| EPILOGUE | |||
| @@ -53,8 +53,8 @@ PROLOGUE | |||
| #endif | |||
| /* init $f8 and $f9 to zero */ | |||
| SUB s1, s1, s1 | |||
| SUB s2, s2, s2 | |||
| xvxor.v $xr8, $xr8, $xr8 | |||
| xvxor.v $xr9, $xr9, $xr9 | |||
| slli.d INCX, INCX, BASE_SHIFT | |||
| li.d TEMP, SIZE | |||
| slli.d INCY, INCY, BASE_SHIFT | |||
| @@ -64,20 +64,6 @@ PROLOGUE | |||
| /* !((inc_x == 1) && (inc_y == 1)) */ | |||
| /* init $xr8 and $xr9 to zero */ | |||
| #ifdef DOUBLE | |||
| xvldrepl.d $xr0, X, 0 | |||
| #else | |||
| xvldrepl.w $xr0, X, 0 | |||
| #endif | |||
| #ifdef DSDOT | |||
| xvfcvtl.d.s $xr0, $xr0 | |||
| xvfsub.d $xr8, $xr0, $xr0 | |||
| xvfsub.d $xr9, $xr0, $xr0 | |||
| #else | |||
| XVFSUB $xr8, $xr0, $xr0 | |||
| XVFSUB $xr9, $xr0, $xr0 | |||
| #endif | |||
| #ifdef DOUBLE | |||
| srai.d I, N, 4 | |||
| @@ -99,31 +85,31 @@ PROLOGUE | |||
| addi.w I, I, -1 | |||
| addi.d X, X, 128 | |||
| addi.d Y, Y, 128 | |||
| #ifdef DSDOT | |||
| #ifndef DOUBLE | |||
| xvfcvtl.d.s $xr10, $xr0 | |||
| xvfcvtl.d.s $xr11, $xr4 | |||
| xvfcvth.d.s $xr12, $xr0 | |||
| xvfcvth.d.s $xr13, $xr4 | |||
| xvfmadd.d $xr8, $xr10, $xr12, $xr8 | |||
| xvfmadd.d $xr9, $xr11, $xr13, $xr9 | |||
| xvfmadd.d $xr8, $xr10, $xr11, $xr8 | |||
| xvfmadd.d $xr9, $xr12, $xr13, $xr9 | |||
| xvfcvtl.d.s $xr10, $xr1 | |||
| xvfcvtl.d.s $xr11, $xr5 | |||
| xvfcvth.d.s $xr12, $xr1 | |||
| xvfcvth.d.s $xr13, $xr5 | |||
| xvfmadd.d $xr8, $xr10, $xr12, $xr8 | |||
| xvfmadd.d $xr9, $xr11, $xr13, $xr9 | |||
| xvfmadd.d $xr8, $xr10, $xr11, $xr8 | |||
| xvfmadd.d $xr9, $xr12, $xr13, $xr9 | |||
| xvfcvtl.d.s $xr10, $xr2 | |||
| xvfcvtl.d.s $xr11, $xr6 | |||
| xvfcvth.d.s $xr12, $xr2 | |||
| xvfcvth.d.s $xr13, $xr6 | |||
| xvfmadd.d $xr8, $xr10, $xr12, $xr8 | |||
| xvfmadd.d $xr9, $xr11, $xr13, $xr9 | |||
| xvfmadd.d $xr8, $xr10, $xr11, $xr8 | |||
| xvfmadd.d $xr9, $xr12, $xr13, $xr9 | |||
| xvfcvtl.d.s $xr10, $xr3 | |||
| xvfcvtl.d.s $xr11, $xr7 | |||
| xvfcvth.d.s $xr12, $xr3 | |||
| xvfcvth.d.s $xr13, $xr7 | |||
| xvfmadd.d $xr8, $xr10, $xr12, $xr8 | |||
| xvfmadd.d $xr9, $xr11, $xr13, $xr9 | |||
| xvfmadd.d $xr8, $xr10, $xr11, $xr8 | |||
| xvfmadd.d $xr9, $xr12, $xr13, $xr9 | |||
| #else | |||
| XVFMADD $xr8, $xr0, $xr4, $xr8 | |||
| XVFMADD $xr9, $xr1, $xr5, $xr9 | |||
| @@ -149,13 +135,13 @@ PROLOGUE | |||
| addi.w I, I, -1 | |||
| addi.d X, X, 32 | |||
| addi.d Y, Y, 32 | |||
| #ifdef DSDOT | |||
| #ifndef DOUBLE | |||
| xvfcvtl.d.s $xr10, $xr0 | |||
| xvfcvtl.d.s $xr11, $xr4 | |||
| xvfcvth.d.s $xr12, $xr0 | |||
| xvfcvth.d.s $xr13, $xr4 | |||
| xvfmadd.d $xr8, $xr10, $xr12, $xr8 | |||
| xvfmadd.d $xr9, $xr11, $xr13, $xr9 | |||
| xvfmadd.d $xr8, $xr10, $xr11, $xr8 | |||
| xvfmadd.d $xr9, $xr12, $xr13, $xr9 | |||
| #else | |||
| XVFMADD $xr8, $xr0, $xr4, $xr8 | |||
| #endif | |||
| @@ -163,27 +149,12 @@ PROLOGUE | |||
| .align 3 | |||
| .L14: | |||
| /* store dot in s1 $f8 */ | |||
| #ifdef DSDOT | |||
| xvfadd.d $xr8, $xr8, $xr9 | |||
| fsub.s s2, s2, s2 /* set s2 to 0.0 */ | |||
| fsub.d s2, s2, s2 /* set s2 to 0.0 */ | |||
| xvpermi.q $xr0, $xr8, 0x1 | |||
| vfadd.d $vr8, $vr8, $vr0 | |||
| vpackod.d $vr0, $vr8, $vr8 | |||
| vfadd.d $vr8, $vr8, $vr0 | |||
| #else | |||
| XVFADD $xr8, $xr8, $xr9 | |||
| SUB s2, s2, s2 /* set s2 to 0.0 */ | |||
| xvpermi.q $xr0, $xr8, 0x1 | |||
| VFADD $vr8, $vr8, $vr0 | |||
| vpackod.d $vr0, $vr8, $vr8 | |||
| #ifdef DOUBLE | |||
| VFADD $vr8, $vr8, $vr0 | |||
| #else | |||
| VFADD $vr8, $vr8, $vr0 | |||
| vpackod.w $vr0, $vr8, $vr8 | |||
| VFADD $vr8, $vr8, $vr0 | |||
| #endif /* defined DOUBLE */ | |||
| #endif /* defined DSDOT */ | |||
| .align 3 | |||
| .L15: | |||
| #ifdef DOUBLE | |||
| @@ -197,7 +168,7 @@ PROLOGUE | |||
| /* FLOAT: 1~7 ; DOUBLE: 1~3 */ | |||
| LD a1, X, 0 | |||
| LD b1, Y, 0 | |||
| #ifdef DSDOT | |||
| #ifndef DOUBLE | |||
| fcvt.d.s a1, a1 | |||
| fcvt.d.s b1, b1 | |||
| fmadd.d s1, b1, a1, s1 | |||
| @@ -240,7 +211,7 @@ PROLOGUE | |||
| add.d X, X, INCX | |||
| LD b1, Y, 0 * SIZE | |||
| add.d Y, Y, INCY | |||
| #ifdef DSDOT | |||
| #ifndef DOUBLE | |||
| fcvt.d.s a1, a1 | |||
| fcvt.d.s b1, b1 | |||
| fmadd.d s1, b1, a1, s1 | |||
| @@ -252,7 +223,7 @@ PROLOGUE | |||
| add.d X, X, INCX | |||
| LD b1, Y, 0 * SIZE | |||
| add.d Y, Y, INCY | |||
| #ifdef DSDOT | |||
| #ifndef DOUBLE | |||
| fcvt.d.s a1, a1 | |||
| fcvt.d.s b1, b1 | |||
| fmadd.d s2, b1, a1, s2 | |||
| @@ -264,7 +235,7 @@ PROLOGUE | |||
| add.d X, X, INCX | |||
| LD b1, Y, 0 * SIZE | |||
| add.d Y, Y, INCY | |||
| #ifdef DSDOT | |||
| #ifndef DOUBLE | |||
| fcvt.d.s a1, a1 | |||
| fcvt.d.s b1, b1 | |||
| fmadd.d s1, b1, a1, s1 | |||
| @@ -276,7 +247,7 @@ PROLOGUE | |||
| add.d X, X, INCX | |||
| LD b1, Y, 0 * SIZE | |||
| add.d Y, Y, INCY | |||
| #ifdef DSDOT | |||
| #ifndef DOUBLE | |||
| fcvt.d.s a1, a1 | |||
| fcvt.d.s b1, b1 | |||
| fmadd.d s2, b1, a1, s2 | |||
| @@ -288,7 +259,7 @@ PROLOGUE | |||
| add.d X, X, INCX | |||
| LD b1, Y, 0 * SIZE | |||
| add.d Y, Y, INCY | |||
| #ifdef DSDOT | |||
| #ifndef DOUBLE | |||
| fcvt.d.s a1, a1 | |||
| fcvt.d.s b1, b1 | |||
| fmadd.d s1, b1, a1, s1 | |||
| @@ -300,7 +271,7 @@ PROLOGUE | |||
| add.d X, X, INCX | |||
| LD b1, Y, 0 * SIZE | |||
| add.d Y, Y, INCY | |||
| #ifdef DSDOT | |||
| #ifndef DOUBLE | |||
| fcvt.d.s a1, a1 | |||
| fcvt.d.s b1, b1 | |||
| fmadd.d s2, b1, a1, s2 | |||
| @@ -312,7 +283,7 @@ PROLOGUE | |||
| add.d X, X, INCX | |||
| LD b1, Y, 0 * SIZE | |||
| add.d Y, Y, INCY | |||
| #ifdef DSDOT | |||
| #ifndef DOUBLE | |||
| fcvt.d.s a1, a1 | |||
| fcvt.d.s b1, b1 | |||
| fmadd.d s1, b1, a1, s1 | |||
| @@ -325,7 +296,7 @@ PROLOGUE | |||
| LD b1, Y, 0 * SIZE | |||
| add.d Y, Y, INCY | |||
| addi.d I, I, -1 | |||
| #ifdef DSDOT | |||
| #ifndef DOUBLE | |||
| fcvt.d.s a1, a1 | |||
| fcvt.d.s b1, b1 | |||
| fmadd.d s2, b1, a1, s2 | |||
| @@ -346,7 +317,7 @@ PROLOGUE | |||
| LD b1, Y, 0 * SIZE | |||
| add.d Y, Y, INCY | |||
| addi.d I, I, -1 | |||
| #ifdef DSDOT | |||
| #ifndef DOUBLE | |||
| fcvt.d.s a1, a1 | |||
| fcvt.d.s b1, b1 | |||
| fmadd.d s1, b1, a1, s1 | |||
| @@ -357,12 +328,13 @@ PROLOGUE | |||
| .align 3 | |||
| .L999: | |||
| #ifdef DSDOT | |||
| fadd.d $f0, s1, s2 | |||
| move $r4, $r17 | |||
| #if defined(DOUBLE) | |||
| #elif defined(DSDOT) | |||
| #else | |||
| ADD $f0, s1, s2 | |||
| fcvt.s.d $f0, $f0 | |||
| #endif | |||
| move $r4, $r17 | |||
| jirl $r0, $r1, 0x0 | |||
| EPILOGUE | |||
| @@ -56,25 +56,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define VI3 $xr8 | |||
| #define VI4 $xr19 | |||
| #define VT0 $xr23 | |||
| #define VZE $xr3 | |||
| #define VT1 $xr4 | |||
| #define VT2 $xr5 | |||
| #define VC0 $xr6 | |||
| PROLOGUE | |||
| li.d i0, 0 | |||
| bge $r0, N, .L999 | |||
| bge $r0, INCX, .L999 | |||
| li.d TEMP, 1 | |||
| xvldi VZE, 0 | |||
| slli.d TEMP, TEMP, BASE_SHIFT | |||
| slli.d INCX, INCX, BASE_SHIFT | |||
| bne INCX, TEMP, .L20 | |||
| xvld VM0, X, 0 | |||
| #ifdef DOUBLE | |||
| xvfsub.d VT1, VZE, VM0 | |||
| addi.d i0, i0, 1 | |||
| srai.d I, N, 3 | |||
| bge $r0, I, .L21 | |||
| slli.d i0, i0, 2 //4 | |||
| xvfmaxa.d VM0, VM0, VT1 | |||
| bge $r0, I, .L11 | |||
| slli.d i0, i0, 1 //2 | |||
| xvreplgr2vr.d VINC4, i0 | |||
| slli.d i0, i0, 1 //8 | |||
| slli.d i0, i0, 1 //4 | |||
| xvreplgr2vr.d VINC8, i0 | |||
| addi.d i0, i0, -15 | |||
| addi.d i0, i0, -7 | |||
| xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization | |||
| addi.d i0, i0, 1 | |||
| xvinsgr2vr.d VI1, i0, 1 | |||
| @@ -82,19 +89,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xvinsgr2vr.d VI1, i0, 2 | |||
| addi.d i0, i0, 1 | |||
| xvinsgr2vr.d VI1, i0, 3 | |||
| addi.d i0, i0, 5 | |||
| xvinsgr2vr.d VI0, i0, 0 //1 | |||
| addi.d i0, i0, 1 | |||
| xvinsgr2vr.d VI0, i0, 1 //2 | |||
| xvinsgr2vr.d VI0, i0, 0 //initialize the index value for vectorization | |||
| addi.d i0, i0, 1 | |||
| xvinsgr2vr.d VI0, i0, 2 //3 | |||
| xvinsgr2vr.d VI0, i0, 1 | |||
| addi.d i0, i0, 1 | |||
| xvinsgr2vr.d VI0, i0, 3 //4 | |||
| xvinsgr2vr.d VI0, i0, 2 | |||
| addi.d i0, i0, 1 | |||
| xvinsgr2vr.d VI0, i0, 3 | |||
| #else | |||
| xvfsub.s VT1, VZE, VM0 | |||
| addi.w i0, i0, 1 | |||
| srai.d I, N, 3 | |||
| xvfmaxa.s VM0, VM0, VT1 | |||
| bge $r0, I, .L21 | |||
| slli.w i0, i0, 3 //8 | |||
| slli.w i0, i0, 2 //4 | |||
| xvreplgr2vr.w VINC4, i0 | |||
| slli.w i0, i0, 1 //8 | |||
| xvreplgr2vr.w VINC8, i0 | |||
| addi.w i0, i0, -15 | |||
| xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization | |||
| @@ -135,73 +146,124 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #ifdef DOUBLE | |||
| xvld VX0, X, 0 * SIZE | |||
| xvadd.d VI1, VI1, VINC8 | |||
| xvld VX1, X, 4 * SIZE | |||
| xvld VX1, X, 2 * SIZE | |||
| xvadd.d VI2, VI1, VINC4 | |||
| xvfsub.d VT1, VZE, VX0 | |||
| xvfsub.d VT2, VZE, VX1 | |||
| xvfmaxa.d VX0, VX0, VT1 | |||
| xvfmaxa.d VX1, VX1, VT2 | |||
| xvfcmp.clt.d VT0, VX0, VX1 //abx(x0) < abs(x1) | |||
| xvbitsel.v x1, VX0, VX1, VT0 //abs(maxf) | |||
| xvbitsel.v x2, VI1, VI2, VT0 //i | |||
| xvld VX0, X, 4 * SIZE | |||
| xvadd.d VI1, VI2, VINC4 | |||
| xvld VX1, X, 6 * SIZE | |||
| xvadd.d VI2, VI1, VINC4 | |||
| xvfmaxa.d VM1, VX0, VX1 | |||
| xvfcmp.ceq.d VT0, VX0, VM1 | |||
| xvfsub.d VT1, VZE, VX0 | |||
| xvfsub.d VT2, VZE, VX1 | |||
| xvfmaxa.d VX0, VX0, VT1 | |||
| xvfmaxa.d VX1, VX1, VT2 | |||
| xvfcmp.clt.d VT0, VX0, VX1 | |||
| xvbitsel.v x3, VX0, VX1, VT0 //abs(maxf) | |||
| xvbitsel.v x4, VI1, VI2, VT0 //i | |||
| xvfcmp.clt.d VC0, x1, x3 | |||
| xvbitsel.v x1, x1, x3, VC0 //abs(maxf) | |||
| xvbitsel.v x2, x2, x4, VC0 //i | |||
| xvfcmp.clt.d VT0, VM0, x1 | |||
| addi.d I, I, -1 | |||
| xvbitsel.v VI2, VI2, VI1, VT0 | |||
| xvfmaxa.d VM1, VM0, VM1 | |||
| xvfcmp.ceq.d VT0, VM0, VM1 | |||
| addi.d X, X, 8 * SIZE | |||
| xvbitsel.v VM0, VM1, VM0, VT0 | |||
| xvbitsel.v VI0, VI2, VI0, VT0 | |||
| xvbitsel.v VM0, VM0, x1, VT0 | |||
| xvbitsel.v VI0, VI0, x2, VT0 | |||
| #else | |||
| xvld VX0, X, 0 * SIZE | |||
| addi.d I, I, -1 | |||
| xvadd.w VI1, VI1, VINC8 | |||
| xvfmaxa.s VM1, VX0, VM0 | |||
| xvfcmp.ceq.s VT0, VM0, VM1 | |||
| xvld VX1, X, 4 * SIZE | |||
| xvadd.w VI2, VI1, VINC4 | |||
| xvfsub.s VT1, VZE, VX0 | |||
| xvfsub.s VT2, VZE, VX1 | |||
| xvfmaxa.s VX0, VX0, VT1 | |||
| xvfmaxa.s VX1, VX1, VT2 | |||
| xvfcmp.clt.s VT0, VX0, VX1 | |||
| xvbitsel.v x1, VX0, VX1, VT0 //abs(maxf) | |||
| xvbitsel.v x2, VI1, VI2, VT0 //i | |||
| addi.d I, I, -1 | |||
| xvfcmp.clt.s VT0, VM0, x1 | |||
| addi.d X, X, 8 * SIZE | |||
| xvbitsel.v VM0, VM1, VM0, VT0 | |||
| xvbitsel.v VI0, VI1, VI0, VT0 | |||
| xvbitsel.v VM0, VM0, x1, VT0 | |||
| xvbitsel.v VI0, VI0, x2, VT0 | |||
| #endif | |||
| blt $r0, I, .L10 | |||
| .align 3 | |||
| .L15: | |||
| #ifdef DOUBLE | |||
| xvpickve.d VI1, VI0, 0 | |||
| xvpickve.d VI2, VI0, 1 | |||
| xvpickve.d VI3, VI0, 2 | |||
| xvpickve.d VI4, VI0, 3 | |||
| xvpickve.d x1, VM0, 0 | |||
| xvpickve.d x2, VM0, 1 | |||
| xvpickve.d x3, VM0, 2 | |||
| xvpickve.d x4, VM0, 3 | |||
| vreplvei.d $vr21, $vr20, 0 | |||
| vreplvei.d $vr22, $vr20, 1 | |||
| vreplvei.d $vr9, $vr15, 0 | |||
| vreplvei.d $vr10, $vr15, 1 | |||
| fcmp.ceq.d $fcc0, $f9, $f10 | |||
| bceqz $fcc0, .L16 | |||
| xvfcmp.clt.d VT0, VI1, VI2 | |||
| xvbitsel.v VI0, VI2, VI1, VT0 | |||
| b .L17 | |||
| #else | |||
| xvxor.v VX0, VX0, VX0 | |||
| xvor.v VX0, VI0, VX0 | |||
| xvxor.v VX1, VX1, VX1 | |||
| xvor.v VX1, VM0, VX1 | |||
| xvpickve.w VI1, VI0, 0 | |||
| xvpickve.w VI2, VI0, 1 | |||
| xvpickve.w VI3, VI0, 2 | |||
| xvpickve.w VI4, VI0, 3 | |||
| xvpickve.w x1, VM0, 0 | |||
| xvpickve.w x2, VM0, 1 | |||
| xvpickve.w x3, VM0, 2 | |||
| xvpickve.w x4, VM0, 3 | |||
| vreplvei.w $vr21, $vr20, 0 | |||
| vreplvei.w $vr22, $vr20, 1 | |||
| vreplvei.w $vr8, $vr20, 2 | |||
| vreplvei.w $vr19, $vr20, 3 | |||
| vreplvei.w $vr9, $vr15, 0 | |||
| vreplvei.w $vr10, $vr15, 1 | |||
| vreplvei.w $vr11, $vr15, 2 | |||
| vreplvei.w $vr12, $vr15, 3 | |||
| b .L26 | |||
| #endif | |||
| XVFMAXA VM1, x1, x2 | |||
| XVCMPEQ VT0, x1, VM1 | |||
| xvbitsel.v VINC4, VI2, VI1, VT0 | |||
| XVFMAXA VM0, x3, x4 | |||
| XVCMPEQ VT0, x3, VM0 | |||
| xvbitsel.v VINC8, VI4, VI3, VT0 | |||
| XVFMAXA VM0, VM0, VM1 | |||
| XVCMPEQ VT0, VM0, VM1 | |||
| xvbitsel.v VI0, VINC8, VINC4, VT0 | |||
| CMPEQ $fcc0, $f15, $f9 | |||
| bceqz $fcc0, .L26 | |||
| XVCMPLT VT0, VI1, VI0 | |||
| .align 3 | |||
| #ifdef DOUBLE | |||
| .L16: | |||
| xvfcmp.clt.d VT0, x1, x2 | |||
| xvbitsel.v VI0, VI1, VI2, VT0 | |||
| xvbitsel.v VM0, x1, x2, VT0 | |||
| .align 3 | |||
| .L17: | |||
| movfr2gr.d i0, $f20 | |||
| .align 3 | |||
| .L11: //INCX==1 and N<8 | |||
| andi I, N, 7 | |||
| bge $r0, I, .L14 | |||
| srai.d i1, N, 3 | |||
| slli.d i1, i1, 3 | |||
| addi.d i1, i1, 1 //current index | |||
| movgr2fr.d $f21, i1 | |||
| movgr2fr.d $f20, i0 | |||
| .align 3 | |||
| .L13: | |||
| fld.d $f9, X, 0 | |||
| fsub.d $f10, $f3, $f9 | |||
| xvfmaxa.d x1, x1, x2 | |||
| xvfcmp.clt.d VT0, VM0, x1 | |||
| xvbitsel.v VM0, VM0, x1, VT0 | |||
| xvbitsel.v VI0, VI0, VI1, VT0 | |||
| b .L26 | |||
| addi.d I, I, -1 | |||
| addi.d i1, i1, 1 | |||
| addi.d X, X, SIZE | |||
| movgr2fr.d $f21, i1 | |||
| blt $r0, I, .L13 | |||
| movfr2gr.d i0, $f20 | |||
| .align 3 | |||
| .L14: | |||
| move $r4, $r17 | |||
| jirl $r0, $r1, 0x0 | |||
| .align 3 | |||
| .L20: // INCX!=1 | |||
| move TEMP, X | |||
| #ifdef DOUBLE | |||
| addi.d i0, i0, 1 | |||
| ld.d t1, TEMP, 0 * SIZE | |||
| add.d TEMP, TEMP, INCX | |||
| @@ -210,34 +272,103 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| bge $r0, I, .L21 | |||
| ld.d t2, TEMP, 0 * SIZE | |||
| add.d TEMP, TEMP, INCX | |||
| ld.d t3, TEMP, 0 * SIZE | |||
| add.d TEMP, TEMP, INCX | |||
| ld.d t4, TEMP, 0 * SIZE | |||
| add.d TEMP, TEMP, INCX | |||
| xvinsgr2vr.d VM0, t2, 1 | |||
| xvinsgr2vr.d VM0, t3, 2 | |||
| xvinsgr2vr.d VM0, t4, 3 | |||
| slli.d i0, i0, 2 //4 | |||
| slli.d i0, i0, 1 //2 | |||
| xvfsub.d VT1, VZE, VM0 | |||
| xvreplgr2vr.d VINC4, i0 | |||
| slli.d i0, i0, 1 //8 | |||
| slli.d i0, i0, 1 //4 | |||
| xvreplgr2vr.d VINC8, i0 | |||
| addi.d i0, i0, -15 | |||
| addi.d i0, i0, -7 | |||
| xvfmaxa.d VM0, VM0, VT1 | |||
| xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization | |||
| addi.d i0, i0, 1 | |||
| xvinsgr2vr.d VI1, i0, 1 | |||
| addi.d i0, i0, 1 | |||
| xvinsgr2vr.d VI1, i0, 2 | |||
| addi.d i0, i0, 1 | |||
| xvinsgr2vr.d VI1, i0, 3 | |||
| addi.d i0, i0, 5 | |||
| addi.d i0, i0, 3 | |||
| xvinsgr2vr.d VI0, i0, 0 //1 | |||
| addi.d i0, i0, 1 | |||
| xvinsgr2vr.d VI0, i0, 1 //2 | |||
| addi.d i0, i0, 1 | |||
| xvinsgr2vr.d VI0, i0, 2 //3 | |||
| addi.d i0, i0, 1 | |||
| xvinsgr2vr.d VI0, i0, 3 //4 | |||
| .align 3 | |||
| .L24: | |||
| ld.d t1, X, 0 * SIZE | |||
| add.d X, X, INCX | |||
| xvinsgr2vr.d VX0, t1, 0 | |||
| ld.d t2, X, 0 * SIZE | |||
| add.d X, X, INCX | |||
| xvinsgr2vr.d VX0, t2, 1 | |||
| xvadd.d VI1, VI1, VINC8 | |||
| ld.d t1, X, 0 * SIZE | |||
| add.d X, X, INCX | |||
| xvinsgr2vr.d VX1, t1, 0 | |||
| ld.d t2, X, 0 * SIZE | |||
| add.d X, X, INCX | |||
| xvinsgr2vr.d VX1, t2, 1 | |||
| xvadd.d VI2, VI1, VINC4 | |||
| xvfsub.d VT1, VZE, VX0 | |||
| xvfsub.d VT2, VZE, VX1 | |||
| xvfmaxa.d VX0, VX0, VT1 | |||
| xvfmaxa.d VX1, VX1, VT2 | |||
| xvfcmp.clt.d VT0, VX0, VX1 | |||
| xvbitsel.v x1, VX0, VX1, VT0 | |||
| xvbitsel.v x2, VI1, VI2, VT0 | |||
| ld.d t1, X, 0 * SIZE | |||
| add.d X, X, INCX | |||
| xvinsgr2vr.d VX0, t1, 0 | |||
| ld.d t2, X, 0 * SIZE | |||
| add.d X, X, INCX | |||
| xvinsgr2vr.d VX0, t2, 1 | |||
| xvadd.d VI1, VI2, VINC4 | |||
| ld.d t1, X, 0 * SIZE | |||
| add.d X, X, INCX | |||
| xvinsgr2vr.d VX1, t1, 0 | |||
| ld.d t2, X, 0 * SIZE | |||
| add.d X, X, INCX | |||
| xvinsgr2vr.d VX1, t2, 1 | |||
| xvadd.d VI2, VI1, VINC4 | |||
| xvfsub.d VT1, VZE, VX0 | |||
| xvfsub.d VT2, VZE, VX1 | |||
| xvfmaxa.d VX0, VX0, VT1 | |||
| xvfmaxa.d VX1, VX1, VT2 | |||
| xvfcmp.clt.d VT0, VX0, VX1 | |||
| xvbitsel.v x3, VX0, VX1, VT0 | |||
| xvbitsel.v x4, VI1, VI2, VT0 | |||
| xvfcmp.clt.d VC0, x1, x3 | |||
| xvbitsel.v x1, x1, x3, VC0 | |||
| xvbitsel.v x2, x2, x4, VC0 | |||
| xvfcmp.clt.d VT0, VM0, x1 | |||
| xvbitsel.v VM0, VM0, x1, VT0 | |||
| xvbitsel.v VI0, VI0, x2, VT0 | |||
| addi.d I, I, -1 | |||
| blt $r0, I, .L24 | |||
| .align 3 | |||
| .L25: | |||
| vreplvei.d $vr21, $vr20, 0 | |||
| vreplvei.d $vr22, $vr20, 1 | |||
| vreplvei.d $vr9, $vr15, 0 | |||
| vreplvei.d $vr10, $vr15, 1 | |||
| fcmp.ceq.d $fcc0, $f10, $f9 | |||
| bceqz $fcc0, .L26 | |||
| xvfcmp.clt.d VT0, VI1, VI2 | |||
| xvbitsel.v VI0, VI2, VI1, VT0 | |||
| b .L27 | |||
| .align 3 | |||
| .L26: | |||
| xvfcmp.clt.d VT0, x1, x2 | |||
| xvbitsel.v VI0, VI1, VI2, VT0 | |||
| xvbitsel.v VM0, x1, x2, VT0 | |||
| .align 3 | |||
| .L27: | |||
| movfr2gr.d i0, $f20 | |||
| .align 3 | |||
| #else | |||
| .L20: // INCX!=1 | |||
| move TEMP, X | |||
| addi.w i0, i0, 1 | |||
| ld.w t1, TEMP, 0 * SIZE | |||
| add.d TEMP, TEMP, INCX | |||
| @@ -253,19 +384,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xvinsgr2vr.w VM0, t2, 1 | |||
| xvinsgr2vr.w VM0, t3, 2 | |||
| xvinsgr2vr.w VM0, t4, 3 | |||
| ld.w t1, TEMP, 0 * SIZE | |||
| add.d TEMP, TEMP, INCX | |||
| ld.w t2, TEMP, 0 * SIZE | |||
| add.d TEMP, TEMP, INCX | |||
| ld.w t3, TEMP, 0 * SIZE | |||
| add.d TEMP, TEMP, INCX | |||
| ld.w t4, TEMP, 0 * SIZE | |||
| add.d TEMP, TEMP, INCX | |||
| xvinsgr2vr.w VM0, t1, 4 | |||
| xvinsgr2vr.w VM0, t2, 5 | |||
| xvinsgr2vr.w VM0, t3, 6 | |||
| xvinsgr2vr.w VM0, t4, 7 | |||
| slli.w i0, i0, 3 //8 | |||
| slli.w i0, i0, 2 //4 | |||
| xvreplgr2vr.w VINC4, i0 | |||
| slli.w i0, i0, 1 //8 | |||
| xvreplgr2vr.w VINC8, i0 | |||
| addi.w i0, i0, -15 | |||
| xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization | |||
| @@ -275,15 +396,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xvinsgr2vr.w VI1, i0, 2 | |||
| addi.w i0, i0, 1 | |||
| xvinsgr2vr.w VI1, i0, 3 | |||
| addi.w i0, i0, 1 | |||
| xvinsgr2vr.w VI1, i0, 4 | |||
| addi.w i0, i0, 1 | |||
| xvinsgr2vr.w VI1, i0, 5 | |||
| addi.w i0, i0, 1 | |||
| xvinsgr2vr.w VI1, i0, 6 | |||
| addi.w i0, i0, 1 | |||
| xvinsgr2vr.w VI1, i0, 7 | |||
| addi.w i0, i0, 1 | |||
| addi.w i0, i0, 5 | |||
| xvinsgr2vr.w VI0, i0, 0 //1 | |||
| addi.w i0, i0, 1 | |||
| xvinsgr2vr.w VI0, i0, 1 //2 | |||
| @@ -291,54 +404,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xvinsgr2vr.w VI0, i0, 2 //3 | |||
| addi.w i0, i0, 1 | |||
| xvinsgr2vr.w VI0, i0, 3 //4 | |||
| addi.w i0, i0, 1 | |||
| xvinsgr2vr.w VI0, i0, 4 //5 | |||
| addi.w i0, i0, 1 | |||
| xvinsgr2vr.w VI0, i0, 5 //6 | |||
| addi.w i0, i0, 1 | |||
| xvinsgr2vr.w VI0, i0, 6 //7 | |||
| addi.w i0, i0, 1 | |||
| xvinsgr2vr.w VI0, i0, 7 //8 | |||
| #endif | |||
| .align 3 | |||
| .L24: | |||
| #ifdef DOUBLE | |||
| ld.d t1, X, 0 * SIZE | |||
| add.d X, X, INCX | |||
| ld.d t2, X, 0 * SIZE | |||
| add.d X, X, INCX | |||
| ld.d t3, X, 0 * SIZE | |||
| add.d X, X, INCX | |||
| ld.d t4, X, 0 * SIZE | |||
| add.d X, X, INCX | |||
| xvinsgr2vr.d VX0, t1, 0 | |||
| xvinsgr2vr.d VX0, t2, 1 | |||
| xvinsgr2vr.d VX0, t3, 2 | |||
| xvinsgr2vr.d VX0, t4, 3 | |||
| xvadd.d VI1, VI1, VINC8 | |||
| ld.d t1, X, 0 * SIZE | |||
| add.d X, X, INCX | |||
| ld.d t2, X, 0 * SIZE | |||
| add.d X, X, INCX | |||
| ld.d t3, X, 0 * SIZE | |||
| add.d X, X, INCX | |||
| ld.d t4, X, 0 * SIZE | |||
| add.d X, X, INCX | |||
| xvinsgr2vr.d VX1, t1, 0 | |||
| xvinsgr2vr.d VX1, t2, 1 | |||
| xvinsgr2vr.d VX1, t3, 2 | |||
| xvinsgr2vr.d VX1, t4, 3 | |||
| xvadd.d VI2, VI1, VINC4 | |||
| xvfmaxa.d VM1, VX0, VX1 | |||
| xvfcmp.ceq.d VT0, VX0, VM1 | |||
| addi.d I, I, -1 | |||
| xvbitsel.v VI2, VI2, VI1, VT0 | |||
| xvfmaxa.d VM1, VM0, VM1 | |||
| xvfcmp.ceq.d VT0, VM0, VM1 | |||
| xvbitsel.v VM0, VM1, VM0, VT0 | |||
| xvbitsel.v VI0, VI2, VI0, VT0 | |||
| #else | |||
| ld.w t1, X, 0 * SIZE | |||
| add.d X, X, INCX | |||
| ld.w t2, X, 0 * SIZE | |||
| @@ -351,6 +419,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xvinsgr2vr.w VX0, t2, 1 | |||
| xvinsgr2vr.w VX0, t3, 2 | |||
| xvinsgr2vr.w VX0, t4, 3 | |||
| xvadd.w VI1, VI1, VINC8 | |||
| ld.w t1, X, 0 * SIZE | |||
| add.d X, X, INCX | |||
| ld.w t2, X, 0 * SIZE | |||
| @@ -359,158 +428,80 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| add.d X, X, INCX | |||
| ld.w t4, X, 0 * SIZE | |||
| add.d X, X, INCX | |||
| xvinsgr2vr.w VX0, t1, 4 | |||
| xvinsgr2vr.w VX0, t2, 5 | |||
| xvinsgr2vr.w VX0, t3, 6 | |||
| xvinsgr2vr.w VX0, t4, 7 | |||
| xvadd.w VI1, VI1, VINC8 | |||
| xvfmaxa.s VM1, VX0, VM0 | |||
| xvfcmp.ceq.s VT0, VM1, VM0 | |||
| xvinsgr2vr.w VX1, t1, 0 | |||
| xvinsgr2vr.w VX1, t2, 1 | |||
| xvinsgr2vr.w VX1, t3, 2 | |||
| xvinsgr2vr.w VX1, t4, 3 | |||
| xvadd.w VI2, VI1, VINC4 | |||
| xvfsub.s VT1, VZE, VX0 | |||
| xvfsub.s VT2, VZE, VX1 | |||
| xvfmaxa.s VX0, VX0, VT1 | |||
| xvfmaxa.s VX1, VX1, VT2 | |||
| xvfcmp.clt.s VT0, VX0, VX1 | |||
| xvbitsel.v x1, VX0, VX1, VT0 | |||
| xvbitsel.v x2, VI1, VI2, VT0 //i | |||
| addi.d I, I, -1 | |||
| xvbitsel.v VM0, VM1, VM0, VT0 | |||
| xvbitsel.v VI0, VI1, VI0, VT0 | |||
| #endif | |||
| xvfcmp.clt.s VT0, VM0, x1 | |||
| xvbitsel.v VM0, VM0, x1, VT0 | |||
| xvbitsel.v VI0, VI0, x2, VT0 | |||
| blt $r0, I, .L24 | |||
| .align 3 | |||
| .L25: | |||
| #ifdef DOUBLE | |||
| xvpickve.d VI1, VI0, 0 | |||
| xvpickve.d VI2, VI0, 1 | |||
| xvpickve.d VI3, VI0, 2 | |||
| xvpickve.d VI4, VI0, 3 | |||
| xvpickve.d x1, VM0, 0 | |||
| xvpickve.d x2, VM0, 1 | |||
| xvpickve.d x3, VM0, 2 | |||
| xvpickve.d x4, VM0, 3 | |||
| xvfmaxa.d VM1, x1, x2 | |||
| xvfcmp.ceq.d VT0, x1, VM1 | |||
| xvbitsel.v VINC4, VI2, VI1, VT0 | |||
| xvfmaxa.d VM0, x4, x3 | |||
| xvfcmp.ceq.d VT0, x3, VM0 | |||
| xvbitsel.v VINC8, VI4, VI3, VT0 | |||
| xvfmaxa.d VM0, VM0, VM1 | |||
| xvfcmp.ceq.d VT0, VM0, VM1 | |||
| xvbitsel.v VI0, VINC8, VINC4, VT0 | |||
| #else | |||
| xvxor.v VX0, VX0, VX0 | |||
| xvor.v VX0, VI0, VX0 | |||
| xvxor.v VX1, VX1, VX1 | |||
| xvor.v VX1, VM0, VX1 | |||
| xvpickve.w VI1, VI0, 0 | |||
| xvpickve.w VI2, VI0, 1 | |||
| xvpickve.w VI3, VI0, 2 | |||
| xvpickve.w VI4, VI0, 3 | |||
| xvpickve.w x1, VM0, 0 | |||
| xvpickve.w x2, VM0, 1 | |||
| xvpickve.w x3, VM0, 2 | |||
| xvpickve.w x4, VM0, 3 | |||
| xvfmaxa.s VM1, x1, x2 | |||
| xvfcmp.ceq.s VT0, x1, VM1 | |||
| xvbitsel.v VINC4, VI2, VI1, VT0 | |||
| xvfmaxa.s VM0, x3, x4 | |||
| xvfcmp.ceq.s VT0, x3, VM0 | |||
| xvbitsel.v VINC8, VI3, VI4, VT0 | |||
| xvfmaxa.s VM0, VM0, VM1 | |||
| xvfcmp.ceq.s VT0, VM0, VM1 | |||
| xvbitsel.v VM0, VM0, VM1, VT0 | |||
| xvbitsel.v VI0, VINC8, VINC4, VT0 | |||
| #endif | |||
| CMPEQ $fcc0, $f15, $f9 | |||
| bceqz $fcc0, .L26 | |||
| XVCMPLT VT0, VI1, VI0 | |||
| xvbitsel.v VI0, VI0, VI1, VT0 | |||
| vreplvei.w $vr21, $vr20, 0 | |||
| vreplvei.w $vr22, $vr20, 1 | |||
| vreplvei.w $vr8, $vr20, 2 | |||
| vreplvei.w $vr19, $vr20, 3 | |||
| vreplvei.w $vr9, $vr15, 0 | |||
| vreplvei.w $vr10, $vr15, 1 | |||
| vreplvei.w $vr11, $vr15, 2 | |||
| vreplvei.w $vr12, $vr15, 3 | |||
| .align 3 | |||
| .L26: | |||
| fcmp.ceq.d $fcc0, $f15, $f10 | |||
| bceqz $fcc0, .L27 | |||
| XVCMPLT VT0, VI2, VI0 | |||
| xvbitsel.v VI0, VI0, VI2, VT0 | |||
| fcmp.ceq.s $fcc0, $f9, $f10 | |||
| bceqz $fcc0, .L31 | |||
| xvfcmp.clt.s VT0, VI1, VI2 | |||
| xvbitsel.v VI1, VI2, VI1, VT0 | |||
| b .L32 | |||
| .align 3 | |||
| .L27: | |||
| fcmp.ceq.d $fcc0, $f15, $f11 | |||
| bceqz $fcc0, .L28 | |||
| XVCMPLT VT0, VI3, VI0 | |||
| xvbitsel.v VI0, VI0, VI3, VT0 | |||
| .L31: | |||
| xvfcmp.clt.s VT0, x1, x2 | |||
| xvbitsel.v VI1, VI1, VI2, VT0 | |||
| xvbitsel.v x1, x1, x2, VT0 | |||
| .align 3 | |||
| .L28: | |||
| fcmp.ceq.d $fcc0, $f15, $f12 | |||
| bceqz $fcc0, .L29 | |||
| XVCMPLT VT0, VI4, VI0 | |||
| xvbitsel.v VI0, VI0, VI4, VT0 | |||
| .L32: | |||
| fcmp.ceq.s $fcc0, $f11, $f12 | |||
| bceqz $fcc0, .L33 | |||
| xvfcmp.clt.s VT1, VI3, VI4 | |||
| xvbitsel.v VI3, VI4, VI3, VT1 | |||
| b .L34 | |||
| .align 3 | |||
| .L29: | |||
| #ifdef DOUBLE | |||
| movfr2gr.d i0, $f20 | |||
| #else | |||
| fmov.s $f16, $f20 | |||
| #endif | |||
| .L33: | |||
| xvfcmp.clt.s VT1, x3, x4 | |||
| xvbitsel.v x3, x3, x4, VT1 | |||
| xvbitsel.v VI3, VI3, VI4, VT1 | |||
| .align 3 | |||
| #ifdef DOUBLE | |||
| #else | |||
| .L252: | |||
| xvxor.v VI0, VI0, VI0 | |||
| xvor.v VI0, VI0, VX0 | |||
| fmov.s $f13, $f15 | |||
| xvxor.v VM0, VM0, VM0 | |||
| xvor.v VM0, VM0, VX1 | |||
| xvpickve.w VI1, VI0, 4 | |||
| xvpickve.w VI2, VI0, 5 | |||
| xvpickve.w VI3, VI0, 6 | |||
| xvpickve.w VI4, VI0, 7 | |||
| xvpickve.w x1, VM0, 4 | |||
| xvpickve.w x2, VM0, 5 | |||
| xvpickve.w x3, VM0, 6 | |||
| xvpickve.w x4, VM0, 7 | |||
| xvfmaxa.s VM1, x1, x2 | |||
| xvfcmp.ceq.s VT0, x1, VM1 | |||
| xvbitsel.v VINC4, VI2, VI1, VT0 | |||
| xvfmaxa.s VM0, x3, x4 | |||
| xvfcmp.ceq.s VT0, x3, VM0 | |||
| xvbitsel.v VINC8, VI4, VI3, VT0 | |||
| xvfmaxa.s VM0, VM0, VM1 | |||
| xvfcmp.ceq.s VT0, VM0, VM1 | |||
| xvbitsel.v VI0, VINC8, VINC4, VT0 | |||
| fcmp.ceq.d $fcc0, $f15, $f9 | |||
| bceqz $fcc0, .L262 | |||
| xvfcmp.clt.s VT0, VI1, VI0 | |||
| xvbitsel.v VI0, VI0, VI1, VT0 | |||
| .L34: | |||
| fcmp.ceq.s $fcc0, $f9, $f11 | |||
| bceqz $fcc0, .L35 | |||
| xvfcmp.clt.s VT0, VI1, VI3 | |||
| xvbitsel.v VI0, VI3, VI1, VT0 | |||
| xvxor.v VM0, x1, VZE | |||
| b .L29 | |||
| .align 3 | |||
| .L262: | |||
| fcmp.ceq.d $fcc0, $f15, $f10 | |||
| bceqz $fcc0, .L272 | |||
| xvfcmp.clt.s VT0, VI2, VI0 | |||
| xvbitsel.v VI0, VI0, VI2, VT0 | |||
| .L35: | |||
| xvfcmp.clt.s VT0, x1, x3 | |||
| xvbitsel.v VM0, x1, x3, VT0 | |||
| xvbitsel.v VI0, VI1, VI3, VT0 | |||
| .align 3 | |||
| .L272: | |||
| fcmp.ceq.d $fcc0, $f15, $f11 | |||
| bceqz $fcc0, .L282 | |||
| xvfcmp.clt.s VT0, VI3, VI0 | |||
| xvbitsel.v VI0, VI0, VI3, VT0 | |||
| .align 3 | |||
| .L282: | |||
| fcmp.ceq.d $fcc0, $f15, $f12 | |||
| bceqz $fcc0, .L292 | |||
| xvfcmp.clt.s VT0, VI4, VI0 | |||
| xvbitsel.v VI0, VI0, VI4, VT0 | |||
| .L29: | |||
| movfr2gr.s i0, $f20 | |||
| .align 3 | |||
| .L292: | |||
| xvfmaxa.s VM0, VX0, VM0 | |||
| xvfcmp.ceq.s VT0, VM0, VX0 | |||
| xvbitsel.v VI0, VI0, VI1, VT0 | |||
| movfr2gr.s i0, $f20 | |||
| #endif | |||
| .L21: //N<8 | |||
| .L21: // N<8 | |||
| andi I, N, 7 | |||
| bge $r0, I, .L999 | |||
| srai.d i1, N, 3 | |||
| @@ -521,17 +512,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .align 3 | |||
| .L22: | |||
| LD $f9, X, 0 | |||
| LD $f9, X, 0 | |||
| #ifdef DOUBLE | |||
| fsub.d $f10, $f3, $f9 | |||
| xvfmaxa.d x1, x1, x2 | |||
| xvfcmp.clt.d VT0, VM0, x1 | |||
| #else | |||
| fsub.s $f10, $f3, $f9 | |||
| xvfmaxa.s x1, x1, x2 | |||
| xvfcmp.clt.s VT0, VM0, x1 | |||
| #endif | |||
| xvbitsel.v VM0, VM0, x1, VT0 | |||
| xvbitsel.v VI0, VI0, VI1, VT0 | |||
| addi.d I, I, -1 | |||
| XVFMAXA VM1, x1, VM0 | |||
| XVCMPEQ VT0, VM0, VM1 | |||
| add.d X, X, INCX | |||
| xvbitsel.v VM0, VM1, VM0, VT0 | |||
| xvbitsel.v VI0, VI1, VI0, VT0 | |||
| addi.d i1, i1, 1 | |||
| add.d X, X, INCX | |||
| movgr2fr.d $f21, i1 | |||
| blt $r0, I, .L22 | |||
| MTG i0, $f20 | |||
| MTG i0, $f20 | |||
| .align 3 | |||
| .L999: | |||
| @@ -76,66 +76,66 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| addi.d i0, i0, 1 | |||
| srai.d I, N, 2 | |||
| bge $r0, I, .L21 | |||
| slli.d i0, i0, 2 //4 | |||
| slli.d i0, i0, 1 //2 | |||
| xvreplgr2vr.d VINC4, i0 | |||
| addi.d i0, i0, -7 | |||
| addi.d i0, i0, -3 | |||
| xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization | |||
| addi.d i0, i0, 2 | |||
| addi.d i0, i0, 1 | |||
| xvinsgr2vr.d VI1, i0, 1 | |||
| addi.d i0, i0, -1 | |||
| addi.d i0, i0, 1 | |||
| xvinsgr2vr.d VI1, i0, 2 | |||
| addi.d i0, i0, 2 | |||
| xvinsgr2vr.d VI1, i0, 3 | |||
| addi.d i0, i0, 1 | |||
| xvinsgr2vr.d VI0, i0, 0 //1 | |||
| addi.d i0, i0, 2 | |||
| xvinsgr2vr.d VI0, i0, 1 //3 | |||
| xvinsgr2vr.d VI1, i0, 3 | |||
| addi.d i0, i0, -1 | |||
| xvinsgr2vr.d VI0, i0, 2 //2 | |||
| addi.d i0, i0, 2 | |||
| xvinsgr2vr.d VI0, i0, 3 //4 | |||
| xvinsgr2vr.d VI0, i0, 0 | |||
| addi.d i0, i0, 1 | |||
| xvinsgr2vr.d VI0, i0, 1 | |||
| addi.d i0, i0, 1 | |||
| xvinsgr2vr.d VI0, i0, 2 | |||
| addi.d i0, i0, 1 | |||
| xvinsgr2vr.d VI0, i0, 3 | |||
| #else | |||
| li.w I, -1 | |||
| xvreplgr2vr.w VI4, I | |||
| xvffint.s.w VI4, VI4 // -1 | |||
| bne INCX, TEMP, .L20 | |||
| addi.w i0, i0, 1 | |||
| srai.d I, N, 3 | |||
| srai.d I, N, 2 | |||
| bge $r0, I, .L21 | |||
| slli.w i0, i0, 3 //8 | |||
| xvreplgr2vr.w VINC8, i0 | |||
| addi.w i0, i0, -15 | |||
| slli.w i0, i0, 2 //4 | |||
| xvreplgr2vr.w VINC4, i0 | |||
| addi.w i0, i0, -7 | |||
| xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization | |||
| addi.w i0, i0, 1 | |||
| xvinsgr2vr.w VI1, i0, 1 | |||
| addi.w i0, i0, 3 | |||
| addi.w i0, i0, 1 | |||
| xvinsgr2vr.w VI1, i0, 2 | |||
| addi.w i0, i0, 1 | |||
| xvinsgr2vr.w VI1, i0, 3 | |||
| addi.w i0, i0, -3 | |||
| addi.w i0, i0, 1 | |||
| xvinsgr2vr.w VI1, i0, 4 | |||
| addi.w i0, i0, 1 | |||
| xvinsgr2vr.w VI1, i0, 5 | |||
| addi.w i0, i0, 3 | |||
| addi.w i0, i0, 1 | |||
| xvinsgr2vr.w VI1, i0, 6 | |||
| addi.w i0, i0, 1 | |||
| xvinsgr2vr.w VI1, i0, 7 | |||
| addi.w i0, i0, -3 | |||
| xvinsgr2vr.w VI0, i0, 0 | |||
| addi.w i0, i0, 1 | |||
| xvinsgr2vr.w VI0, i0, 0 //1 | |||
| xvinsgr2vr.w VI0, i0, 1 | |||
| addi.w i0, i0, 1 | |||
| xvinsgr2vr.w VI0, i0, 1 //2 | |||
| addi.w i0, i0, 3 | |||
| xvinsgr2vr.w VI0, i0, 2 //5 | |||
| xvinsgr2vr.w VI0, i0, 2 | |||
| addi.w i0, i0, 1 | |||
| xvinsgr2vr.w VI0, i0, 3 //6 | |||
| addi.w i0, i0, -3 | |||
| xvinsgr2vr.w VI0, i0, 4 //3 | |||
| xvinsgr2vr.w VI0, i0, 3 | |||
| addi.w i0, i0, 1 | |||
| xvinsgr2vr.w VI0, i0, 4 | |||
| addi.w i0, i0, 1 | |||
| xvinsgr2vr.w VI0, i0, 5 //4 | |||
| addi.w i0, i0, 3 | |||
| xvinsgr2vr.w VI0, i0, 6 //7 | |||
| xvinsgr2vr.w VI0, i0, 5 | |||
| addi.w i0, i0, 1 | |||
| xvinsgr2vr.w VI0, i0, 7 //8 | |||
| xvinsgr2vr.w VI0, i0, 6 | |||
| addi.w i0, i0, 1 | |||
| xvinsgr2vr.w VI0, i0, 7 | |||
| #endif | |||
| .align 3 | |||
| @@ -143,7 +143,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xvld VX0, X, 0 * SIZE | |||
| #ifdef DOUBLE | |||
| xvadd.d VI1, VI1, VINC4 | |||
| xvld VX1, X, 4 * SIZE | |||
| xvld VX1, X, 2 * SIZE | |||
| addi.d I, I, -1 | |||
| xvpickev.d x1, VX1, VX0 | |||
| xvpickod.d x2, VX1, VX0 | |||
| @@ -153,22 +153,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xvfcmp.clt.d VINC8, x2, VI3 | |||
| xvbitsel.v x1, x1, x3, VT0 | |||
| xvbitsel.v x2, x2, x4, VINC8 | |||
| xvfadd.d x1, x1, x2 | |||
| xvfmax.d x3, VM0, x1 | |||
| xvfcmp.ceq.d VT0, x3, VM0 | |||
| xvbitsel.v VM0, x3, VM0, VT0 | |||
| xvbitsel.v VI0, VI1, VI0, VT0 | |||
| xvld VX0, X, 4 * SIZE | |||
| xvadd.d VI1, VI1, VINC4 | |||
| xvld VX1, X, 6 * SIZE | |||
| xvpickev.d x1, VX1, VX0 | |||
| xvpickod.d x2, VX1, VX0 | |||
| xvfmul.d x3, VI4, x1 | |||
| xvfmul.d x4, VI4, x2 | |||
| #else | |||
| xvadd.w VI1, VI1, VINC8 | |||
| xvld VX1, X, 8 * SIZE | |||
| xvadd.w VI1, VI1, VINC4 | |||
| xvld VX1, X, 4 * SIZE | |||
| addi.d I, I, -1 | |||
| xvpickev.w x1, VX1, VX0 | |||
| xvpickod.w x2, VX1, VX0 | |||
| xvfmul.s x3, VI4, x1 | |||
| xvfmul.s x4, VI4, x2 | |||
| xvfcmp.clt.s VT0, x1, VI3 | |||
| xvfcmp.clt.s VINC4, x2, VI3 | |||
| xvbitsel.v x1, x1, x3, VT0 | |||
| xvbitsel.v x2, x2, x4, VINC4 | |||
| #endif | |||
| XVFADD x1, x1, x2 | |||
| XVFMAX x3, VM0, x1 | |||
| XVCMPEQ VT0, x3, VM0 | |||
| XVCMPLT VT0, x1, VI3 | |||
| XVCMPLT VINC8, x2, VI3 | |||
| xvbitsel.v x1, x1, x3, VT0 | |||
| xvbitsel.v x2, x2, x4, VINC8 | |||
| XVFADD x1, x1, x2 | |||
| XVFMAX x3, VM0, x1 | |||
| XVCMPEQ VT0, x3, VM0 | |||
| addi.d X, X, 8 * SIZE | |||
| xvbitsel.v VM0, x3, VM0, VT0 | |||
| xvbitsel.v VI0, VI1, VI0, VT0 | |||
| @@ -177,51 +189,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .L15: | |||
| #ifdef DOUBLE | |||
| xvpickve.d VI1, VI0, 0 | |||
| xvpickve.d VI2, VI0, 1 | |||
| xvpickve.d VI3, VI0, 2 | |||
| xvpickve.d VI4, VI0, 3 | |||
| xvpickve.d x1, VM0, 0 | |||
| xvpickve.d x2, VM0, 1 | |||
| xvpickve.d x3, VM0, 2 | |||
| xvpickve.d x4, VM0, 3 | |||
| xvfmax.d VM1, x1, x2 | |||
| xvfcmp.ceq.d VT0, VM1, x1 | |||
| vreplvei.d $vr21, $vr20, 0 | |||
| vreplvei.d $vr22, $vr20, 1 | |||
| vreplvei.d $vr9, $vr15, 0 | |||
| vreplvei.d $vr10, $vr15, 1 | |||
| fcmp.ceq.d $fcc0, $f10, $f9 | |||
| bceqz $fcc0, .L26 | |||
| xvfcmp.clt.d VT0, VI1, VI2 | |||
| xvbitsel.v VI0, VI2, VI1, VT0 | |||
| b .L27 | |||
| #else | |||
| vreplvei.w $vr21, $vr20, 0 | |||
| vreplvei.w $vr22, $vr20, 1 | |||
| vreplvei.w $vr8, $vr20, 2 | |||
| vreplvei.w $vr19, $vr20, 3 | |||
| vreplvei.w $vr9, $vr15, 0 | |||
| vreplvei.w $vr10, $vr15, 1 | |||
| vreplvei.w $vr11, $vr15, 2 | |||
| vreplvei.w $vr12, $vr15, 3 | |||
| xvfmaxa.s VM1, x1, x2 | |||
| xvfcmp.ceq.s VT0, VM1, x1 | |||
| xvbitsel.v VINC4, VI2, VI1, VT0 | |||
| xvfmax.d VM0, x3, x4 | |||
| xvfcmp.ceq.d VT0, x3, VM0 | |||
| xvfmaxa.s VM0, x3, x4 | |||
| xvfcmp.ceq.s VT0, x3, VM0 | |||
| xvbitsel.v VINC8, VI4, VI3, VT0 | |||
| xvfmax.d VM0, VM0, VM1 | |||
| xvfcmp.ceq.d VT0, VM0, VM1 | |||
| xvfmaxa.s VM0, VM0, VM1 | |||
| xvfcmp.ceq.s VT0, VM0, VM1 | |||
| xvbitsel.v VI0, VINC8, VINC4, VT0 | |||
| #else | |||
| xvxor.v VX0, VX0, VX0 | |||
| xvor.v VX0, VI0, VX0 | |||
| xvxor.v VX1, VX1, VX1 | |||
| xvor.v VX1, VM0, VX1 | |||
| xvpickve.w VI1, VI0, 0 | |||
| xvpickve.w VI2, VI0, 1 | |||
| xvpickve.w VI3, VI0, 2 | |||
| xvpickve.w VI4, VI0, 3 | |||
| xvpickve.w x1, VM0, 0 | |||
| xvpickve.w x2, VM0, 1 | |||
| xvpickve.w x3, VM0, 2 | |||
| xvpickve.w x4, VM0, 3 | |||
| xvfcmp.clt.s VT0, x1, x2 | |||
| xvbitsel.v VM1, x1, x2, VT0 | |||
| xvbitsel.v VINC4, VI1, VI2, VT0 | |||
| xvfcmp.clt.s VT0, x3, x4 | |||
| xvbitsel.v VM0, x3, x4, VT0 | |||
| xvbitsel.v VINC8, VI3, VI4, VT0 | |||
| xvfcmp.clt.s VT0, VM0, VM1 | |||
| xvbitsel.v VM0, VM0, VM1, VT0 | |||
| xvbitsel.v VI0, VINC8, VINC4, VT0 | |||
| #endif | |||
| fcmp.ceq.d $fcc0, $f15, $f9 | |||
| bceqz $fcc0, .L26 | |||
| XVCMPLT VT0, VI1, VI0 | |||
| xvfcmp.clt.s VT0, VI1, VI0 | |||
| xvbitsel.v VI0, VI0, VI1, VT0 | |||
| b .L26 | |||
| #endif | |||
| .align 3 | |||
| .L20: // INCX!=1 | |||
| @@ -229,62 +229,62 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| addi.d i0, i0, 1 | |||
| srai.d I, N, 2 | |||
| bge $r0, I, .L21 | |||
| slli.d i0, i0, 2 //4 | |||
| slli.d i0, i0, 1 //2 | |||
| xvreplgr2vr.d VINC4, i0 | |||
| addi.d i0, i0, -7 | |||
| addi.d i0, i0, -3 | |||
| xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization | |||
| addi.d i0, i0, 2 | |||
| addi.d i0, i0, 1 | |||
| xvinsgr2vr.d VI1, i0, 1 | |||
| addi.d i0, i0, -1 | |||
| addi.d i0, i0, 1 | |||
| xvinsgr2vr.d VI1, i0, 2 | |||
| addi.d i0, i0, 2 | |||
| xvinsgr2vr.d VI1, i0, 3 | |||
| addi.d i0, i0, 1 | |||
| xvinsgr2vr.d VI0, i0, 0 //1 | |||
| addi.d i0, i0, 2 | |||
| xvinsgr2vr.d VI0, i0, 1 //3 | |||
| xvinsgr2vr.d VI1, i0, 3 | |||
| addi.d i0, i0, -1 | |||
| xvinsgr2vr.d VI0, i0, 2 //2 | |||
| addi.d i0, i0, 2 | |||
| xvinsgr2vr.d VI0, i0, 3 //4 | |||
| xvinsgr2vr.d VI0, i0, 0 | |||
| addi.d i0, i0, 1 | |||
| xvinsgr2vr.d VI0, i0, 1 | |||
| addi.d i0, i0, 1 | |||
| xvinsgr2vr.d VI0, i0, 2 | |||
| addi.d i0, i0, 1 | |||
| xvinsgr2vr.d VI0, i0, 3 | |||
| #else | |||
| addi.w i0, i0, 1 | |||
| srai.d I, N, 3 | |||
| srai.d I, N, 2 | |||
| bge $r0, I, .L21 | |||
| slli.w i0, i0, 3 //8 | |||
| xvreplgr2vr.w VINC8, i0 | |||
| addi.w i0, i0, -15 | |||
| slli.w i0, i0, 2 //4 | |||
| xvreplgr2vr.w VINC4, i0 | |||
| addi.w i0, i0, -7 | |||
| xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization | |||
| addi.w i0, i0, 1 | |||
| xvinsgr2vr.w VI1, i0, 1 | |||
| addi.w i0, i0, 3 | |||
| addi.w i0, i0, 1 | |||
| xvinsgr2vr.w VI1, i0, 2 | |||
| addi.w i0, i0, 1 | |||
| xvinsgr2vr.w VI1, i0, 3 | |||
| addi.w i0, i0, -3 | |||
| addi.w i0, i0, 1 | |||
| xvinsgr2vr.w VI1, i0, 4 | |||
| addi.w i0, i0, 1 | |||
| xvinsgr2vr.w VI1, i0, 5 | |||
| addi.w i0, i0, 3 | |||
| addi.w i0, i0, 1 | |||
| xvinsgr2vr.w VI1, i0, 6 | |||
| addi.w i0, i0, 1 | |||
| xvinsgr2vr.w VI1, i0, 7 | |||
| addi.w i0, i0, -3 | |||
| xvinsgr2vr.w VI0, i0, 0 | |||
| addi.w i0, i0, 1 | |||
| xvinsgr2vr.w VI0, i0, 0 //1 | |||
| xvinsgr2vr.w VI0, i0, 1 | |||
| addi.w i0, i0, 1 | |||
| xvinsgr2vr.w VI0, i0, 1 //2 | |||
| addi.w i0, i0, 3 | |||
| xvinsgr2vr.w VI0, i0, 2 //5 | |||
| xvinsgr2vr.w VI0, i0, 2 | |||
| addi.w i0, i0, 1 | |||
| xvinsgr2vr.w VI0, i0, 3 //6 | |||
| addi.w i0, i0, -3 | |||
| xvinsgr2vr.w VI0, i0, 4 //3 | |||
| xvinsgr2vr.w VI0, i0, 3 | |||
| addi.w i0, i0, 1 | |||
| xvinsgr2vr.w VI0, i0, 4 | |||
| addi.w i0, i0, 1 | |||
| xvinsgr2vr.w VI0, i0, 5 //4 | |||
| addi.w i0, i0, 3 | |||
| xvinsgr2vr.w VI0, i0, 6 //7 | |||
| xvinsgr2vr.w VI0, i0, 5 | |||
| addi.w i0, i0, 1 | |||
| xvinsgr2vr.w VI0, i0, 7 //8 | |||
| xvinsgr2vr.w VI0, i0, 6 | |||
| addi.w i0, i0, 1 | |||
| xvinsgr2vr.w VI0, i0, 7 | |||
| #endif | |||
| .align 3 | |||
| @@ -301,16 +301,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xvinsgr2vr.d x1, t3, 1 | |||
| xvinsgr2vr.d x2, t4, 1 | |||
| xvadd.d VI1, VI1, VINC4 | |||
| xvfmul.d x3, VI4, x1 | |||
| xvfmul.d x4, VI4, x2 | |||
| xvfcmp.clt.d VT0, x1, VI3 | |||
| xvfcmp.clt.d VINC8, x2, VI3 | |||
| xvbitsel.v x1, x1, x3, VT0 | |||
| xvbitsel.v x2, x2, x4, VINC8 | |||
| xvfadd.d x1, x1, x2 | |||
| xvfmax.d x3, VM0, x1 | |||
| ld.d t1, X, 0 * SIZE | |||
| xvfcmp.ceq.d VT0, x3, VM0 | |||
| ld.d t2, X, 1 * SIZE | |||
| xvbitsel.v VM0, x3, VM0, VT0 | |||
| xvbitsel.v VI0, VI1, VI0, VT0 | |||
| add.d X, X, INCX | |||
| ld.d t3, X, 0 * SIZE | |||
| ld.d t4, X, 1 * SIZE | |||
| add.d X, X, INCX | |||
| xvinsgr2vr.d x1, t1, 2 | |||
| xvinsgr2vr.d x2, t2, 2 | |||
| xvinsgr2vr.d x1, t3, 3 | |||
| xvinsgr2vr.d x2, t4, 3 | |||
| xvinsgr2vr.d x1, t1, 0 | |||
| xvinsgr2vr.d x2, t2, 0 | |||
| xvinsgr2vr.d x1, t3, 1 | |||
| xvinsgr2vr.d x2, t4, 1 | |||
| xvadd.d VI1, VI1, VINC4 | |||
| addi.d I, I, -1 | |||
| xvfmul.d x3, VI4, x1 | |||
| xvfmul.d x4, VI4, x2 | |||
| @@ -332,6 +344,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xvinsgr2vr.w x2, t2, 0 | |||
| xvinsgr2vr.w x1, t3, 1 | |||
| xvinsgr2vr.w x2, t4, 1 | |||
| xvadd.w VI1, VI1, VINC4 | |||
| ld.w t1, X, 0 * SIZE | |||
| ld.w t2, X, 1 * SIZE | |||
| add.d X, X, INCX | |||
| @@ -342,31 +355,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xvinsgr2vr.w x2, t2, 2 | |||
| xvinsgr2vr.w x1, t3, 3 | |||
| xvinsgr2vr.w x2, t4, 3 | |||
| xvadd.w VI1, VI1, VINC8 | |||
| ld.w t1, X, 0 * SIZE | |||
| ld.w t2, X, 1 * SIZE | |||
| add.d X, X, INCX | |||
| ld.w t3, X, 0 * SIZE | |||
| ld.w t4, X, 1 * SIZE | |||
| add.d X, X, INCX | |||
| xvinsgr2vr.w x1, t1, 4 | |||
| xvinsgr2vr.w x2, t2, 4 | |||
| xvinsgr2vr.w x1, t3, 5 | |||
| xvinsgr2vr.w x2, t4, 5 | |||
| xvadd.w VI1, VI1, VINC8 | |||
| ld.w t1, X, 0 * SIZE | |||
| ld.w t2, X, 1 * SIZE | |||
| add.d X, X, INCX | |||
| ld.w t3, X, 0 * SIZE | |||
| ld.w t4, X, 1 * SIZE | |||
| add.d X, X, INCX | |||
| xvinsgr2vr.w x1, t1, 6 | |||
| xvinsgr2vr.w x2, t2, 6 | |||
| xvinsgr2vr.w x1, t3, 7 | |||
| xvinsgr2vr.w x2, t4, 7 | |||
| addi.d I, I, -1 | |||
| xvpickev.w x1, VX1, VX0 | |||
| xvpickod.w x2, VX1, VX0 | |||
| xvfmul.s x3, VI4, x1 | |||
| xvfmul.s x4, VI4, x2 | |||
| xvfcmp.clt.s VT0, x1, VI3 | |||
| @@ -384,152 +373,82 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .L25: | |||
| #ifdef DOUBLE | |||
| xvpickve.d VI1, VI0, 0 | |||
| xvpickve.d VI2, VI0, 1 | |||
| xvpickve.d VI3, VI0, 2 | |||
| xvpickve.d VI4, VI0, 3 | |||
| xvpickve.d x1, VM0, 0 | |||
| xvpickve.d x2, VM0, 1 | |||
| xvpickve.d x3, VM0, 2 | |||
| xvpickve.d x4, VM0, 3 | |||
| xvfmaxa.d VM1, x1, x2 | |||
| xvfcmp.ceq.d VT0, VM1, x1 | |||
| vreplvei.d $vr21, $vr20, 0 | |||
| vreplvei.d $vr22, $vr20, 1 | |||
| vreplvei.d $vr9, $vr15, 0 | |||
| vreplvei.d $vr10, $vr15, 1 | |||
| fcmp.ceq.d $fcc0, $f10, $f9 | |||
| bceqz $fcc0, .L26 | |||
| xvfcmp.clt.d VT0, VI1, VI2 | |||
| xvbitsel.v VI0, VI2, VI1, VT0 | |||
| b .L27 | |||
| #else | |||
| vreplvei.w $vr21, $vr20, 0 | |||
| vreplvei.w $vr22, $vr20, 1 | |||
| vreplvei.w $vr8, $vr20, 2 | |||
| vreplvei.w $vr19, $vr20, 3 | |||
| vreplvei.w $vr9, $vr15, 0 | |||
| vreplvei.w $vr10, $vr15, 1 | |||
| vreplvei.w $vr11, $vr15, 2 | |||
| vreplvei.w $vr12, $vr15, 3 | |||
| xvfmaxa.s VM1, x1, x2 | |||
| xvfcmp.ceq.s VT0, VM1, x1 | |||
| xvbitsel.v VINC4, VI2, VI1, VT0 | |||
| xvfmaxa.d VM0, x3, x4 | |||
| xvfcmp.ceq.d VT0, x3, VM0 | |||
| xvfmaxa.s VM0, x3, x4 | |||
| xvfcmp.ceq.s VT0, x3, VM0 | |||
| xvbitsel.v VINC8, VI4, VI3, VT0 | |||
| xvfmaxa.d VM0, VM0, VM1 | |||
| xvfcmp.ceq.d VT0, VM0, VM1 | |||
| xvfmaxa.s VM0, VM0, VM1 | |||
| xvfcmp.ceq.s VT0, VM0, VM1 | |||
| xvbitsel.v VI0, VINC8, VINC4, VT0 | |||
| #else | |||
| xvxor.v VX0, VX0, VX0 | |||
| xvor.v VX0, VI0, VX0 | |||
| xvxor.v VX1, VX1, VX1 | |||
| xvor.v VX1, VM0, VX1 | |||
| xvpickve.w VI1, VI0, 0 | |||
| xvpickve.w VI2, VI0, 1 | |||
| xvpickve.w VI3, VI0, 2 | |||
| xvpickve.w VI4, VI0, 3 | |||
| xvpickve.w x1, VM0, 0 | |||
| xvpickve.w x2, VM0, 1 | |||
| xvpickve.w x3, VM0, 2 | |||
| xvpickve.w x4, VM0, 3 | |||
| xvfcmp.clt.s VT0, x1, x2 | |||
| xvbitsel.v VM1, x1, x2, VT0 | |||
| xvbitsel.v VINC4, VI1, VI2, VT0 | |||
| xvfcmp.clt.s VT0, x3, x4 | |||
| xvbitsel.v VM0, x3, x4, VT0 | |||
| xvbitsel.v VINC8, VI3, VI4, VT0 | |||
| xvfcmp.clt.s VT0, VM0, VM1 | |||
| xvbitsel.v VM0, VM0, VM1, VT0 | |||
| xvbitsel.v VI0, VINC8, VINC4, VT0 | |||
| #endif | |||
| fcmp.ceq.d $fcc0, $f15, $f9 | |||
| bceqz $fcc0, .L26 | |||
| XVCMPLT VT0, VI1, VI0 | |||
| xvfcmp.clt.s VT0, VI1, VI0 | |||
| xvbitsel.v VI0, VI0, VI1, VT0 | |||
| #endif | |||
| .align 3 | |||
| #ifdef DOUBLE | |||
| .L26: | |||
| fcmp.ceq.d $fcc0, $f15, $f10 | |||
| bceqz $fcc0, .L27 | |||
| XVCMPLT VT0, VI2, VI0 | |||
| xvbitsel.v VI0, VI0, VI2, VT0 | |||
| xvfmaxa.d VM0, x1, x2 | |||
| xvfcmp.ceq.d VT0, x1, VM0 | |||
| xvbitsel.v VI0, VI2, VI1, VT0 | |||
| .align 3 | |||
| .L27: | |||
| fcmp.ceq.d $fcc0, $f15, $f11 | |||
| bceqz $fcc0, .L28 | |||
| XVCMPLT VT0, VI3, VI0 | |||
| xvbitsel.v VI0, VI0, VI3, VT0 | |||
| .align 3 | |||
| .L28: | |||
| fcmp.ceq.d $fcc0, $f15, $f12 | |||
| bceqz $fcc0, .L29 | |||
| XVCMPLT VT0, VI4, VI0 | |||
| xvbitsel.v VI0, VI0, VI4, VT0 | |||
| .align 3 | |||
| .L29: | |||
| #ifdef DOUBLE | |||
| movfr2gr.d i0, $f20 | |||
| #else | |||
| fmov.s $f16, $f20 | |||
| #endif | |||
| .align 3 | |||
| #ifdef DOUBLE | |||
| #else | |||
| .L252: | |||
| xvxor.v VI0, VI0, VI0 | |||
| xvor.v VI0, VI0, VX0 | |||
| fmov.s $f13, $f15 | |||
| xvxor.v VM0, VM0, VM0 | |||
| xvor.v VM0, VM0, VX1 | |||
| xvpickve.w VI1, VI0, 4 | |||
| xvpickve.w VI2, VI0, 5 | |||
| xvpickve.w VI3, VI0, 6 | |||
| xvpickve.w VI4, VI0, 7 | |||
| xvpickve.w x1, VM0, 4 | |||
| xvpickve.w x2, VM0, 5 | |||
| xvpickve.w x3, VM0, 6 | |||
| xvpickve.w x4, VM0, 7 | |||
| xvfcmp.clt.s VT0, x1, x2 | |||
| xvbitsel.v x1, x1, x2, VT0 | |||
| xvbitsel.v VINC4, VI1, VI2, VT0 | |||
| xvfcmp.clt.s VT0, x3, x4 | |||
| xvbitsel.v VM0, x3, x4, VT0 | |||
| xvbitsel.v VINC8, VI3, VI4, VT0 | |||
| xvfcmp.clt.s VT0, VM0, x1 | |||
| xvbitsel.v VM0, VM0, x1, VT0 | |||
| xvbitsel.v VI0, VINC8, VINC4, VT0 | |||
| fcmp.ceq.d $fcc0, $f15, $f9 | |||
| bceqz $fcc0, .L262 | |||
| xvfcmp.clt.s VT0, VI1, VI0 | |||
| xvbitsel.v VI0, VI0, VI1, VT0 | |||
| .align 3 | |||
| .L262: | |||
| .L26: | |||
| fcmp.ceq.d $fcc0, $f15, $f10 | |||
| bceqz $fcc0, .L272 | |||
| bceqz $fcc0, .L27 | |||
| xvfcmp.clt.s VT0, VI2, VI0 | |||
| xvbitsel.v VI0, VI0, VI2, VT0 | |||
| .align 3 | |||
| .L272: | |||
| .L27: | |||
| fcmp.ceq.d $fcc0, $f15, $f11 | |||
| bceqz $fcc0, .L282 | |||
| bceqz $fcc0, .L28 | |||
| xvfcmp.clt.s VT0, VI3, VI0 | |||
| xvbitsel.v VI0, VI0, VI3, VT0 | |||
| .align 3 | |||
| .L282: | |||
| .L28: | |||
| fcmp.ceq.d $fcc0, $f15, $f12 | |||
| bceqz $fcc0, .L292 | |||
| bceqz $fcc0, .L29 | |||
| xvfcmp.clt.s VT0, VI4, VI0 | |||
| xvbitsel.v VI0, VI0, VI4, VT0 | |||
| .align 3 | |||
| .L292: | |||
| fcmp.clt.s $fcc0, $f15, $f13 | |||
| fsel $f15, $f15, $f13, $fcc0 | |||
| fsel $f20, $f20, $f16, $fcc0 | |||
| .L29: | |||
| movfr2gr.s i0, $f20 | |||
| .align 3 | |||
| #endif | |||
| .L21: //N<8 | |||
| #ifdef DOUBLE | |||
| .L21: //N<4 | |||
| andi I, N, 3 | |||
| bge $r0, I, .L999 | |||
| srai.d i1, N, 2 | |||
| slli.d i1, i1, 2 | |||
| #else | |||
| andi I, N, 7 | |||
| bge $r0, I, .L999 | |||
| srai.d i1, N, 3 | |||
| slli.d i1, i1, 3 | |||
| #endif | |||
| addi.d i1, i1, 1 //current index | |||
| movgr2fr.d $f21, i1 | |||
| movgr2fr.d $f20, i0 | |||
| @@ -550,10 +469,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| addi.d i1, i1, 1 | |||
| movgr2fr.d $f21, i1 | |||
| blt $r0, I, .L22 | |||
| MTG i0, $f20 | |||
| MTG i0, $f20 | |||
| .align 3 | |||
| .L999: | |||
| move $r4, $r17 | |||
| jirl $r0, $r1, 0x0 | |||
| @@ -43,15 +43,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define t2 $r13 | |||
| #define t3 $r14 | |||
| #define t4 $r15 | |||
| /* Don't change following FR unless you know the effects. */ | |||
| #define VX0 $xr15 | |||
| #define VX1 $xr16 | |||
| #define VX2 $xr17 | |||
| #define VX3 $xr18 | |||
| #define VX4 $xr21 | |||
| #define VX5 $xr22 | |||
| /* Don't change following FR unless you know the effects. */ | |||
| #define res1 $xr19 | |||
| #define res2 $xr20 | |||
| #define RCP $f2 | |||
| #define VALPHA $xr3 | |||
| // The optimization for snrm2 cannot simply involve | |||
| // extending the data type from float to double and | |||
| // then summing the squares of the data. LAPACK tests | |||
| // have shown that this approach can still lead to data overflow. | |||
| // Instead, we need to find the maximum absolute value in the entire | |||
| // array and divide each data element by this maximum value before | |||
| // performing the calculation. This approach can avoid overflow (and does not require extending the data type). | |||
| PROLOGUE | |||
| @@ -59,29 +69,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| LDINT N, 0(N) | |||
| LDINT INCX, 0(INCX) | |||
| #endif | |||
| bge $r0, N, .L999 | |||
| beq $r0, INCX, .L999 | |||
| addi.d $sp, $sp, -32 | |||
| st.d $ra, $sp, 0 | |||
| st.d N, $sp, 8 | |||
| st.d X, $sp, 16 | |||
| st.d INCX, $sp, 24 | |||
| #ifdef DYNAMIC_ARCH | |||
| bl samax_k_LA264 | |||
| #else | |||
| bl samax_k | |||
| #endif | |||
| ld.d $ra, $sp, 0 | |||
| ld.d N, $sp, 8 | |||
| ld.d X, $sp, 16 | |||
| ld.d INCX, $sp, 24 | |||
| addi.d $sp, $sp, 32 | |||
| frecip.s RCP, $f0 | |||
| vreplvei.w $vr3, $vr2, 0 | |||
| xvpermi.d VALPHA, $xr3,0x00 | |||
| xvxor.v res1, res1, res1 | |||
| xvxor.v res2, res2, res2 | |||
| bge $r0, N, .L999 | |||
| beq $r0, INCX, .L999 | |||
| fcmp.ceq.s $fcc0, $f0, $f19 | |||
| bcnez $fcc0, .L999 | |||
| li.d TEMP, SIZE | |||
| slli.d INCX, INCX, BASE_SHIFT | |||
| srai.d I, N, 3 | |||
| srai.d I, N, 4 | |||
| bne INCX, TEMP, .L20 | |||
| bge $r0, I, .L997 | |||
| bge $r0, I, .L997 | |||
| .align 3 | |||
| .L10: | |||
| xvld VX0, X, 0 | |||
| xvfcvtl.d.s VX1, VX0 | |||
| xvfcvth.d.s VX2, VX0 | |||
| xvfmadd.d res1, VX1, VX1, res1 | |||
| xvfmadd.d res2, VX2, VX2, res2 | |||
| xvld VX0, X, 0 | |||
| xvld VX5, X, 8 * SIZE | |||
| addi.d I, I, -1 | |||
| addi.d X, X, 8 * SIZE | |||
| addi.d X, X, 16 * SIZE | |||
| xvfmul.s VX0, VX0, VALPHA | |||
| xvfmul.s VX5, VX5, VALPHA | |||
| xvfmadd.s res1, VX0, VX0, res1 | |||
| xvfmadd.s res2, VX5, VX5, res2 | |||
| blt $r0, I, .L10 | |||
| .align 3 | |||
| b .L996 | |||
| .align 3 | |||
| .L20: | |||
| bge $r0, I, .L997 | |||
| @@ -107,47 +141,75 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld.w t3, X, 0 | |||
| add.d X, X, INCX | |||
| ld.w t4, X, 0 | |||
| add.d X, X, INCX | |||
| xvinsgr2vr.w VX0, t1, 4 | |||
| xvinsgr2vr.w VX0, t2, 5 | |||
| xvinsgr2vr.w VX0, t3, 6 | |||
| xvinsgr2vr.w VX0, t4, 7 | |||
| xvfmul.s VX0, VX0, VALPHA | |||
| xvfmadd.s res1, VX0, VX0, res1 | |||
| ld.w t1, X, 0 | |||
| add.d X, X, INCX | |||
| ld.w t2, X, 0 | |||
| add.d X, X, INCX | |||
| xvfcvtl.d.s VX1, VX0 | |||
| xvfcvth.d.s VX2, VX0 | |||
| xvfmadd.d res1, VX1, VX1, res1 | |||
| xvfmadd.d res2, VX2, VX2, res2 | |||
| ld.w t3, X, 0 | |||
| add.d X, X, INCX | |||
| ld.w t4, X, 0 | |||
| add.d X, X, INCX | |||
| xvinsgr2vr.w VX0, t1, 0 | |||
| xvinsgr2vr.w VX0, t2, 1 | |||
| xvinsgr2vr.w VX0, t3, 2 | |||
| xvinsgr2vr.w VX0, t4, 3 | |||
| ld.w t1, X, 0 | |||
| add.d X, X, INCX | |||
| ld.w t2, X, 0 | |||
| add.d X, X, INCX | |||
| ld.w t3, X, 0 | |||
| add.d X, X, INCX | |||
| ld.w t4, X, 0 | |||
| add.d X, X, INCX | |||
| xvinsgr2vr.w VX0, t1, 4 | |||
| xvinsgr2vr.w VX0, t2, 5 | |||
| xvinsgr2vr.w VX0, t3, 6 | |||
| xvinsgr2vr.w VX0, t4, 7 | |||
| xvfmul.s VX0, VX0, VALPHA | |||
| xvfmadd.s res2, VX0, VX0, res2 | |||
| addi.d I, I, -1 | |||
| blt $r0, I, .L21 | |||
| b .L996 | |||
| .align 3 | |||
| .L996: | |||
| xvfadd.d res1, res1, res2 | |||
| xvpickve.d VX1, res1, 1 | |||
| xvpickve.d VX2, res1, 2 | |||
| xvpickve.d VX3, res1, 3 | |||
| fadd.d $f19, $f19, $f16 | |||
| fadd.d $f19, $f19, $f17 | |||
| fadd.d $f19, $f19, $f18 | |||
| xvfadd.s res1, res1, res2 | |||
| xvpermi.d VX1, res1, 0x4e | |||
| xvfadd.s res1, res1, VX1 | |||
| vreplvei.w $vr16, $vr19, 1 | |||
| vreplvei.w $vr17, $vr19, 2 | |||
| vreplvei.w $vr18, $vr19, 3 | |||
| xvfadd.s res1, VX1, res1 | |||
| xvfadd.s res1, VX2, res1 | |||
| xvfadd.s res1, VX3, res1 | |||
| .align 3 | |||
| .L997: | |||
| andi I, N, 7 | |||
| andi I, N, 15 | |||
| bge $r0, I, .L999 | |||
| .align 3 | |||
| .L998: | |||
| fld.s $f15, X, 0 | |||
| add.d X, X, INCX | |||
| addi.d I, I, -1 | |||
| fcvt.d.s $f15, $f15 | |||
| fmadd.d $f19, $f15, $f15, $f19 | |||
| addi.d I, I, -1 | |||
| fmul.s $f15, $f15, RCP | |||
| fmadd.s $f19, $f15, $f15, $f19 | |||
| add.d X, X, INCX | |||
| blt $r0, I, .L998 | |||
| .align 3 | |||
| .L999: | |||
| fsqrt.d $f19, $f19 | |||
| fsqrt.s $f19, $f19 | |||
| fmul.s $f0, $f19, $f0 | |||
| move $r4, $r17 | |||
| fcvt.s.d $f0, $f19 | |||
| jirl $r0, $r1, 0x0 | |||
| .align 3 | |||
| EPILOGUE | |||
| @@ -318,62 +318,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| move XX, X | |||
| .L222: | |||
| LD a1, X, 0 | |||
| add.d X, X, INCX | |||
| LD a2, X, 0 | |||
| add.d X, X, INCX | |||
| LD a3, X, 0 | |||
| add.d X, X, INCX | |||
| LD a4, X, 0 | |||
| add.d X, X, INCX | |||
| LD b1, Y, 0 | |||
| ST a1, Y, 0 | |||
| add.d Y, Y, INCY | |||
| LD b2, Y, 0 | |||
| ST a2, Y, 0 | |||
| add.d Y, Y, INCY | |||
| LD b3, Y, 0 | |||
| ST a3, Y, 0 | |||
| add.d Y, Y, INCY | |||
| LD b4, Y, 0 | |||
| ST a4, Y, 0 | |||
| add.d Y, Y, INCY | |||
| LD a1, X, 0 | |||
| add.d X, X, INCX | |||
| ST b1, XX, 0 | |||
| add.d XX, XX, INCX | |||
| LD b1, Y, 0 | |||
| ST a1, Y, 0 | |||
| add.d Y, Y, INCY | |||
| LD a2, X, 0 | |||
| add.d X, X, INCX | |||
| ST b2, XX, 0 | |||
| add.d XX, XX, INCX | |||
| LD b2, Y, 0 | |||
| ST a2, Y, 0 | |||
| add.d Y, Y, INCY | |||
| LD a3, X, 0 | |||
| add.d X, X, INCX | |||
| ST b3, XX, 0 | |||
| add.d XX, XX, INCX | |||
| LD b3, Y, 0 | |||
| ST a3, Y, 0 | |||
| LD a4, X, 0 | |||
| add.d X, X, INCX | |||
| ST b4, XX, 0 | |||
| add.d XX, XX, INCX | |||
| LD b4, Y, 0 | |||
| ST a4, Y, 0 | |||
| add.d Y, Y, INCY | |||
| ST b1, XX, 0 | |||
| add.d XX, XX, INCX | |||
| ST b2, XX, 0 | |||
| add.d XX, XX, INCX | |||
| ST b3, XX, 0 | |||
| add.d XX, XX, INCX | |||
| ST b4, XX, 0 | |||
| add.d XX, XX, INCX | |||
| addi.d I, I, -1 | |||
| .rept 8 | |||
| LD $f12, X, 0 | |||
| LD $f14, Y, 0 | |||
| ST $f12, Y, 0 | |||
| ST $f14, X, 0 | |||
| add.d X, X, INCX | |||
| add.d Y, Y, INCY | |||
| .endr | |||
| addi.d I, I, -1 | |||
| blt $r0, I, .L222 | |||
| .align 3 | |||
| @@ -17,454 +17,369 @@ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |||
| CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE | |||
| GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |||
| HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | |||
| LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF | |||
| THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #if !defined(__VEC__) || !defined(__ALTIVEC__) | |||
| #include "../arm/gemv_n.c" | |||
| #else | |||
| #include "common.h" | |||
| #include <altivec.h> | |||
| #include "common.h" | |||
| #define NBMAX 4096 | |||
| static void sgemv_kernel_4x8(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, BLASLONG lda4, FLOAT *alpha) | |||
| { | |||
| static void sgemv_kernel_4x8(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, | |||
| BLASLONG lda4, FLOAT *alpha) { | |||
| BLASLONG i; | |||
| FLOAT *a0,*a1,*a2,*a3,*b0,*b1,*b2,*b3; | |||
| FLOAT x0,x1,x2,x3,x4,x5,x6,x7; | |||
| a0 = ap[0]; | |||
| a1 = ap[1]; | |||
| a2 = ap[2]; | |||
| a3 = ap[3]; | |||
| b0 = a0 + lda4 ; | |||
| b1 = a1 + lda4 ; | |||
| b2 = a2 + lda4 ; | |||
| b3 = a3 + lda4 ; | |||
| x0 = xo[0] * *alpha; | |||
| x1 = xo[1] * *alpha; | |||
| x2 = xo[2] * *alpha; | |||
| x3 = xo[3] * *alpha; | |||
| x4 = xo[4] * *alpha; | |||
| x5 = xo[5] * *alpha; | |||
| x6 = xo[6] * *alpha; | |||
| x7 = xo[7] * *alpha; | |||
| __vector float* va0 = (__vector float*)a0; | |||
| __vector float* va1 = (__vector float*)a1; | |||
| __vector float* va2 = (__vector float*)a2; | |||
| __vector float* va3 = (__vector float*)a3; | |||
| __vector float* vb0 = (__vector float*)b0; | |||
| __vector float* vb1 = (__vector float*)b1; | |||
| __vector float* vb2 = (__vector float*)b2; | |||
| __vector float* vb3 = (__vector float*)b3; | |||
| __vector float v_x0 = {x0,x0,x0,x0}; | |||
| __vector float v_x1 = {x1,x1,x1,x1}; | |||
| __vector float v_x2 = {x2,x2,x2,x2}; | |||
| __vector float v_x3 = {x3,x3,x3,x3}; | |||
| __vector float v_x4 = {x4,x4,x4,x4}; | |||
| __vector float v_x5 = {x5,x5,x5,x5}; | |||
| __vector float v_x6 = {x6,x6,x6,x6}; | |||
| __vector float v_x7 = {x7,x7,x7,x7}; | |||
| __vector float* v_y =(__vector float*)y; | |||
| for ( i=0; i< n/4; i++) | |||
| { | |||
| register __vector float vy=v_y[i]; | |||
| vy += v_x0 * va0[i] + v_x1 * va1[i] + v_x2 * va2[i] + v_x3 * va3[i] ; | |||
| vy += v_x4 * vb0[i] + v_x5 * vb1[i] + v_x6 * vb2[i] + v_x7 * vb3[i] ; | |||
| v_y[i] =vy; | |||
| FLOAT *a0, *a1, *a2, *a3, *b0, *b1, *b2, *b3; | |||
| FLOAT x0, x1, x2, x3, x4, x5, x6, x7; | |||
| a0 = ap[0]; | |||
| a1 = ap[1]; | |||
| a2 = ap[2]; | |||
| a3 = ap[3]; | |||
| b0 = a0 + lda4; | |||
| b1 = a1 + lda4; | |||
| b2 = a2 + lda4; | |||
| b3 = a3 + lda4; | |||
| x0 = xo[0] * (*alpha); | |||
| x1 = xo[1] * (*alpha); | |||
| x2 = xo[2] * (*alpha); | |||
| x3 = xo[3] * (*alpha); | |||
| x4 = xo[4] * (*alpha); | |||
| x5 = xo[5] * (*alpha); | |||
| x6 = xo[6] * (*alpha); | |||
| x7 = xo[7] * (*alpha); | |||
| __vector float v_x0 = {x0, x0, x0, x0}; | |||
| __vector float v_x1 = {x1, x1, x1, x1}; | |||
| __vector float v_x2 = {x2, x2, x2, x2}; | |||
| __vector float v_x3 = {x3, x3, x3, x3}; | |||
| __vector float v_x4 = {x4, x4, x4, x4}; | |||
| __vector float v_x5 = {x5, x5, x5, x5}; | |||
| __vector float v_x6 = {x6, x6, x6, x6}; | |||
| __vector float v_x7 = {x7, x7, x7, x7}; | |||
| for (i = 0; i < n; i += 4) { | |||
| __vector float vy = vec_vsx_ld(0, &y[i]); | |||
| __vector float va0 = vec_vsx_ld(0, &a0[i]); | |||
| __vector float va1 = vec_vsx_ld(0, &a1[i]); | |||
| __vector float va2 = vec_vsx_ld(0, &a2[i]); | |||
| __vector float va3 = vec_vsx_ld(0, &a3[i]); | |||
| __vector float vb0 = vec_vsx_ld(0, &b0[i]); | |||
| __vector float vb1 = vec_vsx_ld(0, &b1[i]); | |||
| __vector float vb2 = vec_vsx_ld(0, &b2[i]); | |||
| __vector float vb3 = vec_vsx_ld(0, &b3[i]); | |||
| vy += v_x0 * va0 + v_x1 * va1 + v_x2 * va2 + v_x3 * va3; | |||
| vy += v_x4 * vb0 + v_x5 * vb1 + v_x6 * vb2 + v_x7 * vb3; | |||
| vec_vsx_st(vy, 0, &y[i]); | |||
| } | |||
| } | |||
| static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) | |||
| { | |||
| static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, | |||
| FLOAT *alpha) { | |||
| BLASLONG i; | |||
| FLOAT x0,x1,x2,x3; | |||
| x0 = xo[0] * *alpha; | |||
| x1 = xo[1] * *alpha; | |||
| x2 = xo[2] * *alpha; | |||
| x3 = xo[3] * *alpha; | |||
| __vector float v_x0 = {x0,x0,x0,x0}; | |||
| __vector float v_x1 = {x1,x1,x1,x1}; | |||
| __vector float v_x2 = {x2,x2,x2,x2}; | |||
| __vector float v_x3 = {x3,x3,x3,x3}; | |||
| __vector float* v_y =(__vector float*)y; | |||
| __vector float* va0 = (__vector float*)ap[0]; | |||
| __vector float* va1 = (__vector float*)ap[1]; | |||
| __vector float* va2 = (__vector float*)ap[2]; | |||
| __vector float* va3 = (__vector float*)ap[3]; | |||
| for ( i=0; i< n/4; i++ ) | |||
| { | |||
| register __vector float vy=v_y[i]; | |||
| vy += v_x0 * va0[i] + v_x1 * va1[i] + v_x2 * va2[i] + v_x3 * va3[i] ; | |||
| v_y[i] =vy; | |||
| FLOAT x0, x1, x2, x3; | |||
| FLOAT *a0, *a1, *a2, *a3; | |||
| a0 = ap[0]; | |||
| a1 = ap[1]; | |||
| a2 = ap[2]; | |||
| a3 = ap[3]; | |||
| x0 = xo[0] * (*alpha); | |||
| x1 = xo[1] * (*alpha); | |||
| x2 = xo[2] * (*alpha); | |||
| x3 = xo[3] * (*alpha); | |||
| __vector float v_x0 = {x0, x0, x0, x0}; | |||
| __vector float v_x1 = {x1, x1, x1, x1}; | |||
| __vector float v_x2 = {x2, x2, x2, x2}; | |||
| __vector float v_x3 = {x3, x3, x3, x3}; | |||
| for (i = 0; i < n; i += 4) { | |||
| __vector float vy = vec_vsx_ld(0, &y[i]); | |||
| __vector float va0 = vec_vsx_ld(0, &a0[i]); | |||
| __vector float va1 = vec_vsx_ld(0, &a1[i]); | |||
| __vector float va2 = vec_vsx_ld(0, &a2[i]); | |||
| __vector float va3 = vec_vsx_ld(0, &a3[i]); | |||
| vy += v_x0 * va0 + v_x1 * va1 + v_x2 * va2 + v_x3 * va3; | |||
| vec_vsx_st(vy, 0, &y[i]); | |||
| } | |||
| } | |||
| } | |||
| static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| { | |||
| static void sgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, | |||
| FLOAT *alpha) { | |||
| BLASLONG i; | |||
| FLOAT x0,x1; | |||
| x0 = x[0] * *alpha; | |||
| x1 = x[1] * *alpha; | |||
| __vector float v_x0 = {x0,x0,x0,x0}; | |||
| __vector float v_x1 = {x1,x1,x1,x1}; | |||
| __vector float* v_y =(__vector float*)y; | |||
| __vector float* va0 = (__vector float*)ap[0]; | |||
| __vector float* va1 = (__vector float*)ap[1]; | |||
| for ( i=0; i< n/4; i++ ) | |||
| { | |||
| v_y[i] += v_x0 * va0[i] + v_x1 * va1[i] ; | |||
| FLOAT x0, x1; | |||
| FLOAT *a0, *a1; | |||
| a0 = ap[0]; | |||
| a1 = ap[1]; | |||
| x0 = x[0] * (*alpha); | |||
| x1 = x[1] * (*alpha); | |||
| __vector float v_x0 = {x0, x0, x0, x0}; | |||
| __vector float v_x1 = {x1, x1, x1, x1}; | |||
| for (i = 0; i < n; i += 4) { | |||
| __vector float vy = vec_vsx_ld(0, &y[i]); | |||
| __vector float va0 = vec_vsx_ld(0, &a0[i]); | |||
| __vector float va1 = vec_vsx_ld(0, &a1[i]); | |||
| vy += v_x0 * va0 + v_x1 * va1; | |||
| vec_vsx_st(vy, 0, &y[i]); | |||
| } | |||
| } | |||
| } | |||
| static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| { | |||
| static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, | |||
| FLOAT *alpha) { | |||
| BLASLONG i; | |||
| FLOAT x0 ; | |||
| x0 = x[0] * *alpha; | |||
| __vector float v_x0 = {x0,x0,x0,x0}; | |||
| __vector float* v_y =(__vector float*)y; | |||
| __vector float* va0 = (__vector float*)ap; | |||
| for ( i=0; i< n/4; i++ ) | |||
| { | |||
| v_y[i] += v_x0 * va0[i] ; | |||
| FLOAT x0 = x[0] * (*alpha); | |||
| __vector float v_x0 = {x0, x0, x0, x0}; | |||
| for (i = 0; i < n; i += 4) { | |||
| __vector float vy = vec_vsx_ld(0, &y[i]); | |||
| __vector float va0 = vec_vsx_ld(0, &ap[i]); | |||
| vy += v_x0 * va0; | |||
| vec_vsx_st(vy, 0, &y[i]); | |||
| } | |||
| } | |||
| static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) | |||
| { | |||
| static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) { | |||
| BLASLONG i; | |||
| for ( i=0; i<n; i++ ){ | |||
| *dest += *src; | |||
| src++; | |||
| dest += inc_dest; | |||
| for (i = 0; i < n; i++) { | |||
| *dest += *src; | |||
| src++; | |||
| dest += inc_dest; | |||
| } | |||
| return; | |||
| } | |||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||
| { | |||
| BLASLONG i; | |||
| FLOAT *a_ptr; | |||
| FLOAT *x_ptr; | |||
| FLOAT *y_ptr; | |||
| FLOAT *ap[4]; | |||
| BLASLONG n1; | |||
| BLASLONG m1; | |||
| BLASLONG m2; | |||
| BLASLONG m3; | |||
| BLASLONG n2; | |||
| BLASLONG lda4 = lda << 2; | |||
| BLASLONG lda8 = lda << 3; | |||
| FLOAT xbuffer[8] __attribute__((aligned(16))); | |||
| FLOAT *ybuffer; | |||
| if ( m < 1 ) return(0); | |||
| if ( n < 1 ) return(0); | |||
| ybuffer = buffer; | |||
| if ( inc_x == 1 ) | |||
| { | |||
| n1 = n >> 3 ; | |||
| n2 = n & 7 ; | |||
| } | |||
| else | |||
| { | |||
| n1 = n >> 2 ; | |||
| n2 = n & 3 ; | |||
| } | |||
| m3 = m & 3 ; | |||
| m1 = m & -4 ; | |||
| m2 = (m & (NBMAX-1)) - m3 ; | |||
| y_ptr = y; | |||
| BLASLONG NB = NBMAX; | |||
| while ( NB == NBMAX ) | |||
| { | |||
| m1 -= NB; | |||
| if ( m1 < 0) | |||
| { | |||
| if ( m2 == 0 ) break; | |||
| NB = m2; | |||
| } | |||
| a_ptr = a; | |||
| x_ptr = x; | |||
| ap[0] = a_ptr; | |||
| ap[1] = a_ptr + lda; | |||
| ap[2] = ap[1] + lda; | |||
| ap[3] = ap[2] + lda; | |||
| if ( inc_y != 1 ) | |||
| memset(ybuffer,0,NB*4); | |||
| else | |||
| ybuffer = y_ptr; | |||
| if ( inc_x == 1 ) | |||
| { | |||
| for( i = 0; i < n1 ; i++) | |||
| { | |||
| sgemv_kernel_4x8(NB,ap,x_ptr,ybuffer,lda4,&alpha); | |||
| ap[0] += lda8; | |||
| ap[1] += lda8; | |||
| ap[2] += lda8; | |||
| ap[3] += lda8; | |||
| a_ptr += lda8; | |||
| x_ptr += 8; | |||
| } | |||
| if ( n2 & 4 ) | |||
| { | |||
| sgemv_kernel_4x4(NB,ap,x_ptr,ybuffer,&alpha); | |||
| ap[0] += lda4; | |||
| ap[1] += lda4; | |||
| ap[2] += lda4; | |||
| ap[3] += lda4; | |||
| a_ptr += lda4; | |||
| x_ptr += 4; | |||
| } | |||
| if ( n2 & 2 ) | |||
| { | |||
| sgemv_kernel_4x2(NB,ap,x_ptr,ybuffer,&alpha); | |||
| a_ptr += lda*2; | |||
| x_ptr += 2; | |||
| } | |||
| if ( n2 & 1 ) | |||
| { | |||
| sgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,&alpha); | |||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, | |||
| BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, | |||
| FLOAT *buffer) { | |||
| BLASLONG i, n1, m1, m2, m3, n2, lda4, lda8; | |||
| FLOAT *a_ptr, *x_ptr, *y_ptr, *ap[4]; | |||
| lda4 = lda << 2; | |||
| lda8 = lda << 3; | |||
| FLOAT xbuffer[8] __attribute__((aligned(16))); | |||
| FLOAT *ybuffer = buffer; | |||
| if (m < 1) return (0); | |||
| if (n < 1) return (0); | |||
| if (inc_x == 1) { | |||
| n1 = n >> 3; | |||
| n2 = n & 7; | |||
| } else { | |||
| n1 = n >> 2; | |||
| n2 = n & 3; | |||
| } | |||
| m3 = m & 3; | |||
| m1 = m & -4; | |||
| m2 = (m & (NBMAX - 1)) - m3; | |||
| y_ptr = y; | |||
| BLASLONG NB = NBMAX; | |||
| while (NB == NBMAX) { | |||
| m1 -= NB; | |||
| if (m1 < 0) { | |||
| if (m2 == 0) break; | |||
| NB = m2; | |||
| } | |||
| a_ptr = a; | |||
| x_ptr = x; | |||
| ap[0] = a_ptr; | |||
| ap[1] = a_ptr + lda; | |||
| ap[2] = ap[1] + lda; | |||
| ap[3] = ap[2] + lda; | |||
| if (inc_y != 1) | |||
| memset(ybuffer, 0, NB * 4); | |||
| else | |||
| ybuffer = y_ptr; | |||
| if (inc_x == 1) { | |||
| for (i = 0; i < n1; i++) { | |||
| sgemv_kernel_4x8(NB, ap, x_ptr, ybuffer, lda4, &alpha); | |||
| ap[0] += lda8; | |||
| ap[1] += lda8; | |||
| ap[2] += lda8; | |||
| ap[3] += lda8; | |||
| a_ptr += lda8; | |||
| x_ptr += 8; | |||
| } | |||
| if (n2 & 4) { | |||
| sgemv_kernel_4x4(NB, ap, x_ptr, ybuffer, &alpha); | |||
| ap[0] += lda4; | |||
| ap[1] += lda4; | |||
| ap[2] += lda4; | |||
| ap[3] += lda4; | |||
| a_ptr += lda4; | |||
| x_ptr += 4; | |||
| } | |||
| if (n2 & 2) { | |||
| sgemv_kernel_4x2(NB, ap, x_ptr, ybuffer, &alpha); | |||
| a_ptr += lda * 2; | |||
| x_ptr += 2; | |||
| } | |||
| if (n2 & 1) { | |||
| sgemv_kernel_4x1(NB, a_ptr, x_ptr, ybuffer, &alpha); | |||
| a_ptr += lda; | |||
| x_ptr += 1; | |||
| } | |||
| } else { | |||
| for (i = 0; i < n1; i++) { | |||
| xbuffer[0] = x_ptr[0]; | |||
| x_ptr += inc_x; | |||
| xbuffer[1] = x_ptr[0]; | |||
| x_ptr += inc_x; | |||
| xbuffer[2] = x_ptr[0]; | |||
| x_ptr += inc_x; | |||
| xbuffer[3] = x_ptr[0]; | |||
| x_ptr += inc_x; | |||
| sgemv_kernel_4x4(NB, ap, xbuffer, ybuffer, &alpha); | |||
| ap[0] += lda4; | |||
| ap[1] += lda4; | |||
| ap[2] += lda4; | |||
| ap[3] += lda4; | |||
| a_ptr += lda4; | |||
| } | |||
| for (i = 0; i < n2; i++) { | |||
| xbuffer[0] = x_ptr[0]; | |||
| x_ptr += inc_x; | |||
| sgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer, &alpha); | |||
| a_ptr += lda; | |||
| } | |||
| } | |||
| a += NB; | |||
| if (inc_y != 1) { | |||
| add_y(NB, ybuffer, y_ptr, inc_y); | |||
| y_ptr += NB * inc_y; | |||
| } else | |||
| y_ptr += NB; | |||
| } | |||
| if (m3 == 0) return (0); | |||
| if (m3 == 3) { | |||
| a_ptr = a; | |||
| x_ptr = x; | |||
| FLOAT temp0 = 0.0; | |||
| FLOAT temp1 = 0.0; | |||
| FLOAT temp2 = 0.0; | |||
| if (lda == 3 && inc_x == 1) { | |||
| for (i = 0; i < (n & -4); i += 4) { | |||
| temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1]; | |||
| temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1]; | |||
| temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1]; | |||
| temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9] * x_ptr[3]; | |||
| temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3]; | |||
| temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3]; | |||
| a_ptr += 12; | |||
| x_ptr += 4; | |||
| } | |||
| for (; i < n; i++) { | |||
| temp0 += a_ptr[0] * x_ptr[0]; | |||
| temp1 += a_ptr[1] * x_ptr[0]; | |||
| temp2 += a_ptr[2] * x_ptr[0]; | |||
| a_ptr += 3; | |||
| x_ptr++; | |||
| } | |||
| } else { | |||
| for (i = 0; i < n; i++) { | |||
| temp0 += a_ptr[0] * x_ptr[0]; | |||
| temp1 += a_ptr[1] * x_ptr[0]; | |||
| temp2 += a_ptr[2] * x_ptr[0]; | |||
| a_ptr += lda; | |||
| x_ptr += inc_x; | |||
| } | |||
| } | |||
| y_ptr[0] += alpha * temp0; | |||
| y_ptr += inc_y; | |||
| y_ptr[0] += alpha * temp1; | |||
| y_ptr += inc_y; | |||
| y_ptr[0] += alpha * temp2; | |||
| return (0); | |||
| } | |||
| if (m3 == 2) { | |||
| a_ptr = a; | |||
| x_ptr = x; | |||
| FLOAT temp0 = 0.0; | |||
| FLOAT temp1 = 0.0; | |||
| if (lda == 2 && inc_x == 1) { | |||
| for (i = 0; i < (n & -4); i += 4) { | |||
| temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1]; | |||
| temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1]; | |||
| temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3]; | |||
| temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3]; | |||
| a_ptr += 8; | |||
| x_ptr += 4; | |||
| } | |||
| for (; i < n; i++) { | |||
| temp0 += a_ptr[0] * x_ptr[0]; | |||
| temp1 += a_ptr[1] * x_ptr[0]; | |||
| a_ptr += 2; | |||
| x_ptr++; | |||
| } | |||
| } else { | |||
| for (i = 0; i < n; i++) { | |||
| temp0 += a_ptr[0] * x_ptr[0]; | |||
| temp1 += a_ptr[1] * x_ptr[0]; | |||
| a_ptr += lda; | |||
| x_ptr += inc_x; | |||
| } | |||
| } | |||
| y_ptr[0] += alpha * temp0; | |||
| y_ptr += inc_y; | |||
| y_ptr[0] += alpha * temp1; | |||
| return (0); | |||
| } | |||
| if (m3 == 1) { | |||
| a_ptr = a; | |||
| x_ptr = x; | |||
| FLOAT temp = 0.0; | |||
| if (lda == 1 && inc_x == 1) { | |||
| for (i = 0; i < (n & -4); i += 4) { | |||
| temp += a_ptr[i] * x_ptr[i] + a_ptr[i + 1] * x_ptr[i + 1] + | |||
| a_ptr[i + 2] * x_ptr[i + 2] + | |||
| a_ptr[i + 3] * x_ptr[i + 3]; | |||
| } | |||
| for (; i < n; i++) { | |||
| temp += a_ptr[i] * x_ptr[i]; | |||
| } | |||
| } else { | |||
| for (i = 0; i < n; i++) { | |||
| temp += a_ptr[0] * x_ptr[0]; | |||
| a_ptr += lda; | |||
| x_ptr += 1; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| for( i = 0; i < n1 ; i++) | |||
| { | |||
| xbuffer[0] = x_ptr[0]; | |||
| x_ptr += inc_x; | |||
| xbuffer[1] = x_ptr[0]; | |||
| x_ptr += inc_x; | |||
| xbuffer[2] = x_ptr[0]; | |||
| x_ptr += inc_x; | |||
| xbuffer[3] = x_ptr[0]; | |||
| x_ptr += inc_x; | |||
| sgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,&alpha); | |||
| ap[0] += lda4; | |||
| ap[1] += lda4; | |||
| ap[2] += lda4; | |||
| ap[3] += lda4; | |||
| a_ptr += lda4; | |||
| } | |||
| for( i = 0; i < n2 ; i++) | |||
| { | |||
| xbuffer[0] = x_ptr[0]; | |||
| x_ptr += inc_x; | |||
| sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,&alpha); | |||
| a_ptr += lda; | |||
| } | |||
| } | |||
| a += NB; | |||
| if ( inc_y != 1 ) | |||
| { | |||
| add_y(NB,ybuffer,y_ptr,inc_y); | |||
| y_ptr += NB * inc_y; | |||
| } | |||
| else | |||
| y_ptr += NB ; | |||
| } | |||
| if ( m3 == 0 ) return(0); | |||
| if ( m3 == 3 ) | |||
| { | |||
| a_ptr = a; | |||
| x_ptr = x; | |||
| FLOAT temp0 = 0.0; | |||
| FLOAT temp1 = 0.0; | |||
| FLOAT temp2 = 0.0; | |||
| if ( lda == 3 && inc_x ==1 ) | |||
| { | |||
| for( i = 0; i < ( n & -4 ); i+=4 ) | |||
| { | |||
| temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1]; | |||
| temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1]; | |||
| temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1]; | |||
| temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9] * x_ptr[3]; | |||
| temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3]; | |||
| temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3]; | |||
| a_ptr += 12; | |||
| x_ptr += 4; | |||
| } | |||
| for( ; i < n; i++ ) | |||
| { | |||
| temp0 += a_ptr[0] * x_ptr[0]; | |||
| temp1 += a_ptr[1] * x_ptr[0]; | |||
| temp2 += a_ptr[2] * x_ptr[0]; | |||
| a_ptr += 3; | |||
| x_ptr ++; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| for( i = 0; i < n; i++ ) | |||
| { | |||
| temp0 += a_ptr[0] * x_ptr[0]; | |||
| temp1 += a_ptr[1] * x_ptr[0]; | |||
| temp2 += a_ptr[2] * x_ptr[0]; | |||
| a_ptr += lda; | |||
| x_ptr += inc_x; | |||
| } | |||
| } | |||
| y_ptr[0] += alpha * temp0; | |||
| y_ptr += inc_y; | |||
| y_ptr[0] += alpha * temp1; | |||
| y_ptr += inc_y; | |||
| y_ptr[0] += alpha * temp2; | |||
| return(0); | |||
| } | |||
| if ( m3 == 2 ) | |||
| { | |||
| a_ptr = a; | |||
| x_ptr = x; | |||
| FLOAT temp0 = 0.0; | |||
| FLOAT temp1 = 0.0; | |||
| if ( lda == 2 && inc_x ==1 ) | |||
| { | |||
| for( i = 0; i < (n & -4) ; i+=4 ) | |||
| { | |||
| temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1]; | |||
| temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1]; | |||
| temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3]; | |||
| temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3]; | |||
| a_ptr += 8; | |||
| x_ptr += 4; | |||
| } | |||
| for( ; i < n; i++ ) | |||
| { | |||
| temp0 += a_ptr[0] * x_ptr[0]; | |||
| temp1 += a_ptr[1] * x_ptr[0]; | |||
| a_ptr += 2; | |||
| x_ptr ++; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| for( i = 0; i < n; i++ ) | |||
| { | |||
| temp0 += a_ptr[0] * x_ptr[0]; | |||
| temp1 += a_ptr[1] * x_ptr[0]; | |||
| a_ptr += lda; | |||
| x_ptr += inc_x; | |||
| } | |||
| } | |||
| y_ptr[0] += alpha * temp0; | |||
| y_ptr += inc_y; | |||
| y_ptr[0] += alpha * temp1; | |||
| return(0); | |||
| } | |||
| if ( m3 == 1 ) | |||
| { | |||
| a_ptr = a; | |||
| x_ptr = x; | |||
| FLOAT temp = 0.0; | |||
| if ( lda == 1 && inc_x ==1 ) | |||
| { | |||
| for( i = 0; i < (n & -4); i+=4 ) | |||
| { | |||
| temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3]; | |||
| } | |||
| for( ; i < n; i++ ) | |||
| { | |||
| temp += a_ptr[i] * x_ptr[i]; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| for( i = 0; i < n; i++ ) | |||
| { | |||
| temp += a_ptr[0] * x_ptr[0]; | |||
| a_ptr += lda; | |||
| x_ptr += inc_x; | |||
| } | |||
| } | |||
| y_ptr[0] += alpha * temp; | |||
| return(0); | |||
| } | |||
| return(0); | |||
| x_ptr += inc_x; | |||
| } | |||
| } | |||
| y_ptr[0] += alpha * temp; | |||
| return (0); | |||
| } | |||
| return (0); | |||
| } | |||
| #endif | |||
| @@ -17,12 +17,12 @@ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |||
| CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE | |||
| GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |||
| HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | |||
| LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF | |||
| THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #if !defined(__VEC__) || !defined(__ALTIVEC__) | |||
| #include "../arm/gemv_t.c" | |||
| @@ -33,20 +33,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define NBMAX 2048 | |||
| #include <altivec.h> | |||
| static void sgemv_kernel_4x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { | |||
| BLASLONG i; | |||
| #include <altivec.h> | |||
| static void sgemv_kernel_4x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, | |||
| FLOAT *y, FLOAT alpha) { | |||
| BLASLONG i; | |||
| FLOAT *a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7; | |||
| __vector float *va0, *va1, *va2, *va3, *va4, *va5, *va6, *va7, *v_x; | |||
| register __vector float temp0 = {0,0,0,0}; | |||
| register __vector float temp1 = {0,0,0,0}; | |||
| register __vector float temp2 = {0,0,0,0}; | |||
| register __vector float temp3 = {0,0,0,0}; | |||
| register __vector float temp4 = {0,0,0,0}; | |||
| register __vector float temp5 = {0,0,0,0}; | |||
| register __vector float temp6 = {0,0,0,0}; | |||
| register __vector float temp7 = {0,0,0,0}; | |||
| register __vector float temp0 = {0, 0, 0, 0}; | |||
| register __vector float temp1 = {0, 0, 0, 0}; | |||
| register __vector float temp2 = {0, 0, 0, 0}; | |||
| register __vector float temp3 = {0, 0, 0, 0}; | |||
| register __vector float temp4 = {0, 0, 0, 0}; | |||
| register __vector float temp5 = {0, 0, 0, 0}; | |||
| register __vector float temp6 = {0, 0, 0, 0}; | |||
| register __vector float temp7 = {0, 0, 0, 0}; | |||
| a0 = ap; | |||
| a1 = ap + lda; | |||
| @@ -56,43 +56,32 @@ static void sgemv_kernel_4x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA | |||
| a5 = a4 + lda; | |||
| a6 = a5 + lda; | |||
| a7 = a6 + lda; | |||
| va0 = (__vector float*) a0; | |||
| va1 = (__vector float*) a1; | |||
| va2 = (__vector float*) a2; | |||
| va3 = (__vector float*) a3; | |||
| va4 = (__vector float*) a4; | |||
| va5 = (__vector float*) a5; | |||
| va6 = (__vector float*) a6; | |||
| va7 = (__vector float*) a7; | |||
| v_x = (__vector float*) x; | |||
| for (i = 0; i < n/4; i ++) { | |||
| temp0 += v_x[i] * va0[i]; | |||
| temp1 += v_x[i] * va1[i]; | |||
| temp2 += v_x[i] * va2[i]; | |||
| temp3 += v_x[i] * va3[i]; | |||
| temp4 += v_x[i] * va4[i]; | |||
| temp5 += v_x[i] * va5[i]; | |||
| temp6 += v_x[i] * va6[i]; | |||
| temp7 += v_x[i] * va7[i]; | |||
| } | |||
| #if defined(POWER8) | |||
| y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); | |||
| y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); | |||
| y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]); | |||
| y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]); | |||
| y[4] += alpha * (temp4[0] + temp4[1]+temp4[2] + temp4[3]); | |||
| y[5] += alpha * (temp5[0] + temp5[1]+temp5[2] + temp5[3]); | |||
| y[6] += alpha * (temp6[0] + temp6[1]+temp6[2] + temp6[3]); | |||
| y[7] += alpha * (temp7[0] + temp7[1]+temp7[2] + temp7[3]); | |||
| #else | |||
| register __vector float t0, t1, t2, t3; | |||
| register __vector float a = { alpha, alpha, alpha, alpha }; | |||
| __vector float *v_y = (__vector float*) y; | |||
| for (i = 0; i < n; i += 4) { | |||
| __vector float vx = vec_vsx_ld(0, &x[i]); | |||
| __vector float vva0 = vec_vsx_ld(0, &a0[i]); | |||
| __vector float vva1 = vec_vsx_ld(0, &a1[i]); | |||
| __vector float vva2 = vec_vsx_ld(0, &a2[i]); | |||
| __vector float vva3 = vec_vsx_ld(0, &a3[i]); | |||
| __vector float vva4 = vec_vsx_ld(0, &a4[i]); | |||
| __vector float vva5 = vec_vsx_ld(0, &a5[i]); | |||
| __vector float vva6 = vec_vsx_ld(0, &a6[i]); | |||
| __vector float vva7 = vec_vsx_ld(0, &a7[i]); | |||
| temp0 += vx * vva0; | |||
| temp1 += vx * vva1; | |||
| temp2 += vx * vva2; | |||
| temp3 += vx * vva3; | |||
| temp4 += vx * vva4; | |||
| temp5 += vx * vva5; | |||
| temp6 += vx * vva6; | |||
| temp7 += vx * vva7; | |||
| } | |||
| register __vector float t0, t1, t2, t3; | |||
| register __vector float a = {alpha, alpha, alpha, alpha}; | |||
| __vector float vy0 = vec_vsx_ld(0, y); | |||
| __vector float vy1 = vec_vsx_ld(0, &(y[4])); | |||
| t0 = vec_mergeh(temp0, temp2); | |||
| t1 = vec_mergel(temp0, temp2); | |||
| t2 = vec_mergeh(temp1, temp3); | |||
| @@ -113,44 +102,41 @@ static void sgemv_kernel_4x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA | |||
| temp7 = vec_mergel(t1, t3); | |||
| temp4 += temp5 + temp6 + temp7; | |||
| v_y[0] += a * temp0; | |||
| v_y[1] += a * temp4; | |||
| #endif | |||
| vy0 += a * temp0; | |||
| vy1 += a * temp4; | |||
| vec_vsx_st(vy0, 0, y); | |||
| vec_vsx_st(vy1, 0, &(y[4])); | |||
| } | |||
| static void sgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { | |||
| static void sgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, | |||
| FLOAT *y, FLOAT alpha) { | |||
| BLASLONG i = 0; | |||
| FLOAT *a0, *a1, *a2, *a3; | |||
| a0 = ap; | |||
| a1 = ap + lda; | |||
| a2 = a1 + lda; | |||
| a3 = a2 + lda; | |||
| __vector float* va0 = (__vector float*) a0; | |||
| __vector float* va1 = (__vector float*) a1; | |||
| __vector float* va2 = (__vector float*) a2; | |||
| __vector float* va3 = (__vector float*) a3; | |||
| __vector float* v_x = (__vector float*) x; | |||
| register __vector float temp0 = {0,0,0,0}; | |||
| register __vector float temp1 = {0,0,0,0}; | |||
| register __vector float temp2 = {0,0,0,0}; | |||
| register __vector float temp3 = {0,0,0,0}; | |||
| for (i = 0; i < n / 4; i ++) { | |||
| temp0 += v_x[i] * va0[i]; | |||
| temp1 += v_x[i] * va1[i]; | |||
| temp2 += v_x[i] * va2[i]; | |||
| temp3 += v_x[i] * va3[i]; | |||
| register __vector float temp0 = {0, 0, 0, 0}; | |||
| register __vector float temp1 = {0, 0, 0, 0}; | |||
| register __vector float temp2 = {0, 0, 0, 0}; | |||
| register __vector float temp3 = {0, 0, 0, 0}; | |||
| for (i = 0; i < n; i += 4) { | |||
| __vector float vx = vec_vsx_ld(0, &x[i]); | |||
| __vector float vva0 = vec_vsx_ld(0, &a0[i]); | |||
| __vector float vva1 = vec_vsx_ld(0, &a1[i]); | |||
| __vector float vva2 = vec_vsx_ld(0, &a2[i]); | |||
| __vector float vva3 = vec_vsx_ld(0, &a3[i]); | |||
| temp0 += vx * vva0; | |||
| temp1 += vx * vva1; | |||
| temp2 += vx * vva2; | |||
| temp3 += vx * vva3; | |||
| } | |||
| #if defined(POWER8) | |||
| y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); | |||
| y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); | |||
| y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]); | |||
| y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]); | |||
| #else | |||
| register __vector float t0, t1, t2, t3; | |||
| register __vector float a = { alpha, alpha, alpha, alpha }; | |||
| __vector float *v_y = (__vector float*) y; | |||
| register __vector float a = {alpha, alpha, alpha, alpha}; | |||
| __vector float vy0 = vec_vsx_ld(0, y); | |||
| t0 = vec_mergeh(temp0, temp2); | |||
| t1 = vec_mergel(temp0, temp2); | |||
| @@ -162,47 +148,42 @@ static void sgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA | |||
| temp3 = vec_mergel(t1, t3); | |||
| temp0 += temp1 + temp2 + temp3; | |||
| v_y[0] += a * temp0; | |||
| #endif | |||
| } | |||
| vy0 += a * temp0; | |||
| vec_vsx_st(vy0, 0, y); | |||
| static void sgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha, BLASLONG inc_y) { | |||
| } | |||
| static void sgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, | |||
| FLOAT *y, FLOAT alpha, BLASLONG inc_y) { | |||
| BLASLONG i; | |||
| FLOAT *a0, *a1; | |||
| a0 = ap; | |||
| a1 = ap + lda; | |||
| __vector float* va0 = (__vector float*) a0; | |||
| __vector float* va1 = (__vector float*) a1; | |||
| __vector float* v_x = (__vector float*) x; | |||
| __vector float temp0 = {0,0,0,0}; | |||
| __vector float temp1 = {0,0,0,0}; | |||
| for (i = 0; i < n / 4; i ++) { | |||
| temp0 += v_x[i] * va0[i]; | |||
| temp1 += v_x[i] * va1[i]; | |||
| __vector float temp0 = {0, 0, 0, 0}; | |||
| __vector float temp1 = {0, 0, 0, 0}; | |||
| for (i = 0; i < n; i += 4) { | |||
| __vector float vx = vec_vsx_ld(0, &x[i]); | |||
| __vector float vva0 = vec_vsx_ld(0, &a0[i]); | |||
| __vector float vva1 = vec_vsx_ld(0, &a1[i]); | |||
| temp0 += vx * vva0; | |||
| temp1 += vx * vva1; | |||
| } | |||
| y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); | |||
| y[inc_y] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); | |||
| y[0] += alpha * (temp0[0] + temp0[1] + temp0[2] + temp0[3]); | |||
| y[inc_y] += alpha * (temp1[0] + temp1[1] + temp1[2] + temp1[3]); | |||
| } | |||
| static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { | |||
| static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, | |||
| FLOAT alpha) { | |||
| BLASLONG i; | |||
| FLOAT *a0; | |||
| a0 = ap; | |||
| __vector float* va0 = (__vector float*) a0; | |||
| __vector float* v_x = (__vector float*) x; | |||
| __vector float temp0 = {0,0,0,0}; | |||
| for (i = 0; i < n / 4; i ++) { | |||
| temp0 += v_x[i] * va0[i] ; | |||
| __vector float temp0 = {0, 0, 0, 0}; | |||
| for (i = 0; i < n; i += 4) { | |||
| __vector float vx = vec_vsx_ld(0, &x[i]); | |||
| __vector float vva0 = vec_vsx_ld(0, &ap[i]); | |||
| temp0 += vx * vva0; | |||
| } | |||
| y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); | |||
| y[0] += alpha * (temp0[0] + temp0[1] + temp0[2] + temp0[3]); | |||
| } | |||
| static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { | |||
| @@ -213,20 +194,14 @@ static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { | |||
| } | |||
| } | |||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) { | |||
| BLASLONG i; | |||
| BLASLONG j; | |||
| FLOAT *a_ptr; | |||
| FLOAT *x_ptr; | |||
| FLOAT *y_ptr; | |||
| BLASLONG n1; | |||
| BLASLONG m1; | |||
| BLASLONG m2; | |||
| BLASLONG m3; | |||
| BLASLONG n2; | |||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, | |||
| BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, | |||
| FLOAT *buffer) { | |||
| BLASLONG i, j, n1, m1, m2, m3, n2; | |||
| FLOAT *a_ptr, *x_ptr, *y_ptr; | |||
| FLOAT ybuffer[8] __attribute__((aligned(16))); | |||
| FLOAT *xbuffer; | |||
| FLOAT *xbuffer; | |||
| if (m < 1) return (0); | |||
| if (n < 1) return (0); | |||
| @@ -242,7 +217,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO | |||
| BLASLONG NB = NBMAX; | |||
| while (NB == NBMAX) { | |||
| m1 -= NB; | |||
| if (m1 < 0) { | |||
| if (m2 == 0) break; | |||
| @@ -260,20 +234,15 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO | |||
| BLASLONG lda8 = lda << 3; | |||
| if (inc_y == 1) { | |||
| for (i = 0; i < n1; i++) { | |||
| sgemv_kernel_4x8(NB, lda, a_ptr, xbuffer, y_ptr, alpha); | |||
| y_ptr += 8; | |||
| a_ptr += lda8; | |||
| } | |||
| } else { | |||
| for (i = 0; i < n1; i++) { | |||
| ybuffer[0] = 0; | |||
| ybuffer[1] = 0; | |||
| @@ -285,8 +254,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO | |||
| ybuffer[7] = 0; | |||
| sgemv_kernel_4x8(NB, lda, a_ptr, xbuffer, ybuffer, alpha); | |||
| *y_ptr += ybuffer[0]; | |||
| y_ptr += inc_y; | |||
| *y_ptr += ybuffer[1]; | |||
| @@ -307,10 +274,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO | |||
| a_ptr += lda8; | |||
| } | |||
| } | |||
| if (n2 & 4) { | |||
| ybuffer[0] = 0; | |||
| ybuffer[1] = 0; | |||
| @@ -318,7 +283,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO | |||
| ybuffer[3] = 0; | |||
| sgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, ybuffer, alpha); | |||
| a_ptr += lda<<2; | |||
| a_ptr += lda << 2; | |||
| *y_ptr += ybuffer[0]; | |||
| y_ptr += inc_y; | |||
| @@ -334,20 +299,16 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO | |||
| sgemv_kernel_4x2(NB, lda, a_ptr, xbuffer, y_ptr, alpha, inc_y); | |||
| a_ptr += lda << 1; | |||
| y_ptr += 2 * inc_y; | |||
| } | |||
| if (n2 & 1) { | |||
| sgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha); | |||
| a_ptr += lda; | |||
| y_ptr += inc_y; | |||
| } | |||
| a += NB; | |||
| x += NB * inc_x; | |||
| } | |||
| if (m3 == 0) return (0); | |||
| @@ -365,13 +326,14 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO | |||
| y_ptr = y; | |||
| if (lda == 3 && inc_y == 1) { | |||
| for (j = 0; j < (n & -4); j += 4) { | |||
| y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2; | |||
| y_ptr[j + 1] += aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2; | |||
| y_ptr[j + 2] += aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2; | |||
| y_ptr[j + 3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2; | |||
| y_ptr[j + 1] += | |||
| aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2; | |||
| y_ptr[j + 2] += | |||
| aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2; | |||
| y_ptr[j + 3] += | |||
| aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2; | |||
| aj += 12; | |||
| } | |||
| @@ -381,38 +343,40 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO | |||
| } | |||
| } else { | |||
| if (inc_y == 1) { | |||
| BLASLONG register lda2 = lda << 1; | |||
| BLASLONG register lda4 = lda << 2; | |||
| BLASLONG register lda3 = lda2 + lda; | |||
| for (j = 0; j < (n & -4); j += 4) { | |||
| y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; | |||
| y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1 + *(aj + lda + 2) * xtemp2; | |||
| y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1 + *(aj + lda2 + 2) * xtemp2; | |||
| y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1 + *(aj + lda3 + 2) * xtemp2; | |||
| y_ptr[j] += | |||
| *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; | |||
| y_ptr[j + 1] += *(aj + lda) * xtemp0 + | |||
| *(aj + lda + 1) * xtemp1 + | |||
| *(aj + lda + 2) * xtemp2; | |||
| y_ptr[j + 2] += *(aj + lda2) * xtemp0 + | |||
| *(aj + lda2 + 1) * xtemp1 + | |||
| *(aj + lda2 + 2) * xtemp2; | |||
| y_ptr[j + 3] += *(aj + lda3) * xtemp0 + | |||
| *(aj + lda3 + 1) * xtemp1 + | |||
| *(aj + lda3 + 2) * xtemp2; | |||
| aj += lda4; | |||
| } | |||
| for (; j < n; j++) { | |||
| y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; | |||
| y_ptr[j] += | |||
| *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; | |||
| aj += lda; | |||
| } | |||
| } else { | |||
| for (j = 0; j < n; j++) { | |||
| *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; | |||
| *y_ptr += | |||
| *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; | |||
| y_ptr += inc_y; | |||
| aj += lda; | |||
| } | |||
| } | |||
| } | |||
| return (0); | |||
| } | |||
| @@ -426,14 +390,12 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO | |||
| y_ptr = y; | |||
| if (lda == 2 && inc_y == 1) { | |||
| for (j = 0; j < (n & -4); j += 4) { | |||
| y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1; | |||
| y_ptr[j + 1] += aj[2] * xtemp0 + aj[3] * xtemp1; | |||
| y_ptr[j + 2] += aj[4] * xtemp0 + aj[5] * xtemp1; | |||
| y_ptr[j + 3] += aj[6] * xtemp0 + aj[7] * xtemp1; | |||
| aj += 8; | |||
| } | |||
| for (; j < n; j++) { | |||
| @@ -443,22 +405,22 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO | |||
| } else { | |||
| if (inc_y == 1) { | |||
| BLASLONG register lda2 = lda << 1; | |||
| BLASLONG register lda4 = lda << 2; | |||
| BLASLONG register lda3 = lda2 + lda; | |||
| for (j = 0; j < (n & -4); j += 4) { | |||
| y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1; | |||
| y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1; | |||
| y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1; | |||
| y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1; | |||
| y_ptr[j + 1] += | |||
| *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1; | |||
| y_ptr[j + 2] += | |||
| *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1; | |||
| y_ptr[j + 3] += | |||
| *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1; | |||
| aj += lda4; | |||
| } | |||
| for (; j < n; j++) { | |||
| y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1; | |||
| aj += lda; | |||
| } | |||
| @@ -470,10 +432,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO | |||
| aj += lda; | |||
| } | |||
| } | |||
| } | |||
| return (0); | |||
| } | |||
| FLOAT xtemp = *x_ptr * alpha; | |||
| @@ -490,10 +450,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO | |||
| y_ptr[j] += aj[j] * xtemp; | |||
| } | |||
| } else { | |||
| if (inc_y == 1) { | |||
| BLASLONG register lda2 = lda << 1; | |||
| BLASLONG register lda4 = lda << 2; | |||
| BLASLONG register lda3 = lda2 + lda; | |||
| @@ -516,12 +474,10 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO | |||
| y_ptr += inc_y; | |||
| aj += lda; | |||
| } | |||
| } | |||
| } | |||
| return (0); | |||
| } | |||
| #endif | |||
| @@ -27,13 +27,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n) | |||
| #define FLOAT_V_T vfloat32m4_t | |||
| #define VLEV_FLOAT RISCV_RVV(vle32_v_f32m4) | |||
| #define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4) | |||
| #define VSEV_FLOAT RISCV_RVV(vse32_v_f32m4) | |||
| #define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4) | |||
| #define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4) | |||
| #define VSETVL(n) RISCV_RVV(vsetvl_e32m8)(n) | |||
| #define FLOAT_V_T vfloat32m8_t | |||
| #define VLEV_FLOAT RISCV_RVV(vle32_v_f32m8) | |||
| #define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m8) | |||
| #define VSEV_FLOAT RISCV_RVV(vse32_v_f32m8) | |||
| #define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m8) | |||
| #define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m8) | |||
| #define VFMUL_VF_FLOAT RISCV_RVV(vfmul_vf_f32m8) | |||
| #define VFILL_ZERO_FLOAT RISCV_RVV(vfsub_vv_f32m8) | |||
| #else | |||
| #define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n) | |||
| #define FLOAT_V_T vfloat64m4_t | |||
| @@ -42,103 +44,211 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define VSEV_FLOAT RISCV_RVV(vse64_v_f64m4) | |||
| #define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4) | |||
| #define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4) | |||
| #define VFMUL_VF_FLOAT RISCV_RVV(vfmul_vf_f64m4) | |||
| #define VFILL_ZERO_FLOAT RISCV_RVV(vfsub_vv_f64m4) | |||
| #endif | |||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||
| { | |||
| BLASLONG i = 0, j = 0, k = 0; | |||
| BLASLONG ix = 0, iy = 0; | |||
| if(n < 0) return(0); | |||
| FLOAT *a_ptr = a; | |||
| FLOAT temp = 0.0; | |||
| FLOAT_V_T va0, va1, vy0, vy1; | |||
| unsigned int gvl = 0; | |||
| if(inc_y == 1){ | |||
| gvl = VSETVL(m); | |||
| if(gvl <= m/2){ | |||
| for(k=0,j=0; k<m/(2*gvl); k++){ | |||
| a_ptr = a; | |||
| ix = 0; | |||
| vy0 = VLEV_FLOAT(&y[j], gvl); | |||
| vy1 = VLEV_FLOAT(&y[j+gvl], gvl); | |||
| for(i = 0; i < n; i++){ | |||
| temp = alpha * x[ix]; | |||
| va0 = VLEV_FLOAT(&a_ptr[j], gvl); | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp, va0, gvl); | |||
| va1 = VLEV_FLOAT(&a_ptr[j+gvl], gvl); | |||
| vy1 = VFMACCVF_FLOAT(vy1, temp, va1, gvl); | |||
| a_ptr += lda; | |||
| ix += inc_x; | |||
| } | |||
| VSEV_FLOAT(&y[j], vy0, gvl); | |||
| VSEV_FLOAT(&y[j+gvl], vy1, gvl); | |||
| j += gvl * 2; | |||
| } | |||
| } | |||
| //tail | |||
| for(;j < m;){ | |||
| gvl = VSETVL(m-j); | |||
| a_ptr = a; | |||
| ix = 0; | |||
| vy0 = VLEV_FLOAT(&y[j], gvl); | |||
| for(i = 0; i < n; i++){ | |||
| temp = alpha * x[ix]; | |||
| va0 = VLEV_FLOAT(&a_ptr[j], gvl); | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp, va0, gvl); | |||
| a_ptr += lda; | |||
| ix += inc_x; | |||
| } | |||
| VSEV_FLOAT(&y[j], vy0, gvl); | |||
| j += gvl; | |||
| BLASLONG i = 0, j = 0, k = 0; | |||
| BLASLONG ix = 0, iy = 0; | |||
| if(n < 0) return(0); | |||
| FLOAT *a_ptr = a; | |||
| FLOAT temp[4]; | |||
| FLOAT_V_T va0, va1, vy0, vy1,vy0_temp, vy1_temp , temp_v ,va0_0 , va0_1 , va1_0 ,va1_1 ,va2_0 ,va2_1 ,va3_0 ,va3_1 ; | |||
| unsigned int gvl = 0; | |||
| if(inc_y == 1 && inc_x == 1){ | |||
| gvl = VSETVL(m); | |||
| if(gvl <= m/2){ | |||
| for(k=0,j=0; k<m/(2*gvl); k++){ | |||
| a_ptr = a; | |||
| ix = 0; | |||
| vy0_temp = VLEV_FLOAT(&y[j], gvl); | |||
| vy1_temp = VLEV_FLOAT(&y[j+gvl], gvl); | |||
| vy0 = VFILL_ZERO_FLOAT(vy0 , vy0 , gvl); | |||
| vy1 = VFILL_ZERO_FLOAT(vy1 , vy1 , gvl); | |||
| int i; | |||
| int remainder = n % 4; | |||
| for(i = 0; i < remainder; i++){ | |||
| temp[0] = x[ix]; | |||
| va0 = VLEV_FLOAT(&a_ptr[j], gvl); | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp[0], va0, gvl); | |||
| va1 = VLEV_FLOAT(&a_ptr[j+gvl], gvl); | |||
| vy1 = VFMACCVF_FLOAT(vy1, temp[0], va1, gvl); | |||
| a_ptr += lda; | |||
| ix ++; | |||
| } | |||
| }else{ | |||
| BLASLONG stride_y = inc_y * sizeof(FLOAT); | |||
| gvl = VSETVL(m); | |||
| if(gvl <= m/2){ | |||
| BLASLONG inc_yv = inc_y * gvl; | |||
| for(k=0,j=0; k<m/(2*gvl); k++){ | |||
| a_ptr = a; | |||
| ix = 0; | |||
| vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl); | |||
| vy1 = VLSEV_FLOAT(&y[iy+inc_yv], stride_y, gvl); | |||
| for(i = 0; i < n; i++){ | |||
| temp = alpha * x[ix]; | |||
| va0 = VLEV_FLOAT(&a_ptr[j], gvl); | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp, va0, gvl); | |||
| va1 = VLEV_FLOAT(&a_ptr[j+gvl], gvl); | |||
| vy1 = VFMACCVF_FLOAT(vy1, temp, va1, gvl); | |||
| a_ptr += lda; | |||
| ix += inc_x; | |||
| } | |||
| VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl); | |||
| VSSEV_FLOAT(&y[iy+inc_yv], stride_y, vy1, gvl); | |||
| j += gvl * 2; | |||
| iy += inc_yv * 2; | |||
| } | |||
| for(i = remainder; i < n; i += 4){ | |||
| va0_0 = VLEV_FLOAT(&(a_ptr)[j], gvl); | |||
| va0_1 = VLEV_FLOAT(&(a_ptr)[j+gvl], gvl); | |||
| va1_0 = VLEV_FLOAT(&(a_ptr+lda * 1)[j], gvl); | |||
| va1_1 = VLEV_FLOAT(&(a_ptr+lda * 1)[j+gvl], gvl); | |||
| va2_0 = VLEV_FLOAT(&(a_ptr+lda * 2)[j], gvl); | |||
| va2_1 = VLEV_FLOAT(&(a_ptr+lda * 2)[j+gvl], gvl); | |||
| va3_0 = VLEV_FLOAT(&(a_ptr+lda * 3)[j], gvl); | |||
| va3_1 = VLEV_FLOAT(&(a_ptr+lda * 3)[j+gvl], gvl); | |||
| vy0 = VFMACCVF_FLOAT(vy0, x[ix], va0_0, gvl); | |||
| vy1 = VFMACCVF_FLOAT(vy1, x[ix], va0_1, gvl); | |||
| vy0 = VFMACCVF_FLOAT(vy0, x[ix+1], va1_0, gvl); | |||
| vy1 = VFMACCVF_FLOAT(vy1, x[ix+1], va1_1, gvl); | |||
| vy0 = VFMACCVF_FLOAT(vy0, x[ix+2], va2_0, gvl); | |||
| vy1 = VFMACCVF_FLOAT(vy1, x[ix+2], va2_1, gvl); | |||
| vy0 = VFMACCVF_FLOAT(vy0, x[ix+3], va3_0, gvl); | |||
| vy1 = VFMACCVF_FLOAT(vy1, x[ix+3], va3_1, gvl); | |||
| a_ptr += 4 * lda; | |||
| ix +=4; | |||
| } | |||
| //tail | |||
| for(;j < m;){ | |||
| gvl = VSETVL(m-j); | |||
| a_ptr = a; | |||
| ix = 0; | |||
| vy0 = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl); | |||
| for(i = 0; i < n; i++){ | |||
| temp = alpha * x[ix]; | |||
| va0 = VLEV_FLOAT(&a_ptr[j], gvl); | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp, va0, gvl); | |||
| a_ptr += lda; | |||
| ix += inc_x; | |||
| } | |||
| VSSEV_FLOAT(&y[j*inc_y], stride_y, vy0, gvl); | |||
| j += gvl; | |||
| vy0 = VFMACCVF_FLOAT(vy0_temp, alpha, vy0, gvl); | |||
| vy1 = VFMACCVF_FLOAT(vy1_temp, alpha, vy1, gvl); | |||
| VSEV_FLOAT(&y[j], vy0, gvl); | |||
| VSEV_FLOAT(&y[j+gvl], vy1, gvl); | |||
| j += gvl * 2; | |||
| } | |||
| } | |||
| //tail | |||
| if(gvl <= m - j ){ | |||
| a_ptr = a; | |||
| ix = 0; | |||
| vy0_temp = VLEV_FLOAT(&y[j], gvl); | |||
| vy0 = VFILL_ZERO_FLOAT(vy0 , vy0 , gvl); | |||
| int i; | |||
| int remainder = n % 4; | |||
| for(i = 0; i < remainder; i++){ | |||
| temp[0] = x[ix]; | |||
| va0 = VLEV_FLOAT(&a_ptr[j], gvl); | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp[0], va0, gvl); | |||
| a_ptr += lda; | |||
| ix ++; | |||
| } | |||
| for(i = remainder; i < n; i += 4){ | |||
| va0_0 = VLEV_FLOAT(&(a_ptr)[j], gvl); | |||
| va1_0 = VLEV_FLOAT(&(a_ptr+lda * 1)[j], gvl); | |||
| va2_0 = VLEV_FLOAT(&(a_ptr+lda * 2)[j], gvl); | |||
| va3_0 = VLEV_FLOAT(&(a_ptr+lda * 3)[j], gvl); | |||
| vy0 = VFMACCVF_FLOAT(vy0, x[ix], va0_0, gvl); | |||
| vy0 = VFMACCVF_FLOAT(vy0, x[ix+1], va1_0, gvl); | |||
| vy0 = VFMACCVF_FLOAT(vy0, x[ix+2], va2_0, gvl); | |||
| vy0 = VFMACCVF_FLOAT(vy0, x[ix+3], va3_0, gvl); | |||
| a_ptr += 4 * lda; | |||
| ix +=4; | |||
| } | |||
| vy0 = VFMACCVF_FLOAT(vy0_temp, alpha, vy0, gvl); | |||
| VSEV_FLOAT(&y[j], vy0, gvl); | |||
| j += gvl ; | |||
| } | |||
| for(;j < m;){ | |||
| gvl = VSETVL(m-j); | |||
| a_ptr = a; | |||
| ix = 0; | |||
| vy0 = VLEV_FLOAT(&y[j], gvl); | |||
| for(i = 0; i < n; i++){ | |||
| temp[0] = alpha * x[ix]; | |||
| va0 = VLEV_FLOAT(&a_ptr[j], gvl); | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp[0], va0, gvl); | |||
| a_ptr += lda; | |||
| ix += inc_x; | |||
| } | |||
| VSEV_FLOAT(&y[j], vy0, gvl); | |||
| j += gvl; | |||
| } | |||
| }else if (inc_y == 1 && inc_x !=1) { | |||
| gvl = VSETVL(m); | |||
| if(gvl <= m/2){ | |||
| for(k=0,j=0; k<m/(2*gvl); k++){ | |||
| a_ptr = a; | |||
| ix = 0; | |||
| vy0 = VLEV_FLOAT(&y[j], gvl); | |||
| vy1 = VLEV_FLOAT(&y[j+gvl], gvl); | |||
| for(i = 0; i < n; i++){ | |||
| temp[0] = alpha * x[ix]; | |||
| va0 = VLEV_FLOAT(&a_ptr[j], gvl); | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp[0], va0, gvl); | |||
| va1 = VLEV_FLOAT(&a_ptr[j+gvl], gvl); | |||
| vy1 = VFMACCVF_FLOAT(vy1, temp[0], va1, gvl); | |||
| a_ptr += lda; | |||
| ix += inc_x; | |||
| } | |||
| VSEV_FLOAT(&y[j], vy0, gvl); | |||
| VSEV_FLOAT(&y[j+gvl], vy1, gvl); | |||
| j += gvl * 2; | |||
| } | |||
| } | |||
| return(0); | |||
| } | |||
| //tail | |||
| for(;j < m;){ | |||
| gvl = VSETVL(m-j); | |||
| a_ptr = a; | |||
| ix = 0; | |||
| vy0 = VLEV_FLOAT(&y[j], gvl); | |||
| for(i = 0; i < n; i++){ | |||
| temp[0] = alpha * x[ix]; | |||
| va0 = VLEV_FLOAT(&a_ptr[j], gvl); | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp[0], va0, gvl); | |||
| a_ptr += lda; | |||
| ix += inc_x; | |||
| } | |||
| VSEV_FLOAT(&y[j], vy0, gvl); | |||
| j += gvl; | |||
| } | |||
| }else{ | |||
| BLASLONG stride_y = inc_y * sizeof(FLOAT); | |||
| gvl = VSETVL(m); | |||
| if(gvl <= m/2){ | |||
| BLASLONG inc_yv = inc_y * gvl; | |||
| for(k=0,j=0; k<m/(2*gvl); k++){ | |||
| a_ptr = a; | |||
| ix = 0; | |||
| vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl); | |||
| vy1 = VLSEV_FLOAT(&y[iy+inc_yv], stride_y, gvl); | |||
| for(i = 0; i < n; i++){ | |||
| temp[0] = alpha * x[ix]; | |||
| va0 = VLEV_FLOAT(&a_ptr[j], gvl); | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp[0], va0, gvl); | |||
| va1 = VLEV_FLOAT(&a_ptr[j+gvl], gvl); | |||
| vy1 = VFMACCVF_FLOAT(vy1, temp[0], va1, gvl); | |||
| a_ptr += lda; | |||
| ix += inc_x; | |||
| } | |||
| VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl); | |||
| VSSEV_FLOAT(&y[iy+inc_yv], stride_y, vy1, gvl); | |||
| j += gvl * 2; | |||
| iy += inc_yv * 2; | |||
| } | |||
| } | |||
| //tail | |||
| for(;j < m;){ | |||
| gvl = VSETVL(m-j); | |||
| a_ptr = a; | |||
| ix = 0; | |||
| vy0 = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl); | |||
| for(i = 0; i < n; i++){ | |||
| temp[0] = alpha * x[ix]; | |||
| va0 = VLEV_FLOAT(&a_ptr[j], gvl); | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp[0], va0, gvl); | |||
| a_ptr += lda; | |||
| ix += inc_x; | |||
| } | |||
| VSSEV_FLOAT(&y[j*inc_y], stride_y, vy0, gvl); | |||
| j += gvl; | |||
| } | |||
| } | |||
| return(0); | |||
| } | |||
| @@ -27,32 +27,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n) | |||
| #define FLOAT_V_T vfloat32m4_t | |||
| #define VLEV_FLOAT RISCV_RVV(vle32_v_f32m4) | |||
| #define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4) | |||
| #define VSEV_FLOAT RISCV_RVV(vse32_v_f32m4) | |||
| #define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4) | |||
| #define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4) | |||
| #define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f32m4) | |||
| #define VSETVL(n) RISCV_RVV(vsetvl_e32m2)(n) | |||
| #define FLOAT_V_T vfloat32m2_t | |||
| #define VLEV_FLOAT RISCV_RVV(vle32_v_f32m2) | |||
| #define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m2) | |||
| #define VSEV_FLOAT RISCV_RVV(vse32_v_f32m2) | |||
| #define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m2) | |||
| #define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m2) | |||
| #define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f32m2) | |||
| #define VFMUL_VF_FLOAT RISCV_RVV(vfmul_vf_f32m2) | |||
| #define VSEV_FLOAT RISCV_RVV(vse32_v_f32m2) | |||
| #else | |||
| #define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n) | |||
| #define FLOAT_V_T vfloat64m4_t | |||
| #define VLEV_FLOAT RISCV_RVV(vle64_v_f64m4) | |||
| #define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4) | |||
| #define VSEV_FLOAT RISCV_RVV(vse64_v_f64m4) | |||
| #define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4) | |||
| #define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4) | |||
| #define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f64m4) | |||
| #define VSETVL(n) RISCV_RVV(vsetvl_e64m2)(n) | |||
| #define FLOAT_V_T vfloat64m2_t | |||
| #define VLEV_FLOAT RISCV_RVV(vle64_v_f64m2) | |||
| #define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m2) | |||
| #define VSEV_FLOAT RISCV_RVV(vse64_v_f64m2) | |||
| #define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m2) | |||
| #define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m2) | |||
| #define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f64m2) | |||
| #define VFMUL_VF_FLOAT RISCV_RVV(vfmul_vf_f64m2) | |||
| #define VSEV_FLOAT RISCV_RVV(vse64_v_f64m2) | |||
| #endif | |||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||
| { | |||
| BLASLONG i = 0, j = 0, k = 0; | |||
| BLASLONG i = 0, j = 0, k = 0; | |||
| BLASLONG ix = 0, iy = 0; | |||
| FLOAT *a_ptr = a; | |||
| FLOAT temp_r = 0.0, temp_i = 0.0; | |||
| FLOAT_V_T va0, va1, vy0, vy1; | |||
| FLOAT temp_r = 0.0, temp_i = 0.0, temp_r1, temp_i1, temp_r2, temp_i2, temp_r3, temp_i3, temp_rr[4], temp_ii[4]; | |||
| FLOAT_V_T va0, va1, vy0, vy1, vy0_new, vy1_new, va2, va3, va4, va5, va6, va7, temp_iv, temp_rv, x_v0, x_v1, temp_v1, temp_v2, temp_v3, temp_v4; | |||
| unsigned int gvl = 0; | |||
| BLASLONG stride_a = sizeof(FLOAT) * 2; | |||
| BLASLONG stride_y = inc_y * sizeof(FLOAT) * 2; | |||
| @@ -60,104 +64,248 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, | |||
| BLASLONG inc_yv = inc_y * gvl * 2; | |||
| BLASLONG inc_x2 = inc_x * 2; | |||
| BLASLONG lda2 = lda * 2; | |||
| for(k=0,j=0; k<m/gvl; k++){ | |||
| vy0_new = VLSEV_FLOAT(&y[iy], stride_y, gvl); | |||
| vy1_new = VLSEV_FLOAT(&y[iy + 1], stride_y, gvl); | |||
| for (k = 0, j = 0; k < m / gvl; k++) | |||
| { | |||
| a_ptr = a; | |||
| ix = 0; | |||
| vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl); | |||
| vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl); | |||
| for(i = 0; i < n; i++){ | |||
| vy0 = vy0_new; | |||
| vy1 = vy1_new; | |||
| if (k < m / gvl - 1) | |||
| { | |||
| vy0_new = VLSEV_FLOAT(&y[iy + inc_yv], stride_y, gvl); | |||
| vy1_new = VLSEV_FLOAT(&y[iy + inc_yv + 1], stride_y, gvl); | |||
| } | |||
| for (i = 0; i < n % 4; i++) | |||
| { | |||
| #if !defined(XCONJ) | |||
| temp_r = alpha_r * x[ix] - alpha_i * x[ix+1]; | |||
| temp_i = alpha_r * x[ix+1] + alpha_i * x[ix]; | |||
| temp_r = alpha_r * x[ix] - alpha_i * x[ix + 1]; | |||
| temp_i = alpha_r * x[ix + 1] + alpha_i * x[ix]; | |||
| #else | |||
| temp_r = alpha_r * x[ix] + alpha_i * x[ix+1]; | |||
| temp_i = alpha_r * x[ix+1] - alpha_i * x[ix]; | |||
| temp_r = alpha_r * x[ix] + alpha_i * x[ix + 1]; | |||
| temp_i = alpha_r * x[ix + 1] - alpha_i * x[ix]; | |||
| #endif | |||
| va0 = VLSEV_FLOAT(&a_ptr[j], stride_a, gvl); | |||
| va1 = VLSEV_FLOAT(&a_ptr[j+1], stride_a, gvl); | |||
| va1 = VLSEV_FLOAT(&a_ptr[j + 1], stride_a, gvl); | |||
| #if !defined(CONJ) | |||
| #if !defined(XCONJ) | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, gvl); | |||
| vy0 = VFNMSACVF_FLOAT(vy0, temp_i, va1, gvl); | |||
| vy1 = VFMACCVF_FLOAT(vy1, temp_r, va1, gvl); | |||
| vy1 = VFMACCVF_FLOAT(vy1, temp_i, va0, gvl); | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, gvl); | |||
| vy0 = VFNMSACVF_FLOAT(vy0, temp_i, va1, gvl); | |||
| vy1 = VFMACCVF_FLOAT(vy1, temp_r, va1, gvl); | |||
| vy1 = VFMACCVF_FLOAT(vy1, temp_i, va0, gvl); | |||
| #else | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, gvl); | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_i, va1, gvl); | |||
| vy1 = VFMACCVF_FLOAT(vy1, temp_r, va1, gvl); | |||
| vy1 = VFNMSACVF_FLOAT(vy1, temp_i, va0, gvl); | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, gvl); | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_i, va1, gvl); | |||
| vy1 = VFMACCVF_FLOAT(vy1, temp_r, va1, gvl); | |||
| vy1 = VFNMSACVF_FLOAT(vy1, temp_i, va0, gvl); | |||
| #endif | |||
| #else | |||
| #if !defined(XCONJ) | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, gvl); | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_i, va1, gvl); | |||
| vy1 = VFNMSACVF_FLOAT(vy1, temp_r, va1, gvl); | |||
| vy1 = VFMACCVF_FLOAT(vy1, temp_i, va0, gvl); | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, gvl); | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_i, va1, gvl); | |||
| vy1 = VFNMSACVF_FLOAT(vy1, temp_r, va1, gvl); | |||
| vy1 = VFMACCVF_FLOAT(vy1, temp_i, va0, gvl); | |||
| #else | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, gvl); | |||
| vy0 = VFNMSACVF_FLOAT(vy0, temp_i, va1, gvl); | |||
| vy1 = VFNMSACVF_FLOAT(vy1, temp_r, va1, gvl); | |||
| vy1 = VFNMSACVF_FLOAT(vy1, temp_i, va0, gvl); | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, gvl); | |||
| vy0 = VFNMSACVF_FLOAT(vy0, temp_i, va1, gvl); | |||
| vy1 = VFNMSACVF_FLOAT(vy1, temp_r, va1, gvl); | |||
| vy1 = VFNMSACVF_FLOAT(vy1, temp_i, va0, gvl); | |||
| #endif | |||
| #endif | |||
| a_ptr += lda2; | |||
| ix += inc_x2; | |||
| } | |||
| for (; i < n; i += 4) | |||
| { | |||
| #if !defined(XCONJ) | |||
| x_v0 = VLSEV_FLOAT(&x[ix], inc_x2 * sizeof(FLOAT), 4); | |||
| x_v1 = VLSEV_FLOAT(&x[ix + 1], inc_x2 * sizeof(FLOAT), 4); | |||
| temp_rv = VFMUL_VF_FLOAT(x_v0, alpha_r, 4); | |||
| temp_iv = VFMUL_VF_FLOAT(x_v0, alpha_i, 4); | |||
| temp_rv = VFNMSACVF_FLOAT(temp_rv, alpha_i, x_v1, 4); | |||
| temp_iv = VFMACCVF_FLOAT(temp_iv, alpha_r, x_v1, 4); | |||
| VSEV_FLOAT(&temp_rr[0], temp_rv, 4); | |||
| VSEV_FLOAT(&temp_ii[0], temp_iv, 4); | |||
| #else | |||
| x_v0 = VLSEV_FLOAT(&x[ix], inc_x2 * sizeof(FLOAT), 4); | |||
| x_v1 = VLSEV_FLOAT(&x[ix + 1], inc_x2 * sizeof(FLOAT), 4); | |||
| temp_rv = VFMUL_VF_FLOAT(x_v0, alpha_r, 4); | |||
| temp_iv = VFMUL_VF_FLOAT(x_v0, alpha_i, 4); | |||
| temp_rv = VFMACCVF_FLOAT(temp_rv, alpha_i, x_v1, 4); | |||
| temp_iv = VFNMSACVF_FLOAT(temp_iv, alpha_r, x_v1, 4); | |||
| VSEV_FLOAT(&temp_rr[0], temp_rv, 4); | |||
| VSEV_FLOAT(&temp_ii[0], temp_iv, 4); | |||
| #endif | |||
| va0 = VLSEV_FLOAT(&a_ptr[j], stride_a, gvl); | |||
| va1 = VLSEV_FLOAT(&a_ptr[j + 1], stride_a, gvl); | |||
| va2 = VLSEV_FLOAT(&a_ptr[j + lda2], stride_a, gvl); | |||
| va3 = VLSEV_FLOAT(&a_ptr[j + lda2 + 1], stride_a, gvl); | |||
| va4 = VLSEV_FLOAT(&a_ptr[j + lda2 * 2], stride_a, gvl); | |||
| va5 = VLSEV_FLOAT(&a_ptr[j + lda2 * 2 + 1], stride_a, gvl); | |||
| va6 = VLSEV_FLOAT(&a_ptr[j + lda2 * 3], stride_a, gvl); | |||
| va7 = VLSEV_FLOAT(&a_ptr[j + lda2 * 3 + 1], stride_a, gvl); | |||
| #if !defined(CONJ) | |||
| #if !defined(XCONJ) | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_rr[0], va0, gvl); | |||
| vy0 = VFNMSACVF_FLOAT(vy0, temp_ii[0], va1, gvl); | |||
| vy1 = VFMACCVF_FLOAT(vy1, temp_rr[0], va1, gvl); | |||
| vy1 = VFMACCVF_FLOAT(vy1, temp_ii[0], va0, gvl); | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_rr[1], va2, gvl); | |||
| vy0 = VFNMSACVF_FLOAT(vy0, temp_ii[1], va3, gvl); | |||
| vy1 = VFMACCVF_FLOAT(vy1, temp_rr[1], va3, gvl); | |||
| vy1 = VFMACCVF_FLOAT(vy1, temp_ii[1], va2, gvl); | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_rr[2], va4, gvl); | |||
| vy0 = VFNMSACVF_FLOAT(vy0, temp_ii[2], va5, gvl); | |||
| vy1 = VFMACCVF_FLOAT(vy1, temp_rr[2], va5, gvl); | |||
| vy1 = VFMACCVF_FLOAT(vy1, temp_ii[2], va4, gvl); | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_rr[3], va6, gvl); | |||
| vy0 = VFNMSACVF_FLOAT(vy0, temp_ii[3], va7, gvl); | |||
| vy1 = VFMACCVF_FLOAT(vy1, temp_rr[3], va7, gvl); | |||
| vy1 = VFMACCVF_FLOAT(vy1, temp_ii[3], va6, gvl); | |||
| #else | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_rr[0], va0, gvl); | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_ii[0], va1, gvl); | |||
| vy1 = VFMACCVF_FLOAT(vy1, temp_rr[0], va1, gvl); | |||
| vy1 = VFNMSACVF_FLOAT(vy1, temp_ii[0], va0, gvl); | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_rr[1], va2, gvl); | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_ii[1], va3, gvl); | |||
| vy1 = VFMACCVF_FLOAT(vy1, temp_rr[1], va3, gvl); | |||
| vy1 = VFNMSACVF_FLOAT(vy1, temp_ii[1], va2, gvl); | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_rr[2], va4, gvl); | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_ii[2], va5, gvl); | |||
| vy1 = VFMACCVF_FLOAT(vy1, temp_rr[2], va5, gvl); | |||
| vy1 = VFNMSACVF_FLOAT(vy1, temp_ii[2], va4, gvl); | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_rr[3], va6, gvl); | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_ii[3], va7, gvl); | |||
| vy1 = VFMACCVF_FLOAT(vy1, temp_rr[3], va7, gvl); | |||
| vy1 = VFNMSACVF_FLOAT(vy1, temp_ii[3], va6, gvl); | |||
| #endif | |||
| #else | |||
| #if !defined(XCONJ) | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_rr[0], va0, gvl); | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_ii[0], va1, gvl); | |||
| vy1 = VFNMSACVF_FLOAT(vy1, temp_rr[0], va1, gvl); | |||
| vy1 = VFMACCVF_FLOAT(vy1, temp_ii[0], va0, gvl); | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_rr[1], va2, gvl); | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_ii[1], va3, gvl); | |||
| vy1 = VFNMSACVF_FLOAT(vy1, temp_rr[1], va3, gvl); | |||
| vy1 = VFMACCVF_FLOAT(vy1, temp_ii[1], va2, gvl); | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_rr[2], va4, gvl); | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_ii[2], va5, gvl); | |||
| vy1 = VFNMSACVF_FLOAT(vy1, temp_rr[2], va5, gvl); | |||
| vy1 = VFMACCVF_FLOAT(vy1, temp_ii[2], va4, gvl); | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_rr[3], va6, gvl); | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_ii[3], va7, gvl); | |||
| vy1 = VFNMSACVF_FLOAT(vy1, temp_rr[3], va7, gvl); | |||
| vy1 = VFMACCVF_FLOAT(vy1, temp_ii[3], va6, gvl); | |||
| #else | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_rr[0], va0, gvl); | |||
| vy0 = VFNMSACVF_FLOAT(vy0, temp_ii[0], va1, gvl); | |||
| vy1 = VFNMSACVF_FLOAT(vy1, temp_rr[0], va1, gvl); | |||
| vy1 = VFNMSACVF_FLOAT(vy1, temp_ii[0], va0, gvl); | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_rr[1], va2, gvl); | |||
| vy0 = VFNMSACVF_FLOAT(vy0, temp_ii[1], va3, gvl); | |||
| vy1 = VFNMSACVF_FLOAT(vy1, temp_rr[1], va3, gvl); | |||
| vy1 = VFNMSACVF_FLOAT(vy1, temp_ii[1], va2, gvl); | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_rr[2], va4, gvl); | |||
| vy0 = VFNMSACVF_FLOAT(vy0, temp_ii[2], va5, gvl); | |||
| vy1 = VFNMSACVF_FLOAT(vy1, temp_rr[2], va5, gvl); | |||
| vy1 = VFNMSACVF_FLOAT(vy1, temp_ii[2], va4, gvl); | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_rr[3], va6, gvl); | |||
| vy0 = VFNMSACVF_FLOAT(vy0, temp_ii[3], va7, gvl); | |||
| vy1 = VFNMSACVF_FLOAT(vy1, temp_rr[3], va7, gvl); | |||
| vy1 = VFNMSACVF_FLOAT(vy1, temp_ii[3], va6, gvl); | |||
| #endif | |||
| #endif | |||
| a_ptr += lda2 * 4; | |||
| ix += inc_x2 * 4; | |||
| } | |||
| VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl); | |||
| VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl); | |||
| VSSEV_FLOAT(&y[iy + 1], stride_y, vy1, gvl); | |||
| j += gvl * 2; | |||
| iy += inc_yv; | |||
| } | |||
| //tail | |||
| if(j/2 < m){ | |||
| gvl = VSETVL(m-j/2); | |||
| // tail | |||
| if (j / 2 < m) | |||
| { | |||
| gvl = VSETVL(m - j / 2); | |||
| a_ptr = a; | |||
| ix = 0; | |||
| vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl); | |||
| vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl); | |||
| for(i = 0; i < n; i++){ | |||
| vy1 = VLSEV_FLOAT(&y[iy + 1], stride_y, gvl); | |||
| for (i = 0; i < n; i++) | |||
| { | |||
| #if !defined(XCONJ) | |||
| temp_r = alpha_r * x[ix] - alpha_i * x[ix+1]; | |||
| temp_i = alpha_r * x[ix+1] + alpha_i * x[ix]; | |||
| temp_r = alpha_r * x[ix] - alpha_i * x[ix + 1]; | |||
| temp_i = alpha_r * x[ix + 1] + alpha_i * x[ix]; | |||
| #else | |||
| temp_r = alpha_r * x[ix] + alpha_i * x[ix+1]; | |||
| temp_i = alpha_r * x[ix+1] - alpha_i * x[ix]; | |||
| temp_r = alpha_r * x[ix] + alpha_i * x[ix + 1]; | |||
| temp_i = alpha_r * x[ix + 1] - alpha_i * x[ix]; | |||
| #endif | |||
| va0 = VLSEV_FLOAT(&a_ptr[j], stride_a, gvl); | |||
| va1 = VLSEV_FLOAT(&a_ptr[j+1], stride_a, gvl); | |||
| va1 = VLSEV_FLOAT(&a_ptr[j + 1], stride_a, gvl); | |||
| #if !defined(CONJ) | |||
| #if !defined(XCONJ) | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, gvl); | |||
| vy0 = VFNMSACVF_FLOAT(vy0, temp_i, va1, gvl); | |||
| vy1 = VFMACCVF_FLOAT(vy1, temp_r, va1, gvl); | |||
| vy1 = VFMACCVF_FLOAT(vy1, temp_i, va0, gvl); | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, gvl); | |||
| vy0 = VFNMSACVF_FLOAT(vy0, temp_i, va1, gvl); | |||
| vy1 = VFMACCVF_FLOAT(vy1, temp_r, va1, gvl); | |||
| vy1 = VFMACCVF_FLOAT(vy1, temp_i, va0, gvl); | |||
| #else | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, gvl); | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_i, va1, gvl); | |||
| vy1 = VFMACCVF_FLOAT(vy1, temp_r, va1, gvl); | |||
| vy1 = VFNMSACVF_FLOAT(vy1, temp_i, va0, gvl); | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, gvl); | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_i, va1, gvl); | |||
| vy1 = VFMACCVF_FLOAT(vy1, temp_r, va1, gvl); | |||
| vy1 = VFNMSACVF_FLOAT(vy1, temp_i, va0, gvl); | |||
| #endif | |||
| #else | |||
| #if !defined(XCONJ) | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, gvl); | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_i, va1, gvl); | |||
| vy1 = VFNMSACVF_FLOAT(vy1, temp_r, va1, gvl); | |||
| vy1 = VFMACCVF_FLOAT(vy1, temp_i, va0, gvl); | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, gvl); | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_i, va1, gvl); | |||
| vy1 = VFNMSACVF_FLOAT(vy1, temp_r, va1, gvl); | |||
| vy1 = VFMACCVF_FLOAT(vy1, temp_i, va0, gvl); | |||
| #else | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, gvl); | |||
| vy0 = VFNMSACVF_FLOAT(vy0, temp_i, va1, gvl); | |||
| vy1 = VFNMSACVF_FLOAT(vy1, temp_r, va1, gvl); | |||
| vy1 = VFNMSACVF_FLOAT(vy1, temp_i, va0, gvl); | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, gvl); | |||
| vy0 = VFNMSACVF_FLOAT(vy0, temp_i, va1, gvl); | |||
| vy1 = VFNMSACVF_FLOAT(vy1, temp_r, va1, gvl); | |||
| vy1 = VFNMSACVF_FLOAT(vy1, temp_i, va0, gvl); | |||
| #endif | |||
| #endif | |||
| @@ -165,9 +313,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, | |||
| ix += inc_x2; | |||
| } | |||
| VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl); | |||
| VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl); | |||
| VSSEV_FLOAT(&y[iy + 1], stride_y, vy1, gvl); | |||
| } | |||
| return(0); | |||
| return (0); | |||
| } | |||
| @@ -180,9 +180,7 @@ gotoblas_t TABLE_NAME = { | |||
| sgemm_direct_performantTS, | |||
| #endif | |||
| #ifdef ARCH_ARM64 | |||
| #ifdef HAVE_SME | |||
| sgemm_directTS, | |||
| #endif | |||
| #endif | |||
| sgemm_kernelTS, sgemm_betaTS, | |||
| @@ -231,7 +231,7 @@ static int sbgemv_kernel_32xN_lda_direct(BLASLONG m, BLASLONG n, float alpha, bf | |||
| accum512_8 = _mm512_permutex2var_ps(accum512_0, idx_base_0, accum512_1); | |||
| accum512_9 = _mm512_permutex2var_ps(accum512_0, idx_base_1, accum512_1); | |||
| if ((m-tag_m_32x) > 16) { | |||
| if ((m-tag_m_32x) >= 16) { | |||
| STORE16_COMPLETE_RESULT(accum512_8, y+tag_m_32x+0) | |||
| STORE16_MASK_COMPLETE_RESULT(accum512_9, y+tag_m_32x+16, store_tail_mask) | |||
| } else { | |||
| @@ -54,7 +54,7 @@ static FLOAT sum_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| if (n <= 0 || inc_x <= 0) return(sumf); | |||
| if (inc_x == 1) { | |||
| sumf = zsum_kernel(n, x); | |||
| sumf = zasum_kernel(n, x); | |||
| } | |||
| else { | |||
| inc_x2 = 2 * inc_x; | |||
| @@ -714,6 +714,8 @@ ld %f10,136(%r15) | |||
| ld %f11,144(%r15) | |||
| ld %f12,152(%r15) | |||
| br %r14 | |||
| EPILOGUE | |||
| .end | |||
| @@ -604,6 +604,8 @@ ALIGN_2 | |||
| /*end*/ | |||
| lmg %r6,%r12,48(%r15) | |||
| br %r14 | |||
| EPILOGUE | |||
| .end | |||
| @@ -845,6 +845,8 @@ ALIGN_2 | |||
| lmg %r6,%r12,48(%r15) | |||
| #endif | |||
| br %r14 | |||
| EPILOGUE | |||
| .end | |||
| @@ -864,6 +864,8 @@ ALIGN_2 | |||
| lmg %r6,%r12,48(%r15) | |||
| #endif | |||
| br %r14 | |||
| EPILOGUE | |||
| .end | |||
| @@ -719,6 +719,8 @@ ld %f10,136(%r15) | |||
| ld %f11,144(%r15) | |||
| ld %f12,152(%r15) | |||
| br %r14 | |||
| EPILOGUE | |||
| .end | |||
| @@ -75,7 +75,16 @@ extern "C" { | |||
| #ifndef LAPACK_COMPLEX_CUSTOM | |||
| #if defined(_MSC_VER) && !defined(__INTEL_CLANG_COMPILER) | |||
| #if defined(LAPACK_COMPLEX_CPP) | |||
| #include <complex> | |||
| #define lapack_complex_float std::complex<float> | |||
| #define lapack_complex_double std::complex<double> | |||
| #define lapack_complex_float_real(z) ((z).real()) | |||
| #define lapack_complex_float_imag(z) ((z).imag()) | |||
| #define lapack_complex_double_real(z) ((z).real()) | |||
| #define lapack_complex_double_imag(z) ((z).imag()) | |||
| #define _CRT_USE_C_COMPLEX_H | |||
| #else | |||
| #include <complex.h> | |||
| #define LAPACK_COMPLEX_CUSTOM | |||
| #define lapack_complex_float _Fcomplex | |||
| @@ -84,6 +93,7 @@ extern "C" { | |||
| #define lapack_complex_float_imag(z) (cimag(z)) | |||
| #define lapack_complex_double_real(z) (creal(z)) | |||
| #define lapack_complex_double_imag(z) (cimag(z)) | |||
| #endif | |||
| #else | |||
| #if defined(LAPACK_COMPLEX_STRUCTURE) | |||
| @@ -710,8 +710,8 @@ or GE matrices</b> */ | |||
| /* > \ingroup complexGEeigen */ | |||
| /* ===================================================================== */ | |||
| /* Subroutine */ void cgees_(char *jobvs, char *sort, L_fp select, integer *n, | |||
| complex *a, integer *lda, integer *sdim, complex *w, complex *vs, | |||
| /* Subroutine */ void cgees_(char *jobvs, char *sort, logical (*select)(complex*), | |||
| integer *n, complex *a, integer *lda, integer *sdim, complex *w, complex *vs, | |||
| integer *ldvs, complex *work, integer *lwork, real *rwork, logical * | |||
| bwork, integer *info) | |||
| { | |||
| @@ -752,8 +752,8 @@ f"> */ | |||
| /* > \ingroup complexGEeigen */ | |||
| /* ===================================================================== */ | |||
| /* Subroutine */ void cgeesx_(char *jobvs, char *sort, L_fp select, char * | |||
| sense, integer *n, complex *a, integer *lda, integer *sdim, complex * | |||
| /* Subroutine */ void cgeesx_(char *jobvs, char *sort, logical (*select)(complex*), | |||
| char *sense, integer *n, complex *a, integer *lda, integer *sdim, complex * | |||
| w, complex *vs, integer *ldvs, real *rconde, real *rcondv, complex * | |||
| work, integer *lwork, real *rwork, logical *bwork, integer *info) | |||
| { | |||
| @@ -485,12 +485,12 @@ | |||
| * Undo scaling if necessary | |||
| * | |||
| 50 CONTINUE | |||
| IF( SCALEA ) THEN | |||
| IF( SCALEA .AND. INFO.GT.0 ) THEN | |||
| CALL CLASCL( 'G', 0, 0, CSCALE, ANRM, N-INFO, 1, W( INFO+1 ), | |||
| $ MAX( N-INFO, 1 ), IERR ) | |||
| IF( INFO.GT.0 ) THEN | |||
| CALL CLASCL( 'G', 0, 0, CSCALE, ANRM, ILO-1, 1, W, N, IERR ) | |||
| END IF | |||
| END IF | |||
| * | |||
| WORK( 1 ) = SROUNDUP_LWORK(MAXWRK) | |||
| @@ -784,8 +784,8 @@ or GE matrices</b> */ | |||
| /* > \ingroup complexGEeigen */ | |||
| /* ===================================================================== */ | |||
| /* Subroutine */ void cgges_(char *jobvsl, char *jobvsr, char *sort, L_fp | |||
| selctg, integer *n, complex *a, integer *lda, complex *b, integer * | |||
| /* Subroutine */ void cgges_(char *jobvsl, char *jobvsr, char *sort, logical | |||
| (*selctg)(complex*,complex*), integer *n, complex *a, integer *lda, complex *b, integer * | |||
| ldb, integer *sdim, complex *alpha, complex *beta, complex *vsl, | |||
| integer *ldvsl, complex *vsr, integer *ldvsr, complex *work, integer * | |||
| lwork, real *rwork, logical *bwork, integer *info) | |||
| @@ -783,8 +783,8 @@ f"> */ | |||
| /* > \ingroup complexGEeigen */ | |||
| /* ===================================================================== */ | |||
| /* Subroutine */ void cgges3_(char *jobvsl, char *jobvsr, char *sort, L_fp | |||
| selctg, integer *n, complex *a, integer *lda, complex *b, integer * | |||
| /* Subroutine */ void cgges3_(char *jobvsl, char *jobvsr, char *sort, logical | |||
| (*selctg)(complex*,complex*), integer *n, complex *a, integer *lda, complex *b, integer * | |||
| ldb, integer *sdim, complex *alpha, complex *beta, complex *vsl, | |||
| integer *ldvsl, complex *vsr, integer *ldvsr, complex *work, integer * | |||
| lwork, real *rwork, logical *bwork, integer *info) | |||
| @@ -843,8 +843,8 @@ f"> */ | |||
| /* > \ingroup complexGEeigen */ | |||
| /* ===================================================================== */ | |||
| /* Subroutine */ void cggesx_(char *jobvsl, char *jobvsr, char *sort, L_fp | |||
| selctg, char *sense, integer *n, complex *a, integer *lda, complex *b, | |||
| /* Subroutine */ void cggesx_(char *jobvsl, char *jobvsr, char *sort, logical | |||
| (*selctg)(complex*,complex*), char *sense, integer *n, complex *a, integer *lda, complex *b, | |||
| integer *ldb, integer *sdim, complex *alpha, complex *beta, complex * | |||
| vsl, integer *ldvsl, complex *vsr, integer *ldvsr, real *rconde, real | |||
| *rcondv, complex *work, integer *lwork, real *rwork, integer *iwork, | |||
| @@ -729,7 +729,7 @@ or GE matrices</b> */ | |||
| /* > \ingroup doubleGEeigen */ | |||
| /* ===================================================================== */ | |||
| /* Subroutine */ void dgees_(char *jobvs, char *sort, L_fp select, integer *n, | |||
| /* Subroutine */ void dgees_(char *jobvs, char *sort, logical(*select)(doublereal*,doublereal*), integer *n, | |||
| doublereal *a, integer *lda, integer *sdim, doublereal *wr, | |||
| doublereal *wi, doublereal *vs, integer *ldvs, doublereal *work, | |||
| integer *lwork, logical *bwork, integer *info) | |||
| @@ -793,7 +793,7 @@ f"> */ | |||
| /* > \ingroup doubleGEeigen */ | |||
| /* ===================================================================== */ | |||
| /* Subroutine */ void dgeesx_(char *jobvs, char *sort, L_fp select, char * | |||
| /* Subroutine */ void dgeesx_(char *jobvs, char *sort, logical(*select)(doublereal*,doublereal*), char * | |||
| sense, integer *n, doublereal *a, integer *lda, integer *sdim, | |||
| doublereal *wr, doublereal *wi, doublereal *vs, integer *ldvs, | |||
| doublereal *rconde, doublereal *rcondv, doublereal *work, integer * | |||
| @@ -506,17 +506,17 @@ | |||
| * Undo scaling if necessary | |||
| * | |||
| 50 CONTINUE | |||
| IF( SCALEA ) THEN | |||
| IF( SCALEA .AND. INFO.GT.0) THEN | |||
| CALL DLASCL( 'G', 0, 0, CSCALE, ANRM, N-INFO, 1, WR( INFO+1 ), | |||
| $ MAX( N-INFO, 1 ), IERR ) | |||
| CALL DLASCL( 'G', 0, 0, CSCALE, ANRM, N-INFO, 1, WI( INFO+1 ), | |||
| $ MAX( N-INFO, 1 ), IERR ) | |||
| IF( INFO.GT.0 ) THEN | |||
| CALL DLASCL( 'G', 0, 0, CSCALE, ANRM, ILO-1, 1, WR, N, | |||
| $ IERR ) | |||
| CALL DLASCL( 'G', 0, 0, CSCALE, ANRM, ILO-1, 1, WI, N, | |||
| $ IERR ) | |||
| END IF | |||
| END IF | |||
| * | |||
| WORK( 1 ) = MAXWRK | |||
| @@ -798,8 +798,8 @@ or GE matrices</b> */ | |||
| /* > \ingroup doubleGEeigen */ | |||
| /* ===================================================================== */ | |||
| /* Subroutine */ void dgges_(char *jobvsl, char *jobvsr, char *sort, L_fp | |||
| selctg, integer *n, doublereal *a, integer *lda, doublereal *b, | |||
| /* Subroutine */ void dgges_(char *jobvsl, char *jobvsr, char *sort, logical | |||
| (selctg)(doublereal*, doublereal*, doublereal*), integer *n, doublereal *a, integer *lda, doublereal *b, | |||
| integer *ldb, integer *sdim, doublereal *alphar, doublereal *alphai, | |||
| doublereal *beta, doublereal *vsl, integer *ldvsl, doublereal *vsr, | |||
| integer *ldvsr, doublereal *work, integer *lwork, logical *bwork, | |||
| @@ -796,8 +796,8 @@ f"> */ | |||
| /* > \ingroup doubleGEeigen */ | |||
| /* ===================================================================== */ | |||
| /* Subroutine */ void dgges3_(char *jobvsl, char *jobvsr, char *sort, L_fp | |||
| selctg, integer *n, doublereal *a, integer *lda, doublereal *b, | |||
| /* Subroutine */ void dgges3_(char *jobvsl, char *jobvsr, char *sort, logical | |||
| (*selctg)(doublereal*,doublereal*,doublereal*), integer *n, doublereal *a, integer *lda, doublereal *b, | |||
| integer *ldb, integer *sdim, doublereal *alphar, doublereal *alphai, | |||
| doublereal *beta, doublereal *vsl, integer *ldvsl, doublereal *vsr, | |||
| integer *ldvsr, doublereal *work, integer *lwork, logical *bwork, | |||
| @@ -878,8 +878,8 @@ f"> */ | |||
| /* > \endverbatim */ | |||
| /* > */ | |||
| /* ===================================================================== */ | |||
| /* Subroutine */ void dggesx_(char *jobvsl, char *jobvsr, char *sort, L_fp | |||
| selctg, char *sense, integer *n, doublereal *a, integer *lda, | |||
| /* Subroutine */ void dggesx_(char *jobvsl, char *jobvsr, char *sort, logical | |||
| (*selctg)(doublereal*,doublereal*,doublereal*), char *sense, integer *n, doublereal *a, integer *lda, | |||
| doublereal *b, integer *ldb, integer *sdim, doublereal *alphar, | |||
| doublereal *alphai, doublereal *beta, doublereal *vsl, integer *ldvsl, | |||
| doublereal *vsr, integer *ldvsr, doublereal *rconde, doublereal * | |||
| @@ -482,7 +482,7 @@ or GE matrices</b> */ | |||
| /* > \ingroup realGEeigen */ | |||
| /* ===================================================================== */ | |||
| /* Subroutine */ void sgees_(char *jobvs, char *sort, L_fp select, integer *n, | |||
| /* Subroutine */ void sgees_(char *jobvs, char *sort, logical(*select)(real*,real*), integer *n, | |||
| real *a, integer *lda, integer *sdim, real *wr, real *wi, real *vs, | |||
| integer *ldvs, real *work, integer *lwork, logical *bwork, integer * | |||
| info) | |||
| @@ -550,7 +550,7 @@ f"> */ | |||
| /* > \ingroup realGEeigen */ | |||
| /* ===================================================================== */ | |||
| /* Subroutine */ void sgeesx_(char *jobvs, char *sort, L_fp select, char * | |||
| /* Subroutine */ void sgeesx_(char *jobvs, char *sort, logical(*select)(real*,real*), char * | |||
| sense, integer *n, real *a, integer *lda, integer *sdim, real *wr, | |||
| real *wi, real *vs, integer *ldvs, real *rconde, real *rcondv, real * | |||
| work, integer *lwork, integer *iwork, integer *liwork, logical *bwork, | |||
| @@ -504,17 +504,17 @@ | |||
| * Undo scaling if necessary | |||
| * | |||
| 50 CONTINUE | |||
| IF( SCALEA ) THEN | |||
| IF( SCALEA .AND. INFO.GT.0) THEN | |||
| CALL SLASCL( 'G', 0, 0, CSCALE, ANRM, N-INFO, 1, WR( INFO+1 ), | |||
| $ MAX( N-INFO, 1 ), IERR ) | |||
| CALL SLASCL( 'G', 0, 0, CSCALE, ANRM, N-INFO, 1, WI( INFO+1 ), | |||
| $ MAX( N-INFO, 1 ), IERR ) | |||
| IF( INFO.GT.0 ) THEN | |||
| CALL SLASCL( 'G', 0, 0, CSCALE, ANRM, ILO-1, 1, WR, N, | |||
| $ IERR ) | |||
| CALL SLASCL( 'G', 0, 0, CSCALE, ANRM, ILO-1, 1, WI, N, | |||
| $ IERR ) | |||
| END IF | |||
| END IF | |||
| * | |||
| WORK( 1 ) = SROUNDUP_LWORK(MAXWRK) | |||
| @@ -555,8 +555,8 @@ or GE matrices</b> */ | |||
| /* > \ingroup realGEeigen */ | |||
| /* ===================================================================== */ | |||
| /* Subroutine */ void sgges_(char *jobvsl, char *jobvsr, char *sort, L_fp | |||
| selctg, integer *n, real *a, integer *lda, real *b, integer *ldb, | |||
| /* Subroutine */ void sgges_(char *jobvsl, char *jobvsr, char *sort, logical | |||
| (*selctg)(real*,real*,real*), integer *n, real *a, integer *lda, real *b, integer *ldb, | |||
| integer *sdim, real *alphar, real *alphai, real *beta, real *vsl, | |||
| integer *ldvsl, real *vsr, integer *ldvsr, real *work, integer *lwork, | |||
| logical *bwork, integer *info) | |||
| @@ -553,8 +553,8 @@ f"> */ | |||
| /* > \ingroup realGEeigen */ | |||
| /* ===================================================================== */ | |||
| /* Subroutine */ void sgges3_(char *jobvsl, char *jobvsr, char *sort, L_fp | |||
| selctg, integer *n, real *a, integer *lda, real *b, integer *ldb, | |||
| /* Subroutine */ void sgges3_(char *jobvsl, char *jobvsr, char *sort, logical | |||
| (*selctg)(real*,real*,real*), integer *n, real *a, integer *lda, real *b, integer *ldb, | |||
| integer *sdim, real *alphar, real *alphai, real *beta, real *vsl, | |||
| integer *ldvsl, real *vsr, integer *ldvsr, real *work, integer *lwork, | |||
| logical *bwork, integer *info) | |||
| @@ -635,8 +635,8 @@ f"> */ | |||
| /* > \endverbatim */ | |||
| /* > */ | |||
| /* ===================================================================== */ | |||
| /* Subroutine */ void sggesx_(char *jobvsl, char *jobvsr, char *sort, L_fp | |||
| selctg, char *sense, integer *n, real *a, integer *lda, real *b, | |||
| /* Subroutine */ void sggesx_(char *jobvsl, char *jobvsr, char *sort, logical | |||
| (*selctg)(real*,real*,real*), char *sense, integer *n, real *a, integer *lda, real *b, | |||
| integer *ldb, integer *sdim, real *alphar, real *alphai, real *beta, | |||
| real *vsl, integer *ldvsl, real *vsr, integer *ldvsr, real *rconde, | |||
| real *rcondv, real *work, integer *lwork, integer *iwork, integer * | |||
| @@ -710,8 +710,8 @@ or GE matrices</b> */ | |||
| /* > \ingroup complex16GEeigen */ | |||
| /* ===================================================================== */ | |||
| /* Subroutine */ void zgees_(char *jobvs, char *sort, L_fp select, integer *n, | |||
| doublecomplex *a, integer *lda, integer *sdim, doublecomplex *w, | |||
| /* Subroutine */ void zgees_(char *jobvs, char *sort, logical (*select)(doublecomplex*), | |||
| integer *n, doublecomplex *a, integer *lda, integer *sdim, doublecomplex *w, | |||
| doublecomplex *vs, integer *ldvs, doublecomplex *work, integer *lwork, | |||
| doublereal *rwork, logical *bwork, integer *info) | |||
| { | |||
| @@ -751,8 +751,8 @@ f"> */ | |||
| /* > \ingroup complex16GEeigen */ | |||
| /* ===================================================================== */ | |||
| /* Subroutine */ void zgeesx_(char *jobvs, char *sort, L_fp select, char * | |||
| sense, integer *n, doublecomplex *a, integer *lda, integer *sdim, | |||
| /* Subroutine */ void zgeesx_(char *jobvs, char *sort, logical (*select)(doublecomplex*), | |||
| char * sense, integer *n, doublecomplex *a, integer *lda, integer *sdim, | |||
| doublecomplex *w, doublecomplex *vs, integer *ldvs, doublereal * | |||
| rconde, doublereal *rcondv, doublecomplex *work, integer *lwork, | |||
| doublereal *rwork, logical *bwork, integer *info) | |||
| @@ -485,12 +485,12 @@ | |||
| * Undo scaling if necessary | |||
| * | |||
| 50 CONTINUE | |||
| IF( SCALEA ) THEN | |||
| IF( SCALEA .AND. INFO.GT.0) THEN | |||
| CALL ZLASCL( 'G', 0, 0, CSCALE, ANRM, N-INFO, 1, W( INFO+1 ), | |||
| $ MAX( N-INFO, 1 ), IERR ) | |||
| IF( INFO.GT.0 ) THEN | |||
| CALL ZLASCL( 'G', 0, 0, CSCALE, ANRM, ILO-1, 1, W, N, IERR ) | |||
| END IF | |||
| END IF | |||
| * | |||
| WORK( 1 ) = MAXWRK | |||
| @@ -784,8 +784,9 @@ or GE matrices</b> */ | |||
| /* > \ingroup complex16GEeigen */ | |||
| /* ===================================================================== */ | |||
| /* Subroutine */ void zgges_(char *jobvsl, char *jobvsr, char *sort, L_fp | |||
| selctg, integer *n, doublecomplex *a, integer *lda, doublecomplex *b, | |||
| /* Subroutine */ void zgges_(char *jobvsl, char *jobvsr, char *sort, logical | |||
| (*selctg)(doublecomplex*,doublecomplex*), integer *n, doublecomplex *a, | |||
| integer *lda, doublecomplex *b, | |||
| integer *ldb, integer *sdim, doublecomplex *alpha, doublecomplex * | |||
| beta, doublecomplex *vsl, integer *ldvsl, doublecomplex *vsr, integer | |||
| *ldvsr, doublecomplex *work, integer *lwork, doublereal *rwork, | |||
| @@ -783,8 +783,9 @@ f"> */ | |||
| /* > \ingroup complex16GEeigen */ | |||
| /* ===================================================================== */ | |||
| /* Subroutine */ void zgges3_(char *jobvsl, char *jobvsr, char *sort, L_fp | |||
| selctg, integer *n, doublecomplex *a, integer *lda, doublecomplex *b, | |||
| /* Subroutine */ void zgges3_(char *jobvsl, char *jobvsr, char *sort, logical | |||
| (*selctg)(doublecomplex*,doublecomplex*), integer *n, doublecomplex *a, | |||
| integer *lda, doublecomplex *b, | |||
| integer *ldb, integer *sdim, doublecomplex *alpha, doublecomplex * | |||
| beta, doublecomplex *vsl, integer *ldvsl, doublecomplex *vsr, integer | |||
| *ldvsr, doublecomplex *work, integer *lwork, doublereal *rwork, | |||
| @@ -843,8 +843,9 @@ f"> */ | |||
| /* > \ingroup complex16GEeigen */ | |||
| /* ===================================================================== */ | |||
| /* Subroutine */ void zggesx_(char *jobvsl, char *jobvsr, char *sort, L_fp | |||
| selctg, char *sense, integer *n, doublecomplex *a, integer *lda, | |||
| /* Subroutine */ void zggesx_(char *jobvsl, char *jobvsr, char *sort, logical | |||
| (*selctg)(doublecomplex*,doublecomplex*), char *sense, integer *n, | |||
| doublecomplex *a, integer *lda, | |||
| doublecomplex *b, integer *ldb, integer *sdim, doublecomplex *alpha, | |||
| doublecomplex *beta, doublecomplex *vsl, integer *ldvsl, | |||
| doublecomplex *vsr, integer *ldvsr, doublereal *rconde, doublereal * | |||
| @@ -107,12 +107,6 @@ set(ZDMDEIGTST zchkdmd.f90) | |||
| macro(add_eig_executable name) | |||
| add_executable(${name} ${ARGN}) | |||
| target_link_libraries(${name} ${LIBNAMEPREFIX}openblas${LIBNAMESUFFIX}${SUFFIX64_UNDERSCORE}) | |||
| if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang)) | |||
| string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}") | |||
| target_link_libraries(${name} omp pthread) | |||
| endif() | |||
| #${TMGLIB} ../${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) | |||
| endmacro() | |||
| @@ -332,7 +332,7 @@ | |||
| WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ), | |||
| $ NT | |||
| ELSE | |||
| WRITE( NOUT, FMT = 9998 ) | |||
| WRITE( NOUT, FMT = 9998 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ) | |||
| END IF | |||
| * | |||
| * Test CGESDD | |||
| @@ -367,7 +367,7 @@ | |||
| WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ), | |||
| $ NT | |||
| ELSE | |||
| WRITE( NOUT, FMT = 9998 ) | |||
| WRITE( NOUT, FMT = 9998 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ) | |||
| END IF | |||
| * | |||
| * Test CGEJSV | |||
| @@ -433,7 +433,7 @@ | |||
| WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ), | |||
| $ NT | |||
| ELSE | |||
| WRITE( NOUT, FMT = 9998 ) | |||
| WRITE( NOUT, FMT = 9998 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ) | |||
| END IF | |||
| * | |||
| * Test CGESVDX | |||
| @@ -492,7 +492,7 @@ | |||
| WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ), | |||
| $ NT | |||
| ELSE | |||
| WRITE( NOUT, FMT = 9998 ) | |||
| WRITE( NOUT, FMT = 9998 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ) | |||
| END IF | |||
| * | |||
| * Test CGESVDQ | |||
| @@ -547,7 +547,7 @@ | |||
| WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ), | |||
| $ NT | |||
| ELSE | |||
| WRITE( NOUT, FMT = 9998 ) | |||
| WRITE( NOUT, FMT = 9998 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ) | |||
| END IF | |||
| END IF | |||
| * | |||
| @@ -558,7 +558,7 @@ | |||
| WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ), | |||
| $ NT | |||
| ELSE | |||
| WRITE( NOUT, FMT = 9998 ) | |||
| WRITE( NOUT, FMT = 9998 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ) | |||
| END IF | |||
| END IF | |||
| * | |||
| @@ -329,7 +329,7 @@ | |||
| WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ), | |||
| $ NT | |||
| ELSE | |||
| WRITE( NOUT, FMT = 9998 ) | |||
| WRITE( NOUT, FMT = 9998 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ) | |||
| END IF | |||
| * | |||
| * Test DGESDD | |||
| @@ -358,7 +358,7 @@ | |||
| WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ), | |||
| $ NT | |||
| ELSE | |||
| WRITE( NOUT, FMT = 9998 ) | |||
| WRITE( NOUT, FMT = 9998 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ) | |||
| END IF | |||
| * | |||
| * Test DGEJSV | |||
| @@ -424,7 +424,7 @@ | |||
| WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ), | |||
| $ NT | |||
| ELSE | |||
| WRITE( NOUT, FMT = 9998 ) | |||
| WRITE( NOUT, FMT = 9998 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ) | |||
| END IF | |||
| * | |||
| * Test DGESVDX | |||
| @@ -483,7 +483,7 @@ | |||
| WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ), | |||
| $ NT | |||
| ELSE | |||
| WRITE( NOUT, FMT = 9998 ) | |||
| WRITE( NOUT, FMT = 9998 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ) | |||
| END IF | |||
| * | |||
| * Test DGESVDQ | |||
| @@ -538,7 +538,7 @@ | |||
| WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ), | |||
| $ NT | |||
| ELSE | |||
| WRITE( NOUT, FMT = 9998 ) | |||
| WRITE( NOUT, FMT = 9998 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ) | |||
| END IF | |||
| END IF | |||
| * | |||
| @@ -549,7 +549,7 @@ | |||
| WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ), | |||
| $ NT | |||
| ELSE | |||
| WRITE( NOUT, FMT = 9998 ) | |||
| WRITE( NOUT, FMT = 9998 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ) | |||
| END IF | |||
| END IF | |||
| * | |||
| @@ -329,7 +329,7 @@ | |||
| WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ), | |||
| $ NT | |||
| ELSE | |||
| WRITE( NOUT, FMT = 9998 ) | |||
| WRITE( NOUT, FMT = 9998 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ) | |||
| END IF | |||
| * | |||
| * Test SGESDD | |||
| @@ -358,7 +358,7 @@ | |||
| WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ), | |||
| $ NT | |||
| ELSE | |||
| WRITE( NOUT, FMT = 9998 ) | |||
| WRITE( NOUT, FMT = 9998 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ) | |||
| END IF | |||
| * | |||
| * Test SGEJSV | |||
| @@ -424,7 +424,7 @@ | |||
| WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ), | |||
| $ NT | |||
| ELSE | |||
| WRITE( NOUT, FMT = 9998 ) | |||
| WRITE( NOUT, FMT = 9998 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ) | |||
| END IF | |||
| * | |||
| * Test SGESVDX | |||
| @@ -483,7 +483,7 @@ | |||
| WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ), | |||
| $ NT | |||
| ELSE | |||
| WRITE( NOUT, FMT = 9998 ) | |||
| WRITE( NOUT, FMT = 9998 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ) | |||
| END IF | |||
| * | |||
| * Test SGESVDQ | |||
| @@ -538,7 +538,7 @@ | |||
| WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ), | |||
| $ NT | |||
| ELSE | |||
| WRITE( NOUT, FMT = 9998 ) | |||
| WRITE( NOUT, FMT = 9998 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ) | |||
| END IF | |||
| END IF | |||
| * | |||
| @@ -549,7 +549,7 @@ | |||
| WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ), | |||
| $ NT | |||
| ELSE | |||
| WRITE( NOUT, FMT = 9998 ) | |||
| WRITE( NOUT, FMT = 9998 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ) | |||
| END IF | |||
| END IF | |||
| * | |||
| @@ -332,7 +332,7 @@ | |||
| WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ), | |||
| $ NT | |||
| ELSE | |||
| WRITE( NOUT, FMT = 9998 ) | |||
| WRITE( NOUT, FMT = 9998 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ) | |||
| END IF | |||
| * | |||
| * Test ZGESDD | |||
| @@ -367,7 +367,7 @@ | |||
| WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ), | |||
| $ NT | |||
| ELSE | |||
| WRITE( NOUT, FMT = 9998 ) | |||
| WRITE( NOUT, FMT = 9998 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ) | |||
| END IF | |||
| * | |||
| * Test ZGEJSV | |||
| @@ -433,7 +433,7 @@ | |||
| WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ), | |||
| $ NT | |||
| ELSE | |||
| WRITE( NOUT, FMT = 9998 ) | |||
| WRITE( NOUT, FMT = 9998 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ) | |||
| END IF | |||
| * | |||
| * Test ZGESVDX | |||
| @@ -492,7 +492,7 @@ | |||
| WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ), | |||
| $ NT | |||
| ELSE | |||
| WRITE( NOUT, FMT = 9998 ) | |||
| WRITE( NOUT, FMT = 9998 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ) | |||
| END IF | |||
| * | |||
| * Test ZGESVDQ | |||
| @@ -547,7 +547,7 @@ | |||
| WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ), | |||
| $ NT | |||
| ELSE | |||
| WRITE( NOUT, FMT = 9998 ) | |||
| WRITE( NOUT, FMT = 9998 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ) | |||
| END IF | |||
| END IF | |||
| * | |||
| @@ -558,7 +558,7 @@ | |||
| WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ), | |||
| $ NT | |||
| ELSE | |||
| WRITE( NOUT, FMT = 9998 ) | |||
| WRITE( NOUT, FMT = 9998 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ) | |||
| END IF | |||
| END IF | |||
| * | |||
| @@ -240,10 +240,6 @@ set(ZLINTSTRFP zchkrfp.f zdrvrfp.f zdrvrf1.f zdrvrf2.f zdrvrf3.f zdrvrf4.f zerrr | |||
| macro(add_lin_executable name) | |||
| add_executable(${name} ${ARGN}) | |||
| target_link_libraries(${name} ${LIBNAMEPREFIX}openblas${LIBNAMESUFFIX}${SUFFIX64_UNDERSCORE}) | |||
| if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang)) | |||
| string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}") | |||
| target_link_libraries(${name} omp pthread) | |||
| endif() | |||
| #${TMGLIB} ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) | |||
| endmacro() | |||