Update from develop branch for 0.3.19 releasetags/v0.3.19
| @@ -3,10 +3,13 @@ | |||
| ## | |||
| cmake_minimum_required(VERSION 2.8.5) | |||
| project(OpenBLAS C ASM) | |||
| set(OpenBLAS_MAJOR_VERSION 0) | |||
| set(OpenBLAS_MINOR_VERSION 3) | |||
| set(OpenBLAS_PATCH_VERSION 19) | |||
| set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") | |||
| # Adhere to GNU filesystem layout conventions | |||
| @@ -20,51 +23,68 @@ endif() | |||
| ####### | |||
| if(MSVC) | |||
| option(BUILD_WITHOUT_LAPACK "Do not build LAPACK and LAPACKE (Only BLAS or CBLAS)" ON) | |||
| option(BUILD_WITHOUT_LAPACK "Do not build LAPACK and LAPACKE (Only BLAS or CBLAS)" ON) | |||
| endif() | |||
| option(BUILD_WITHOUT_CBLAS "Do not build the C interface (CBLAS) to the BLAS functions" OFF) | |||
| option(DYNAMIC_ARCH "Include support for multiple CPU targets, with automatic selection at runtime (x86/x86_64, aarch64 or ppc only)" OFF) | |||
| option(DYNAMIC_OLDER "Include specific support for older x86 cpu models (Penryn,Dunnington,Atom,Nano,Opteron) with DYNAMIC_ARCH" OFF) | |||
| option(BUILD_RELAPACK "Build with ReLAPACK (recursive implementation of several LAPACK functions on top of standard LAPACK)" OFF) | |||
| option(USE_LOCKING "Use locks even in single-threaded builds to make them callable from multiple threads" OFF) | |||
| if(${CMAKE_SYSTEM_NAME} MATCHES "Linux") | |||
| option(NO_AFFINITY "Disable support for CPU affinity masks to avoid binding processes from e.g. R or numpy/scipy to a single core" ON) | |||
| option(NO_AFFINITY "Disable support for CPU affinity masks to avoid binding processes from e.g. R or numpy/scipy to a single core" ON) | |||
| else() | |||
| set(NO_AFFINITY 1) | |||
| set(NO_AFFINITY 1) | |||
| endif() | |||
| option(CPP_THREAD_SAFETY_TEST "Run a massively parallel DGEMM test to confirm thread safety of the library (requires OpenMP and about 1.3GB of RAM)" OFF) | |||
| option(CPP_THREAD_SAFETY_GEMV "Run a massively parallel DGEMV test to confirm thread safety of the library (requires OpenMP)" OFF) | |||
| option(BUILD_STATIC_LIBS "Build static library" OFF) | |||
| if(NOT BUILD_STATIC_LIBS AND NOT BUILD_SHARED_LIBS) | |||
| set(BUILD_STATIC_LIBS ON CACHE BOOL "Build static library" FORCE) | |||
| endif() | |||
| if((BUILD_STATIC_LIBS AND BUILD_SHARED_LIBS) AND MSVC) | |||
| message(WARNING "Could not enable both BUILD_STATIC_LIBS and BUILD_SHARED_LIBS with MSVC, Disable BUILD_SHARED_LIBS") | |||
| set(BUILD_SHARED_LIBS OFF CACHE BOOL "Build static library" FORCE) | |||
| endif() | |||
| # Add a prefix or suffix to all exported symbol names in the shared library. | |||
| # Avoids conflicts with other BLAS libraries, especially when using | |||
| # 64 bit integer interfaces in OpenBLAS. | |||
| set(SYMBOLPREFIX "" CACHE STRING "Add a prefix to all exported symbol names in the shared library to avoid conflicts with other BLAS libraries" ) | |||
| set(SYMBOLSUFFIX "" CACHE STRING "Add a suffix to all exported symbol names in the shared library, e.g. _64 for INTERFACE64 builds" ) | |||
| ####### | |||
| if(BUILD_WITHOUT_LAPACK) | |||
| set(NO_LAPACK 1) | |||
| set(NO_LAPACKE 1) | |||
| set(NO_LAPACK 1) | |||
| set(NO_LAPACKE 1) | |||
| endif() | |||
| if(BUILD_WITHOUT_CBLAS) | |||
| set(NO_CBLAS 1) | |||
| set(NO_CBLAS 1) | |||
| endif() | |||
| ####### | |||
| if(MSVC AND MSVC_STATIC_CRT) | |||
| set(CompilerFlags | |||
| CMAKE_CXX_FLAGS | |||
| CMAKE_CXX_FLAGS_DEBUG | |||
| CMAKE_CXX_FLAGS_RELEASE | |||
| CMAKE_C_FLAGS | |||
| CMAKE_C_FLAGS_DEBUG | |||
| CMAKE_C_FLAGS_RELEASE | |||
| ) | |||
| foreach(CompilerFlag ${CompilerFlags}) | |||
| string(REPLACE "/MD" "/MT" ${CompilerFlag} "${${CompilerFlag}}") | |||
| endforeach() | |||
| set(CompilerFlags | |||
| CMAKE_CXX_FLAGS | |||
| CMAKE_CXX_FLAGS_DEBUG | |||
| CMAKE_CXX_FLAGS_RELEASE | |||
| CMAKE_C_FLAGS | |||
| CMAKE_C_FLAGS_DEBUG | |||
| CMAKE_C_FLAGS_RELEASE | |||
| ) | |||
| foreach(CompilerFlag ${CompilerFlags}) | |||
| string(REPLACE "/MD" "/MT" ${CompilerFlag} "${${CompilerFlag}}") | |||
| endforeach() | |||
| endif() | |||
| message(WARNING "CMake support is experimental. It does not yet support all build options and may not produce the same Makefiles that OpenBLAS ships with.") | |||
| @@ -98,7 +118,7 @@ endif () | |||
| # set which float types we want to build for | |||
| if (NOT DEFINED BUILD_SINGLE AND NOT DEFINED BUILD_DOUBLE AND NOT DEFINED BUILD_COMPLEX AND NOT DEFINED BUILD_COMPLEX16) | |||
| # if none are defined, build for all | |||
| # set(BUILD_BFLOAT16 true) | |||
| # set(BUILD_BFLOAT16 true) | |||
| set(BUILD_SINGLE true) | |||
| set(BUILD_DOUBLE true) | |||
| set(BUILD_COMPLEX true) | |||
| @@ -143,9 +163,10 @@ endif () | |||
| set( CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib) | |||
| set( CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib) | |||
| if(MSVC) | |||
| set( CMAKE_LIBRARY_OUTPUT_DIRECTORY_DEBUG ${PROJECT_BINARY_DIR}/lib/Debug) | |||
| set( CMAKE_ARCHIVE_OUTPUT_DIRECTORY_RELEASE ${PROJECT_BINARY_DIR}/lib/Release) | |||
| set( CMAKE_LIBRARY_OUTPUT_DIRECTORY_DEBUG ${PROJECT_BINARY_DIR}/lib/Debug) | |||
| set( CMAKE_ARCHIVE_OUTPUT_DIRECTORY_RELEASE ${PROJECT_BINARY_DIR}/lib/Release) | |||
| endif () | |||
| # get obj vars into format that add_library likes: $<TARGET_OBJS:objlib> (see http://www.cmake.org/cmake/help/v3.0/command/add_library.html) | |||
| set(TARGET_OBJS "") | |||
| foreach (SUBDIR ${SUBDIRS}) | |||
| @@ -183,12 +204,61 @@ if (${DYNAMIC_ARCH}) | |||
| endif () | |||
| # add objects to the openblas lib | |||
| add_library(${OpenBLAS_LIBNAME} ${LA_SOURCES} ${LAPACKE_SOURCES} ${RELA_SOURCES} ${TARGET_OBJS} ${OpenBLAS_DEF_FILE}) | |||
| target_include_directories(${OpenBLAS_LIBNAME} INTERFACE $<INSTALL_INTERFACE:include/openblas${SUFFIX64}>) | |||
| if(NOT NO_LAPACK) | |||
| add_library(LAPACK OBJECT ${LA_SOURCES}) | |||
| list(APPEND TARGET_OBJS "$<TARGET_OBJECTS:LAPACK>") | |||
| endif() | |||
| if(NOT NO_LAPACKE) | |||
| add_library(LAPACKE OBJECT ${LAPACKE_SOURCES}) | |||
| list(APPEND TARGET_OBJS "$<TARGET_OBJECTS:LAPACKE>") | |||
| endif() | |||
| if(BUILD_RELAPACK) | |||
| add_library(RELAPACK OBJECT ${RELA_SOURCES}) | |||
| list(APPEND TARGET_OBJS "$<TARGET_OBJECTS:RELAPACK>") | |||
| endif() | |||
| set(OpenBLAS_LIBS "") | |||
| if(BUILD_STATIC_LIBS) | |||
| add_library(${OpenBLAS_LIBNAME}_static STATIC ${TARGET_OBJS} ${OpenBLAS_DEF_FILE}) | |||
| target_include_directories(${OpenBLAS_LIBNAME}_static INTERFACE $<INSTALL_INTERFACE:include/openblas${SUFFIX64}>) | |||
| list(APPEND OpenBLAS_LIBS ${OpenBLAS_LIBNAME}_static) | |||
| endif() | |||
| if(BUILD_SHARED_LIBS) | |||
| add_library(${OpenBLAS_LIBNAME}_shared SHARED ${TARGET_OBJS} ${OpenBLAS_DEF_FILE}) | |||
| target_include_directories(${OpenBLAS_LIBNAME}_shared INTERFACE $<INSTALL_INTERFACE:include/openblas${SUFFIX64}>) | |||
| list(APPEND OpenBLAS_LIBS ${OpenBLAS_LIBNAME}_shared) | |||
| endif() | |||
| if(BUILD_STATIC_LIBS) | |||
| add_library(${OpenBLAS_LIBNAME} ALIAS ${OpenBLAS_LIBNAME}_static) | |||
| else() | |||
| add_library(${OpenBLAS_LIBNAME} ALIAS ${OpenBLAS_LIBNAME}_shared) | |||
| endif() | |||
| set_target_properties(${OpenBLAS_LIBS} PROPERTIES OUTPUT_NAME ${OpenBLAS_LIBNAME}) | |||
| # Android needs to explicitly link against libm | |||
| if(ANDROID) | |||
| target_link_libraries(${OpenBLAS_LIBNAME} m) | |||
| if(BUILD_STATIC_LIBS) | |||
| target_link_libraries(${OpenBLAS_LIBNAME}_static m) | |||
| endif() | |||
| if(BUILD_SHARED_LIBS) | |||
| target_link_libraries(${OpenBLAS_LIBNAME}_shared m) | |||
| endif() | |||
| endif() | |||
| if (APPLE AND DYNAMIC_ARCH AND BUILD_SHARED_LIBS) | |||
| set (CMAKE_C_USE_RESPONSE_FILE_FOR_OBJECTS 1) | |||
| if (NOT NOFORTRAN) | |||
| set (CMAKE_Fortran_USE_RESPONSE_FILE_FOR_OBJECTS 1) | |||
| set (CMAKE_Fortran_CREATE_SHARED_LIBRARY | |||
| "sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ar -ru libopenblas.a && exit 0' " | |||
| "sh -c 'echo \"\" | ${CMAKE_Fortran_COMPILER} -o dummy.o -c -x f95-cpp-input - '" | |||
| "sh -c '${CMAKE_Fortran_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load dummy.o -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'" | |||
| "sh -c 'ls -l ${CMAKE_BINARY_DIR}/lib'") | |||
| else () | |||
| set (CMAKE_C_CREATE_SHARED_LIBRARY | |||
| "sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ar -ru libopenblas.a && exit 0' " | |||
| "sh -c '${CMAKE_C_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'") | |||
| endif () | |||
| endif() | |||
| # Handle MSVC exports | |||
| @@ -197,21 +267,21 @@ if(MSVC AND BUILD_SHARED_LIBS) | |||
| include("${PROJECT_SOURCE_DIR}/cmake/export.cmake") | |||
| else() | |||
| # Creates verbose .def file (51KB vs 18KB) | |||
| set_target_properties(${OpenBLAS_LIBNAME} PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS true) | |||
| set_target_properties(${OpenBLAS_LIBNAME}_shared PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS true) | |||
| endif() | |||
| endif() | |||
| # Set output for libopenblas | |||
| set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib) | |||
| set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES LIBRARY_OUTPUT_NAME_DEBUG "${OpenBLAS_LIBNAME}_d") | |||
| set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES EXPORT_NAME "OpenBLAS") | |||
| set_target_properties( ${OpenBLAS_LIBS} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib) | |||
| set_target_properties( ${OpenBLAS_LIBS} PROPERTIES LIBRARY_OUTPUT_NAME_DEBUG "${OpenBLAS_LIBNAME}_d") | |||
| set_target_properties( ${OpenBLAS_LIBS} PROPERTIES EXPORT_NAME "OpenBLAS") | |||
| foreach (OUTPUTCONFIG ${CMAKE_CONFIGURATION_TYPES}) | |||
| string( TOUPPER ${OUTPUTCONFIG} OUTPUTCONFIG ) | |||
| set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib/${OUTPUTCONFIG} ) | |||
| set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES LIBRARY_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib/${OUTPUTCONFIG} ) | |||
| set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES ARCHIVE_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib/${OUTPUTCONFIG} ) | |||
| set_target_properties( ${OpenBLAS_LIBS} PROPERTIES RUNTIME_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib/${OUTPUTCONFIG} ) | |||
| set_target_properties( ${OpenBLAS_LIBS} PROPERTIES LIBRARY_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib/${OUTPUTCONFIG} ) | |||
| set_target_properties( ${OpenBLAS_LIBS} PROPERTIES ARCHIVE_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib/${OUTPUTCONFIG} ) | |||
| endforeach() | |||
| enable_testing() | |||
| @@ -220,10 +290,17 @@ if (USE_THREAD) | |||
| # Add threading library to linker | |||
| find_package(Threads) | |||
| if (THREADS_HAVE_PTHREAD_ARG) | |||
| set_property(TARGET ${OpenBLAS_LIBNAME} PROPERTY COMPILE_OPTIONS "-pthread") | |||
| set_property(TARGET ${OpenBLAS_LIBNAME} PROPERTY INTERFACE_COMPILE_OPTIONS "-pthread") | |||
| set_target_properties(${OpenBLAS_LIBS} PROPERTIES | |||
| COMPILE_OPTIONS "-pthread" | |||
| INTERFACE_COMPILE_OPTIONS "-pthread" | |||
| ) | |||
| endif() | |||
| if(BUILD_STATIC_LIBS) | |||
| target_link_libraries(${OpenBLAS_LIBNAME}_static ${CMAKE_THREAD_LIBS_INIT}) | |||
| endif() | |||
| if(BUILD_SHARED_LIBS) | |||
| target_link_libraries(${OpenBLAS_LIBNAME}_shared ${CMAKE_THREAD_LIBS_INIT}) | |||
| endif() | |||
| target_link_libraries(${OpenBLAS_LIBNAME} ${CMAKE_THREAD_LIBS_INIT}) | |||
| endif() | |||
| #if (MSVC OR NOT NOFORTRAN) | |||
| @@ -239,97 +316,109 @@ if (NOT NOFORTRAN) | |||
| add_subdirectory(ctest) | |||
| endif() | |||
| add_subdirectory(lapack-netlib/TESTING) | |||
| if (CPP_THREAD_SAFETY_TEST OR CPP_THREAD_SAFETY_GEMV) | |||
| add_subdirectory(cpp_thread_test) | |||
| endif() | |||
| if (CPP_THREAD_SAFETY_TEST OR CPP_THREAD_SAFETY_GEMV) | |||
| add_subdirectory(cpp_thread_test) | |||
| endif() | |||
| endif() | |||
| set_target_properties(${OpenBLAS_LIBNAME} PROPERTIES | |||
| set_target_properties(${OpenBLAS_LIBS} PROPERTIES | |||
| VERSION ${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION} | |||
| SOVERSION ${OpenBLAS_MAJOR_VERSION} | |||
| ) | |||
| if (BUILD_SHARED_LIBS AND BUILD_RELAPACK) | |||
| if (NOT MSVC) | |||
| target_link_libraries(${OpenBLAS_LIBNAME} "-Wl,-allow-multiple-definition") | |||
| target_link_libraries(${OpenBLAS_LIBNAME}_shared "-Wl,-allow-multiple-definition") | |||
| else() | |||
| set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} /FORCE:MULTIPLE") | |||
| set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} /FORCE:MULTIPLE") | |||
| endif() | |||
| endif() | |||
| if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFFIX} STREQUAL "") | |||
| if (NOT DEFINED ARCH) | |||
| set(ARCH_IN "x86_64") | |||
| else() | |||
| set(ARCH_IN ${ARCH}) | |||
| endif() | |||
| if (NOT DEFINED ARCH) | |||
| set(ARCH_IN "x86_64") | |||
| else() | |||
| set(ARCH_IN ${ARCH}) | |||
| endif() | |||
| if (${CORE} STREQUAL "generic") | |||
| set(ARCH_IN "GENERIC") | |||
| endif () | |||
| if (${CORE} STREQUAL "generic") | |||
| set(ARCH_IN "GENERIC") | |||
| endif () | |||
| if (NOT DEFINED EXPRECISION) | |||
| set(EXPRECISION_IN 0) | |||
| else() | |||
| set(EXPRECISION_IN ${EXPRECISION}) | |||
| endif() | |||
| if (NOT DEFINED EXPRECISION) | |||
| set(EXPRECISION_IN 0) | |||
| else() | |||
| set(EXPRECISION_IN ${EXPRECISION}) | |||
| endif() | |||
| if (NOT DEFINED NO_CBLAS) | |||
| set(NO_CBLAS_IN 0) | |||
| else() | |||
| set(NO_CBLAS_IN ${NO_CBLAS}) | |||
| endif() | |||
| if (NOT DEFINED NO_CBLAS) | |||
| set(NO_CBLAS_IN 0) | |||
| else() | |||
| set(NO_CBLAS_IN ${NO_CBLAS}) | |||
| endif() | |||
| if (NOT DEFINED NO_LAPACK) | |||
| set(NO_LAPACK_IN 0) | |||
| else() | |||
| set(NO_LAPACK_IN ${NO_LAPACK}) | |||
| endif() | |||
| if (NOT DEFINED NO_LAPACK) | |||
| set(NO_LAPACK_IN 0) | |||
| else() | |||
| set(NO_LAPACK_IN ${NO_LAPACK}) | |||
| endif() | |||
| if (NOT DEFINED NO_LAPACKE) | |||
| set(NO_LAPACKE_IN 0) | |||
| else() | |||
| set(NO_LAPACKE_IN ${NO_LAPACKE}) | |||
| endif() | |||
| if (NOT DEFINED NO_LAPACKE) | |||
| set(NO_LAPACKE_IN 0) | |||
| else() | |||
| set(NO_LAPACKE_IN ${NO_LAPACKE}) | |||
| endif() | |||
| if (NOT DEFINED NEED2UNDERSCORES) | |||
| set(NEED2UNDERSCORES_IN 0) | |||
| else() | |||
| set(NEED2UNDERSCORES_IN ${NEED2UNDERSCORES}) | |||
| endif() | |||
| if (NOT DEFINED NEED2UNDERSCORES) | |||
| set(NEED2UNDERSCORES_IN 0) | |||
| else() | |||
| set(NEED2UNDERSCORES_IN ${NEED2UNDERSCORES}) | |||
| endif() | |||
| if (NOT DEFINED ONLY_CBLAS) | |||
| set(ONLY_CBLAS_IN 0) | |||
| else() | |||
| set(ONLY_CBLAS_IN ${ONLY_CBLAS}) | |||
| endif() | |||
| if (NOT DEFINED ONLY_CBLAS) | |||
| set(ONLY_CBLAS_IN 0) | |||
| else() | |||
| set(ONLY_CBLAS_IN ${ONLY_CBLAS}) | |||
| endif() | |||
| if (NOT DEFINED BU) | |||
| set(BU _) | |||
| endif() | |||
| if (NOT DEFINED BU) | |||
| set(BU _) | |||
| endif() | |||
| if (NOT ${SYMBOLPREFIX} STREQUAL "") | |||
| message(STATUS "adding prefix ${SYMBOLPREFIX} to names of exported symbols in ${OpenBLAS_LIBNAME}") | |||
| endif() | |||
| if (NOT ${SYMBOLSUFFIX} STREQUAL "") | |||
| message(STATUS "adding suffix ${SYMBOLSUFFIX} to names of exported symbols in ${OpenBLAS_LIBNAME}") | |||
| endif() | |||
| add_custom_command(TARGET ${OpenBLAS_LIBNAME} POST_BUILD | |||
| COMMAND perl ${PROJECT_SOURCE_DIR}/exports/gensymbol "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BUILD_LAPACK_DEPRECATED}" > ${PROJECT_BINARY_DIR}/objcopy.def | |||
| COMMAND objcopy -v --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def ${PROJECT_BINARY_DIR}/lib/lib${OpenBLAS_LIBNAME}.so | |||
| COMMENT "renaming symbols" | |||
| ) | |||
| if (NOT ${SYMBOLPREFIX} STREQUAL "") | |||
| message(STATUS "adding prefix ${SYMBOLPREFIX} to names of exported symbols in ${OpenBLAS_LIBNAME}") | |||
| endif() | |||
| if (NOT ${SYMBOLSUFFIX} STREQUAL "") | |||
| message(STATUS "adding suffix ${SYMBOLSUFFIX} to names of exported symbols in ${OpenBLAS_LIBNAME}") | |||
| endif() | |||
| add_custom_command(TARGET ${OpenBLAS_LIBNAME}_shared POST_BUILD | |||
| COMMAND perl ${PROJECT_SOURCE_DIR}/exports/gensymbol "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BUILD_LAPACK_DEPRECATED}" > ${PROJECT_BINARY_DIR}/objcopy.def | |||
| COMMAND objcopy -v --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def ${PROJECT_BINARY_DIR}/lib/lib${OpenBLAS_LIBNAME}.so | |||
| COMMENT "renaming symbols" | |||
| ) | |||
| endif() | |||
| # Install project | |||
| # Install libraries | |||
| install(TARGETS ${OpenBLAS_LIBNAME} | |||
| EXPORT "OpenBLAS${SUFFIX64}Targets" | |||
| RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} | |||
| ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} | |||
| LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} ) | |||
| if(BUILD_SHARED_LIBS AND BUILD_STATIC_LIBS) | |||
| install(TARGETS ${OpenBLAS_LIBNAME}_shared | |||
| EXPORT "OpenBLAS${SUFFIX64}Targets" | |||
| RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} | |||
| ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} | |||
| LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} ) | |||
| install(TARGETS ${OpenBLAS_LIBNAME}_static | |||
| ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} | |||
| LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} ) | |||
| else() | |||
| install(TARGETS ${OpenBLAS_LIBS} | |||
| EXPORT "OpenBLAS${SUFFIX64}Targets" | |||
| RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} | |||
| ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} | |||
| LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} ) | |||
| endif() | |||
| # Install headers | |||
| set(CMAKE_INSTALL_INCLUDEDIR ${CMAKE_INSTALL_INCLUDEDIR}/openblas${SUFFIX64}) | |||
| @@ -365,36 +454,41 @@ if(NOT NOFORTRAN) | |||
| endif() | |||
| if(NOT NO_CBLAS) | |||
| message (STATUS "Generating cblas.h in ${CMAKE_INSTALL_INCLUDEDIR}") | |||
| set(CBLAS_H ${CMAKE_BINARY_DIR}/generated/cblas.h) | |||
| file(READ ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h CBLAS_H_CONTENTS) | |||
| string(REPLACE "common" "openblas_config" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") | |||
| if (NOT ${SYMBOLPREFIX} STREQUAL "") | |||
| string(REPLACE " cblas" " ${SYMBOLPREFIX}cblas" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}") | |||
| string(REPLACE " openblas" " ${SYMBOLPREFIX}openblas" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") | |||
| string (REPLACE " ${SYMBOLPREFIX}openblas_complex" " openblas_complex" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}") | |||
| string(REPLACE " goto" " ${SYMBOLPREFIX}goto" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") | |||
| endif() | |||
| if (NOT ${SYMBOLSUFFIX} STREQUAL "") | |||
| string(REGEX REPLACE "(cblas[^ (]*)" "\\1${SYMBOLSUFFIX}" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}") | |||
| string(REGEX REPLACE "(openblas[^ (]*)" "\\1${SYMBOLSUFFIX}" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") | |||
| string(REGEX REPLACE "(openblas_complex[^ ]*)${SYMBOLSUFFIX}" "\\1" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}") | |||
| string(REGEX REPLACE "(goto[^ (]*)" "\\1${SYMBOLSUFFIX}" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") | |||
| endif() | |||
| file(WRITE ${CBLAS_H} "${CBLAS_H_CONTENTS_NEW}") | |||
| install (FILES ${CBLAS_H} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) | |||
| message (STATUS "Generating cblas.h in ${CMAKE_INSTALL_INCLUDEDIR}") | |||
| set(CBLAS_H ${CMAKE_BINARY_DIR}/generated/cblas.h) | |||
| file(READ ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h CBLAS_H_CONTENTS) | |||
| string(REPLACE "common" "openblas_config" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") | |||
| if (NOT ${SYMBOLPREFIX} STREQUAL "") | |||
| string(REPLACE " cblas" " ${SYMBOLPREFIX}cblas" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}") | |||
| string(REPLACE " openblas" " ${SYMBOLPREFIX}openblas" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") | |||
| string (REPLACE " ${SYMBOLPREFIX}openblas_complex" " openblas_complex" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}") | |||
| string(REPLACE " goto" " ${SYMBOLPREFIX}goto" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") | |||
| endif() | |||
| if (NOT ${SYMBOLSUFFIX} STREQUAL "") | |||
| string(REGEX REPLACE "(cblas[^ (]*)" "\\1${SYMBOLSUFFIX}" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}") | |||
| string(REGEX REPLACE "(openblas[^ (]*)" "\\1${SYMBOLSUFFIX}" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") | |||
| string(REGEX REPLACE "(openblas_complex[^ ]*)${SYMBOLSUFFIX}" "\\1" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}") | |||
| string(REGEX REPLACE "(goto[^ (]*)" "\\1${SYMBOLSUFFIX}" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") | |||
| endif() | |||
| file(WRITE ${CBLAS_H} "${CBLAS_H_CONTENTS_NEW}") | |||
| install (FILES ${CBLAS_H} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) | |||
| endif() | |||
| if(NOT NO_LAPACKE) | |||
| message (STATUS "Copying LAPACKE header files to ${CMAKE_INSTALL_INCLUDEDIR}") | |||
| add_dependencies( ${OpenBLAS_LIBNAME} genlapacke) | |||
| FILE(GLOB_RECURSE INCLUDE_FILES "${CMAKE_CURRENT_SOURCE_DIR}/lapack-netlib/LAPACKE/*.h") | |||
| install (FILES ${INCLUDE_FILES} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) | |||
| ADD_CUSTOM_TARGET(genlapacke | |||
| COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/lapack-netlib/LAPACKE/include/lapacke_mangling_with_flags.h.in "${CMAKE_BINARY_DIR}/lapacke_mangling.h" | |||
| ) | |||
| install (FILES ${CMAKE_BINARY_DIR}/lapacke_mangling.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/openblas${SUFFIX64}) | |||
| message (STATUS "Copying LAPACKE header files to ${CMAKE_INSTALL_INCLUDEDIR}") | |||
| if(BUILD_STATIC_LIBS) | |||
| add_dependencies( ${OpenBLAS_LIBNAME}_static genlapacke) | |||
| endif() | |||
| if(BUILD_SHARED_LIBS) | |||
| add_dependencies( ${OpenBLAS_LIBNAME}_shared genlapacke) | |||
| endif() | |||
| FILE(GLOB_RECURSE INCLUDE_FILES "${CMAKE_CURRENT_SOURCE_DIR}/lapack-netlib/LAPACKE/*.h") | |||
| install (FILES ${INCLUDE_FILES} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) | |||
| ADD_CUSTOM_TARGET(genlapacke | |||
| COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/lapack-netlib/LAPACKE/include/lapacke_mangling_with_flags.h.in "${CMAKE_BINARY_DIR}/lapacke_mangling.h" | |||
| ) | |||
| install (FILES ${CMAKE_BINARY_DIR}/lapacke_mangling.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/openblas${SUFFIX64}) | |||
| endif() | |||
| # Install pkg-config files | |||
| @@ -419,4 +513,3 @@ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${PN}ConfigVersion.cmake | |||
| install(EXPORT "${PN}${SUFFIX64}Targets" | |||
| NAMESPACE "${PN}${SUFFIX64}::" | |||
| DESTINATION ${CMAKECONFIG_INSTALL_DIR}) | |||
| @@ -197,3 +197,7 @@ In chronological order: | |||
| * River Dillon <oss@outerpassage.net> | |||
| * [2021-07-10] fix compilation with musl libc | |||
| * Bine Brank <https://github.com/binebrank> | |||
| * [2021-10-27] Add vector-length-agnostic DGEMM kernels for Arm SVE | |||
| * [2021-11-20] Vector-length-agnostic Arm SVE copy routines for DGEMM, DTRMM, DSYMM | |||
| @@ -1,4 +1,51 @@ | |||
| OpenBLAS ChangeLog | |||
| ==================================================================== | |||
| Version 0.3.19 | |||
| 19-Dec-2021 | |||
| general: | |||
| - reverted unsafe TRSV/ZRSV optimizations introduced in 0.3.16 | |||
| - fixed a potential thread race in the thread buffer reallocation routines | |||
| that were introduced in 0.3.18 | |||
| - fixed miscounting of thread pool size on Linux with OMP_PROC_BIND=TRUE | |||
| - fixed CBLAS interfaces for CSROT/ZSROT and CROTG/ZROTG | |||
| - made automatic library suffix for CMAKE builds with INTERFACE64 available | |||
| to CBLAS-only builds | |||
| x86_64: | |||
| - DYNAMIC_ARCH builds now fall back to the cpu with most similar capabilities | |||
| when an unknown CPUID is encountered, instead of defaulting to Prescott | |||
| - added cpu detection for Intel Alder Lake | |||
| - added cpu detection for Intel Sapphire Rapids | |||
| - added an optimized SBGEMM kernel for Sapphire Rapids | |||
| - fixed DYNAMIC_ARCH builds on OSX with CMAKE | |||
| - worked around DYNAMIC_ARCH builds made on Sandybridge failing on SkylakeX | |||
| - fixed missing thread initialization for static builds on Windows/MSVC | |||
| - fixed an excessive read in ZSYMV | |||
| POWER: | |||
| - added support for POWER10 in big-endian mode | |||
| - added support for building with CMAKE | |||
| - added optimized SGEMM and DGEMM kernels for small matrix sizes | |||
| ARMV8: | |||
| - added basic support and cputype detection for Fujitsu A64FX | |||
| - added a generic ARMV8SVE target | |||
| - added SVE-enabled SGEMM and DGEMM kernels for ARMV8SVE and A64FX | |||
| - added optimized CGEMM and ZGEMM kernels for Cortex A53 and A55 cpus | |||
| - fixed cpuid detection for Apple M1 and improved performance | |||
| - improved compiler flag setting in CMAKE builds | |||
| RISCV64: | |||
| - fixed improper initialization in CSCAL/ZSCAL for strided access patterns | |||
| MIPS: | |||
| - added a GENERIC target for MIPS32 | |||
| - added support for cross-compiling to MIPS32 on x86_64 using CMAKE | |||
| MIPS64: | |||
| - fixed misdetection of MSA capability | |||
| ==================================================================== | |||
| Version 0.3.18 | |||
| 02-Oct-2021 | |||
| @@ -32,7 +32,7 @@ export NOFORTRAN | |||
| export NO_LAPACK | |||
| endif | |||
| LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast,$(LAPACK_FFLAGS)) | |||
| LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast -O -Og -Os,$(LAPACK_FFLAGS)) | |||
| SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench cpp_thread_test | |||
| @@ -1,6 +1,9 @@ | |||
| ifneq ($(C_COMPILER), PGI) | |||
| ifneq ($(GCCVERSIONGT4), 1) | |||
| ifeq ($(C_COMPILER), CLANG) | |||
| ISCLANG=1 | |||
| endif | |||
| ifneq (1, $(filter 1,$(GCCVERSIONGT4) $(ISCLANG))) | |||
| CCOMMON_OPT += -march=armv8-a | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=armv8-a | |||
| @@ -17,6 +20,13 @@ FCOMMON_OPT += -march=armv8-a | |||
| endif | |||
| endif | |||
| ifeq ($(CORE), ARMV8SVE) | |||
| CCOMMON_OPT += -march=armv8-a+sve | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=armv8-a+sve | |||
| endif | |||
| endif | |||
| ifeq ($(CORE), CORTEXA53) | |||
| CCOMMON_OPT += -march=armv8-a -mtune=cortex-a53 | |||
| ifneq ($(F_COMPILER), NAG) | |||
| @@ -48,7 +58,7 @@ endif | |||
| # Use a72 tunings because Neoverse-N1 is only available | |||
| # in GCC>=9 | |||
| ifeq ($(CORE), NEOVERSEN1) | |||
| ifeq ($(GCCVERSIONGTEQ7), 1) | |||
| ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG))) | |||
| ifeq ($(GCCVERSIONGTEQ9), 1) | |||
| CCOMMON_OPT += -march=armv8.2-a -mtune=neoverse-n1 | |||
| ifneq ($(F_COMPILER), NAG) | |||
| @@ -70,7 +80,7 @@ endif | |||
| # Use a53 tunings because a55 is only available in GCC>=8.1 | |||
| ifeq ($(CORE), CORTEXA55) | |||
| ifeq ($(GCCVERSIONGTEQ7), 1) | |||
| ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG))) | |||
| ifeq ($(GCCVERSIONGTEQ8), 1) | |||
| CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a55 | |||
| ifneq ($(F_COMPILER), NAG) | |||
| @@ -132,7 +142,7 @@ FCOMMON_OPT += -march=armv8.3-a | |||
| endif | |||
| endif | |||
| ifeq ($(GCCVERSIONGTEQ9), 1) | |||
| ifeq (1, $(filter 1,$(GCCVERSIONGTEQ9) $(ISCLANG))) | |||
| ifeq ($(CORE), TSV110) | |||
| CCOMMON_OPT += -march=armv8.2-a -mtune=tsv110 | |||
| ifneq ($(F_COMPILER), NAG) | |||
| @@ -150,6 +160,15 @@ endif | |||
| endif | |||
| endif | |||
| ifeq (1, $(filter 1,$(GCCVERSIONGTEQ11) $(ISCLANG))) | |||
| ifeq ($(CORE), A64FX) | |||
| CCOMMON_OPT += -march=armv8.2-a+sve -mtune=a64fx | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=armv8.2-a+sve -mtune=a64fx | |||
| endif | |||
| endif | |||
| endif | |||
| endif | |||
| endif | |||
| endif | |||
| @@ -3,7 +3,7 @@ | |||
| # | |||
| # This library's version | |||
| VERSION = 0.3.18 | |||
| VERSION = 0.3.18.dev | |||
| # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a | |||
| # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library | |||
| @@ -9,11 +9,10 @@ ifndef TOPDIR | |||
| TOPDIR = . | |||
| endif | |||
| # If ARCH is not set, we use the host system's architecture for getarch compile options. | |||
| ifndef ARCH | |||
| # we need to use the host system's architecture for getarch compile options even especially when cross-compiling | |||
| HOSTARCH := $(shell uname -m) | |||
| else | |||
| HOSTARCH = $(ARCH) | |||
| ifeq ($(HOSTARCH), amd64) | |||
| HOSTARCH=x86_64 | |||
| endif | |||
| # Catch conflicting usage of ARCH in some BSD environments | |||
| @@ -102,7 +101,7 @@ GETARCH_FLAGS += -DUSER_TARGET | |||
| ifeq ($(TARGET), GENERIC) | |||
| ifeq ($(DYNAMIC_ARCH), 1) | |||
| override NO_EXPRECISION=1 | |||
| export NO_EXPRECiSION | |||
| export NO_EXPRECISION | |||
| endif | |||
| endif | |||
| endif | |||
| @@ -119,6 +118,9 @@ endif | |||
| ifeq ($(TARGET), COOPERLAKE) | |||
| GETARCH_FLAGS := -DFORCE_NEHALEM | |||
| endif | |||
| ifeq ($(TARGET), SAPPHIRERAPIDS) | |||
| GETARCH_FLAGS := -DFORCE_NEHALEM | |||
| endif | |||
| ifeq ($(TARGET), SANDYBRIDGE) | |||
| GETARCH_FLAGS := -DFORCE_NEHALEM | |||
| endif | |||
| @@ -143,8 +145,13 @@ endif | |||
| ifeq ($(TARGET), POWER8) | |||
| GETARCH_FLAGS := -DFORCE_POWER6 | |||
| endif | |||
| ifeq ($(TARGET), POWER9) | |||
| GETARCH_FLAGS := -DFORCE_POWER6 | |||
| endif | |||
| ifeq ($(TARGET), POWER10) | |||
| GETARCH_FLAGS := -DFORCE_POWER6 | |||
| endif | |||
| endif | |||
| #TARGET_CORE will override TARGET which is used in DYNAMIC_ARCH=1. | |||
| # | |||
| @@ -164,6 +171,9 @@ endif | |||
| ifeq ($(TARGET_CORE), COOPERLAKE) | |||
| GETARCH_FLAGS := -DFORCE_NEHALEM | |||
| endif | |||
| ifeq ($(TARGET_CORE), SAPPHIRERAPIDS) | |||
| GETARCH_FLAGS := -DFORCE_NEHALEM | |||
| endif | |||
| ifeq ($(TARGET_CORE), SANDYBRIDGE) | |||
| GETARCH_FLAGS := -DFORCE_NEHALEM | |||
| endif | |||
| @@ -251,6 +261,8 @@ endif | |||
| #For small matrix optimization | |||
| ifeq ($(ARCH), x86_64) | |||
| SMALL_MATRIX_OPT = 1 | |||
| else ifeq ($(CORE), POWER10) | |||
| SMALL_MATRIX_OPT = 1 | |||
| endif | |||
| ifeq ($(SMALL_MATRIX_OPT), 1) | |||
| CCOMMON_OPT += -DSMALL_MATRIX_OPT | |||
| @@ -260,6 +272,10 @@ endif | |||
| ifndef GOTOBLAS_MAKEFILE | |||
| export GOTOBLAS_MAKEFILE = 1 | |||
| # Determine if the assembler is GNU Assembler | |||
| HAVE_GAS := $(shell $(AS) -v < /dev/null 2>&1 | grep GNU 2>&1 >/dev/null ; echo $$?) | |||
| GETARCH_FLAGS += -DHAVE_GAS=$(HAVE_GAS) | |||
| # Generating Makefile.conf and config.h | |||
| DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" HOST_CFLAGS="$(GETARCH_FLAGS)" CFLAGS="$(CFLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) TARGET_CORE=$(TARGET_CORE) ONLY_CBLAS=$(ONLY_CBLAS) TARGET=$(TARGET) all) | |||
| @@ -307,7 +323,7 @@ else | |||
| SMP = 1 | |||
| endif | |||
| else | |||
| ifeq ($(NUM_THREAD), 1) | |||
| ifeq ($(NUM_THREADS), 1) | |||
| SMP = | |||
| else | |||
| SMP = 1 | |||
| @@ -892,15 +908,25 @@ endif | |||
| ifeq ($(C_COMPILER), PGI) | |||
| PGCVERSIONGT20 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` \> 20) | |||
| PGCVERSIONGTEQ20 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` \>= 20) | |||
| PGCMINORVERSIONGE11 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -c 4-5` == 11) | |||
| PGCVERSIONEQ20 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` == 20) | |||
| PGCMINORVERSIONGE11 := $(shell expr `$(CC) --version|sed -n "2p" |cut -d "-" -f 1 |sed -e "s/[^0-9.]//g" |cut -c 4-5` \>= 11) | |||
| PGCVERSIONCHECK := $(PGCVERSIONGT20)$(PGCVERSIONEQ20)$(PGCMINORVERSIONGE11) | |||
| ifeq ($(PGCVERSIONCHECK), $(filter $(PGCVERSIONCHECK), 110 111 011)) | |||
| ifeq ($(PGCVERSIONCHECK), $(filter $(PGCVERSIONCHECK), 100 101 011)) | |||
| NEWPGI := 1 | |||
| PGCVERSIONGT21 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` \> 21) | |||
| PGCVERSIONEQ21 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` == 21) | |||
| PGCVERSIONCHECK2 := $(PGCVERSIONGT21)$(PGCVERSIONEQ21)$(PGCMINORVERSIONGE11) | |||
| ifeq ($(PGCVERSIONCHECK2), $(filter $(PGCVERSIONCHECK2), 100 101 011)) | |||
| NEWPGI2 := 1 | |||
| endif | |||
| endif | |||
| ifdef BINARY64 | |||
| ifeq ($(ARCH), x86_64) | |||
| ifneq ($(NEWPGI2),1) | |||
| CCOMMON_OPT += -tp p7-64 | |||
| else | |||
| CCOMMON_OPT += -tp px | |||
| endif | |||
| ifneq ($(NEWPGI),1) | |||
| CCOMMON_OPT += -D__MMX__ -Mnollvm | |||
| endif | |||
| @@ -915,7 +941,11 @@ endif | |||
| endif | |||
| endif | |||
| else | |||
| ifneq ($(NEWPGI2),1) | |||
| CCOMMON_OPT += -tp p7 | |||
| else | |||
| CCOMMON_OPT += -tp px | |||
| endif | |||
| endif | |||
| endif | |||
| @@ -1092,8 +1122,12 @@ FCOMMON_OPT += -i8 | |||
| endif | |||
| endif | |||
| ifeq ($(ARCH), x86_64) | |||
| ifneq ($(NEWPGI2),1) | |||
| FCOMMON_OPT += -tp p7-64 | |||
| else | |||
| FCOMMON_OPT += -tp px | |||
| endif | |||
| else | |||
| ifeq ($(ARCH), power) | |||
| ifeq ($(CORE), POWER6) | |||
| $(warning NVIDIA HPC compilers do not support POWER6.) | |||
| @@ -1643,8 +1677,10 @@ export HAVE_VFP | |||
| export HAVE_VFPV3 | |||
| export HAVE_VFPV4 | |||
| export HAVE_NEON | |||
| export HAVE_MSA | |||
| export MSA_FLAGS | |||
| ifndef NO_MSA | |||
| export HAVE_MSA | |||
| export MSA_FLAGS | |||
| endif | |||
| export KERNELDIR | |||
| export FUNCTION_PROFILE | |||
| export TARGET_CORE | |||
| @@ -81,6 +81,40 @@ CCOMMON_OPT += -march=cooperlake | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=cooperlake | |||
| endif | |||
| else # gcc not support, fallback to avx512 | |||
| CCOMMON_OPT += -march=skylake-avx512 | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=skylake-avx512 | |||
| endif | |||
| endif | |||
| endif | |||
| ifeq ($(OSNAME), CYGWIN_NT) | |||
| CCOMMON_OPT += -fno-asynchronous-unwind-tables | |||
| FCOMMON_OPT += -fno-asynchronous-unwind-tables | |||
| endif | |||
| ifeq ($(OSNAME), WINNT) | |||
| ifeq ($(C_COMPILER), GCC) | |||
| CCOMMON_OPT += -fno-asynchronous-unwind-tables | |||
| FCOMMON_OPT += -fno-asynchronous-unwind-tables | |||
| endif | |||
| endif | |||
| endif | |||
| endif | |||
| ifeq ($(CORE), SAPPHIRERAPIDS) | |||
| ifndef NO_AVX512 | |||
| ifeq ($(C_COMPILER), GCC) | |||
| # sapphire rapids support was added in 11 | |||
| ifeq ($(GCCVERSIONGTEQ11), 1) | |||
| CCOMMON_OPT += -march=sapphirerapids | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=sapphirerapids | |||
| endif | |||
| else # gcc not support, fallback to avx512 | |||
| CCOMMON_OPT += -march=skylake-avx512 | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=skylake-avx512 | |||
| endif | |||
| endif | |||
| endif | |||
| ifeq ($(OSNAME), CYGWIN_NT) | |||
| @@ -23,6 +23,7 @@ HASWELL | |||
| SKYLAKEX | |||
| ATOM | |||
| COOPERLAKE | |||
| SAPPHIRERAPIDS | |||
| b)AMD CPU: | |||
| ATHLON | |||
| @@ -29,15 +29,15 @@ environment: | |||
| global: | |||
| CONDA_INSTALL_LOCN: C:\\Miniconda36-x64 | |||
| matrix: | |||
| - COMPILER: clang-cl | |||
| WITH_FORTRAN: ON | |||
| - COMPILER: clang-cl | |||
| DYNAMIC_ARCH: ON | |||
| WITH_FORTRAN: OFF | |||
| - COMPILER: cl | |||
| - COMPILER: MinGW64-gcc-7.2.0-mingw | |||
| DYNAMIC_ARCH: OFF | |||
| WITH_FORTRAN: ignore | |||
| # - COMPILER: clang-cl | |||
| # WITH_FORTRAN: ON | |||
| # - COMPILER: clang-cl | |||
| # DYNAMIC_ARCH: ON | |||
| # WITH_FORTRAN: OFF | |||
| # - COMPILER: cl | |||
| # - COMPILER: MinGW64-gcc-7.2.0-mingw | |||
| # DYNAMIC_ARCH: OFF | |||
| # WITH_FORTRAN: ignore | |||
| - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015 | |||
| COMPILER: MinGW-gcc-6.3.0-32 | |||
| - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015 | |||
| @@ -46,6 +46,7 @@ environment: | |||
| install: | |||
| - if [%COMPILER%]==[clang-cl] call %CONDA_INSTALL_LOCN%\Scripts\activate.bat | |||
| - if [%COMPILER%]==[clang-cl] conda update --yes -n base conda | |||
| - if [%COMPILER%]==[clang-cl] conda config --add channels conda-forge --force | |||
| - if [%COMPILER%]==[clang-cl] conda config --set auto_update_conda false | |||
| - if [%COMPILER%]==[clang-cl] conda install --yes --quiet clangdev cmake ninja flang=11.0.1 | |||
| @@ -64,8 +65,8 @@ before_build: | |||
| - if [%COMPILER%]==[MinGW64-gcc-7.2.0-mingw] cmake -G "MinGW Makefiles" -DNOFORTRAN=1 .. | |||
| - if [%COMPILER%]==[MinGW-gcc-6.3.0-32] cmake -G "MSYS Makefiles" -DNOFORTRAN=1 .. | |||
| - if [%COMPILER%]==[MinGW-gcc-5.3.0] cmake -G "MSYS Makefiles" -DNOFORTRAN=1 .. | |||
| - if [%WITH_FORTRAN%]==[OFF] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DMSVC_STATIC_CRT=ON .. | |||
| - if [%WITH_FORTRAN%]==[ON] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_WITHOUT_LAPACK=no -DNOFORTRAN=0 .. | |||
| - if [%WITH_FORTRAN%]==[OFF] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_MT=mt -DMSVC_STATIC_CRT=ON .. | |||
| - if [%WITH_FORTRAN%]==[ON] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DCMAKE_MT=mt -DBUILD_WITHOUT_LAPACK=no -DNOFORTRAN=0 .. | |||
| - if [%USE_OPENMP%]==[ON] cmake -DUSE_OPENMP=ON .. | |||
| - if [%DYNAMIC_ARCH%]==[ON] cmake -DDYNAMIC_ARCH=ON -DDYNAMIC_LIST='CORE2;NEHALEM;SANDYBRIDGE;BULLDOZER;HASWELL' .. | |||
| @@ -75,7 +75,50 @@ jobs: | |||
| cd utest | |||
| dir | |||
| openblas_utest.exe | |||
| - job: Windows_mingw_gmake | |||
| pool: | |||
| vmImage: 'windows-latest' | |||
| steps: | |||
| - script: | | |||
| mingw32-make CC=gcc FC=gfortran DYNAMIC_ARCH=1 DYNAMIC_LIST="NEHALEM SANDYBRIDGE HASWELL" | |||
| - job: Windows_clang_cmake | |||
| pool: | |||
| vmImage: 'windows-latest' | |||
| steps: | |||
| - script: | | |||
| set "PATH=C:\Miniconda\Scripts;C:\Miniconda\Library\bin;C:\Miniconda\Library\usr\bin;C:\Miniconda\condabin;%PATH%" | |||
| set "LIB=C:\Miniconda\Library\lib;%LIB%" | |||
| set "CPATH=C:\Miniconda\Library\include;%CPATH% | |||
| conda config --add channels conda-forge --force | |||
| conda config --set auto_update_conda false | |||
| conda install --yes ninja | |||
| call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat" | |||
| mkdir build | |||
| cd build | |||
| cmake -G "Ninja" -DCMAKE_C_COMPILER=clang-cl -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_MT=mt -DCMAKE_BUILD_TYPE=Release -DNOFORTRAN=1 -DMSVC_STATIC_CRT=ON .. | |||
| cmake --build . --config Release | |||
| ctest | |||
| - job: Windows_flang_clang | |||
| pool: | |||
| vmImage: 'windows-latest' | |||
| steps: | |||
| - script: | | |||
| set "PATH=C:\Miniconda\Scripts;C:\Miniconda\Library\bin;C:\Miniconda\Library\usr\bin;C:\Miniconda\condabin;%PATH%" | |||
| set "LIB=C:\Miniconda\Library\lib;%LIB%" | |||
| set "CPATH=C:\Miniconda\Library\include;%CPATH%" | |||
| conda config --add channels conda-forge --force | |||
| conda config --set auto_update_conda false | |||
| conda install --yes --quiet ninja flang | |||
| mkdir build | |||
| cd build | |||
| call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat" | |||
| cmake -G "Ninja" -DCMAKE_C_COMPILER=clang-cl -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DCMAKE_MT=mt -DCMAKE_BUILD_TYPE=Release -DMSVC_STATIC_CRT=ON .. | |||
| cmake --build . --config Release | |||
| ctest | |||
| - job: OSX_OpenMP | |||
| pool: | |||
| vmImage: 'macOS-10.15' | |||
| @@ -122,7 +165,7 @@ jobs: | |||
| make | |||
| ctest | |||
| - job: OSX_OpenMP_Clang_gf_cmake | |||
| - job: OSX_dynarch_cmake | |||
| pool: | |||
| vmImage: 'macOS-10.15' | |||
| variables: | |||
| @@ -130,14 +173,12 @@ jobs: | |||
| LIBRARY_PATH: /usr/local/opt/llvm/lib | |||
| steps: | |||
| - script: | | |||
| brew update | |||
| brew install llvm libomp | |||
| mkdir build | |||
| cd build | |||
| cmake -DTARGET=CORE2 -DUSE_OPENMP=1 -DINTERFACE64=1 -DDYNAMIC_ARCH=1 -DCMAKE_C_COMPILER=/usr/local/opt/llvm/bin/clang -DNO_AVX512=1 .. | |||
| make | |||
| cmake -DTARGET=CORE2 -DDYNAMIC_ARCH=1 -DCMAKE_C_COMPILER=gcc-10 -DCMAKE_Fortran_COMPILER=gfortran-10 -DBUILD_SHARED_LIBS=ON .. | |||
| cmake --build . | |||
| ctest | |||
| - job: OSX_Ifort_Clang | |||
| pool: | |||
| vmImage: 'macOS-10.15' | |||
| @@ -179,7 +220,7 @@ jobs: | |||
| brew update | |||
| brew install --cask android-ndk | |||
| export ANDROID_NDK_HOME=/usr/local/share/android-ndk | |||
| make TARGET=ARMV7 ONLY_CBLAS=1 CC=$ANDROID_NDK_HOME/toolchains/llvm/prebuilt/darwin-x86_64/bin/armv7a-linux-androideabi21-clang AR=$ANDROID_NDK_HOME/toolchains/llvm/prebuilt/darwin-x86_64/bin/arm-linux-androideabi-ar HOSTCC=gcc ARM_SOFTFP_ABI=1 -j4 | |||
| make TARGET=ARMV7 ONLY_CBLAS=1 CC=$ANDROID_NDK_HOME/toolchains/llvm/prebuilt/darwin-x86_64/bin/armv7a-linux-androideabi21-clang AR=$ANDROID_NDK_HOME/toolchains/llvm/prebuilt/darwin-x86_64/bin/llvm-ar HOSTCC=gcc ARM_SOFTFP_ABI=1 -j4 | |||
| - job: OSX_IOS_ARMV8 | |||
| pool: | |||
| @@ -206,9 +247,9 @@ jobs: | |||
| vmImage: 'ubuntu-latest' | |||
| steps: | |||
| - script: | | |||
| wget https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.13.1/alpine-chroot-install \ | |||
| && echo '7c7e3fa378e69aecc7f5f01bbc759e5f0a9d9b74 alpine-chroot-install' | sha1sum -c \ | |||
| || exit 1 | |||
| wget https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.13.2/alpine-chroot-install \ | |||
| && echo '60c7e0b5d82e21d1a549fc9a46ba3b36688c09dc alpine-chroot-install' | sha1sum -c \ | |||
| || exit 1 | |||
| alpine() { /alpine/enter-chroot -u "$USER" "$@"; } | |||
| sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers sudo' | |||
| alpine make DYNAMIC_ARCH=1 BINARY=64 | |||
| @@ -125,7 +125,7 @@ int main(int argc, char *argv[]){ | |||
| fprintf(stderr, " %6dx%d : ", (int)m,(int)n); | |||
| for(j = 0; j < m; j++){ | |||
| for(i = 0; i < n * COMPSIZE; i++){ | |||
| a[(long)j + (long)i * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| } | |||
| @@ -162,7 +162,7 @@ int main(int argc, char *argv[]){ | |||
| fprintf(stderr, " %6dx%d : ", (int)m,(int)n); | |||
| for(j = 0; j < m; j++){ | |||
| for(i = 0; i < n * COMPSIZE; i++){ | |||
| a[(long)j + (long)i * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| } | |||
| @@ -109,7 +109,7 @@ if (${ARCH} STREQUAL "ia64") | |||
| endif () | |||
| endif () | |||
| if (MIPS64) | |||
| if (MIPS32 OR MIPS64) | |||
| set(NO_BINARY_MODE 1) | |||
| endif () | |||
| @@ -15,6 +15,11 @@ if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "LS | |||
| if (NO_BINARY_MODE) | |||
| if (MIPS32) | |||
| set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=32") | |||
| set(BINARY_DEFINED 1) | |||
| endif () | |||
| if (MIPS64) | |||
| if (BINARY64) | |||
| set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=64") | |||
| @@ -126,6 +131,65 @@ if (${CORE} STREQUAL COOPERLAKE) | |||
| endif () | |||
| endif () | |||
| if (${CORE} STREQUAL SAPPHIRERAPIDS) | |||
| if (NOT DYNAMIC_ARCH) | |||
| if (NOT NO_AVX512) | |||
| execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) | |||
| if (${GCC_VERSION} VERSION_GREATER 11.0 OR ${GCC_VERSION} VERSION_EQUAL 11.0) | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -march=sapphirerapids") | |||
| else () | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -march=skylake-avx512") | |||
| endif() | |||
| endif () | |||
| endif () | |||
| endif () | |||
| if (${CORE} STREQUAL A64FX) | |||
| if (NOT DYNAMIC_ARCH) | |||
| execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) | |||
| if (${GCC_VERSION} VERSION_GREATER 11.0 OR ${GCC_VERSION} VERSION_EQUAL 11.0) | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve -mtune=a64fx") | |||
| else () | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve") | |||
| endif() | |||
| endif () | |||
| endif () | |||
| if (${CORE} STREQUAL ARMV8SVE) | |||
| if (NOT DYNAMIC_ARCH) | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve") | |||
| endif () | |||
| endif () | |||
| if (${CORE} STREQUAL POWER10) | |||
| if (NOT DYNAMIC_ARCH) | |||
| execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) | |||
| if (${GCC_VERSION} VERSION_GREATER 10.2 OR ${GCC_VERSION} VERSION_EQUAL 10.2) | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math") | |||
| else () | |||
| message(FATAL_ERROR "Compiler GCC.${GCC_VERSION} does not support Power10." ) | |||
| endif() | |||
| endif () | |||
| endif () | |||
| if (${CORE} STREQUAL POWER9) | |||
| if (NOT DYNAMIC_ARCH) | |||
| execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) | |||
| if (${GCC_VERSION} VERSION_GREATER 5.0 OR ${GCC_VERSION} VERSION_EQUAL 5.0) | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -mcpu=power9 -mtune=power9 -mvsx -fno-fast-math") | |||
| else () | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math") | |||
| message(WARNING "Compiler GCC.${GCC_VERSION} does not fully support Power9.") | |||
| endif () | |||
| endif () | |||
| endif () | |||
| if (${CORE} STREQUAL POWER8) | |||
| if (NOT DYNAMIC_ARCH) | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math") | |||
| endif () | |||
| endif () | |||
| if (NOT DYNAMIC_ARCH) | |||
| if (HAVE_AVX2) | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -mavx2") | |||
| @@ -3,11 +3,6 @@ | |||
| ## Description: Ported from portion of OpenBLAS/Makefile.system | |||
| ## Sets Fortran related variables. | |||
| if (INTERFACE64) | |||
| set(SUFFIX64 64) | |||
| set(SUFFIX64_UNDERSCORE _64) | |||
| endif() | |||
| if (${F_COMPILER} STREQUAL "FLANG") | |||
| set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_FLANG") | |||
| if (BINARY64 AND INTERFACE64) | |||
| @@ -1,214 +1,218 @@ | |||
| # helper functions for the kernel CMakeLists.txt | |||
| function(SetFallback KERNEL SOURCE_PATH) | |||
| if (NOT (DEFINED ${KERNEL})) | |||
| set(${KERNEL} ${SOURCE_PATH} PARENT_SCOPE) | |||
| endif () | |||
| endfunction() | |||
| # Set the default filenames for L1 objects. Most of these will be overridden by the appropriate KERNEL file. | |||
| macro(SetDefaultL1) | |||
| set(SAMAXKERNEL amax.S) | |||
| set(DAMAXKERNEL amax.S) | |||
| set(QAMAXKERNEL amax.S) | |||
| set(CAMAXKERNEL zamax.S) | |||
| set(ZAMAXKERNEL zamax.S) | |||
| set(XAMAXKERNEL zamax.S) | |||
| set(SAMINKERNEL amin.S) | |||
| set(DAMINKERNEL amin.S) | |||
| set(QAMINKERNEL amin.S) | |||
| set(CAMINKERNEL zamin.S) | |||
| set(ZAMINKERNEL zamin.S) | |||
| set(XAMINKERNEL zamin.S) | |||
| set(SMAXKERNEL max.S) | |||
| set(DMAXKERNEL max.S) | |||
| set(QMAXKERNEL max.S) | |||
| set(SMINKERNEL min.S) | |||
| set(DMINKERNEL min.S) | |||
| set(QMINKERNEL min.S) | |||
| set(ISAMAXKERNEL iamax.S) | |||
| set(IDAMAXKERNEL iamax.S) | |||
| set(IQAMAXKERNEL iamax.S) | |||
| set(ICAMAXKERNEL izamax.S) | |||
| set(IZAMAXKERNEL izamax.S) | |||
| set(IXAMAXKERNEL izamax.S) | |||
| set(ISAMINKERNEL iamin.S) | |||
| set(IDAMINKERNEL iamin.S) | |||
| set(IQAMINKERNEL iamin.S) | |||
| set(ICAMINKERNEL izamin.S) | |||
| set(IZAMINKERNEL izamin.S) | |||
| set(IXAMINKERNEL izamin.S) | |||
| set(ISMAXKERNEL iamax.S) | |||
| set(IDMAXKERNEL iamax.S) | |||
| set(IQMAXKERNEL iamax.S) | |||
| set(ISMINKERNEL iamin.S) | |||
| set(IDMINKERNEL iamin.S) | |||
| set(IQMINKERNEL iamin.S) | |||
| set(SASUMKERNEL asum.S) | |||
| set(DASUMKERNEL asum.S) | |||
| set(CASUMKERNEL zasum.S) | |||
| set(ZASUMKERNEL zasum.S) | |||
| set(QASUMKERNEL asum.S) | |||
| set(XASUMKERNEL zasum.S) | |||
| set(SAXPYKERNEL axpy.S) | |||
| set(DAXPYKERNEL axpy.S) | |||
| set(CAXPYKERNEL zaxpy.S) | |||
| set(ZAXPYKERNEL zaxpy.S) | |||
| set(QAXPYKERNEL axpy.S) | |||
| set(XAXPYKERNEL zaxpy.S) | |||
| set(SCOPYKERNEL copy.S) | |||
| set(DCOPYKERNEL copy.S) | |||
| set(CCOPYKERNEL zcopy.S) | |||
| set(ZCOPYKERNEL zcopy.S) | |||
| set(QCOPYKERNEL copy.S) | |||
| set(XCOPYKERNEL zcopy.S) | |||
| set(SDOTKERNEL dot.S) | |||
| set(DDOTKERNEL dot.S) | |||
| set(CDOTKERNEL zdot.S) | |||
| set(ZDOTKERNEL zdot.S) | |||
| set(QDOTKERNEL dot.S) | |||
| set(XDOTKERNEL zdot.S) | |||
| set(SNRM2KERNEL nrm2.S) | |||
| set(DNRM2KERNEL nrm2.S) | |||
| set(QNRM2KERNEL nrm2.S) | |||
| set(CNRM2KERNEL znrm2.S) | |||
| set(ZNRM2KERNEL znrm2.S) | |||
| set(XNRM2KERNEL znrm2.S) | |||
| set(SROTKERNEL rot.S) | |||
| set(DROTKERNEL rot.S) | |||
| set(QROTKERNEL rot.S) | |||
| set(CROTKERNEL zrot.S) | |||
| set(ZROTKERNEL zrot.S) | |||
| set(XROTKERNEL zrot.S) | |||
| set(SSCALKERNEL scal.S) | |||
| set(DSCALKERNEL scal.S) | |||
| set(CSCALKERNEL zscal.S) | |||
| set(ZSCALKERNEL zscal.S) | |||
| set(QSCALKERNEL scal.S) | |||
| set(XSCALKERNEL zscal.S) | |||
| set(SSWAPKERNEL swap.S) | |||
| set(DSWAPKERNEL swap.S) | |||
| set(CSWAPKERNEL zswap.S) | |||
| set(ZSWAPKERNEL zswap.S) | |||
| set(QSWAPKERNEL swap.S) | |||
| set(XSWAPKERNEL zswap.S) | |||
| set(SGEMVNKERNEL gemv_n.S) | |||
| set(SGEMVTKERNEL gemv_t.S) | |||
| set(DGEMVNKERNEL gemv_n.S) | |||
| set(DGEMVTKERNEL gemv_t.S) | |||
| set(CGEMVNKERNEL zgemv_n.S) | |||
| set(CGEMVTKERNEL zgemv_t.S) | |||
| set(ZGEMVNKERNEL zgemv_n.S) | |||
| set(ZGEMVTKERNEL zgemv_t.S) | |||
| set(QGEMVNKERNEL gemv_n.S) | |||
| set(QGEMVTKERNEL gemv_t.S) | |||
| set(XGEMVNKERNEL zgemv_n.S) | |||
| set(XGEMVTKERNEL zgemv_t.S) | |||
| set(SCABS_KERNEL ../generic/cabs.c) | |||
| set(DCABS_KERNEL ../generic/cabs.c) | |||
| set(QCABS_KERNEL ../generic/cabs.c) | |||
| set(LSAME_KERNEL ../generic/lsame.c) | |||
| set(SAXPBYKERNEL ../arm/axpby.c) | |||
| set(DAXPBYKERNEL ../arm/axpby.c) | |||
| set(CAXPBYKERNEL ../arm/zaxpby.c) | |||
| set(ZAXPBYKERNEL ../arm/zaxpby.c) | |||
| set(SSUMKERNEL sum.S) | |||
| set(DSUMKERNEL sum.S) | |||
| set(CSUMKERNEL zsum.S) | |||
| set(ZSUMKERNEL zsum.S) | |||
| set(QSUMKERNEL sum.S) | |||
| set(XSUMKERNEL zsum.S) | |||
| SetFallback(SAMAXKERNEL amax.S) | |||
| SetFallback(DAMAXKERNEL amax.S) | |||
| SetFallback(QAMAXKERNEL amax.S) | |||
| SetFallback(CAMAXKERNEL zamax.S) | |||
| SetFallback(ZAMAXKERNEL zamax.S) | |||
| SetFallback(XAMAXKERNEL zamax.S) | |||
| SetFallback(SAMINKERNEL amin.S) | |||
| SetFallback(DAMINKERNEL amin.S) | |||
| SetFallback(QAMINKERNEL amin.S) | |||
| SetFallback(CAMINKERNEL zamin.S) | |||
| SetFallback(ZAMINKERNEL zamin.S) | |||
| SetFallback(XAMINKERNEL zamin.S) | |||
| SetFallback(SMAXKERNEL max.S) | |||
| SetFallback(DMAXKERNEL max.S) | |||
| SetFallback(QMAXKERNEL max.S) | |||
| SetFallback(SMINKERNEL min.S) | |||
| SetFallback(DMINKERNEL min.S) | |||
| SetFallback(QMINKERNEL min.S) | |||
| SetFallback(ISAMAXKERNEL iamax.S) | |||
| SetFallback(IDAMAXKERNEL iamax.S) | |||
| SetFallback(IQAMAXKERNEL iamax.S) | |||
| SetFallback(ICAMAXKERNEL izamax.S) | |||
| SetFallback(IZAMAXKERNEL izamax.S) | |||
| SetFallback(IXAMAXKERNEL izamax.S) | |||
| SetFallback(ISAMINKERNEL iamin.S) | |||
| SetFallback(IDAMINKERNEL iamin.S) | |||
| SetFallback(IQAMINKERNEL iamin.S) | |||
| SetFallback(ICAMINKERNEL izamin.S) | |||
| SetFallback(IZAMINKERNEL izamin.S) | |||
| SetFallback(IXAMINKERNEL izamin.S) | |||
| SetFallback(ISMAXKERNEL iamax.S) | |||
| SetFallback(IDMAXKERNEL iamax.S) | |||
| SetFallback(IQMAXKERNEL iamax.S) | |||
| SetFallback(ISMINKERNEL iamin.S) | |||
| SetFallback(IDMINKERNEL iamin.S) | |||
| SetFallback(IQMINKERNEL iamin.S) | |||
| SetFallback(SASUMKERNEL asum.S) | |||
| SetFallback(DASUMKERNEL asum.S) | |||
| SetFallback(CASUMKERNEL zasum.S) | |||
| SetFallback(ZASUMKERNEL zasum.S) | |||
| SetFallback(QASUMKERNEL asum.S) | |||
| SetFallback(XASUMKERNEL zasum.S) | |||
| SetFallback(SAXPYKERNEL axpy.S) | |||
| SetFallback(DAXPYKERNEL axpy.S) | |||
| SetFallback(CAXPYKERNEL zaxpy.S) | |||
| SetFallback(ZAXPYKERNEL zaxpy.S) | |||
| SetFallback(QAXPYKERNEL axpy.S) | |||
| SetFallback(XAXPYKERNEL zaxpy.S) | |||
| SetFallback(SCOPYKERNEL copy.S) | |||
| SetFallback(DCOPYKERNEL copy.S) | |||
| SetFallback(CCOPYKERNEL zcopy.S) | |||
| SetFallback(ZCOPYKERNEL zcopy.S) | |||
| SetFallback(QCOPYKERNEL copy.S) | |||
| SetFallback(XCOPYKERNEL zcopy.S) | |||
| SetFallback(SDOTKERNEL dot.S) | |||
| SetFallback(DDOTKERNEL dot.S) | |||
| SetFallback(CDOTKERNEL zdot.S) | |||
| SetFallback(ZDOTKERNEL zdot.S) | |||
| SetFallback(QDOTKERNEL dot.S) | |||
| SetFallback(XDOTKERNEL zdot.S) | |||
| SetFallback(SNRM2KERNEL nrm2.S) | |||
| SetFallback(DNRM2KERNEL nrm2.S) | |||
| SetFallback(QNRM2KERNEL nrm2.S) | |||
| SetFallback(CNRM2KERNEL znrm2.S) | |||
| SetFallback(ZNRM2KERNEL znrm2.S) | |||
| SetFallback(XNRM2KERNEL znrm2.S) | |||
| SetFallback(SROTKERNEL rot.S) | |||
| SetFallback(DROTKERNEL rot.S) | |||
| SetFallback(QROTKERNEL rot.S) | |||
| SetFallback(CROTKERNEL zrot.S) | |||
| SetFallback(ZROTKERNEL zrot.S) | |||
| SetFallback(XROTKERNEL zrot.S) | |||
| SetFallback(SSCALKERNEL scal.S) | |||
| SetFallback(DSCALKERNEL scal.S) | |||
| SetFallback(CSCALKERNEL zscal.S) | |||
| SetFallback(ZSCALKERNEL zscal.S) | |||
| SetFallback(QSCALKERNEL scal.S) | |||
| SetFallback(XSCALKERNEL zscal.S) | |||
| SetFallback(SSWAPKERNEL swap.S) | |||
| SetFallback(DSWAPKERNEL swap.S) | |||
| SetFallback(CSWAPKERNEL zswap.S) | |||
| SetFallback(ZSWAPKERNEL zswap.S) | |||
| SetFallback(QSWAPKERNEL swap.S) | |||
| SetFallback(XSWAPKERNEL zswap.S) | |||
| SetFallback(SGEMVNKERNEL gemv_n.S) | |||
| SetFallback(SGEMVTKERNEL gemv_t.S) | |||
| SetFallback(DGEMVNKERNEL gemv_n.S) | |||
| SetFallback(DGEMVTKERNEL gemv_t.S) | |||
| SetFallback(CGEMVNKERNEL zgemv_n.S) | |||
| SetFallback(CGEMVTKERNEL zgemv_t.S) | |||
| SetFallback(ZGEMVNKERNEL zgemv_n.S) | |||
| SetFallback(ZGEMVTKERNEL zgemv_t.S) | |||
| SetFallback(QGEMVNKERNEL gemv_n.S) | |||
| SetFallback(QGEMVTKERNEL gemv_t.S) | |||
| SetFallback(XGEMVNKERNEL zgemv_n.S) | |||
| SetFallback(XGEMVTKERNEL zgemv_t.S) | |||
| SetFallback(SCABS_KERNEL ../generic/cabs.c) | |||
| SetFallback(DCABS_KERNEL ../generic/cabs.c) | |||
| SetFallback(QCABS_KERNEL ../generic/cabs.c) | |||
| SetFallback(LSAME_KERNEL ../generic/lsame.c) | |||
| SetFallback(SAXPBYKERNEL ../arm/axpby.c) | |||
| SetFallback(DAXPBYKERNEL ../arm/axpby.c) | |||
| SetFallback(CAXPBYKERNEL ../arm/zaxpby.c) | |||
| SetFallback(ZAXPBYKERNEL ../arm/zaxpby.c) | |||
| SetFallback(SSUMKERNEL sum.S) | |||
| SetFallback(DSUMKERNEL sum.S) | |||
| SetFallback(CSUMKERNEL zsum.S) | |||
| SetFallback(ZSUMKERNEL zsum.S) | |||
| SetFallback(QSUMKERNEL sum.S) | |||
| SetFallback(XSUMKERNEL zsum.S) | |||
| if (BUILD_BFLOAT16) | |||
| set(SHAMINKERNEL ../arm/amin.c) | |||
| set(SHAMAXKERNEL ../arm/amax.c) | |||
| set(SHMAXKERNEL ../arm/max.c) | |||
| set(SHMINKERNEL ../arm/min.c) | |||
| set(ISHAMAXKERNEL ../arm/iamax.c) | |||
| set(ISHAMINKERNEL ../arm/iamin.c) | |||
| set(ISHMAXKERNEL ../arm/imax.c) | |||
| set(ISHMINKERNEL ../arm/imin.c) | |||
| set(SHASUMKERNEL ../arm/asum.c) | |||
| set(SHAXPYKERNEL ../arm/axpy.c) | |||
| set(SHAXPBYKERNEL ../arm/axpby.c) | |||
| set(SHCOPYKERNEL ../arm/copy.c) | |||
| set(SBDOTKERNEL ../x86_64/sbdot.c) | |||
| set(SHROTKERNEL ../arm/rot.c) | |||
| set(SHSCALKERNEL ../arm/scal.c) | |||
| set(SHNRM2KERNEL ../arm/nrm2.c) | |||
| set(SHSUMKERNEL ../arm/sum.c) | |||
| set(SHSWAPKERNEL ../arm/swap.c) | |||
| set(TOBF16KERNEL ../x86_64/tobf16.c) | |||
| set(BF16TOKERNEL ../x86_64/bf16to.c) | |||
| set(SBGEMVNKERNEL ../x86_64/sbgemv_n.c) | |||
| set(SBGEMVTKERNEL ../x86_64/sbgemv_t.c) | |||
| SetFallback(SHAMINKERNEL ../arm/amin.c) | |||
| SetFallback(SHAMAXKERNEL ../arm/amax.c) | |||
| SetFallback(SHMAXKERNEL ../arm/max.c) | |||
| SetFallback(SHMINKERNEL ../arm/min.c) | |||
| SetFallback(ISHAMAXKERNEL ../arm/iamax.c) | |||
| SetFallback(ISHAMINKERNEL ../arm/iamin.c) | |||
| SetFallback(ISHMAXKERNEL ../arm/imax.c) | |||
| SetFallback(ISHMINKERNEL ../arm/imin.c) | |||
| SetFallback(SHASUMKERNEL ../arm/asum.c) | |||
| SetFallback(SHAXPYKERNEL ../arm/axpy.c) | |||
| SetFallback(SHAXPBYKERNEL ../arm/axpby.c) | |||
| SetFallback(SHCOPYKERNEL ../arm/copy.c) | |||
| SetFallback(SBDOTKERNEL ../x86_64/sbdot.c) | |||
| SetFallback(SHROTKERNEL ../arm/rot.c) | |||
| SetFallback(SHSCALKERNEL ../arm/scal.c) | |||
| SetFallback(SHNRM2KERNEL ../arm/nrm2.c) | |||
| SetFallback(SHSUMKERNEL ../arm/sum.c) | |||
| SetFallback(SHSWAPKERNEL ../arm/swap.c) | |||
| SetFallback(TOBF16KERNEL ../x86_64/tobf16.c) | |||
| SetFallback(BF16TOKERNEL ../x86_64/bf16to.c) | |||
| SetFallback(SBGEMVNKERNEL ../x86_64/sbgemv_n.c) | |||
| SetFallback(SBGEMVTKERNEL ../x86_64/sbgemv_t.c) | |||
| endif () | |||
| endmacro () | |||
| macro(SetDefaultL2) | |||
| set(SGEMVNKERNEL ../arm/gemv_n.c) | |||
| set(SGEMVTKERNEL ../arm/gemv_t.c) | |||
| set(DGEMVNKERNEL gemv_n.S) | |||
| set(DGEMVTKERNEL gemv_t.S) | |||
| set(CGEMVNKERNEL zgemv_n.S) | |||
| set(CGEMVTKERNEL zgemv_t.S) | |||
| set(ZGEMVNKERNEL zgemv_n.S) | |||
| set(ZGEMVTKERNEL zgemv_t.S) | |||
| set(QGEMVNKERNEL gemv_n.S) | |||
| set(QGEMVTKERNEL gemv_t.S) | |||
| set(XGEMVNKERNEL zgemv_n.S) | |||
| set(XGEMVTKERNEL zgemv_t.S) | |||
| set(SGERKERNEL ../generic/ger.c) | |||
| set(DGERKERNEL ../generic/ger.c) | |||
| set(QGERKERNEL ../generic/ger.c) | |||
| set(CGERUKERNEL ../generic/zger.c) | |||
| set(CGERCKERNEL ../generic/zger.c) | |||
| set(ZGERUKERNEL ../generic/zger.c) | |||
| set(ZGERCKERNEL ../generic/zger.c) | |||
| set(XGERUKERNEL ../generic/zger.c) | |||
| set(XGERCKERNEL ../generic/zger.c) | |||
| set(SSYMV_U_KERNEL ../generic/symv_k.c) | |||
| set(SSYMV_L_KERNEL ../generic/symv_k.c) | |||
| set(DSYMV_U_KERNEL ../generic/symv_k.c) | |||
| set(DSYMV_L_KERNEL ../generic/symv_k.c) | |||
| set(QSYMV_U_KERNEL ../generic/symv_k.c) | |||
| set(QSYMV_L_KERNEL ../generic/symv_k.c) | |||
| set(CSYMV_U_KERNEL ../generic/zsymv_k.c) | |||
| set(CSYMV_L_KERNEL ../generic/zsymv_k.c) | |||
| set(ZSYMV_U_KERNEL ../generic/zsymv_k.c) | |||
| set(ZSYMV_L_KERNEL ../generic/zsymv_k.c) | |||
| set(XSYMV_U_KERNEL ../generic/zsymv_k.c) | |||
| set(XSYMV_L_KERNEL ../generic/zsymv_k.c) | |||
| set(CHEMV_U_KERNEL ../generic/zhemv_k.c) | |||
| set(CHEMV_L_KERNEL ../generic/zhemv_k.c) | |||
| set(CHEMV_V_KERNEL ../generic/zhemv_k.c) | |||
| set(CHEMV_M_KERNEL ../generic/zhemv_k.c) | |||
| set(ZHEMV_U_KERNEL ../generic/zhemv_k.c) | |||
| set(ZHEMV_L_KERNEL ../generic/zhemv_k.c) | |||
| set(ZHEMV_V_KERNEL ../generic/zhemv_k.c) | |||
| set(ZHEMV_M_KERNEL ../generic/zhemv_k.c) | |||
| set(XHEMV_U_KERNEL ../generic/zhemv_k.c) | |||
| set(XHEMV_L_KERNEL ../generic/zhemv_k.c) | |||
| set(XHEMV_V_KERNEL ../generic/zhemv_k.c) | |||
| set(XHEMV_M_KERNEL ../generic/zhemv_k.c) | |||
| SetFallback(SGEMVNKERNEL ../arm/gemv_n.c) | |||
| SetFallback(SGEMVTKERNEL ../arm/gemv_t.c) | |||
| SetFallback(DGEMVNKERNEL gemv_n.S) | |||
| SetFallback(DGEMVTKERNEL gemv_t.S) | |||
| SetFallback(CGEMVNKERNEL zgemv_n.S) | |||
| SetFallback(CGEMVTKERNEL zgemv_t.S) | |||
| SetFallback(ZGEMVNKERNEL zgemv_n.S) | |||
| SetFallback(ZGEMVTKERNEL zgemv_t.S) | |||
| SetFallback(QGEMVNKERNEL gemv_n.S) | |||
| SetFallback(QGEMVTKERNEL gemv_t.S) | |||
| SetFallback(XGEMVNKERNEL zgemv_n.S) | |||
| SetFallback(XGEMVTKERNEL zgemv_t.S) | |||
| SetFallback(SGERKERNEL ../generic/ger.c) | |||
| SetFallback(DGERKERNEL ../generic/ger.c) | |||
| SetFallback(QGERKERNEL ../generic/ger.c) | |||
| SetFallback(CGERUKERNEL ../generic/zger.c) | |||
| SetFallback(CGERCKERNEL ../generic/zger.c) | |||
| SetFallback(ZGERUKERNEL ../generic/zger.c) | |||
| SetFallback(ZGERCKERNEL ../generic/zger.c) | |||
| SetFallback(XGERUKERNEL ../generic/zger.c) | |||
| SetFallback(XGERCKERNEL ../generic/zger.c) | |||
| SetFallback(SSYMV_U_KERNEL ../generic/symv_k.c) | |||
| SetFallback(SSYMV_L_KERNEL ../generic/symv_k.c) | |||
| SetFallback(DSYMV_U_KERNEL ../generic/symv_k.c) | |||
| SetFallback(DSYMV_L_KERNEL ../generic/symv_k.c) | |||
| SetFallback(QSYMV_U_KERNEL ../generic/symv_k.c) | |||
| SetFallback(QSYMV_L_KERNEL ../generic/symv_k.c) | |||
| SetFallback(CSYMV_U_KERNEL ../generic/zsymv_k.c) | |||
| SetFallback(CSYMV_L_KERNEL ../generic/zsymv_k.c) | |||
| SetFallback(ZSYMV_U_KERNEL ../generic/zsymv_k.c) | |||
| SetFallback(ZSYMV_L_KERNEL ../generic/zsymv_k.c) | |||
| SetFallback(XSYMV_U_KERNEL ../generic/zsymv_k.c) | |||
| SetFallback(XSYMV_L_KERNEL ../generic/zsymv_k.c) | |||
| SetFallback(CHEMV_U_KERNEL ../generic/zhemv_k.c) | |||
| SetFallback(CHEMV_L_KERNEL ../generic/zhemv_k.c) | |||
| SetFallback(CHEMV_V_KERNEL ../generic/zhemv_k.c) | |||
| SetFallback(CHEMV_M_KERNEL ../generic/zhemv_k.c) | |||
| SetFallback(ZHEMV_U_KERNEL ../generic/zhemv_k.c) | |||
| SetFallback(ZHEMV_L_KERNEL ../generic/zhemv_k.c) | |||
| SetFallback(ZHEMV_V_KERNEL ../generic/zhemv_k.c) | |||
| SetFallback(ZHEMV_M_KERNEL ../generic/zhemv_k.c) | |||
| SetFallback(XHEMV_U_KERNEL ../generic/zhemv_k.c) | |||
| SetFallback(XHEMV_L_KERNEL ../generic/zhemv_k.c) | |||
| SetFallback(XHEMV_V_KERNEL ../generic/zhemv_k.c) | |||
| SetFallback(XHEMV_M_KERNEL ../generic/zhemv_k.c) | |||
| if (BUILD_BFLOAT16) | |||
| set(SBGEMVNKERNEL ../x86_64/sbgemv_n.c) | |||
| set(SBGEMVTKERNEL ../x86_64/sbgemv_t.c) | |||
| set(SHGERKERNEL ../generic/ger.c) | |||
| SetFallback(SBGEMVNKERNEL ../x86_64/sbgemv_n.c) | |||
| SetFallback(SBGEMVTKERNEL ../x86_64/sbgemv_t.c) | |||
| SetFallback(SHGERKERNEL ../generic/ger.c) | |||
| endif () | |||
| endmacro () | |||
| macro(SetDefaultL3) | |||
| set(SGEADD_KERNEL ../generic/geadd.c) | |||
| set(DGEADD_KERNEL ../generic/geadd.c) | |||
| set(CGEADD_KERNEL ../generic/zgeadd.c) | |||
| set(ZGEADD_KERNEL ../generic/zgeadd.c) | |||
| SetFallback(SGEADD_KERNEL ../generic/geadd.c) | |||
| SetFallback(DGEADD_KERNEL ../generic/geadd.c) | |||
| SetFallback(CGEADD_KERNEL ../generic/zgeadd.c) | |||
| SetFallback(ZGEADD_KERNEL ../generic/zgeadd.c) | |||
| if (BUILD_BFLOAT16) | |||
| set(SHGEADD_KERNEL ../generic/geadd.c) | |||
| set(SBGEMMKERNEL ../generic/gemmkernel_2x2.c) | |||
| set(SBGEMM_BETA ../generic/gemm_beta.c) | |||
| set(SBGEMMINCOPY ../generic/gemm_ncopy_2.c) | |||
| set(SBGEMMITCOPY ../generic/gemm_tcopy_2.c) | |||
| set(SBGEMMONCOPY ../generic/gemm_ncopy_2.c) | |||
| set(SBGEMMOTCOPY ../generic/gemm_tcopy_2.c) | |||
| set(SBGEMMINCOPYOBJ sbgemm_incopy.o) | |||
| set(SBGEMMITCOPYOBJ sbgemm_itcopy.o) | |||
| set(SBGEMMONCOPYOBJ sbgemm_oncopy.o) | |||
| set(SBGEMMOTCOPYOBJ sbgemm_otcopy.o) | |||
| SetFallback(SHGEADD_KERNEL ../generic/geadd.c) | |||
| SetFallback(SBGEMMKERNEL ../generic/gemmkernel_2x2.c) | |||
| SetFallback(SBGEMM_BETA ../generic/gemm_beta.c) | |||
| SetFallback(SBGEMMINCOPY ../generic/gemm_ncopy_2.c) | |||
| SetFallback(SBGEMMITCOPY ../generic/gemm_tcopy_2.c) | |||
| SetFallback(SBGEMMONCOPY ../generic/gemm_ncopy_2.c) | |||
| SetFallback(SBGEMMOTCOPY ../generic/gemm_tcopy_2.c) | |||
| SetFallback(SBGEMMINCOPYOBJ sbgemm_incopy.o) | |||
| SetFallback(SBGEMMITCOPYOBJ sbgemm_itcopy.o) | |||
| SetFallback(SBGEMMONCOPYOBJ sbgemm_oncopy.o) | |||
| SetFallback(SBGEMMOTCOPYOBJ sbgemm_otcopy.o) | |||
| endif () | |||
| endmacro () | |||
| @@ -416,7 +416,7 @@ endif () | |||
| set(ZGEMM_UNROLL_M 4) | |||
| set(ZGEMM_UNROLL_N 4) | |||
| set(SYMV_P 16) | |||
| elseif ("${TCORE}" STREQUAL "VORTEX") | |||
| elseif ("${TCORE}" STREQUAL "VORTEX") | |||
| file(APPEND ${TARGET_CONF_TEMP} | |||
| "#define ARMV8\n" | |||
| "#define L1_CODE_SIZE\t32768\n" | |||
| @@ -439,6 +439,34 @@ elseif ("${TCORE}" STREQUAL "VORTEX") | |||
| set(ZGEMM_UNROLL_M 4) | |||
| set(ZGEMM_UNROLL_N 4) | |||
| set(SYMV_P 16) | |||
| elseif ("${TCORE}" STREQUAL "P5600") | |||
| file(APPEND ${TARGET_CONF_TEMP} | |||
| "#define L2_SIZE 1048576\n" | |||
| "#define DTB_SIZE 4096\n" | |||
| "#define DTB_DEFAULT_ENTRIES 64\n") | |||
| set(SGEMM_UNROLL_M 2) | |||
| set(SGEMM_UNROLL_N 2) | |||
| set(DGEMM_UNROLL_M 2) | |||
| set(DGEMM_UNROLL_N 2) | |||
| set(CGEMM_UNROLL_M 2) | |||
| set(CGEMM_UNROLL_N 2) | |||
| set(ZGEMM_UNROLL_M 2) | |||
| set(ZGEMM_UNROLL_N 2) | |||
| set(SYMV_P 16) | |||
| elseif ("${TCORE}" MATCHES "MIPS") | |||
| file(APPEND ${TARGET_CONF_TEMP} | |||
| "#define L2_SIZE 262144\n" | |||
| "#define DTB_SIZE 4096\n" | |||
| "#define DTB_DEFAULT_ENTRIES 64\n") | |||
| set(SGEMM_UNROLL_M 2) | |||
| set(SGEMM_UNROLL_N 2) | |||
| set(DGEMM_UNROLL_M 2) | |||
| set(DGEMM_UNROLL_N 2) | |||
| set(CGEMM_UNROLL_M 2) | |||
| set(CGEMM_UNROLL_N 2) | |||
| set(ZGEMM_UNROLL_M 2) | |||
| set(ZGEMM_UNROLL_N 2) | |||
| set(SYMV_P 16) | |||
| elseif ("${TCORE}" STREQUAL "POWER6") | |||
| file(APPEND ${TARGET_CONF_TEMP} | |||
| "#define L1_DATA_SIZE 32768\n" | |||
| @@ -33,7 +33,7 @@ endif () | |||
| if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32) | |||
| message(STATUS "Compiling a ${BINARY}-bit binary.") | |||
| set(NO_AVX 1) | |||
| if (${TARGET} STREQUAL "HASWELL" OR ${TARGET} STREQUAL "SANDYBRIDGE" OR ${TARGET} STREQUAL "SKYLAKEX" OR ${TARGET} STREQUAL "COOPERLAKE") | |||
| if (${TARGET} STREQUAL "HASWELL" OR ${TARGET} STREQUAL "SANDYBRIDGE" OR ${TARGET} STREQUAL "SKYLAKEX" OR ${TARGET} STREQUAL "COOPERLAKE" OR ${TARGET} STREQUAL "SAPPHIRERAPIDS") | |||
| set(TARGET "NEHALEM") | |||
| endif () | |||
| if (${TARGET} STREQUAL "BULLDOZER" OR ${TARGET} STREQUAL "PILEDRIVER" OR ${TARGET} STREQUAL "ZEN") | |||
| @@ -42,6 +42,9 @@ if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32) | |||
| if (${TARGET} STREQUAL "ARMV8" OR ${TARGET} STREQUAL "CORTEXA57" OR ${TARGET} STREQUAL "CORTEXA53" OR ${TARGET} STREQUAL "CORTEXA55") | |||
| set(TARGET "ARMV7") | |||
| endif () | |||
| if (${TARGET} STREQUAL "POWER8" OR ${TARGET} STREQUAL "POWER9" OR ${TARGET} STREQUAL "POWER10") | |||
| set(TARGET "POWER6") | |||
| endif () | |||
| endif () | |||
| @@ -102,6 +105,18 @@ if (CMAKE_C_COMPILER STREQUAL loongcc) | |||
| set(GETARCH_FLAGS "${GETARCH_FLAGS} -static") | |||
| endif () | |||
| if (POWER) | |||
| set(NO_WARMUP 1) | |||
| set(HAVE_GAS 1) | |||
| if (CMAKE_ASM_COMPILER_ID STREQUAL "GNU") | |||
| set(HAVE_GAS 0) | |||
| elseif (CMAKE_ASM_COMPILER_ID STREQUAL "Clang") | |||
| set(CCOMMON_OPT "${CCOMMON_OPT} -fno-integrated-as") | |||
| set(HAVE_GAS 0) | |||
| endif () | |||
| set(GETARCH_FLAGS "${GETARCH_FLAGS} -DHAVE_GAS=${HAVE_GAS}") | |||
| endif () | |||
| #if don't use Fortran, it will only compile CBLAS. | |||
| if (ONLY_CBLAS) | |||
| set(NO_LAPACK 1) | |||
| @@ -163,6 +178,22 @@ if (DEFINED TARGET) | |||
| endif() | |||
| endif() | |||
| endif() | |||
| if (${TARGET} STREQUAL SAPPHIRERAPIDS AND NOT NO_AVX512) | |||
| if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU") | |||
| execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) | |||
| if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 11.0) | |||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=sapphirerapids") | |||
| else() | |||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512") | |||
| endif() | |||
| elseif (${CMAKE_C_COMPILER_ID} STREQUAL "Clang" OR ${CMAKE_C_COMPILER_ID} STREQUAL "AppleClang") | |||
| if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 12.0) | |||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=sapphirerapids") | |||
| else() | |||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512") | |||
| endif() | |||
| endif() | |||
| endif() | |||
| if (${TARGET} STREQUAL SKYLAKEX AND NOT NO_AVX512) | |||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512") | |||
| endif() | |||
| @@ -206,6 +237,27 @@ if (DEFINED TARGET) | |||
| if (DEFINED HAVE_SSE4_1) | |||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse4.1") | |||
| endif() | |||
| if (${TARGET} STREQUAL POWER10) | |||
| execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) | |||
| if (${GCC_VERSION} VERSION_GREATER 10.2 OR ${GCC_VERSION} VERSION_EQUAL 10.2) | |||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math") | |||
| else () | |||
| message(FATAL_ERROR "Compiler GCC.${GCC_VERSION} does not support Power10.") | |||
| endif() | |||
| endif() | |||
| if (${TARGET} STREQUAL POWER9) | |||
| execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) | |||
| if (${GCC_VERSION} VERSION_GREATER 5.0 OR ${GCC_VERSION} VERSION_EQUAL 5.0) | |||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mcpu=power9 -mtune=power9 -mvsx -fno-fast-math") | |||
| else () | |||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math") | |||
| message(WARNING "Compiler GCC.${GCC_VERSION} does not support fully Power9.") | |||
| endif() | |||
| endif() | |||
| if (${TARGET} STREQUAL POWER8) | |||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math") | |||
| endif() | |||
| endif() | |||
| if (DEFINED BINARY) | |||
| message(STATUS "Compiling a ${BINARY}-bit binary.") | |||
| @@ -223,6 +275,11 @@ include("${PROJECT_SOURCE_DIR}/cmake/arch.cmake") | |||
| # C Compiler dependent settings | |||
| include("${PROJECT_SOURCE_DIR}/cmake/cc.cmake") | |||
| if (INTERFACE64) | |||
| set(SUFFIX64 64) | |||
| set(SUFFIX64_UNDERSCORE _64) | |||
| endif() | |||
| if (NOT NOFORTRAN) | |||
| # Fortran Compiler dependent settings | |||
| include("${PROJECT_SOURCE_DIR}/cmake/fc.cmake") | |||
| @@ -258,7 +315,7 @@ if (NEED_PIC) | |||
| endif() | |||
| endif () | |||
| if (X86_64) | |||
| if (X86_64 OR ${CORE} STREQUAL POWER10) | |||
| set(SMALL_MATRIX_OPT TRUE) | |||
| endif () | |||
| if (SMALL_MATRIX_OPT) | |||
| @@ -266,7 +323,7 @@ if (SMALL_MATRIX_OPT) | |||
| endif () | |||
| if (DYNAMIC_ARCH) | |||
| if (X86 OR X86_64 OR ARM64 OR PPC) | |||
| if (X86 OR X86_64 OR ARM64 OR POWER) | |||
| set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_ARCH") | |||
| if (DYNAMIC_OLDER) | |||
| set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_OLDER") | |||
| @@ -20,11 +20,11 @@ endif() | |||
| if(CMAKE_COMPILER_IS_GNUCC AND WIN32) | |||
| if(MINGW) | |||
| execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpmachine | |||
| OUTPUT_VARIABLE OPENBLAS_GCC_TARGET_MACHINE | |||
| OUTPUT_VARIABLE OPENBLAS_MINGW_TARGET_MACHINE | |||
| OUTPUT_STRIP_TRAILING_WHITESPACE) | |||
| if(OPENBLAS_GCC_TARGET_MACHINE MATCHES "amd64|x86_64|AMD64") | |||
| if(OPENBLAS_MINGW_TARGET_MACHINE MATCHES "amd64|x86_64|AMD64") | |||
| set(MINGW64 1) | |||
| endif() | |||
| endif() | |||
| @@ -35,7 +35,7 @@ if(CMAKE_CL_64 OR MINGW64) | |||
| elseif(MINGW OR (MSVC AND NOT CMAKE_CROSSCOMPILING)) | |||
| set(X86 1) | |||
| elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc.*|power.*|Power.*") | |||
| set(PPC 1) | |||
| set(POWER 1) | |||
| elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "mips64.*") | |||
| set(MIPS64 1) | |||
| elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "loongarch64.*") | |||
| @@ -73,6 +73,8 @@ elseif (${CMAKE_CROSSCOMPILING}) | |||
| else () | |||
| set(X86 1) | |||
| endif() | |||
| elseif (${TARGET} STREQUAL "P5600" OR ${TARGET} MATCHES "MIPS.*") | |||
| set(MIPS32 1) | |||
| elseif (${TARGET} STREQUAL "ARMV7") | |||
| set(ARM 1) | |||
| else() | |||
| @@ -86,8 +88,12 @@ if (X86_64) | |||
| set(ARCH "x86_64") | |||
| elseif(X86) | |||
| set(ARCH "x86") | |||
| elseif(PPC) | |||
| elseif(POWER) | |||
| set(ARCH "power") | |||
| elseif(MIPS32) | |||
| set(ARCH "mips") | |||
| elseif(MIPS64) | |||
| set(ARCH "mips64") | |||
| elseif(ARM) | |||
| set(ARCH "arm") | |||
| elseif(ARM64) | |||
| @@ -97,7 +103,7 @@ else() | |||
| endif () | |||
| if (NOT BINARY) | |||
| if (X86_64 OR ARM64 OR PPC OR MIPS64 OR LOONGARCH64) | |||
| if (X86_64 OR ARM64 OR POWER OR MIPS64 OR LOONGARCH64) | |||
| set(BINARY 64) | |||
| else () | |||
| set(BINARY 32) | |||
| @@ -15,35 +15,83 @@ endfunction () | |||
| # Reads a Makefile into CMake vars. | |||
| macro(ParseMakefileVars MAKEFILE_IN) | |||
| message(STATUS "Reading vars from ${MAKEFILE_IN}...") | |||
| set (IfElse 0) | |||
| set (ElseSeen 0) | |||
| set (C_COMPILER ${CMAKE_C_COMPILER_ID}) | |||
| set (IfElse 0) | |||
| set (ElseSeen 0) | |||
| set (SkipIfs 0) | |||
| set (SkipElse 0) | |||
| file(STRINGS ${MAKEFILE_IN} makefile_contents) | |||
| foreach (makefile_line ${makefile_contents}) | |||
| #message(STATUS "parsing ${makefile_line}") | |||
| #message(STATUS "parsing ${makefile_line}") | |||
| # Skip the entire scope of the else statement given that the if statement that precedes it has the valid condition. | |||
| # The variable SkipIfs is used to identify which endif statement closes the scope of the else statement. | |||
| if (${SkipElse} EQUAL 1) | |||
| #message(STATUS "skipping ${makefile_line}") | |||
| string(REGEX MATCH "(ifeq|ifneq|ifdef|ifndef) .*$" line_match "${makefile_line}") | |||
| if (NOT "${line_match}" STREQUAL "") | |||
| MATH(EXPR SkipIfs "${SkipIfs}+1") | |||
| endif () | |||
| string(REGEX MATCH "endif[ \t]*" line_match "${makefile_line}") | |||
| if (NOT "${line_match}" STREQUAL "") | |||
| if (${SkipIfs} EQUAL 0) | |||
| set (SkipElse 0) | |||
| else () | |||
| MATH(EXPR SkipIfs "${SkipIfs}-1") | |||
| endif () | |||
| endif () | |||
| continue () | |||
| endif () | |||
| # The variable IfElse is greater than 0 if and only if the previously parsed line is an if statement. | |||
| if (${IfElse} GREATER 0) | |||
| # If the current scope is the one that has to be skipped, the if/endif/else statements | |||
| # along with it till the endif that closes the current scope have to be ignored as well. | |||
| string(REGEX MATCH "(ifeq|ifneq|ifdef|ifndef) .*$" line_match "${makefile_line}") | |||
| if (NOT "${line_match}" STREQUAL "") | |||
| if ((${IfElse} EQUAL 2 AND ${ElseSeen} EQUAL 0) OR (${IfElse} EQUAL 1 AND ${ElseSeen} EQUAL 1)) | |||
| #message(STATUS "skipping ${makefile_line}") | |||
| MATH(EXPR SkipIfs "${SkipIfs}+1") | |||
| continue () | |||
| endif () | |||
| endif () | |||
| string(REGEX MATCH "endif[ \t]*" line_match "${makefile_line}") | |||
| if (NOT "${line_match}" STREQUAL "") | |||
| # message(STATUS "ENDIF ${makefile_line}") | |||
| set (IfElse 0) | |||
| set (ElseSeen 0) | |||
| if (${SkipIfs} EQUAL 0) | |||
| #message(STATUS "ENDIF ${makefile_line}") | |||
| set (IfElse 0) | |||
| set (ElseSeen 0) | |||
| else () | |||
| #message(STATUS "skipping ${makefile_line}") | |||
| MATH(EXPR SkipIfs "${SkipIfs}-1") | |||
| endif () | |||
| continue () | |||
| endif () | |||
| string(REGEX MATCH "else[ \t]*" line_match "${makefile_line}") | |||
| if (NOT "${line_match}" STREQUAL "") | |||
| # message(STATUS "ELSE ${makefile_line}") | |||
| set (ElseSeen 1) | |||
| continue () | |||
| endif() | |||
| if ( (${IfElse} EQUAL 2 AND ${ElseSeen} EQUAL 0) OR ( ${IfElse} EQUAL 1 AND ${ElseSeen} EQUAL 1)) | |||
| # message(STATUS "skipping ${makefile_line}") | |||
| continue () | |||
| if (NOT "${line_match}" STREQUAL "") | |||
| if (${SkipIfs} EQUAL 0) | |||
| #message(STATUS "ELSE ${makefile_line}") | |||
| set (ElseSeen 1) | |||
| else () | |||
| #message(STATUS "skipping ${makefile_line}") | |||
| endif () | |||
| continue () | |||
| endif() | |||
| # Skip the lines that are not part of the path that has to be taken. | |||
| if ((${IfElse} EQUAL 2 AND ${ElseSeen} EQUAL 0) OR (${IfElse} EQUAL 1 AND ${ElseSeen} EQUAL 1) OR (${SkipIfs} GREATER 0)) | |||
| #message(STATUS "skipping ${makefile_line}") | |||
| continue () | |||
| endif () | |||
| endif () | |||
| endif () | |||
| # Skip commented lines (the ones that start with '#') | |||
| string(REGEX MATCH "[ \t]*\\#.*$" line_match "${makefile_line}") | |||
| if (NOT "${line_match}" STREQUAL "") | |||
| #message(STATUS "skipping ${makefile_line}") | |||
| continue () | |||
| endif () | |||
| string(REGEX MATCH "([0-9_a-zA-Z]+)[ \t]*=[ \t]*(.+)$" line_match "${makefile_line}") | |||
| if (NOT "${line_match}" STREQUAL "") | |||
| #message(STATUS "match on ${line_match}") | |||
| #message(STATUS "match on ${line_match}") | |||
| set(var_name ${CMAKE_MATCH_1}) | |||
| # set(var_value ${CMAKE_MATCH_2}) | |||
| #set(var_value ${CMAKE_MATCH_2}) | |||
| string(STRIP ${CMAKE_MATCH_2} var_value) | |||
| # check for Makefile variables in the string, e.g. $(TSUFFIX) | |||
| string(REGEX MATCHALL "\\$\\(([0-9_a-zA-Z]+)\\)" make_var_matches ${var_value}) | |||
| @@ -54,39 +102,93 @@ macro(ParseMakefileVars MAKEFILE_IN) | |||
| string(REPLACE "$(${make_var})" "${${make_var}}" var_value ${var_value}) | |||
| endforeach () | |||
| set(${var_name} ${var_value}) | |||
| else () | |||
| string(REGEX MATCH "include \\$\\(KERNELDIR\\)/(.+)$" line_match "${makefile_line}") | |||
| if (NOT "${line_match}" STREQUAL "") | |||
| #message(STATUS "match on include ${line_match}") | |||
| ParseMakefileVars(${KERNELDIR}/${CMAKE_MATCH_1}) | |||
| continue () | |||
| endif () | |||
| # Include a new file to be parsed | |||
| string(REGEX MATCH "include \\$\\(KERNELDIR\\)/(.+)$" line_match "${makefile_line}") | |||
| if (NOT "${line_match}" STREQUAL "") | |||
| #message(STATUS "match on include ${line_match}") | |||
| ParseMakefileVars(${KERNELDIR}/${CMAKE_MATCH_1}) | |||
| continue () | |||
| endif () | |||
| # The if statement that precedes this else has the path taken | |||
| # Thus, this else statement has to be skipped. | |||
| string(REGEX MATCH "else[ \t]*" line_match "${makefile_line}") | |||
| if (NOT "${line_match}" STREQUAL "") | |||
| #message(STATUS "skipping ${makefile_line}") | |||
| set (SkipElse 1) | |||
| continue() | |||
| endif() | |||
| # Example 1: ifdef HAVE_MSA | |||
| # Example 2: ifndef ZNRM2KERNEL | |||
| string(REGEX MATCH "(ifdef|ifndef) ([0-9_A-Z]+)" line_match "${makefile_line}") | |||
| if (NOT "${line_match}" STREQUAL "") | |||
| #message(STATUS "${CMAKE_MATCH_1} first: ${CMAKE_MATCH_2}") | |||
| set (ElseSeen 0) | |||
| if (DEFINED ${CMAKE_MATCH_2}) | |||
| if (${CMAKE_MATCH_1} STREQUAL "ifdef") | |||
| #message (STATUS "condition is true") | |||
| set (IfElse 1) | |||
| else () | |||
| set (IfElse 2) | |||
| endif () | |||
| else () | |||
| # message(STATUS "unmatched line ${line_match}") | |||
| string(REGEX MATCH "ifeq \\(\\$\\(([_A-Z]+)\\),[ \t]*([0-9_A-Z]+)\\)" line_match "${makefile_line}") | |||
| if (NOT "${line_match}" STREQUAL "") | |||
| # message(STATUS "IFEQ: ${line_match} first: ${CMAKE_MATCH_1} second: ${CMAKE_MATCH_2}") | |||
| if (DEFINED ${${CMAKE_MATCH_1}} AND ${${CMAKE_MATCH_1}} STREQUAL ${CMAKE_MATCH_2}) | |||
| # message (STATUS "condition is true") | |||
| set (IfElse 1) | |||
| else () | |||
| set (IfElse 2) | |||
| endif () | |||
| if (${CMAKE_MATCH_1} STREQUAL "ifdef") | |||
| set (IfElse 2) | |||
| else () | |||
| string(REGEX MATCH "ifneq \\(\\$\\(([_A-Z]+)\\),[ \t]*([0-9_A-Z]+)\\)" line_match "${makefile_line}") | |||
| if (NOT "${line_match}" STREQUAL "") | |||
| # message(STATUS "IFNEQ: ${line_match} first: ${CMAKE_MATCH_1} second: ${CMAKE_MATCH_2}") | |||
| if ( ${CMAKE_MATCH_1} STREQUAL C_COMPILER) | |||
| set (CMAKE_MATCH_1 CMAKE_C_COMPILER) | |||
| endif () | |||
| if (NOT ( ${${CMAKE_MATCH_1}} STREQUAL ${CMAKE_MATCH_2})) | |||
| # message (STATUS "condition is true") | |||
| set (IfElse 1) | |||
| else () | |||
| set (IfElse 2) | |||
| endif () | |||
| endif () | |||
| #message (STATUS "condition is true") | |||
| set (IfElse 1) | |||
| endif () | |||
| endif () | |||
| continue () | |||
| endif () | |||
| # Example 1: ifeq ($(SGEMM_UNROLL_M), 16) | |||
| # Example 2: ifeq ($(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N), 8x8) | |||
| # Example 3: ifeq ($(__BYTE_ORDER__)$(ELF_VERSION),__ORDER_BIG_ENDIAN__2) | |||
| # Ignore the second group since (?:...) does not work on cmake | |||
| string(REGEX MATCH "ifeq \\(\\$\\(([0-9_A-Z]+)\\)(([0-9_A-Za-z]*)\\$\\(([0-9_A-Z]+)\\))?,[ \t]*([0-9_A-Za-z]+)\\)" line_match "${makefile_line}") | |||
| if (NOT "${line_match}" STREQUAL "") | |||
| #message(STATUS "IFEQ: ${line_match} first: ${CMAKE_MATCH_1} second: ${CMAKE_MATCH_3} third: ${CMAKE_MATCH_4} fourth: ${CMAKE_MATCH_5}") | |||
| if (DEFINED ${CMAKE_MATCH_1}) | |||
| if (DEFINED ${CMAKE_MATCH_4}) | |||
| set (STR ${${CMAKE_MATCH_1}}${CMAKE_MATCH_3}${${CMAKE_MATCH_4}}) | |||
| else () | |||
| set (STR ${${CMAKE_MATCH_1}}) | |||
| endif () | |||
| if (${STR} STREQUAL ${CMAKE_MATCH_5}) | |||
| #message (STATUS "condition is true") | |||
| set (IfElse 1) | |||
| continue () | |||
| endif () | |||
| endif () | |||
| set (IfElse 2) | |||
| continue () | |||
| endif () | |||
| # Example 1 (Group 3): ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) | |||
| # Example 2 (Group 4): ifneq ($(C_COMPILER), PGI) | |||
| string(REGEX MATCH "ifneq \\(\\$\\(([0-9_A-Z]+)\\),[ \t]*(\\$\\(([0-9_A-Z]+)\\)|([0-9_A-Z]+))\\)" line_match "${makefile_line}") | |||
| if (NOT "${line_match}" STREQUAL "") | |||
| #message(STATUS "IFNEQ: ${line_match} first: ${CMAKE_MATCH_1} second: ${CMAKE_MATCH_3} third: ${CMAKE_MATCH_4}") | |||
| set (ElseSeen 0) | |||
| set (HasValidGroup 0) | |||
| if (DEFINED ${CMAKE_MATCH_3}) | |||
| set (HasValidGroup 1) | |||
| set (STR ${${CMAKE_MATCH_3}}) | |||
| elseif (NOT ${CMAKE_MATCH_4} STREQUAL "") | |||
| set (HasValidGroup 1) | |||
| set (STR ${CMAKE_MATCH_4}) | |||
| endif () | |||
| if (DEFINED ${CMAKE_MATCH_1} AND ${HasValidGroup} EQUAL 1) | |||
| if (NOT (${${CMAKE_MATCH_1}} STREQUAL ${STR})) | |||
| #message (STATUS "condition is true") | |||
| set (IfElse 1) | |||
| continue () | |||
| endif () | |||
| endif () | |||
| set (IfElse 2) | |||
| continue () | |||
| endif () | |||
| #message(STATUS "unmatched line ${line_match}") | |||
| endforeach () | |||
| endmacro () | |||
| @@ -1,13 +1,14 @@ | |||
| include ../Makefile.rule | |||
| TOPDIR = .. | |||
| include $(TOPDIR)/Makefile.system | |||
| all :: dgemv_tester dgemm_tester | |||
| dgemv_tester : | |||
| $(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemv_thread_safety.cpp ../libopenblas.a -lpthread -o dgemv_tester | |||
| $(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemv_thread_safety.cpp ../$(LIBNAME) $(EXTRALIB) $(FEXTRALIB) -o dgemv_tester | |||
| ./dgemv_tester | |||
| dgemm_tester : dgemv_tester | |||
| $(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemm_thread_safety.cpp ../libopenblas.a -lpthread -o dgemm_tester | |||
| $(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemm_thread_safety.cpp ../$(LIBNAME) $(EXTRALIB) $(FEXTRALIB) -o dgemm_tester | |||
| ./dgemm_tester | |||
| clean :: | |||
| @@ -120,6 +120,7 @@ | |||
| #define CORE_SKYLAKEX 28 | |||
| #define CORE_DHYANA 29 | |||
| #define CORE_COOPERLAKE 30 | |||
| #define CORE_SAPPHIRERAPIDS 31 | |||
| #define HAVE_SSE (1 << 0) | |||
| #define HAVE_SSE2 (1 << 1) | |||
| @@ -145,6 +146,7 @@ | |||
| #define HAVE_AVX512VL (1 << 21) | |||
| #define HAVE_AVX2 (1 << 22) | |||
| #define HAVE_AVX512BF16 (1 << 23) | |||
| #define HAVE_AMXBF16 (1 << 24) | |||
| #define CACHE_INFO_L1_I 1 | |||
| #define CACHE_INFO_L1_D 2 | |||
| @@ -222,6 +224,7 @@ typedef struct { | |||
| #define CPUTYPE_SKYLAKEX 52 | |||
| #define CPUTYPE_DHYANA 53 | |||
| #define CPUTYPE_COOPERLAKE 54 | |||
| #define CPUTYPE_SAPPHIRERAPIDS 55 | |||
| #define CPUTYPE_HYGON_UNKNOWN 99 | |||
| @@ -26,10 +26,12 @@ | |||
| *****************************************************************************/ | |||
| #include <string.h> | |||
| #ifdef OS_DARWIN | |||
| #ifdef __APPLE__ | |||
| #include <sys/sysctl.h> | |||
| int32_t value; | |||
| size_t length=sizeof(value); | |||
| int64_t value64; | |||
| size_t length64=sizeof(value64); | |||
| #endif | |||
| #define CPU_UNKNOWN 0 | |||
| @@ -53,6 +55,8 @@ size_t length=sizeof(value); | |||
| #define CPU_EMAG8180 10 | |||
| // Apple | |||
| #define CPU_VORTEX 13 | |||
| // Fujitsu | |||
| #define CPU_A64FX 15 | |||
| static char *cpuname[] = { | |||
| "UNKNOWN", | |||
| @@ -69,7 +73,8 @@ static char *cpuname[] = { | |||
| "NEOVERSEN1", | |||
| "THUNDERX3T110", | |||
| "VORTEX", | |||
| "CORTEXA55" | |||
| "CORTEXA55", | |||
| "A64FX" | |||
| }; | |||
| static char *cpuname_lower[] = { | |||
| @@ -87,7 +92,8 @@ static char *cpuname_lower[] = { | |||
| "neoversen1", | |||
| "thunderx3t110", | |||
| "vortex", | |||
| "cortexa55" | |||
| "cortexa55", | |||
| "a64fx" | |||
| }; | |||
| int get_feature(char *search) | |||
| @@ -183,6 +189,9 @@ int detect(void) | |||
| // Ampere | |||
| else if (strstr(cpu_implementer, "0x50") && strstr(cpu_part, "0x000")) | |||
| return CPU_EMAG8180; | |||
| // Fujitsu | |||
| else if (strstr(cpu_implementer, "0x46") && strstr(cpu_part, "0x001")) | |||
| return CPU_A64FX; | |||
| } | |||
| p = (char *) NULL ; | |||
| @@ -212,9 +221,9 @@ int detect(void) | |||
| } | |||
| #else | |||
| #ifdef DARWIN | |||
| #ifdef __APPLE__ | |||
| sysctlbyname("hw.cpufamily",&value,&length,NULL,0); | |||
| if (value ==131287967) return CPU_VORTEX; | |||
| if (value ==131287967|| value == 458787763 ) return CPU_VORTEX; | |||
| #endif | |||
| return CPU_ARMV8; | |||
| #endif | |||
| @@ -265,7 +274,7 @@ int n=0; | |||
| printf("#define NUM_CORES %d\n",n); | |||
| #endif | |||
| #ifdef DARWIN | |||
| #ifdef __APPLE__ | |||
| sysctlbyname("hw.physicalcpu_max",&value,&length,NULL,0); | |||
| printf("#define NUM_CORES %d\n",value); | |||
| #endif | |||
| @@ -285,154 +294,166 @@ void get_cpuconfig(void) | |||
| switch (d) | |||
| { | |||
| case CPU_CORTEXA53: | |||
| case CPU_CORTEXA55: | |||
| printf("#define %s\n", cpuname[d]); | |||
| // Fall-through | |||
| case CPU_ARMV8: | |||
| // Minimum parameters for ARMv8 (based on A53) | |||
| printf("#define L1_DATA_SIZE 32768\n"); | |||
| printf("#define L1_DATA_LINESIZE 64\n"); | |||
| printf("#define L2_SIZE 262144\n"); | |||
| printf("#define L2_LINESIZE 64\n"); | |||
| printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||
| printf("#define DTB_SIZE 4096\n"); | |||
| printf("#define L2_ASSOCIATIVE 4\n"); | |||
| case CPU_CORTEXA53: | |||
| case CPU_CORTEXA55: | |||
| printf("#define %s\n", cpuname[d]); | |||
| // Fall-through | |||
| case CPU_ARMV8: | |||
| // Minimum parameters for ARMv8 (based on A53) | |||
| printf("#define L1_DATA_SIZE 32768\n"); | |||
| printf("#define L1_DATA_LINESIZE 64\n"); | |||
| printf("#define L2_SIZE 262144\n"); | |||
| printf("#define L2_LINESIZE 64\n"); | |||
| printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||
| printf("#define DTB_SIZE 4096\n"); | |||
| printf("#define L2_ASSOCIATIVE 4\n"); | |||
| break; | |||
| case CPU_CORTEXA57: | |||
| case CPU_CORTEXA72: | |||
| case CPU_CORTEXA73: | |||
| case CPU_CORTEXA57: | |||
| case CPU_CORTEXA72: | |||
| case CPU_CORTEXA73: | |||
| // Common minimum settings for these Arm cores | |||
| // Can change a lot, but we need to be conservative | |||
| // TODO: detect info from /sys if possible | |||
| printf("#define %s\n", cpuname[d]); | |||
| printf("#define L1_CODE_SIZE 49152\n"); | |||
| printf("#define L1_CODE_LINESIZE 64\n"); | |||
| printf("#define L1_CODE_ASSOCIATIVE 3\n"); | |||
| printf("#define L1_DATA_SIZE 32768\n"); | |||
| printf("#define L1_DATA_LINESIZE 64\n"); | |||
| printf("#define L1_DATA_ASSOCIATIVE 2\n"); | |||
| printf("#define L2_SIZE 524288\n"); | |||
| printf("#define L2_LINESIZE 64\n"); | |||
| printf("#define L2_ASSOCIATIVE 16\n"); | |||
| printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||
| printf("#define DTB_SIZE 4096\n"); | |||
| break; | |||
| case CPU_NEOVERSEN1: | |||
| printf("#define %s\n", cpuname[d]); | |||
| printf("#define L1_CODE_SIZE 65536\n"); | |||
| printf("#define L1_CODE_LINESIZE 64\n"); | |||
| printf("#define L1_CODE_ASSOCIATIVE 4\n"); | |||
| printf("#define L1_DATA_SIZE 65536\n"); | |||
| printf("#define L1_DATA_LINESIZE 64\n"); | |||
| printf("#define L1_DATA_ASSOCIATIVE 4\n"); | |||
| printf("#define L2_SIZE 1048576\n"); | |||
| printf("#define L2_LINESIZE 64\n"); | |||
| printf("#define L2_ASSOCIATIVE 16\n"); | |||
| printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||
| printf("#define DTB_SIZE 4096\n"); | |||
| break; | |||
| case CPU_FALKOR: | |||
| printf("#define FALKOR\n"); | |||
| printf("#define L1_CODE_SIZE 65536\n"); | |||
| printf("#define L1_CODE_LINESIZE 64\n"); | |||
| printf("#define L1_DATA_SIZE 32768\n"); | |||
| printf("#define L1_DATA_LINESIZE 128\n"); | |||
| printf("#define L2_SIZE 524288\n"); | |||
| printf("#define L2_LINESIZE 64\n"); | |||
| printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||
| printf("#define DTB_SIZE 4096\n"); | |||
| printf("#define L2_ASSOCIATIVE 16\n"); | |||
| break; | |||
| case CPU_THUNDERX: | |||
| printf("#define THUNDERX\n"); | |||
| printf("#define L1_DATA_SIZE 32768\n"); | |||
| printf("#define L1_DATA_LINESIZE 128\n"); | |||
| printf("#define L2_SIZE 16777216\n"); | |||
| printf("#define L2_LINESIZE 128\n"); | |||
| printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||
| printf("#define DTB_SIZE 4096\n"); | |||
| printf("#define L2_ASSOCIATIVE 16\n"); | |||
| break; | |||
| case CPU_THUNDERX2T99: | |||
| printf("#define THUNDERX2T99 \n"); | |||
| printf("#define L1_CODE_SIZE 32768 \n"); | |||
| printf("#define L1_CODE_LINESIZE 64 \n"); | |||
| printf("#define L1_CODE_ASSOCIATIVE 8 \n"); | |||
| printf("#define L1_DATA_SIZE 32768 \n"); | |||
| printf("#define L1_DATA_LINESIZE 64 \n"); | |||
| printf("#define L1_DATA_ASSOCIATIVE 8 \n"); | |||
| printf("#define L2_SIZE 262144 \n"); | |||
| printf("#define L2_LINESIZE 64 \n"); | |||
| printf("#define L2_ASSOCIATIVE 8 \n"); | |||
| printf("#define L3_SIZE 33554432 \n"); | |||
| printf("#define L3_LINESIZE 64 \n"); | |||
| printf("#define L3_ASSOCIATIVE 32 \n"); | |||
| printf("#define DTB_DEFAULT_ENTRIES 64 \n"); | |||
| printf("#define DTB_SIZE 4096 \n"); | |||
| break; | |||
| printf("#define %s\n", cpuname[d]); | |||
| printf("#define L1_CODE_SIZE 49152\n"); | |||
| printf("#define L1_CODE_LINESIZE 64\n"); | |||
| printf("#define L1_CODE_ASSOCIATIVE 3\n"); | |||
| printf("#define L1_DATA_SIZE 32768\n"); | |||
| printf("#define L1_DATA_LINESIZE 64\n"); | |||
| printf("#define L1_DATA_ASSOCIATIVE 2\n"); | |||
| printf("#define L2_SIZE 524288\n"); | |||
| printf("#define L2_LINESIZE 64\n"); | |||
| printf("#define L2_ASSOCIATIVE 16\n"); | |||
| printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||
| printf("#define DTB_SIZE 4096\n"); | |||
| break; | |||
| case CPU_NEOVERSEN1: | |||
| printf("#define %s\n", cpuname[d]); | |||
| printf("#define L1_CODE_SIZE 65536\n"); | |||
| printf("#define L1_CODE_LINESIZE 64\n"); | |||
| printf("#define L1_CODE_ASSOCIATIVE 4\n"); | |||
| printf("#define L1_DATA_SIZE 65536\n"); | |||
| printf("#define L1_DATA_LINESIZE 64\n"); | |||
| printf("#define L1_DATA_ASSOCIATIVE 4\n"); | |||
| printf("#define L2_SIZE 1048576\n"); | |||
| printf("#define L2_LINESIZE 64\n"); | |||
| printf("#define L2_ASSOCIATIVE 16\n"); | |||
| printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||
| printf("#define DTB_SIZE 4096\n"); | |||
| break; | |||
| case CPU_FALKOR: | |||
| printf("#define FALKOR\n"); | |||
| printf("#define L1_CODE_SIZE 65536\n"); | |||
| printf("#define L1_CODE_LINESIZE 64\n"); | |||
| printf("#define L1_DATA_SIZE 32768\n"); | |||
| printf("#define L1_DATA_LINESIZE 128\n"); | |||
| printf("#define L2_SIZE 524288\n"); | |||
| printf("#define L2_LINESIZE 64\n"); | |||
| printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||
| printf("#define DTB_SIZE 4096\n"); | |||
| printf("#define L2_ASSOCIATIVE 16\n"); | |||
| break; | |||
| case CPU_THUNDERX: | |||
| printf("#define THUNDERX\n"); | |||
| printf("#define L1_DATA_SIZE 32768\n"); | |||
| printf("#define L1_DATA_LINESIZE 128\n"); | |||
| printf("#define L2_SIZE 16777216\n"); | |||
| printf("#define L2_LINESIZE 128\n"); | |||
| printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||
| printf("#define DTB_SIZE 4096\n"); | |||
| printf("#define L2_ASSOCIATIVE 16\n"); | |||
| break; | |||
| case CPU_THUNDERX2T99: | |||
| printf("#define THUNDERX2T99 \n"); | |||
| printf("#define L1_CODE_SIZE 32768 \n"); | |||
| printf("#define L1_CODE_LINESIZE 64 \n"); | |||
| printf("#define L1_CODE_ASSOCIATIVE 8 \n"); | |||
| printf("#define L1_DATA_SIZE 32768 \n"); | |||
| printf("#define L1_DATA_LINESIZE 64 \n"); | |||
| printf("#define L1_DATA_ASSOCIATIVE 8 \n"); | |||
| printf("#define L2_SIZE 262144 \n"); | |||
| printf("#define L2_LINESIZE 64 \n"); | |||
| printf("#define L2_ASSOCIATIVE 8 \n"); | |||
| printf("#define L3_SIZE 33554432 \n"); | |||
| printf("#define L3_LINESIZE 64 \n"); | |||
| printf("#define L3_ASSOCIATIVE 32 \n"); | |||
| printf("#define DTB_DEFAULT_ENTRIES 64 \n"); | |||
| printf("#define DTB_SIZE 4096 \n"); | |||
| break; | |||
| case CPU_TSV110: | |||
| printf("#define TSV110 \n"); | |||
| printf("#define L1_CODE_SIZE 65536 \n"); | |||
| printf("#define L1_CODE_LINESIZE 64 \n"); | |||
| printf("#define L1_CODE_ASSOCIATIVE 4 \n"); | |||
| printf("#define L1_DATA_SIZE 65536 \n"); | |||
| printf("#define L1_DATA_LINESIZE 64 \n"); | |||
| printf("#define L1_DATA_ASSOCIATIVE 4 \n"); | |||
| printf("#define L2_SIZE 524228 \n"); | |||
| printf("#define L2_LINESIZE 64 \n"); | |||
| printf("#define L2_ASSOCIATIVE 8 \n"); | |||
| printf("#define DTB_DEFAULT_ENTRIES 64 \n"); | |||
| printf("#define DTB_SIZE 4096 \n"); | |||
| break; | |||
| case CPU_EMAG8180: | |||
| // Minimum parameters for ARMv8 (based on A53) | |||
| printf("#define EMAG8180\n"); | |||
| printf("#define L1_CODE_SIZE 32768\n"); | |||
| printf("#define L1_DATA_SIZE 32768\n"); | |||
| printf("#define L1_DATA_LINESIZE 64\n"); | |||
| printf("#define L2_SIZE 262144\n"); | |||
| printf("#define L2_LINESIZE 64\n"); | |||
| printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||
| printf("#define DTB_SIZE 4096\n"); | |||
| break; | |||
| case CPU_THUNDERX3T110: | |||
| printf("#define THUNDERX3T110 \n"); | |||
| printf("#define L1_CODE_SIZE 65536 \n"); | |||
| printf("#define L1_CODE_LINESIZE 64 \n"); | |||
| printf("#define L1_CODE_ASSOCIATIVE 8 \n"); | |||
| printf("#define L1_DATA_SIZE 32768 \n"); | |||
| printf("#define L1_DATA_LINESIZE 64 \n"); | |||
| printf("#define L1_DATA_ASSOCIATIVE 8 \n"); | |||
| printf("#define L2_SIZE 524288 \n"); | |||
| printf("#define L2_LINESIZE 64 \n"); | |||
| printf("#define L2_ASSOCIATIVE 8 \n"); | |||
| printf("#define L3_SIZE 94371840 \n"); | |||
| printf("#define L3_LINESIZE 64 \n"); | |||
| printf("#define L3_ASSOCIATIVE 32 \n"); | |||
| printf("#define DTB_DEFAULT_ENTRIES 64 \n"); | |||
| printf("#define DTB_SIZE 4096 \n"); | |||
| break; | |||
| #ifdef DARWIN | |||
| case CPU_VORTEX: | |||
| printf("#define VORTEX \n"); | |||
| sysctlbyname("hw.l1icachesize",&value,&length,NULL,0); | |||
| printf("#define L1_CODE_SIZE %d \n",value); | |||
| sysctlbyname("hw.cachelinesize",&value,&length,NULL,0); | |||
| printf("#define L1_CODE_LINESIZE %d \n",value); | |||
| sysctlbyname("hw.l1dcachesize",&value,&length,NULL,0); | |||
| printf("#define L1_DATA_SIZE %d \n",value); | |||
| sysctlbyname("hw.l2dcachesize",&value,&length,NULL,0); | |||
| printf("#define L2_SIZE %d \n",value); | |||
| break; | |||
| case CPU_TSV110: | |||
| printf("#define TSV110 \n"); | |||
| printf("#define L1_CODE_SIZE 65536 \n"); | |||
| printf("#define L1_CODE_LINESIZE 64 \n"); | |||
| printf("#define L1_CODE_ASSOCIATIVE 4 \n"); | |||
| printf("#define L1_DATA_SIZE 65536 \n"); | |||
| printf("#define L1_DATA_LINESIZE 64 \n"); | |||
| printf("#define L1_DATA_ASSOCIATIVE 4 \n"); | |||
| printf("#define L2_SIZE 524228 \n"); | |||
| printf("#define L2_LINESIZE 64 \n"); | |||
| printf("#define L2_ASSOCIATIVE 8 \n"); | |||
| printf("#define DTB_DEFAULT_ENTRIES 64 \n"); | |||
| printf("#define DTB_SIZE 4096 \n"); | |||
| break; | |||
| case CPU_EMAG8180: | |||
| // Minimum parameters for ARMv8 (based on A53) | |||
| printf("#define EMAG8180\n"); | |||
| printf("#define L1_CODE_SIZE 32768\n"); | |||
| printf("#define L1_DATA_SIZE 32768\n"); | |||
| printf("#define L1_DATA_LINESIZE 64\n"); | |||
| printf("#define L2_SIZE 262144\n"); | |||
| printf("#define L2_LINESIZE 64\n"); | |||
| printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||
| printf("#define DTB_SIZE 4096\n"); | |||
| break; | |||
| case CPU_THUNDERX3T110: | |||
| printf("#define THUNDERX3T110 \n"); | |||
| printf("#define L1_CODE_SIZE 65536 \n"); | |||
| printf("#define L1_CODE_LINESIZE 64 \n"); | |||
| printf("#define L1_CODE_ASSOCIATIVE 8 \n"); | |||
| printf("#define L1_DATA_SIZE 32768 \n"); | |||
| printf("#define L1_DATA_LINESIZE 64 \n"); | |||
| printf("#define L1_DATA_ASSOCIATIVE 8 \n"); | |||
| printf("#define L2_SIZE 524288 \n"); | |||
| printf("#define L2_LINESIZE 64 \n"); | |||
| printf("#define L2_ASSOCIATIVE 8 \n"); | |||
| printf("#define L3_SIZE 94371840 \n"); | |||
| printf("#define L3_LINESIZE 64 \n"); | |||
| printf("#define L3_ASSOCIATIVE 32 \n"); | |||
| printf("#define DTB_DEFAULT_ENTRIES 64 \n"); | |||
| printf("#define DTB_SIZE 4096 \n"); | |||
| break; | |||
| #ifdef __APPLE__ | |||
| case CPU_VORTEX: | |||
| printf("#define VORTEX \n"); | |||
| sysctlbyname("hw.l1icachesize",&value64,&length64,NULL,0); | |||
| printf("#define L1_CODE_SIZE %lld \n",value64); | |||
| sysctlbyname("hw.cachelinesize",&value64,&length64,NULL,0); | |||
| printf("#define L1_CODE_LINESIZE %lld \n",value64); | |||
| sysctlbyname("hw.l1dcachesize",&value64,&length64,NULL,0); | |||
| printf("#define L1_DATA_SIZE %lld \n",value64); | |||
| sysctlbyname("hw.l2cachesize",&value64,&length64,NULL,0); | |||
| printf("#define L2_SIZE %lld \n",value64); | |||
| printf("#define DTB_DEFAULT_ENTRIES 64 \n"); | |||
| printf("#define DTB_SIZE 4096 \n"); | |||
| break; | |||
| #endif | |||
| case CPU_A64FX: | |||
| printf("#define A64FX\n"); | |||
| printf("#define L1_CODE_SIZE 65535\n"); | |||
| printf("#define L1_DATA_SIZE 65535\n"); | |||
| printf("#define L1_DATA_LINESIZE 256\n"); | |||
| printf("#define L2_SIZE 8388608\n"); | |||
| printf("#define L2_LINESIZE 256\n"); | |||
| printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||
| printf("#define DTB_SIZE 4096\n"); | |||
| break; | |||
| } | |||
| get_cpucount(); | |||
| } | |||
| @@ -165,6 +165,7 @@ void get_cpuconfig(void){ | |||
| }else{ | |||
| printf("#define UNKNOWN\n"); | |||
| } | |||
| if (!get_feature(msa)) printf("#define NO_MSA\n"); | |||
| } | |||
| void get_libname(void){ | |||
| @@ -178,3 +179,38 @@ void get_libname(void){ | |||
| printf("mips\n"); | |||
| } | |||
| } | |||
| int get_feature(char *search) | |||
| { | |||
| #ifdef __linux | |||
| FILE *infile; | |||
| char buffer[2048], *p,*t; | |||
| p = (char *) NULL ; | |||
| infile = fopen("/proc/cpuinfo", "r"); | |||
| while (fgets(buffer, sizeof(buffer), infile)) | |||
| { | |||
| if (!strncmp("Features", buffer, 8)) | |||
| { | |||
| p = strchr(buffer, ':') + 2; | |||
| break; | |||
| } | |||
| } | |||
| fclose(infile); | |||
| if( p == NULL ) return 0; | |||
| t = strtok(p," "); | |||
| while( t = strtok(NULL," ")) | |||
| { | |||
| if (!strcmp(t, search)) { return(1); } | |||
| } | |||
| #endif | |||
| return(0); | |||
| } | |||
| @@ -104,17 +104,17 @@ int detect(void){ | |||
| } | |||
| } | |||
| fclose(infile); | |||
| if(p != NULL){ | |||
| if (strstr(p, "Loongson-3A3000") || strstr(p, "Loongson-3B3000")){ | |||
| return CPU_LOONGSON3R3; | |||
| }else if(strstr(p, "Loongson-3A4000") || strstr(p, "Loongson-3B4000")){ | |||
| return CPU_LOONGSON3R4; | |||
| } else{ | |||
| return CPU_SICORTEX; | |||
| if (p != NULL){ | |||
| if (strstr(p, "Loongson-3A3000") || strstr(p, "Loongson-3B3000")){ | |||
| return CPU_LOONGSON3R3; | |||
| } else if (strstr(p, "Loongson-3A4000") || strstr(p, "Loongson-3B4000")){ | |||
| return CPU_LOONGSON3R4; | |||
| } else{ | |||
| return CPU_SICORTEX; | |||
| } | |||
| } | |||
| #endif | |||
| return CPU_UNKNOWN; | |||
| } | |||
| } | |||
| char *get_corename(void){ | |||
| @@ -201,6 +201,7 @@ void get_cpuconfig(void){ | |||
| printf("#define DTB_SIZE 4096\n"); | |||
| printf("#define L2_ASSOCIATIVE 8\n"); | |||
| } | |||
| if (!get_feature(msa)) printf("#define NO_MSA\n"); | |||
| } | |||
| void get_libname(void){ | |||
| @@ -218,3 +219,38 @@ void get_libname(void){ | |||
| printf("mips64\n"); | |||
| } | |||
| } | |||
| int get_feature(char *search) | |||
| { | |||
| #ifdef __linux | |||
| FILE *infile; | |||
| char buffer[2048], *p,*t; | |||
| p = (char *) NULL ; | |||
| infile = fopen("/proc/cpuinfo", "r"); | |||
| while (fgets(buffer, sizeof(buffer), infile)) | |||
| { | |||
| if (!strncmp("Features", buffer, 8)) | |||
| { | |||
| p = strchr(buffer, ':') + 2; | |||
| break; | |||
| } | |||
| } | |||
| fclose(infile); | |||
| if( p == NULL ) return 0; | |||
| t = strtok(p," "); | |||
| while( t = strtok(NULL," ")) | |||
| { | |||
| if (!strcmp(t, search)) { return(1); } | |||
| } | |||
| #endif | |||
| return(0); | |||
| } | |||
| @@ -1,3 +1,4 @@ | |||
| //{ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| @@ -266,6 +267,31 @@ int support_avx512_bf16(){ | |||
| #endif | |||
| } | |||
| #define BIT_AMX_TILE 0x01000000 | |||
| #define BIT_AMX_BF16 0x00400000 | |||
| #define BIT_AMX_ENBD 0x00060000 | |||
| int support_amx_bf16() { | |||
| #if !defined(NO_AVX) && !defined(NO_AVX512) | |||
| int eax, ebx, ecx, edx; | |||
| int ret=0; | |||
| if (!support_avx512()) | |||
| return 0; | |||
| // CPUID.7.0:EDX indicates AMX support | |||
| cpuid_count(7, 0, &eax, &ebx, &ecx, &edx); | |||
| if ((edx & BIT_AMX_TILE) && (edx & BIT_AMX_BF16)) { | |||
| // CPUID.D.0:EAX[17:18] indicates AMX enabled | |||
| cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx); | |||
| if ((eax & BIT_AMX_ENBD) == BIT_AMX_ENBD) | |||
| ret = 1; | |||
| } | |||
| return ret; | |||
| #else | |||
| return 0; | |||
| #endif | |||
| } | |||
| int get_vendor(void){ | |||
| int eax, ebx, ecx, edx; | |||
| char vendor[13]; | |||
| @@ -353,6 +379,7 @@ int get_cputype(int gettype){ | |||
| if (support_avx2()) feature |= HAVE_AVX2; | |||
| if (support_avx512()) feature |= HAVE_AVX512VL; | |||
| if (support_avx512_bf16()) feature |= HAVE_AVX512BF16; | |||
| if (support_amx_bf16()) feature |= HAVE_AMXBF16; | |||
| if ((ecx & (1 << 12)) != 0) feature |= HAVE_FMA3; | |||
| #endif | |||
| @@ -1429,10 +1456,10 @@ int get_cpuname(void){ | |||
| return CPUTYPE_NEHALEM; | |||
| } | |||
| break; | |||
| case 9: | |||
| case 8: | |||
| switch (model) { | |||
| case 12: // Tiger Lake | |||
| case 13: // Tiger Lake (11th Gen Intel(R) Core(TM) i7-11800H @ 2.30GHz) | |||
| if(support_avx512()) | |||
| return CPUTYPE_SKYLAKEX; | |||
| if(support_avx2()) | |||
| @@ -1448,30 +1475,70 @@ int get_cpuname(void){ | |||
| return CPUTYPE_SANDYBRIDGE; | |||
| else | |||
| return CPUTYPE_NEHALEM; | |||
| } | |||
| case 10: //family 6 exmodel 10 | |||
| case 15: // Sapphire Rapids | |||
| if(support_avx512_bf16()) | |||
| return CPUTYPE_COOPERLAKE; | |||
| if(support_avx512()) | |||
| return CPUTYPE_SKYLAKEX; | |||
| if(support_avx2()) | |||
| return CPUTYPE_HASWELL; | |||
| if(support_avx()) | |||
| return CPUTYPE_SANDYBRIDGE; | |||
| else | |||
| return CPUTYPE_NEHALEM; | |||
| } | |||
| break; | |||
| case 9: | |||
| switch (model) { | |||
| case 5: // Comet Lake H and S | |||
| case 6: // Comet Lake U | |||
| case 7: // Alder Lake desktop | |||
| case 10: // Alder Lake mobile | |||
| if(support_avx2()) | |||
| return CPUTYPE_HASWELL; | |||
| if(support_avx()) | |||
| return CPUTYPE_SANDYBRIDGE; | |||
| return CPUTYPE_SANDYBRIDGE; | |||
| else | |||
| return CPUTYPE_NEHALEM; | |||
| case 7: // Rocket Lake | |||
| if(support_avx512()) | |||
| return CPUTYPE_NEHALEM; | |||
| case 13: // Ice Lake NNPI | |||
| if(support_avx512()) | |||
| return CPUTYPE_SKYLAKEX; | |||
| if(support_avx2()) | |||
| return CPUTYPE_HASWELL; | |||
| if(support_avx()) | |||
| return CPUTYPE_SANDYBRIDGE; | |||
| else | |||
| return CPUTYPE_NEHALEM; | |||
| case 14: // Kaby Lake and refreshes | |||
| if(support_avx2()) | |||
| return CPUTYPE_HASWELL; | |||
| if(support_avx()) | |||
| return CPUTYPE_SANDYBRIDGE; | |||
| else | |||
| return CPUTYPE_NEHALEM; | |||
| } | |||
| break; | |||
| } | |||
| else | |||
| return CPUTYPE_NEHALEM; | |||
| } | |||
| break; | |||
| case 10: //family 6 exmodel 10 | |||
| switch (model) { | |||
| case 5: // Comet Lake H and S | |||
| case 6: // Comet Lake U | |||
| if(support_avx2()) | |||
| return CPUTYPE_HASWELL; | |||
| if(support_avx()) | |||
| return CPUTYPE_SANDYBRIDGE; | |||
| else | |||
| return CPUTYPE_NEHALEM; | |||
| case 7: // Rocket Lake | |||
| if(support_avx512()) | |||
| return CPUTYPE_SKYLAKEX; | |||
| if(support_avx2()) | |||
| return CPUTYPE_HASWELL; | |||
| if(support_avx()) | |||
| return CPUTYPE_SANDYBRIDGE; | |||
| else | |||
| return CPUTYPE_NEHALEM; | |||
| } | |||
| break; | |||
| } | |||
| break; | |||
| case 0x7: | |||
| return CPUTYPE_ITANIUM; | |||
| case 0xf: | |||
| @@ -2042,32 +2109,7 @@ int get_coretype(void){ | |||
| return CORE_NEHALEM; | |||
| } | |||
| break; | |||
| case 10: | |||
| switch (model) { | |||
| case 5: // Comet Lake H and S | |||
| case 6: // Comet Lake U | |||
| if(support_avx()) | |||
| #ifndef NO_AVX2 | |||
| return CORE_HASWELL; | |||
| #else | |||
| return CORE_SANDYBRIDGE; | |||
| #endif | |||
| else | |||
| return CORE_NEHALEM; | |||
| case 7:// Rocket Lake | |||
| #ifndef NO_AVX512 | |||
| if(support_avx512()) | |||
| return CORE_SKYLAKEX; | |||
| #endif | |||
| #ifndef NO_AVX2 | |||
| if(support_avx2()) | |||
| return CORE_HASWELL; | |||
| #endif | |||
| if(support_avx()) | |||
| return CORE_SANDYBRIDGE; | |||
| else | |||
| return CORE_NEHALEM; | |||
| } | |||
| case 5: | |||
| switch (model) { | |||
| case 6: | |||
| @@ -2121,6 +2163,7 @@ int get_coretype(void){ | |||
| return CORE_NEHALEM; | |||
| } | |||
| break; | |||
| case 6: | |||
| if (model == 6) | |||
| #ifndef NO_AVX512 | |||
| @@ -2135,7 +2178,7 @@ int get_coretype(void){ | |||
| else | |||
| return CORE_NEHALEM; | |||
| #endif | |||
| if (model == 10) | |||
| if (model == 10 || model == 12) | |||
| #ifndef NO_AVX512 | |||
| if(support_avx512_bf16()) | |||
| return CORE_COOPERLAKE; | |||
| @@ -2151,10 +2194,11 @@ int get_coretype(void){ | |||
| return CORE_NEHALEM; | |||
| #endif | |||
| break; | |||
| case 7: | |||
| if (model == 10) | |||
| return CORE_NEHALEM; | |||
| if (model == 14) | |||
| if (model == 13 || model == 14) // Ice Lake | |||
| #ifndef NO_AVX512 | |||
| return CORE_SKYLAKEX; | |||
| #else | |||
| @@ -2168,9 +2212,9 @@ int get_coretype(void){ | |||
| return CORE_NEHALEM; | |||
| #endif | |||
| break; | |||
| case 9: | |||
| case 8: | |||
| if (model == 12) { // Tiger Lake | |||
| if (model == 12 || model == 13) { // Tiger Lake | |||
| if(support_avx512()) | |||
| return CORE_SKYLAKEX; | |||
| if(support_avx2()) | |||
| @@ -2180,7 +2224,7 @@ int get_coretype(void){ | |||
| else | |||
| return CORE_NEHALEM; | |||
| } | |||
| if (model == 14) { // Kaby Lake | |||
| if (model == 14) { // Kaby Lake mobile | |||
| if(support_avx()) | |||
| #ifndef NO_AVX2 | |||
| return CORE_HASWELL; | |||
| @@ -2190,12 +2234,82 @@ int get_coretype(void){ | |||
| else | |||
| return CORE_NEHALEM; | |||
| } | |||
| } | |||
| if (model == 15) { // Sapphire Rapids | |||
| if(support_avx512_bf16()) | |||
| return CPUTYPE_COOPERLAKE; | |||
| if(support_avx512()) | |||
| return CPUTYPE_SKYLAKEX; | |||
| if(support_avx2()) | |||
| return CPUTYPE_HASWELL; | |||
| if(support_avx()) | |||
| return CPUTYPE_SANDYBRIDGE; | |||
| else | |||
| return CPUTYPE_NEHALEM; | |||
| } | |||
| break; | |||
| case 9: | |||
| if (model == 7 || model == 10) { // Alder Lake | |||
| if(support_avx2()) | |||
| return CORE_HASWELL; | |||
| if(support_avx()) | |||
| return CORE_SANDYBRIDGE; | |||
| else | |||
| return CORE_NEHALEM; | |||
| } | |||
| if (model == 13) { // Ice Lake NNPI | |||
| if(support_avx512()) | |||
| return CORE_SKYLAKEX; | |||
| if(support_avx2()) | |||
| return CORE_HASWELL; | |||
| if(support_avx()) | |||
| return CORE_SANDYBRIDGE; | |||
| else | |||
| return CORE_NEHALEM; | |||
| } | |||
| if (model == 14) { // Kaby Lake desktop | |||
| if(support_avx()) | |||
| #ifndef NO_AVX2 | |||
| return CORE_HASWELL; | |||
| #else | |||
| return CORE_SANDYBRIDGE; | |||
| #endif | |||
| else | |||
| return CORE_NEHALEM; | |||
| } | |||
| break; | |||
| case 10: | |||
| switch (model) { | |||
| case 5: // Comet Lake H and S | |||
| case 6: // Comet Lake U | |||
| if(support_avx()) | |||
| #ifndef NO_AVX2 | |||
| return CORE_HASWELL; | |||
| #else | |||
| return CORE_SANDYBRIDGE; | |||
| #endif | |||
| else | |||
| return CORE_NEHALEM; | |||
| case 7:// Rocket Lake | |||
| #ifndef NO_AVX512 | |||
| if(support_avx512()) | |||
| return CORE_SKYLAKEX; | |||
| #endif | |||
| #ifndef NO_AVX2 | |||
| if(support_avx2()) | |||
| return CORE_HASWELL; | |||
| #endif | |||
| if(support_avx()) | |||
| return CORE_SANDYBRIDGE; | |||
| else | |||
| return CORE_NEHALEM; | |||
| } | |||
| case 15: | |||
| if (model <= 0x2) return CORE_NORTHWOOD; | |||
| else return CORE_PRESCOTT; | |||
| } | |||
| } | |||
| } | |||
| @@ -2389,6 +2503,7 @@ void get_cpuconfig(void){ | |||
| if (features & HAVE_AVX2 ) printf("#define HAVE_AVX2\n"); | |||
| if (features & HAVE_AVX512VL ) printf("#define HAVE_AVX512VL\n"); | |||
| if (features & HAVE_AVX512BF16 ) printf("#define HAVE_AVX512BF16\n"); | |||
| if (features & HAVE_AMXBF16 ) printf("#define HAVE_AMXBF16\n"); | |||
| if (features & HAVE_3DNOWEX) printf("#define HAVE_3DNOWEX\n"); | |||
| if (features & HAVE_3DNOW) printf("#define HAVE_3DNOW\n"); | |||
| if (features & HAVE_FMA4 ) printf("#define HAVE_FMA4\n"); | |||
| @@ -2460,9 +2575,11 @@ void get_sse(void){ | |||
| if (features & HAVE_AVX2 ) printf("HAVE_AVX2=1\n"); | |||
| if (features & HAVE_AVX512VL ) printf("HAVE_AVX512VL=1\n"); | |||
| if (features & HAVE_AVX512BF16 ) printf("HAVE_AVX512BF16=1\n"); | |||
| if (features & HAVE_AMXBF16 ) printf("HAVE_AMXBF16=1\n"); | |||
| if (features & HAVE_3DNOWEX) printf("HAVE_3DNOWEX=1\n"); | |||
| if (features & HAVE_3DNOW) printf("HAVE_3DNOW=1\n"); | |||
| if (features & HAVE_FMA4 ) printf("HAVE_FMA4=1\n"); | |||
| if (features & HAVE_FMA3 ) printf("HAVE_FMA3=1\n"); | |||
| } | |||
| //} | |||
| @@ -27,57 +27,11 @@ | |||
| #include <string.h> | |||
| #define CPU_GENERIC 0 | |||
| #define CPU_Z13 1 | |||
| #define CPU_Z14 2 | |||
| #define CPU_Z15 3 | |||
| #include "cpuid_zarch.h" | |||
| static char *cpuname[] = { | |||
| "ZARCH_GENERIC", | |||
| "Z13", | |||
| "Z14", | |||
| "Z15" | |||
| }; | |||
| static char *cpuname_lower[] = { | |||
| "zarch_generic", | |||
| "z13", | |||
| "z14", | |||
| "z15" | |||
| }; | |||
| int detect(void) | |||
| { | |||
| FILE *infile; | |||
| char buffer[512], *p; | |||
| p = (char *)NULL; | |||
| infile = fopen("/proc/sysinfo", "r"); | |||
| while (fgets(buffer, sizeof(buffer), infile)){ | |||
| if (!strncmp("Type", buffer, 4)){ | |||
| p = strchr(buffer, ':') + 2; | |||
| #if 0 | |||
| fprintf(stderr, "%s\n", p); | |||
| #endif | |||
| break; | |||
| } | |||
| } | |||
| fclose(infile); | |||
| if (strstr(p, "2964")) return CPU_Z13; | |||
| if (strstr(p, "2965")) return CPU_Z13; | |||
| if (strstr(p, "3906")) return CPU_Z14; | |||
| if (strstr(p, "3907")) return CPU_Z14; | |||
| if (strstr(p, "8561")) return CPU_Z14; // fallback z15 to z14 | |||
| if (strstr(p, "8562")) return CPU_Z14; // fallback z15 to z14 | |||
| return CPU_GENERIC; | |||
| } | |||
| void get_libname(void) | |||
| { | |||
| int d = detect(); | |||
| printf("%s", cpuname_lower[d]); | |||
| } | |||
| @@ -0,0 +1,101 @@ | |||
| #include <stdlib.h> | |||
| #define CPU_GENERIC 0 | |||
| #define CPU_Z13 1 | |||
| #define CPU_Z14 2 | |||
| #define CPU_Z15 3 | |||
| static char *cpuname[] = { | |||
| "ZARCH_GENERIC", | |||
| "Z13", | |||
| "Z14", | |||
| "Z15" | |||
| }; | |||
| static char *cpuname_lower[] = { | |||
| "zarch_generic", | |||
| "z13", | |||
| "z14", | |||
| "z15" | |||
| }; | |||
| // Guard the use of getauxval() on glibc version >= 2.16 | |||
| #ifdef __GLIBC__ | |||
| #include <features.h> | |||
| #if __GLIBC_PREREQ(2, 16) | |||
| #include <sys/auxv.h> | |||
| #define HAVE_GETAUXVAL 1 | |||
| static unsigned long get_hwcap(void) | |||
| { | |||
| unsigned long hwcap = getauxval(AT_HWCAP); | |||
| char *maskenv; | |||
| // honor requests for not using specific CPU features in LD_HWCAP_MASK | |||
| maskenv = getenv("LD_HWCAP_MASK"); | |||
| if (maskenv) | |||
| hwcap &= strtoul(maskenv, NULL, 0); | |||
| return hwcap; | |||
| // note that a missing auxval is interpreted as no capabilities | |||
| // available, which is safe. | |||
| } | |||
| #else // __GLIBC_PREREQ(2, 16) | |||
| #warn "Cannot detect SIMD support in Z13 or newer architectures since glibc is older than 2.16" | |||
| static unsigned long get_hwcap(void) { | |||
| // treat missing support for getauxval() as no capabilities available, | |||
| // which is safe. | |||
| return 0; | |||
| } | |||
| #endif // __GLIBC_PREREQ(2, 16) | |||
| #endif // __GLIBC | |||
| static int detect(void) | |||
| { | |||
| unsigned long hwcap = get_hwcap(); | |||
| // Choose the architecture level for optimized kernels based on hardware | |||
| // capability bits (just like glibc chooses optimized implementations). | |||
| // | |||
| // The hardware capability bits that are used here indicate both | |||
| // hardware support for a particular ISA extension and the presence of | |||
| // software support to enable its use. For example, when HWCAP_S390_VX | |||
| // is set then both the CPU can execute SIMD instructions and the Linux | |||
| // kernel can manage applications using the vector registers and SIMD | |||
| // instructions. | |||
| // | |||
| // See glibc's sysdeps/s390/dl-procinfo.h for an overview (also in | |||
| // sysdeps/unix/sysv/linux/s390/bits/hwcap.h) of the defined hardware | |||
| // capability bits. They are derived from the information that the | |||
| // "store facility list (extended)" instructions provide. | |||
| // (https://sourceware.org/git/?p=glibc.git;a=blob_plain;f=sysdeps/s390/dl-procinfo.h;hb=HEAD) | |||
| // | |||
| // currently used: | |||
| // HWCAP_S390_VX - vector facility for z/Architecture (introduced with | |||
| // IBM z13), enables level CPU_Z13 (SIMD) | |||
| // HWCAP_S390_VXE - vector enhancements facility 1 (introduced with IBM | |||
| // z14), together with VX enables level CPU_Z14 | |||
| // (single-precision SIMD instructions) | |||
| // | |||
| // When you add optimized kernels that make use of other ISA extensions | |||
| // (e.g., for exploiting the vector-enhancements facility 2 that was introduced | |||
| // with IBM z15), then add a new architecture level (e.g., CPU_Z15) and gate | |||
| // it on the hwcap that represents it here (e.g., HWCAP_S390_VXRS_EXT2 | |||
| // for the z15 vector enhancements). | |||
| // | |||
| // To learn the value of hwcaps on a given system, set the environment | |||
| // variable LD_SHOW_AUXV and let ld.so dump it (e.g., by running | |||
| // LD_SHOW_AUXV=1 /bin/true). | |||
| // Also, the init function for dynamic arch support will print hwcaps | |||
| // when OPENBLAS_VERBOSE is set to 2 or higher. | |||
| if ((hwcap & HWCAP_S390_VX) && (hwcap & HWCAP_S390_VXE)) | |||
| return CPU_Z14; | |||
| if (hwcap & HWCAP_S390_VX) | |||
| return CPU_Z13; | |||
| return CPU_GENERIC; | |||
| } | |||
| @@ -333,7 +333,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||
| #else | |||
| for(jjs = js; jjs < js + min_j; jjs += min_jj){ | |||
| min_jj = min_j + js - jjs; | |||
| #if defined(SKYLAKEX) || defined(COOPERLAKE) | |||
| #if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) | |||
| /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve best performance */ | |||
| if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; | |||
| #else | |||
| @@ -367,7 +367,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||
| /* Split local region of B into parts */ | |||
| for(jjs = js; jjs < MIN(n_to, js + div_n); jjs += min_jj){ | |||
| min_jj = MIN(n_to, js + div_n) - jjs; | |||
| #if defined(SKYLAKEX) || defined(COOPERLAKE) | |||
| #if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) | |||
| /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ | |||
| if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; | |||
| #else | |||
| @@ -138,7 +138,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO | |||
| for(jjs = js; jjs < js + min_j; jjs += min_jj){ | |||
| min_jj = min_j + js - jjs; | |||
| #if defined(SKYLAKEX) || defined(COOPERLAKE) | |||
| #if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) | |||
| /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ | |||
| if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; | |||
| #else | |||
| @@ -215,7 +215,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO | |||
| for(jjs = js; jjs < js + min_j; jjs += min_jj){ | |||
| min_jj = min_j + js - jjs; | |||
| #if defined(SKYLAKEX) || defined(COOPERLAKE) | |||
| #if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) | |||
| /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ | |||
| if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; | |||
| #else | |||
| @@ -320,7 +320,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO | |||
| for(jjs = js; jjs < js + min_j; jjs += min_jj){ | |||
| min_jj = min_j + js - jjs; | |||
| #if defined(SKYLAKEX) || defined(COOPERLAKE) | |||
| #if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) | |||
| /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ | |||
| if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; | |||
| #else | |||
| @@ -399,7 +399,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO | |||
| for(jjs = js; jjs < js + min_j; jjs += min_jj){ | |||
| min_jj = min_j + js - jjs; | |||
| #if defined(SKYLAKEX) || defined(COOPERLAKE) | |||
| #if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) | |||
| /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ | |||
| if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; | |||
| #else | |||
| @@ -122,7 +122,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO | |||
| for(jjs = 0; jjs < ls - js; jjs += min_jj){ | |||
| min_jj = ls - js - jjs; | |||
| #if defined(SKYLAKEX) || defined(COOPERLAKE) | |||
| #if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) | |||
| /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ | |||
| if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; | |||
| #else | |||
| @@ -146,7 +146,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO | |||
| for(jjs = 0; jjs < min_l; jjs += min_jj){ | |||
| min_jj = min_l - jjs; | |||
| #if defined(SKYLAKEX) || defined(COOPERLAKE) | |||
| #if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) | |||
| /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ | |||
| if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; | |||
| #else | |||
| @@ -203,7 +203,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO | |||
| for(jjs = js; jjs < js + min_j; jjs += min_jj){ | |||
| min_jj = min_j + js - jjs; | |||
| #if defined(SKYLAKEX) || defined(COOPERLAKE) | |||
| #if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) | |||
| /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ | |||
| if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; | |||
| #else | |||
| @@ -258,7 +258,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO | |||
| for(jjs = 0; jjs < min_l; jjs += min_jj){ | |||
| min_jj = min_l - jjs; | |||
| #if defined(SKYLAKEX) || defined(COOPERLAKE) | |||
| #if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) | |||
| /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ | |||
| if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; | |||
| #else | |||
| @@ -283,7 +283,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO | |||
| for(jjs = 0; jjs < js - ls - min_l; jjs += min_jj){ | |||
| min_jj = js - ls - min_l - jjs; | |||
| #if defined(SKYLAKEX) || defined(COOPERLAKE) | |||
| #if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) | |||
| /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ | |||
| if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; | |||
| #else | |||
| @@ -344,7 +344,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO | |||
| for(jjs = js; jjs < js + min_j; jjs += min_jj){ | |||
| min_jj = min_j + js - jjs; | |||
| #if defined(SKYLAKEX) || defined(COOPERLAKE) | |||
| #if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) | |||
| /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ | |||
| if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; | |||
| #else | |||
| @@ -49,6 +49,8 @@ GenerateNamedObjects("openblas_get_config.c;openblas_get_parallel.c" "" "" 0 "" | |||
| if (DYNAMIC_ARCH) | |||
| if (ARM64) | |||
| list(APPEND COMMON_SOURCES dynamic_arm64.c) | |||
| elseif (POWER) | |||
| list(APPEND COMMON_SOURCES dynamic_power.c) | |||
| else () | |||
| list(APPEND COMMON_SOURCES dynamic.c) | |||
| endif () | |||
| @@ -40,7 +40,7 @@ | |||
| #include <stdlib.h> | |||
| #include "common.h" | |||
| #if defined(OS_CYGWIN_NT) && !defined(unlikely) | |||
| #if !defined(unlikely) | |||
| #ifdef __GNUC__ | |||
| #define unlikely(x) __builtin_expect(!!(x), 0) | |||
| #else | |||
| @@ -391,8 +391,9 @@ int blas_thread_init(void){ | |||
| int exec_blas_async(BLASLONG pos, blas_queue_t *queue){ | |||
| #if defined(SMP_SERVER) && defined(OS_CYGWIN_NT) | |||
| #if defined(SMP_SERVER) | |||
| // Handle lazy re-init of the thread-pool after a POSIX fork | |||
| // on Cygwin or as delayed init when a static library is used | |||
| if (unlikely(blas_server_avail == 0)) blas_thread_init(); | |||
| #endif | |||
| @@ -624,7 +624,7 @@ static gotoblas_t *get_coretype(void){ | |||
| return &gotoblas_NEHALEM; | |||
| } | |||
| } | |||
| if (model == 10) { | |||
| if (model == 10 || model == 12){ | |||
| // Ice Lake SP | |||
| if(support_avx512_bf16()) | |||
| return &gotoblas_COOPERLAKE; | |||
| @@ -639,12 +639,12 @@ static gotoblas_t *get_coretype(void){ | |||
| openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); | |||
| return &gotoblas_NEHALEM; | |||
| } | |||
| } | |||
| } | |||
| return NULL; | |||
| case 7: | |||
| if (model == 10) // Goldmont Plus | |||
| return &gotoblas_NEHALEM; | |||
| if (model == 14) { | |||
| if (model == 13 || model == 14) { | |||
| // Ice Lake | |||
| if (support_avx512()) | |||
| return &gotoblas_SKYLAKEX; | |||
| @@ -661,9 +661,8 @@ static gotoblas_t *get_coretype(void){ | |||
| } | |||
| } | |||
| return NULL; | |||
| case 9: | |||
| case 8: | |||
| if (model == 12) { // Tiger Lake | |||
| if (model == 12 || model == 13) { // Tiger Lake | |||
| if (support_avx512()) | |||
| return &gotoblas_SKYLAKEX; | |||
| if(support_avx2()){ | |||
| @@ -689,6 +688,50 @@ static gotoblas_t *get_coretype(void){ | |||
| return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. | |||
| } | |||
| } | |||
| if (model == 15){ // Sapphire Rapids | |||
| if(support_avx512_bf16()) | |||
| return &gotoblas_COOPERLAKE; | |||
| if (support_avx512()) | |||
| return &gotoblas_SKYLAKEX; | |||
| if(support_avx2()) | |||
| return &gotoblas_HASWELL; | |||
| if(support_avx()) { | |||
| openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); | |||
| return &gotoblas_SANDYBRIDGE; | |||
| } else { | |||
| openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); | |||
| return &gotoblas_NEHALEM; | |||
| } | |||
| } | |||
| return NULL; | |||
| case 9: | |||
| if (model == 7 || model == 10) { // Alder Lake | |||
| if(support_avx2()){ | |||
| openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK); | |||
| return &gotoblas_HASWELL; | |||
| } | |||
| if(support_avx()) { | |||
| openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); | |||
| return &gotoblas_SANDYBRIDGE; | |||
| } else { | |||
| openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); | |||
| return &gotoblas_NEHALEM; | |||
| } | |||
| } | |||
| if (model == 14 ) { // Kaby Lake, Coffee Lake | |||
| if(support_avx2()) | |||
| return &gotoblas_HASWELL; | |||
| if(support_avx()) { | |||
| openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); | |||
| return &gotoblas_SANDYBRIDGE; | |||
| } else { | |||
| openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); | |||
| return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. | |||
| } | |||
| } | |||
| return NULL; | |||
| case 10: | |||
| if (model == 5 || model == 6) { | |||
| if(support_avx2()) | |||
| @@ -1018,7 +1061,13 @@ void gotoblas_dynamic_init(void) { | |||
| #ifdef ARCH_X86 | |||
| if (gotoblas == NULL) gotoblas = &gotoblas_KATMAI; | |||
| #else | |||
| if (gotoblas == NULL) gotoblas = &gotoblas_PRESCOTT; | |||
| if (gotoblas == NULL) { | |||
| if (support_avx512_bf16()) gotoblas = &gotoblas_COOPERLAKE; | |||
| else if (support_avx512()) gotoblas = &gotoblas_SKYLAKEX; | |||
| else if (support_avx2()) gotoblas = &gotoblas_HASWELL; | |||
| else if (support_avx()) gotoblas = &gotoblas_SANDYBRIDGE; | |||
| else gotoblas = &gotoblas_PRESCOTT; | |||
| } | |||
| /* sanity check, if 64bit pointer we can't have a 32 bit cpu */ | |||
| if (sizeof(void*) == 8) { | |||
| if (gotoblas == &gotoblas_KATMAI || | |||
| @@ -1,38 +1,7 @@ | |||
| #include "common.h" | |||
| #include "cpuid_zarch.h" | |||
| #include <stdbool.h> | |||
| // Guard the use of getauxval() on glibc version >= 2.16 | |||
| #ifdef __GLIBC__ | |||
| #include <features.h> | |||
| #if __GLIBC_PREREQ(2, 16) | |||
| #include <sys/auxv.h> | |||
| #define HAVE_GETAUXVAL 1 | |||
| static unsigned long get_hwcap(void) | |||
| { | |||
| unsigned long hwcap = getauxval(AT_HWCAP); | |||
| char *maskenv; | |||
| // honor requests for not using specific CPU features in LD_HWCAP_MASK | |||
| maskenv = getenv("LD_HWCAP_MASK"); | |||
| if (maskenv) | |||
| hwcap &= strtoul(maskenv, NULL, 0); | |||
| return hwcap; | |||
| // note that a missing auxval is interpreted as no capabilities | |||
| // available, which is safe. | |||
| } | |||
| #else // __GLIBC_PREREQ(2, 16) | |||
| #warn "Cannot detect SIMD support in Z13 or newer architectures since glibc is older than 2.16" | |||
| static unsigned long get_hwcap(void) { | |||
| // treat missing support for getauxval() as no capabilities available, | |||
| // which is safe. | |||
| return 0; | |||
| } | |||
| #endif // __GLIBC_PREREQ(2, 16) | |||
| #endif // __GLIBC | |||
| extern gotoblas_t gotoblas_ZARCH_GENERIC; | |||
| #ifdef DYN_Z13 | |||
| @@ -44,25 +13,19 @@ extern gotoblas_t gotoblas_Z14; | |||
| #define NUM_CORETYPES 4 | |||
| extern int openblas_verbose(); | |||
| extern void openblas_warning(int verbose, const char* msg); | |||
| static char* corename[] = { | |||
| "unknown", | |||
| "Z13", | |||
| "Z14", | |||
| "ZARCH_GENERIC", | |||
| }; | |||
| char* gotoblas_corename(void) { | |||
| #ifdef DYN_Z13 | |||
| if (gotoblas == &gotoblas_Z13) return corename[1]; | |||
| if (gotoblas == &gotoblas_Z13) return cpuname[CPU_Z13]; | |||
| #endif | |||
| #ifdef DYN_Z14 | |||
| if (gotoblas == &gotoblas_Z14) return corename[2]; | |||
| if (gotoblas == &gotoblas_Z14) return cpuname[CPU_Z14]; | |||
| #endif | |||
| if (gotoblas == &gotoblas_ZARCH_GENERIC) return corename[3]; | |||
| if (gotoblas == &gotoblas_ZARCH_GENERIC) return cpuname[CPU_GENERIC]; | |||
| return corename[0]; | |||
| return "unknown"; | |||
| } | |||
| #ifndef HWCAP_S390_VXE | |||
| @@ -79,25 +42,28 @@ char* gotoblas_corename(void) { | |||
| */ | |||
| static gotoblas_t* get_coretype(void) { | |||
| unsigned long hwcap __attribute__((unused)) = get_hwcap(); | |||
| int cpu = detect(); | |||
| #ifdef DYN_Z14 | |||
| switch(cpu) { | |||
| // z14 and z15 systems: exploit Vector Facility (SIMD) and | |||
| // Vector-Enhancements Facility 1 (float SIMD instructions), if present. | |||
| if ((hwcap & HWCAP_S390_VX) && (hwcap & HWCAP_S390_VXE)) | |||
| case CPU_Z14: | |||
| #ifdef DYN_Z14 | |||
| return &gotoblas_Z14; | |||
| #endif | |||
| #ifdef DYN_Z13 | |||
| // z13: Vector Facility (SIMD for double) | |||
| if (hwcap & HWCAP_S390_VX) | |||
| case CPU_Z13: | |||
| #ifdef DYN_Z13 | |||
| return &gotoblas_Z13; | |||
| #endif | |||
| default: | |||
| // fallback in case of missing compiler support, systems before z13, or | |||
| // when the OS does not advertise support for the Vector Facility (e.g., | |||
| // missing support in the OS kernel) | |||
| return &gotoblas_ZARCH_GENERIC; | |||
| return &gotoblas_ZARCH_GENERIC; | |||
| } | |||
| } | |||
| static gotoblas_t* force_coretype(char* coretype) { | |||
| @@ -108,28 +74,28 @@ static gotoblas_t* force_coretype(char* coretype) { | |||
| for (i = 0; i < NUM_CORETYPES; i++) | |||
| { | |||
| if (!strncasecmp(coretype, corename[i], 20)) | |||
| if (!strncasecmp(coretype, cpuname[i], 20)) | |||
| { | |||
| found = i; | |||
| break; | |||
| } | |||
| } | |||
| if (found == 1) { | |||
| if (found == CPU_Z13) { | |||
| #ifdef DYN_Z13 | |||
| return &gotoblas_Z13; | |||
| #else | |||
| openblas_warning(1, "Z13 support not compiled in"); | |||
| return NULL; | |||
| #endif | |||
| } else if (found == 2) { | |||
| } else if (found == CPU_Z14) { | |||
| #ifdef DYN_Z14 | |||
| return &gotoblas_Z14; | |||
| #else | |||
| openblas_warning(1, "Z14 support not compiled in"); | |||
| return NULL; | |||
| #endif | |||
| } else if (found == 3) { | |||
| } else if (found == CPU_GENERIC) { | |||
| return &gotoblas_ZARCH_GENERIC; | |||
| } | |||
| @@ -155,6 +121,11 @@ void gotoblas_dynamic_init(void) { | |||
| else | |||
| { | |||
| gotoblas = get_coretype(); | |||
| if (openblas_verbose() >= 2) { | |||
| snprintf(coremsg, sizeof(coremsg), "Choosing kernels based on getauxval(AT_HWCAP)=0x%lx\n", | |||
| getauxval(AT_HWCAP)); | |||
| openblas_warning(2, coremsg); | |||
| } | |||
| } | |||
| if (gotoblas == NULL) | |||
| @@ -165,9 +136,11 @@ void gotoblas_dynamic_init(void) { | |||
| } | |||
| if (gotoblas && gotoblas->init) { | |||
| strncpy(coren, gotoblas_corename(), 20); | |||
| sprintf(coremsg, "Core: %s\n", coren); | |||
| openblas_warning(2, coremsg); | |||
| if (openblas_verbose() >= 2) { | |||
| strncpy(coren, gotoblas_corename(), 20); | |||
| sprintf(coremsg, "Core: %s\n", coren); | |||
| openblas_warning(2, coremsg); | |||
| } | |||
| gotoblas->init(); | |||
| } | |||
| else { | |||
| @@ -246,6 +246,14 @@ int get_num_procs(void) { | |||
| #endif | |||
| if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF); | |||
| #if defined(USE_OPENMP) | |||
| #if _OPENMP >= 201511 | |||
| nums = omp_get_num_places(); | |||
| #endif | |||
| return nums; | |||
| #endif | |||
| #if !defined(OS_LINUX) | |||
| return nums; | |||
| #endif | |||
| @@ -1806,10 +1814,19 @@ int get_num_procs(void) { | |||
| #endif | |||
| if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF); | |||
| #if defined(USE_OPENMP) | |||
| /* if (omp_get_proc_bind() != omp_proc_bind_false) */ | |||
| #if _OPENMP >= 201511 | |||
| nums = omp_get_num_places(); | |||
| #endif | |||
| return nums; | |||
| #endif | |||
| #if !defined(OS_LINUX) | |||
| return nums; | |||
| #endif | |||
| #if !defined(__GLIBC_PREREQ) | |||
| return nums; | |||
| #else | |||
| @@ -2854,32 +2871,28 @@ void *blas_memory_alloc(int procpos){ | |||
| position ++; | |||
| } while (position < NUM_BUFFERS); | |||
| #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) | |||
| UNLOCK_COMMAND(&alloc_lock); | |||
| #endif | |||
| if (memory_overflowed) { | |||
| #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) | |||
| LOCK_COMMAND(&alloc_lock); | |||
| #endif | |||
| do { | |||
| RMB; | |||
| do { | |||
| RMB; | |||
| #if defined(USE_OPENMP) | |||
| if (!newmemory[position-NUM_BUFFERS].used) { | |||
| blas_lock(&newmemory[position-NUM_BUFFERS].lock); | |||
| if (!newmemory[position-NUM_BUFFERS].used) { | |||
| blas_lock(&newmemory[position-NUM_BUFFERS].lock); | |||
| #endif | |||
| if (!newmemory[position-NUM_BUFFERS].used) goto allocation2; | |||
| if (!newmemory[position-NUM_BUFFERS].used) goto allocation2; | |||
| #if defined(USE_OPENMP) | |||
| blas_unlock(&newmemory[position-NUM_BUFFERS].lock); | |||
| } | |||
| blas_unlock(&newmemory[position-NUM_BUFFERS].lock); | |||
| } | |||
| #endif | |||
| position ++; | |||
| position ++; | |||
| } while (position < 512+NUM_BUFFERS); | |||
| } while (position < 512+NUM_BUFFERS); | |||
| } | |||
| #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) | |||
| UNLOCK_COMMAND(&alloc_lock); | |||
| #endif | |||
| } | |||
| goto error; | |||
| allocation : | |||
| @@ -2904,7 +2917,7 @@ void *blas_memory_alloc(int procpos){ | |||
| func = &memoryalloc[0]; | |||
| while ((func != NULL) && (map_address == (void *) -1)) { | |||
| while ((*func != NULL) && (map_address == (void *) -1)) { | |||
| map_address = (*func)((void *)base_address); | |||
| @@ -2984,6 +2997,9 @@ void *blas_memory_alloc(int procpos){ | |||
| return (void *)memory[position].addr; | |||
| error: | |||
| #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) | |||
| LOCK_COMMAND(&alloc_lock); | |||
| #endif | |||
| if (memory_overflowed) goto terminate; | |||
| fprintf(stderr,"OpenBLAS warning: precompiled NUM_THREADS exceeded, adding auxiliary array for thread metadata.\n"); | |||
| memory_overflowed=1; | |||
| @@ -2997,7 +3013,6 @@ void *blas_memory_alloc(int procpos){ | |||
| newmemory[i].used = 0; | |||
| newmemory[i].lock = 0; | |||
| } | |||
| newmemory[position-NUM_BUFFERS].used = 1; | |||
| allocation2: | |||
| newmemory[position-NUM_BUFFERS].used = 1; | |||
| @@ -3015,7 +3030,7 @@ allocation2: | |||
| func = &memoryalloc[0]; | |||
| while ((func != NULL) && (map_address == (void *) -1)) { | |||
| while ((*func != NULL) && (map_address == (void *) -1)) { | |||
| map_address = (*func)((void *)base_address); | |||
| @@ -3069,6 +3084,9 @@ allocation2: | |||
| return (void *)newmemory[position-NUM_BUFFERS].addr; | |||
| terminate: | |||
| #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) | |||
| UNLOCK_COMMAND(&alloc_lock); | |||
| #endif | |||
| printf("OpenBLAS : Program is Terminated. Because you tried to allocate too many memory regions.\n"); | |||
| printf("This library was built to support a maximum of %d threads - either rebuild OpenBLAS\n", NUM_BUFFERS); | |||
| printf("with a larger NUM_THREADS value or set the environment variable OPENBLAS_NUM_THREADS to\n"); | |||
| @@ -183,7 +183,7 @@ int get_L2_size(void){ | |||
| defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \ | |||
| defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) || \ | |||
| defined(PILEDRIVER) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || \ | |||
| defined(ZEN) || defined(SKYLAKEX) || defined(COOPERLAKE) | |||
| defined(ZEN) || defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) | |||
| cpuid(0x80000006, &eax, &ebx, &ecx, &edx); | |||
| @@ -269,7 +269,7 @@ void blas_set_parameter(void){ | |||
| int factor; | |||
| #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || \ | |||
| defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined(ZEN) || \ | |||
| defined(SKYLAKEX) || defined(COOPERLAKE) | |||
| defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) | |||
| int size = 16; | |||
| #else | |||
| int size = get_L2_size(); | |||
| @@ -469,6 +469,55 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| #endif | |||
| #ifdef FORCE_SAPPHIRERAPIDS | |||
| #define FORCE | |||
| #define FORCE_INTEL | |||
| #define ARCHITECTURE "X86" | |||
| #ifdef NO_AVX512 | |||
| #ifdef NO_AVX2 | |||
| #ifdef NO_AVX | |||
| #define SUBARCHITECTURE "NEHALEM" | |||
| #define ARCHCONFIG "-DNEHALEM " \ | |||
| "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ | |||
| "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | |||
| "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2" | |||
| #define LIBNAME "nehalem" | |||
| #define CORENAME "NEHALEM" | |||
| #else | |||
| #define SUBARCHITECTURE "SANDYBRIDGE" | |||
| #define ARCHCONFIG "-DSANDYBRIDGE " \ | |||
| "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ | |||
| "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | |||
| "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX" | |||
| #define LIBNAME "sandybridge" | |||
| #define CORENAME "SANDYBRIDGE" | |||
| #endif | |||
| #else | |||
| #define SUBARCHITECTURE "HASWELL" | |||
| #define ARCHCONFIG "-DHASWELL " \ | |||
| "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ | |||
| "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | |||
| "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \ | |||
| "-DHAVE_AVX2 -DHAVE_FMA3 -DFMA3" | |||
| #define LIBNAME "haswell" | |||
| #define CORENAME "HASWELL" | |||
| #endif | |||
| #else | |||
| #define SUBARCHITECTURE "SAPPHIRERAPIDS" | |||
| #define ARCHCONFIG "-DSAPPHIRERAPIDS " \ | |||
| "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ | |||
| "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | |||
| "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \ | |||
| "-DHAVE_AVX2 -DHAVE_FMA3 -DFMA3 -DHAVE_AVX512VL -DHAVE_AVX512BF16 -march=sapphirerapids" | |||
| #define LIBNAME "sapphirerapids" | |||
| #define CORENAME "SAPPHIRERAPIDS" | |||
| #endif | |||
| #endif | |||
| #ifdef FORCE_ATOM | |||
| #define FORCE | |||
| #define FORCE_INTEL | |||
| @@ -964,7 +1013,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define ARCHCONFIG "-DP5600 " \ | |||
| "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \ | |||
| "-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " | |||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 -DNO_MSA" | |||
| #define LIBNAME "p5600" | |||
| #define CORENAME "P5600" | |||
| #else | |||
| @@ -978,7 +1027,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define ARCHCONFIG "-DMIPS1004K " \ | |||
| "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \ | |||
| "-DL2_SIZE=262144 -DL2_LINESIZE=32 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " | |||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 -DNO_MSA" | |||
| #define LIBNAME "mips1004K" | |||
| #define CORENAME "MIPS1004K" | |||
| #else | |||
| @@ -992,7 +1041,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define ARCHCONFIG "-DMIPS24K " \ | |||
| "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \ | |||
| "-DL2_SIZE=32768 -DL2_LINESIZE=32 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " | |||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 -DNO_MSA" | |||
| #define LIBNAME "mips24K" | |||
| #define CORENAME "MIPS24K" | |||
| #else | |||
| @@ -1149,6 +1198,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #else | |||
| #endif | |||
| #ifdef FORCE_ARMV8SVE | |||
| #define FORCE | |||
| #define ARCHITECTURE "ARM64" | |||
| #define SUBARCHITECTURE "ARMV8SVE" | |||
| #define SUBDIRNAME "arm64" | |||
| #define ARCHCONFIG "-DARMV8SVE " \ | |||
| "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ | |||
| "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 " \ | |||
| "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SVE -DARMV8" | |||
| #define LIBNAME "armv8sve" | |||
| #define CORENAME "ARMV8SVE" | |||
| #endif | |||
| #ifdef FORCE_ARMV8 | |||
| #define FORCE | |||
| @@ -1375,6 +1438,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define CORENAME "VORTEX" | |||
| #endif | |||
| #ifdef FORCE_A64FX | |||
| #define ARMV8 | |||
| #define FORCE | |||
| #define ARCHITECTURE "ARM64" | |||
| #define SUBARCHITECTURE "A64FX" | |||
| #define SUBDIRNAME "arm64" | |||
| #define ARCHCONFIG "-DA64FX " \ | |||
| "-DL1_CODE_SIZE=65536 -DL1_CODE_LINESIZE=256 -DL1_CODE_ASSOCIATIVE=8 " \ | |||
| "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=256 -DL1_DATA_ASSOCIATIVE=8 " \ | |||
| "-DL2_SIZE=8388608 -DL2_LINESIZE=256 -DL2_ASSOCIATIVE=8 " \ | |||
| "-DL3_SIZE=0 -DL3_LINESIZE=0 -DL3_ASSOCIATIVE=0 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | |||
| "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SVE -DARMV8" | |||
| #define LIBNAME "a64fx" | |||
| #define CORENAME "A64FX" | |||
| #else | |||
| #endif | |||
| #ifdef FORCE_ZARCH_GENERIC | |||
| #define FORCE | |||
| #define ARCHITECTURE "ZARCH" | |||
| @@ -188,12 +188,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, | |||
| if (n == 0) return; | |||
| if (incx == 1 && trans == 0 && n < 50) { | |||
| buffer = NULL; | |||
| (trsv[(trans<<2) | (uplo<<1) | unit])(n, a, lda, x, incx, buffer); | |||
| return; | |||
| } | |||
| IDEBUG_START; | |||
| FUNCTION_PROFILE_START(); | |||
| @@ -42,14 +42,20 @@ | |||
| #include "functable.h" | |||
| #endif | |||
| #ifndef CBLAS | |||
| void NAME(blasint *N, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY, FLOAT *C, FLOAT *S){ | |||
| BLASLONG n = *N; | |||
| BLASLONG incx = *INCX; | |||
| BLASLONG incy = *INCY; | |||
| FLOAT c = *C; | |||
| FLOAT s = *S; | |||
| #else | |||
| void CNAME(blasint n, void *VX, blasint incx, void *VY, blasint incy, FLOAT c, FLOAT s) { | |||
| FLOAT *x = (FLOAT*) VX; | |||
| FLOAT *y = (FLOAT*) VY; | |||
| #endif /* CBLAS */ | |||
| PRINT_DEBUG_NAME; | |||
| if (n <= 0) return; | |||
| @@ -4,8 +4,16 @@ | |||
| #include "functable.h" | |||
| #endif | |||
| #ifndef CBLAS | |||
| void NAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){ | |||
| #else | |||
| void CNAME(void *VDA, void *VDB, FLOAT *C, void *VS) { | |||
| FLOAT *DA = (FLOAT*) VDA; | |||
| FLOAT *DB = (FLOAT*) VDB; | |||
| FLOAT *S = (FLOAT*) VS; | |||
| #endif /* CBLAS */ | |||
| #if defined(__i386__) || defined(__x86_64__) || defined(__ia64__) || defined(_M_X64) || defined(_M_IX86) | |||
| long double da_r = *(DA + 0); | |||
| @@ -199,12 +199,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, | |||
| if (n == 0) return; | |||
| if (incx == 1 && trans == 0 && n < 50) { | |||
| buffer = NULL; | |||
| (trsv[(trans<<2) | (uplo<<1) | unit])(n, a, lda, x, incx, buffer); | |||
| return; | |||
| } | |||
| IDEBUG_START; | |||
| FUNCTION_PROFILE_START(); | |||
| @@ -9,11 +9,11 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) | |||
| if (${DYNAMIC_ARCH}) | |||
| include("${PROJECT_SOURCE_DIR}/cmake/system.cmake") | |||
| endif () | |||
| ParseMakefileVars("${KERNELDIR}/KERNEL") | |||
| ParseMakefileVars("${KERNELDIR}/KERNEL.${TARGET_CORE}") | |||
| SetDefaultL1() | |||
| SetDefaultL2() | |||
| SetDefaultL3() | |||
| ParseMakefileVars("${KERNELDIR}/KERNEL") | |||
| ParseMakefileVars("${KERNELDIR}/KERNEL.${TARGET_CORE}") | |||
| set(KERNEL_INTERFACE common_level1.h common_level2.h common_level3.h) | |||
| if(NOT NO_LAPACK) | |||
| @@ -198,7 +198,7 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) | |||
| # Makefile.L3 | |||
| set(USE_TRMM false) | |||
| string(TOUPPER ${TARGET_CORE} UC_TARGET_CORE) | |||
| if (ARM OR ARM64 OR (UC_TARGET_CORE MATCHES LONGSOON3B) OR (UC_TARGET_CORE MATCHES GENERIC) OR (UC_TARGET_CORE MATCHES HASWELL) OR (UC_TARGET_CORE MATCHES ZEN) OR (UC_TARGET_CORE MATCHES SKYLAKEX) OR (UC_TARGET_CORE MATCHES COOPERLAKE)) | |||
| if (ARM OR ARM64 OR (UC_TARGET_CORE MATCHES LONGSOON3B) OR (UC_TARGET_CORE MATCHES GENERIC) OR (UC_TARGET_CORE MATCHES HASWELL) OR (UC_TARGET_CORE MATCHES ZEN) OR (UC_TARGET_CORE MATCHES SKYLAKEX) OR (UC_TARGET_CORE MATCHES COOPERLAKE) OR (UC_TARGET_CORE MATCHES SAPPHIRERAPIDS)) | |||
| set(USE_TRMM true) | |||
| endif () | |||
| if (ZARCH OR (UC_TARGET_CORE MATCHES POWER8) OR (UC_TARGET_CORE MATCHES POWER9) OR (UC_TARGET_CORE MATCHES POWER10)) | |||
| @@ -418,32 +418,50 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) | |||
| GenerateCombinationObjects("${KERNELDIR}/${TRMM_KERNEL}" "LEFT;TRANSA" "R;N" "TRMMKERNEL" 2 "trmm_kernel" false ${float_type}) | |||
| # symm for s and d | |||
| if (NOT DEFINED ${float_char}SYMMUCOPY_M) | |||
| set(SYMMUCOPY_M "generic/symm_ucopy_${${float_char}GEMM_UNROLL_M}.c") | |||
| set(SYMMLCOPY_M "generic/symm_lcopy_${${float_char}GEMM_UNROLL_M}.c") | |||
| else () | |||
| set(SYMMUCOPY_M "${KERNELDIR}/${${float_char}SYMMUCOPY_M}") | |||
| set(SYMMLCOPY_M "${KERNELDIR}/${${float_char}SYMMLCOPY_M}") | |||
| endif() | |||
| GenerateNamedObjects("generic/symm_ucopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "symm_outcopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("generic/symm_ucopy_${${float_char}GEMM_UNROLL_M}.c" "" "symm_iutcopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects(${SYMMUCOPY_M} "" "symm_iutcopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("generic/symm_lcopy_${${float_char}GEMM_UNROLL_N}.c" "LOWER;OUTER" "symm_oltcopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("generic/symm_lcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "symm_iltcopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects(${SYMMLCOPY_M} "LOWER" "symm_iltcopy" false "" "" false ${float_type}) | |||
| # These don't use a scheme that is easy to iterate over - the filenames have part of the DEFINE codes in them, for UPPER/TRANS but not for UNIT/OUTER. Also TRANS is not passed in as a define. | |||
| # Could simplify it a bit by pairing up by -UUNIT/-DUNIT. | |||
| GenerateNamedObjects("generic/trmm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trmm_iunucopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("generic/trmm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "" "trmm_iunncopy" false "" "" false ${float_type}) | |||
| if (NOT DEFINED ${float_char}TRMMUNCOPY_M) | |||
| set(TRMMUNCOPY_M "generic/trmm_uncopy_${${float_char}GEMM_UNROLL_M}.c") | |||
| set(TRMMLNCOPY_M "generic/trmm_lncopy_${${float_char}GEMM_UNROLL_M}.c") | |||
| set(TRMMUTCOPY_M "generic/trmm_utcopy_${${float_char}GEMM_UNROLL_M}.c") | |||
| set(TRMMLTCOPY_M "generic/trmm_ltcopy_${${float_char}GEMM_UNROLL_M}.c") | |||
| else () | |||
| set(TRMMUNCOPY_M "${KERNELDIR}/${${float_char}TRMMUNCOPY_M}") | |||
| set(TRMMLNCOPY_M "${KERNELDIR}/${${float_char}TRMMLNCOPY_M}") | |||
| set(TRMMUTCOPY_M "${KERNELDIR}/${${float_char}TRMMUTCOPY_M}") | |||
| set(TRMMLTCOPY_M "${KERNELDIR}/${${float_char}TRMMLTCOPY_M}") | |||
| endif () | |||
| GenerateNamedObjects(${TRMMUNCOPY_M} "UNIT" "trmm_iunucopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects(${TRMMUNCOPY_M} "" "trmm_iunncopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("generic/trmm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trmm_ounucopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("generic/trmm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trmm_ounncopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("generic/trmm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trmm_ilnucopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("generic/trmm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trmm_ilnncopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects(${TRMMLNCOPY_M} "LOWER;UNIT" "trmm_ilnucopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects(${TRMMLNCOPY_M} "LOWER" "trmm_ilnncopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("generic/trmm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trmm_olnucopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("generic/trmm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trmm_olnncopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("generic/trmm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trmm_iutucopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("generic/trmm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "" "trmm_iutncopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects(${TRMMUTCOPY_M} "UNIT" "trmm_iutucopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects(${TRMMUTCOPY_M} "" "trmm_iutncopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("generic/trmm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trmm_outucopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("generic/trmm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trmm_outncopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("generic/trmm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trmm_iltucopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("generic/trmm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trmm_iltncopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects(${TRMMLTCOPY_M} "LOWER;UNIT" "trmm_iltucopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects(${TRMMLTCOPY_M} "LOWER" "trmm_iltncopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("generic/trmm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trmm_oltucopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("generic/trmm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trmm_oltncopy" false "" "" false ${float_type}) | |||
| @@ -578,11 +596,11 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) | |||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NN}" "" "gemm_small_kernel_nn" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NT}" "" "gemm_small_kernel_nt" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_TN}" "" "gemm_small_kernel_tn" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NT}" "" "gemm_small_kernel_tt" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_TT}" "" "gemm_small_kernel_tt" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NN}" "B0" "gemm_small_kernel_b0_nn" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "B0" "gemm_small_kernel_b0_nt" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TN}" "B0" "gemm_small_kernel_b0_tn" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "B0" "gemm_small_kernel_b0_tt" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TT}" "B0" "gemm_small_kernel_b0_tt" false "" "" false ${float_type}) | |||
| endif () | |||
| if (BUILD_BFLOAT16) | |||
| if (NOT DEFINED SBGEMM_SMALL_M_PERMIT) | |||
| @@ -616,11 +634,11 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) | |||
| GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_NN}" "" "gemm_small_kernel_nn" false "" "" false "BFLOAT16") | |||
| GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_NT}" "" "gemm_small_kernel_nt" false "" "" false "BFLOAT16") | |||
| GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_TN}" "" "gemm_small_kernel_tn" false "" "" false "BFLOAT16") | |||
| GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_NT}" "" "gemm_small_kernel_tt" false "" "" false "BFLOAT16") | |||
| GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_TT}" "" "gemm_small_kernel_tt" false "" "" false "BFLOAT16") | |||
| GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_B0_NN}" "B0" "gemm_small_kernel_b0_nn" false "" "" false "BFLOAT16") | |||
| GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_B0_NT}" "B0" "gemm_small_kernel_b0_nt" false "" "" false "BFLOAT16") | |||
| GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_B0_TN}" "B0" "gemm_small_kernel_b0_tn" false "" "" false "BFLOAT16") | |||
| GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_B0_NT}" "B0" "gemm_small_kernel_b0_tt" false "" "" false "BFLOAT16") | |||
| GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_B0_TT}" "B0" "gemm_small_kernel_b0_tt" false "" "" false "BFLOAT16") | |||
| endif () | |||
| endif () | |||
| @@ -31,7 +31,22 @@ ifdef NO_AVX2 | |||
| endif | |||
| ifdef TARGET_CORE | |||
| ifeq ($(TARGET_CORE), COOPERLAKE) | |||
| ifeq ($(TARGET_CORE), SAPPHIRERAPIDS) | |||
| override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) | |||
| ifeq ($(GCCVERSIONGTEQ10), 1) | |||
| override CFLAGS += -march=sapphirerapids | |||
| else | |||
| override CFLAGS += -march=skylake-avx512 -mavx512f | |||
| endif | |||
| ifeq ($(OSNAME), CYGWIN_NT) | |||
| override CFLAGS += -fno-asynchronous-unwind-tables | |||
| endif | |||
| ifeq ($(OSNAME), WINNT) | |||
| ifeq ($(C_COMPILER), GCC) | |||
| override CFLAGS += -fno-asynchronous-unwind-tables | |||
| endif | |||
| endif | |||
| else ifeq ($(TARGET_CORE), COOPERLAKE) | |||
| override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) | |||
| ifeq ($(GCCVERSIONGTEQ10), 1) | |||
| override CFLAGS += -march=cooperlake | |||
| @@ -47,6 +47,10 @@ ifeq ($(CORE), COOPERLAKE) | |||
| USE_TRMM = 1 | |||
| endif | |||
| ifeq ($(CORE), SAPPHIRERAPIDS) | |||
| USE_TRMM = 1 | |||
| endif | |||
| ifeq ($(CORE), ZEN) | |||
| USE_TRMM = 1 | |||
| endif | |||
| @@ -1479,29 +1483,61 @@ $(KDIR)xtrsm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_RT) $(XT | |||
| $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -UUPPER -DRT -DCONJ $< -o $@ | |||
| ifdef STRMMUNCOPY_M | |||
| $(KDIR)strmm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMUNCOPY_M) | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ | |||
| $(KDIR)strmm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMUNCOPY_M) | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ | |||
| else | |||
| $(KDIR)strmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(SGEMM_UNROLL_M).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ | |||
| $(KDIR)strmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(SGEMM_UNROLL_M).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ | |||
| endif | |||
| ifdef STRMMLNCOPY_M | |||
| $(KDIR)strmm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMLNCOPY_M) | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ | |||
| $(KDIR)strmm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMLNCOPY_M) | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ | |||
| else | |||
| $(KDIR)strmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(SGEMM_UNROLL_M).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ | |||
| $(KDIR)strmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(SGEMM_UNROLL_M).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ | |||
| endif | |||
| ifdef STRMMUTCOPY_M | |||
| $(KDIR)strmm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMUTCOPY_M) | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ | |||
| $(KDIR)strmm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMUTCOPY_M) | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ | |||
| else | |||
| $(KDIR)strmm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(SGEMM_UNROLL_M).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ | |||
| $(KDIR)strmm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(SGEMM_UNROLL_M).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ | |||
| endif | |||
| ifdef STRMMLTCOPY_M | |||
| $(KDIR)strmm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMLTCOPY_M) | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ | |||
| $(KDIR)strmm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMLTCOPY_M) | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ | |||
| else | |||
| $(KDIR)strmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_M).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ | |||
| $(KDIR)strmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_M).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ | |||
| endif | |||
| $(KDIR)strmm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(SGEMM_UNROLL_N).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ | |||
| @@ -1527,29 +1563,61 @@ $(KDIR)strmm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_N | |||
| $(KDIR)strmm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_N).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ | |||
| ifdef DTRMMUNCOPY_M | |||
| $(KDIR)dtrmm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMUNCOPY_M) | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ | |||
| $(KDIR)dtrmm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMUNCOPY_M) | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ | |||
| else | |||
| $(KDIR)dtrmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_M).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ | |||
| $(KDIR)dtrmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_M).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ | |||
| endif | |||
| ifdef DTRMMLNCOPY_M | |||
| $(KDIR)dtrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMLNCOPY_M) | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ | |||
| $(KDIR)dtrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMLNCOPY_M) | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ | |||
| else | |||
| $(KDIR)dtrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(DGEMM_UNROLL_M).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ | |||
| $(KDIR)dtrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(DGEMM_UNROLL_M).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ | |||
| endif | |||
| ifdef DTRMMUTCOPY_M | |||
| $(KDIR)dtrmm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMUTCOPY_M) | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ | |||
| $(KDIR)dtrmm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMUTCOPY_M) | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ | |||
| else | |||
| $(KDIR)dtrmm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(DGEMM_UNROLL_M).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ | |||
| $(KDIR)dtrmm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(DGEMM_UNROLL_M).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ | |||
| endif | |||
| ifdef DTRMMLTCOPY_M | |||
| $(KDIR)dtrmm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMLTCOPY_M) | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ | |||
| $(KDIR)dtrmm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMLTCOPY_M) | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ | |||
| else | |||
| $(KDIR)dtrmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(DGEMM_UNROLL_M).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ | |||
| $(KDIR)dtrmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(DGEMM_UNROLL_M).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ | |||
| endif | |||
| $(KDIR)dtrmm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_N).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ | |||
| @@ -1773,11 +1841,21 @@ $(KDIR)ssymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(SGEMM_UNROLL_N). | |||
| $(KDIR)ssymm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(SGEMM_UNROLL_N).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER $< -o $@ | |||
| ifdef SSYMMUCOPY_M | |||
| $(KDIR)ssymm_iutcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SSYMMUCOPY_M) | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@ | |||
| else | |||
| $(KDIR)ssymm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(SGEMM_UNROLL_M).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@ | |||
| endif | |||
| ifdef SSYMMLCOPY_M | |||
| $(KDIR)ssymm_iltcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SSYMMLCOPY_M) | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@ | |||
| else | |||
| $(KDIR)ssymm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(SGEMM_UNROLL_M).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@ | |||
| endif | |||
| $(KDIR)dsymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(DGEMM_UNROLL_N).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER $< -o $@ | |||
| @@ -1785,11 +1863,21 @@ $(KDIR)dsymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(DGEMM_UNROLL_N). | |||
| $(KDIR)dsymm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(DGEMM_UNROLL_N).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER $< -o $@ | |||
| ifdef DSYMMUCOPY_M | |||
| $(KDIR)dsymm_iutcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DSYMMUCOPY_M) | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@ | |||
| else | |||
| $(KDIR)dsymm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(DGEMM_UNROLL_M).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@ | |||
| endif | |||
| ifdef DSYMMLCOPY_M | |||
| $(KDIR)dsymm_iltcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DSYMMLCOPY_M) | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@ | |||
| else | |||
| $(KDIR)dsymm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(DGEMM_UNROLL_M).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@ | |||
| endif | |||
| $(KDIR)qsymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(QGEMM_UNROLL_N).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER $< -o $@ | |||
| @@ -0,0 +1,183 @@ | |||
| SAMINKERNEL = ../arm/amin.c | |||
| DAMINKERNEL = ../arm/amin.c | |||
| CAMINKERNEL = ../arm/zamin.c | |||
| ZAMINKERNEL = ../arm/zamin.c | |||
| SMAXKERNEL = ../arm/max.c | |||
| DMAXKERNEL = ../arm/max.c | |||
| SMINKERNEL = ../arm/min.c | |||
| DMINKERNEL = ../arm/min.c | |||
| ISAMINKERNEL = ../arm/iamin.c | |||
| IDAMINKERNEL = ../arm/iamin.c | |||
| ICAMINKERNEL = ../arm/izamin.c | |||
| IZAMINKERNEL = ../arm/izamin.c | |||
| ISMAXKERNEL = ../arm/imax.c | |||
| IDMAXKERNEL = ../arm/imax.c | |||
| ISMINKERNEL = ../arm/imin.c | |||
| IDMINKERNEL = ../arm/imin.c | |||
| STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| SAMAXKERNEL = amax.S | |||
| DAMAXKERNEL = amax.S | |||
| CAMAXKERNEL = zamax.S | |||
| ZAMAXKERNEL = zamax.S | |||
| SAXPYKERNEL = axpy.S | |||
| DAXPYKERNEL = axpy.S | |||
| CAXPYKERNEL = zaxpy.S | |||
| ZAXPYKERNEL = zaxpy.S | |||
| SROTKERNEL = rot.S | |||
| DROTKERNEL = rot.S | |||
| CROTKERNEL = zrot.S | |||
| ZROTKERNEL = zrot.S | |||
| SSCALKERNEL = scal.S | |||
| DSCALKERNEL = scal.S | |||
| CSCALKERNEL = zscal.S | |||
| ZSCALKERNEL = zscal.S | |||
| SGEMVNKERNEL = gemv_n.S | |||
| DGEMVNKERNEL = gemv_n.S | |||
| CGEMVNKERNEL = zgemv_n.S | |||
| ZGEMVNKERNEL = zgemv_n.S | |||
| SGEMVTKERNEL = gemv_t.S | |||
| DGEMVTKERNEL = gemv_t.S | |||
| CGEMVTKERNEL = zgemv_t.S | |||
| ZGEMVTKERNEL = zgemv_t.S | |||
| SASUMKERNEL = asum.S | |||
| DASUMKERNEL = asum.S | |||
| CASUMKERNEL = casum.S | |||
| ZASUMKERNEL = zasum.S | |||
| SCOPYKERNEL = copy.S | |||
| DCOPYKERNEL = copy.S | |||
| CCOPYKERNEL = copy.S | |||
| ZCOPYKERNEL = copy.S | |||
| SSWAPKERNEL = swap.S | |||
| DSWAPKERNEL = swap.S | |||
| CSWAPKERNEL = swap.S | |||
| ZSWAPKERNEL = swap.S | |||
| ISAMAXKERNEL = iamax.S | |||
| IDAMAXKERNEL = iamax.S | |||
| ICAMAXKERNEL = izamax.S | |||
| IZAMAXKERNEL = izamax.S | |||
| SNRM2KERNEL = nrm2.S | |||
| DNRM2KERNEL = nrm2.S | |||
| CNRM2KERNEL = znrm2.S | |||
| ZNRM2KERNEL = znrm2.S | |||
| DDOTKERNEL = dot.S | |||
| ifneq ($(C_COMPILER), PGI) | |||
| SDOTKERNEL = ../generic/dot.c | |||
| else | |||
| SDOTKERNEL = dot.S | |||
| endif | |||
| ifneq ($(C_COMPILER), PGI) | |||
| CDOTKERNEL = zdot.S | |||
| ZDOTKERNEL = zdot.S | |||
| else | |||
| CDOTKERNEL = ../arm/zdot.c | |||
| ZDOTKERNEL = ../arm/zdot.c | |||
| endif | |||
| DSDOTKERNEL = dot.S | |||
| DGEMM_BETA = dgemm_beta.S | |||
| SGEMM_BETA = sgemm_beta.S | |||
| SGEMMKERNEL = sgemm_kernel_sve_v2x$(SGEMM_UNROLL_N).S | |||
| STRMMKERNEL = strmm_kernel_sve_v1x$(SGEMM_UNROLL_N).S | |||
| SGEMMINCOPY = sgemm_ncopy_sve_v1.c | |||
| SGEMMITCOPY = sgemm_tcopy_sve_v1.c | |||
| SGEMMONCOPY = sgemm_ncopy_$(DGEMM_UNROLL_N).S | |||
| SGEMMOTCOPY = sgemm_tcopy_$(DGEMM_UNROLL_N).S | |||
| SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| STRMMUNCOPY_M = trmm_uncopy_sve_v1.c | |||
| STRMMLNCOPY_M = trmm_lncopy_sve_v1.c | |||
| STRMMUTCOPY_M = trmm_utcopy_sve_v1.c | |||
| STRMMLTCOPY_M = trmm_ltcopy_sve_v1.c | |||
| SSYMMUCOPY_M = symm_ucopy_sve.c | |||
| SSYMMLCOPY_M = symm_lcopy_sve.c | |||
| DGEMMKERNEL = dgemm_kernel_sve_v2x$(DGEMM_UNROLL_N).S | |||
| DTRMMKERNEL = dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S | |||
| DGEMMINCOPY = dgemm_ncopy_sve_v1.c | |||
| DGEMMITCOPY = dgemm_tcopy_sve_v1.c | |||
| DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S | |||
| DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S | |||
| DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| DTRMMUNCOPY_M = trmm_uncopy_sve_v1.c | |||
| DTRMMLNCOPY_M = trmm_lncopy_sve_v1.c | |||
| DTRMMUTCOPY_M = trmm_utcopy_sve_v1.c | |||
| DTRMMLTCOPY_M = trmm_ltcopy_sve_v1.c | |||
| DSYMMUCOPY_M = symm_ucopy_sve.c | |||
| DSYMMLCOPY_M = symm_lcopy_sve.c | |||
| CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S | |||
| CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S | |||
| ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) | |||
| CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c | |||
| CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c | |||
| CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| endif | |||
| CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c | |||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c | |||
| CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S | |||
| ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S | |||
| ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) | |||
| ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c | |||
| ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c | |||
| ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| endif | |||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c | |||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c | |||
| ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| @@ -0,0 +1,183 @@ | |||
| SAMINKERNEL = ../arm/amin.c | |||
| DAMINKERNEL = ../arm/amin.c | |||
| CAMINKERNEL = ../arm/zamin.c | |||
| ZAMINKERNEL = ../arm/zamin.c | |||
| SMAXKERNEL = ../arm/max.c | |||
| DMAXKERNEL = ../arm/max.c | |||
| SMINKERNEL = ../arm/min.c | |||
| DMINKERNEL = ../arm/min.c | |||
| ISAMINKERNEL = ../arm/iamin.c | |||
| IDAMINKERNEL = ../arm/iamin.c | |||
| ICAMINKERNEL = ../arm/izamin.c | |||
| IZAMINKERNEL = ../arm/izamin.c | |||
| ISMAXKERNEL = ../arm/imax.c | |||
| IDMAXKERNEL = ../arm/imax.c | |||
| ISMINKERNEL = ../arm/imin.c | |||
| IDMINKERNEL = ../arm/imin.c | |||
| STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| SAMAXKERNEL = amax.S | |||
| DAMAXKERNEL = amax.S | |||
| CAMAXKERNEL = zamax.S | |||
| ZAMAXKERNEL = zamax.S | |||
| SAXPYKERNEL = axpy.S | |||
| DAXPYKERNEL = axpy.S | |||
| CAXPYKERNEL = zaxpy.S | |||
| ZAXPYKERNEL = zaxpy.S | |||
| SROTKERNEL = rot.S | |||
| DROTKERNEL = rot.S | |||
| CROTKERNEL = zrot.S | |||
| ZROTKERNEL = zrot.S | |||
| SSCALKERNEL = scal.S | |||
| DSCALKERNEL = scal.S | |||
| CSCALKERNEL = zscal.S | |||
| ZSCALKERNEL = zscal.S | |||
| SGEMVNKERNEL = gemv_n.S | |||
| DGEMVNKERNEL = gemv_n.S | |||
| CGEMVNKERNEL = zgemv_n.S | |||
| ZGEMVNKERNEL = zgemv_n.S | |||
| SGEMVTKERNEL = gemv_t.S | |||
| DGEMVTKERNEL = gemv_t.S | |||
| CGEMVTKERNEL = zgemv_t.S | |||
| ZGEMVTKERNEL = zgemv_t.S | |||
| SASUMKERNEL = asum.S | |||
| DASUMKERNEL = asum.S | |||
| CASUMKERNEL = casum.S | |||
| ZASUMKERNEL = zasum.S | |||
| SCOPYKERNEL = copy.S | |||
| DCOPYKERNEL = copy.S | |||
| CCOPYKERNEL = copy.S | |||
| ZCOPYKERNEL = copy.S | |||
| SSWAPKERNEL = swap.S | |||
| DSWAPKERNEL = swap.S | |||
| CSWAPKERNEL = swap.S | |||
| ZSWAPKERNEL = swap.S | |||
| ISAMAXKERNEL = iamax.S | |||
| IDAMAXKERNEL = iamax.S | |||
| ICAMAXKERNEL = izamax.S | |||
| IZAMAXKERNEL = izamax.S | |||
| SNRM2KERNEL = nrm2.S | |||
| DNRM2KERNEL = nrm2.S | |||
| CNRM2KERNEL = znrm2.S | |||
| ZNRM2KERNEL = znrm2.S | |||
| DDOTKERNEL = dot.S | |||
| ifneq ($(C_COMPILER), PGI) | |||
| SDOTKERNEL = ../generic/dot.c | |||
| else | |||
| SDOTKERNEL = dot.S | |||
| endif | |||
| ifneq ($(C_COMPILER), PGI) | |||
| CDOTKERNEL = zdot.S | |||
| ZDOTKERNEL = zdot.S | |||
| else | |||
| CDOTKERNEL = ../arm/zdot.c | |||
| ZDOTKERNEL = ../arm/zdot.c | |||
| endif | |||
| DSDOTKERNEL = dot.S | |||
| DGEMM_BETA = dgemm_beta.S | |||
| SGEMM_BETA = sgemm_beta.S | |||
| SGEMMKERNEL = sgemm_kernel_sve_v2x$(SGEMM_UNROLL_N).S | |||
| STRMMKERNEL = strmm_kernel_sve_v1x$(SGEMM_UNROLL_N).S | |||
| SGEMMINCOPY = sgemm_ncopy_sve_v1.c | |||
| SGEMMITCOPY = sgemm_tcopy_sve_v1.c | |||
| SGEMMONCOPY = sgemm_ncopy_$(DGEMM_UNROLL_N).S | |||
| SGEMMOTCOPY = sgemm_tcopy_$(DGEMM_UNROLL_N).S | |||
| SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| STRMMUNCOPY_M = trmm_uncopy_sve_v1.c | |||
| STRMMLNCOPY_M = trmm_lncopy_sve_v1.c | |||
| STRMMUTCOPY_M = trmm_utcopy_sve_v1.c | |||
| STRMMLTCOPY_M = trmm_ltcopy_sve_v1.c | |||
| SSYMMUCOPY_M = symm_ucopy_sve.c | |||
| SSYMMLCOPY_M = symm_lcopy_sve.c | |||
| DGEMMKERNEL = dgemm_kernel_sve_v2x$(DGEMM_UNROLL_N).S | |||
| DTRMMKERNEL = dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S | |||
| DGEMMINCOPY = dgemm_ncopy_sve_v1.c | |||
| DGEMMITCOPY = dgemm_tcopy_sve_v1.c | |||
| DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c | |||
| DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c | |||
| DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| DTRMMUNCOPY_M = trmm_uncopy_sve_v1.c | |||
| DTRMMLNCOPY_M = trmm_lncopy_sve_v1.c | |||
| DTRMMUTCOPY_M = trmm_utcopy_sve_v1.c | |||
| DTRMMLTCOPY_M = trmm_ltcopy_sve_v1.c | |||
| DSYMMUCOPY_M = symm_ucopy_sve.c | |||
| DSYMMLCOPY_M = symm_lcopy_sve.c | |||
| CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S | |||
| CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S | |||
| ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) | |||
| CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c | |||
| CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c | |||
| CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| endif | |||
| CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c | |||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c | |||
| CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S | |||
| ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S | |||
| ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) | |||
| ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c | |||
| ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c | |||
| ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| endif | |||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c | |||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c | |||
| ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| @@ -141,7 +141,7 @@ SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S | |||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S | |||
| DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_cortexa53.c | |||
| DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S | |||
| ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) | |||
| @@ -169,7 +169,7 @@ endif | |||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S | |||
| CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_cortexa53.c | |||
| CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S | |||
| ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) | |||
| CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c | |||
| @@ -182,7 +182,7 @@ CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c | |||
| CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S | |||
| ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_cortexa53.c | |||
| ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S | |||
| ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) | |||
| ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c | |||
| @@ -141,7 +141,7 @@ SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S | |||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S | |||
| DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_cortexa53.c | |||
| DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S | |||
| ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) | |||
| @@ -169,7 +169,7 @@ endif | |||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S | |||
| CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_cortexa53.c | |||
| CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S | |||
| ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) | |||
| CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c | |||
| @@ -182,7 +182,7 @@ CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c | |||
| CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S | |||
| ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_cortexa53.c | |||
| ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S | |||
| ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) | |||
| ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c | |||
| @@ -1 +1 @@ | |||
| include $(KERNELDIR)/KERNEL.ARMV8 | |||
| include $(KERNELDIR)/KERNEL.NEOVERSEN1 | |||
| @@ -0,0 +1,898 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2021, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #include <arm_neon.h> | |||
| #if defined(NN) || defined(NT) || defined(TN) || defined(TT) | |||
| #define FMLA_RI "fmla " | |||
| #define FMLA_IR "fmla " | |||
| #define FMLA_II "fmls " | |||
| #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) | |||
| #define FMLA_RI "fmls " | |||
| #define FMLA_IR "fmla " | |||
| #define FMLA_II "fmla " | |||
| #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) | |||
| #define FMLA_RI "fmla " | |||
| #define FMLA_IR "fmls " | |||
| #define FMLA_II "fmla " | |||
| #else | |||
| #define FMLA_RI "fmls " | |||
| #define FMLA_IR "fmls " | |||
| #define FMLA_II "fmls " | |||
| #endif | |||
| #define FMLA_RR "fmla " | |||
| static inline void store_m8n1_contracted(float *C, | |||
| float32x4_t c1r, float32x4_t c1i, float32x4_t c2r, float32x4_t c2i, | |||
| float alphar, float alphai) { | |||
| float32x4x2_t ld1 = vld2q_f32(C), ld2 = vld2q_f32(C + 8); | |||
| ld1.val[0] = vfmaq_n_f32(ld1.val[0], c1r, alphar); | |||
| ld2.val[0] = vfmaq_n_f32(ld2.val[0], c2r, alphar); | |||
| ld1.val[1] = vfmaq_n_f32(ld1.val[1], c1r, alphai); | |||
| ld2.val[1] = vfmaq_n_f32(ld2.val[1], c2r, alphai); | |||
| ld1.val[0] = vfmsq_n_f32(ld1.val[0], c1i, alphai); | |||
| ld2.val[0] = vfmsq_n_f32(ld2.val[0], c2i, alphai); | |||
| ld1.val[1] = vfmaq_n_f32(ld1.val[1], c1i, alphar); | |||
| ld2.val[1] = vfmaq_n_f32(ld2.val[1], c2i, alphar); | |||
| vst2q_f32(C, ld1); | |||
| vst2q_f32(C + 8, ld2); | |||
| } | |||
| static inline void kernel_8x4(const float *sa, const float *sb, float *C, | |||
| float alphar, float alphai, BLASLONG K, BLASLONG LDC) { | |||
| const float *c_pref = C; | |||
| float32x4_t c1r, c1i, c2r, c2i, c3r, c3i, c4r, c4i; | |||
| float32x4_t c5r, c5i, c6r, c6i, c7r, c7i, c8r, c8i; | |||
| /** x0 for filling A, x1-x6 for filling B (x5 and x6 for real, x2 and x4 for imag) */ | |||
| /** v0-v1 and v10-v11 for B, v2-v9 for A */ | |||
| __asm__ __volatile__( | |||
| "cmp %[K],#0; mov %[c_pref],%[C]\n\t" | |||
| "movi %[c1r].16b,#0; prfm pstl1keep,[%[c_pref]]\n\t" | |||
| "movi %[c1i].16b,#0; prfm pstl1keep,[%[c_pref],#64]\n\t" | |||
| "movi %[c2r].16b,#0; add %[c_pref],%[c_pref],%[LDC],LSL#3\n\t" | |||
| "movi %[c2i].16b,#0; prfm pstl1keep,[%[c_pref]]\n\t" | |||
| "movi %[c3r].16b,#0; prfm pstl1keep,[%[c_pref],#64]\n\t" | |||
| "movi %[c3i].16b,#0; add %[c_pref],%[c_pref],%[LDC],LSL#3\n\t" | |||
| "movi %[c4r].16b,#0; prfm pstl1keep,[%[c_pref]]\n\t" | |||
| "movi %[c4i].16b,#0; prfm pstl1keep,[%[c_pref],#64]\n\t" | |||
| "movi %[c5r].16b,#0; add %[c_pref],%[c_pref],%[LDC],LSL#3\n\t" | |||
| "movi %[c5i].16b,#0; prfm pstl1keep,[%[c_pref]]\n\t" | |||
| "movi %[c6r].16b,#0; prfm pstl1keep,[%[c_pref],#64]\n\t" | |||
| "movi %[c6i].16b,#0\n\t" | |||
| "movi %[c7r].16b,#0; movi %[c7i].16b,#0\n\t" | |||
| "movi %[c8r].16b,#0; movi %[c8i].16b,#0\n\t" | |||
| "beq 4f\n\t" | |||
| "cmp %[K],#2\n\t" | |||
| "ldp x1,x2,[%[sb]],#16; ldr q2,[%[sa]],#64\n\t" | |||
| "ldp x3,x4,[%[sb]],#16; ldr d3,[%[sa],#-48]\n\t" | |||
| "mov w5,w1; mov w6,w3; ldr x0,[%[sa],#-40]\n\t" | |||
| "bfi x5,x2,#32,#32; bfi x6,x4,#32,#32; fmov d0,x5\n\t" | |||
| "bfxil x2,x1,#32,#32; bfxil x4,x3,#32,#32; fmov v0.d[1],x6\n\t" | |||
| "blt 3f; beq 2f\n\t" | |||
| "1:\n\t" | |||
| "fmov v3.d[1],x0; ldr d4,[%[sa],#-32]\n\t" | |||
| FMLA_RR "%[c1r].4s,v0.4s,v2.s[0]; ldr x0,[%[sa],#-24]\n\t" | |||
| FMLA_IR "%[c1i].4s,v0.4s,v2.s[1]; ldr x1,[%[sb]],#64\n\t" | |||
| FMLA_RR "%[c2r].4s,v0.4s,v2.s[2]\n\t" | |||
| "fmov v4.d[1],x0; ldr d5,[%[sa],#-16]\n\t" | |||
| FMLA_IR "%[c2i].4s,v0.4s,v2.s[3]; ldr x0,[%[sa],#-8]\n\t" | |||
| FMLA_RR "%[c3r].4s,v0.4s,v3.s[0]; mov w5,w1\n\t" | |||
| FMLA_IR "%[c3i].4s,v0.4s,v3.s[1]\n\t" | |||
| "fmov v5.d[1],x0; fmov d1,x2\n\t" | |||
| FMLA_RR "%[c4r].4s,v0.4s,v3.s[2]; ldr x2,[%[sb],#-56]\n\t" | |||
| FMLA_IR "%[c4i].4s,v0.4s,v3.s[3]; ldr x3,[%[sb],#-48]\n\t" | |||
| FMLA_RR "%[c5r].4s,v0.4s,v4.s[0]\n\t" | |||
| "fmov v1.d[1],x4; ldr d6,[%[sa]]\n\t" | |||
| FMLA_IR "%[c5i].4s,v0.4s,v4.s[1]; ldr x0,[%[sa],#8]\n\t" | |||
| FMLA_RR "%[c6r].4s,v0.4s,v4.s[2]; ldr x4,[%[sb],#-40]\n\t" | |||
| FMLA_IR "%[c6i].4s,v0.4s,v4.s[3]; bfi x5,x2,#32,#32\n\t" | |||
| "fmov v6.d[1],x0; ldr d7,[%[sa],#16]\n\t" | |||
| FMLA_RR "%[c7r].4s,v0.4s,v5.s[0]; ldr x0,[%[sa],#24]\n\t" | |||
| FMLA_IR "%[c7i].4s,v0.4s,v5.s[1]; mov w6,w3\n\t" | |||
| FMLA_RR "%[c8r].4s,v0.4s,v5.s[2]; bfxil x2,x1,#32,#32\n\t" | |||
| "fmov v7.d[1],x0; fmov d10,x5\n\t" | |||
| FMLA_IR "%[c8i].4s,v0.4s,v5.s[3]; bfi x6,x4,#32,#32\n\t" | |||
| FMLA_II "%[c1r].4s,v1.4s,v2.s[1]; ldr x1,[%[sb],#-32]\n\t" | |||
| FMLA_RI "%[c1i].4s,v1.4s,v2.s[0]; bfxil x4,x3,#32,#32\n\t" | |||
| "fmov v10.d[1],x6; fmov d11,x2\n\t" | |||
| FMLA_II "%[c2r].4s,v1.4s,v2.s[3]; ldr x2,[%[sb],#-24]\n\t" | |||
| FMLA_RI "%[c2i].4s,v1.4s,v2.s[2]; ldr x3,[%[sb],#-16]\n\t" | |||
| FMLA_II "%[c3r].4s,v1.4s,v3.s[1]; mov w5,w1\n\t" | |||
| "fmov v11.d[1],x4; ldr d8,[%[sa],#32]\n\t" | |||
| FMLA_RI "%[c3i].4s,v1.4s,v3.s[0]; ldr x0,[%[sa],#40]\n\t" | |||
| FMLA_II "%[c4r].4s,v1.4s,v3.s[3]; ldr x4,[%[sb],#-8]\n\t" | |||
| FMLA_RI "%[c4i].4s,v1.4s,v3.s[2]; bfi x5,x2,#32,#32\n\t" | |||
| "fmov v8.d[1],x0; ldr d9,[%[sa],#48]\n\t" | |||
| FMLA_II "%[c5r].4s,v1.4s,v4.s[1]; ldr x0,[%[sa],#56]\n\t" | |||
| FMLA_RI "%[c5i].4s,v1.4s,v4.s[0]; mov w6,w3\n\t" | |||
| FMLA_II "%[c6r].4s,v1.4s,v4.s[3]\n\t" | |||
| "fmov v9.d[1],x0; fmov d0,x5\n\t" | |||
| FMLA_RI "%[c6i].4s,v1.4s,v4.s[2]; bfi x6,x4,#32,#32\n\t" | |||
| FMLA_II "%[c7r].4s,v1.4s,v5.s[1]\n\t" | |||
| FMLA_RI "%[c7i].4s,v1.4s,v5.s[0]\n\t" | |||
| "fmov v0.d[1],x6; ldr d2,[%[sa],#64]\n\t" | |||
| FMLA_II "%[c8r].4s,v1.4s,v5.s[3]; ldr x0,[%[sa],#72]\n\t" | |||
| FMLA_RI "%[c8i].4s,v1.4s,v5.s[2]\n\t" | |||
| FMLA_RR "%[c1r].4s,v10.4s,v6.s[0]\n\t" | |||
| "fmov v2.d[1],x0; ldr d3,[%[sa],#80]\n\t" | |||
| FMLA_IR "%[c1i].4s,v10.4s,v6.s[1]\n\t" | |||
| FMLA_RR "%[c2r].4s,v10.4s,v6.s[2]; ldr x0,[%[sa],#88]\n\t" | |||
| FMLA_IR "%[c2i].4s,v10.4s,v6.s[3]; bfxil x2,x1,#32,#32\n\t" | |||
| FMLA_RR "%[c3r].4s,v10.4s,v7.s[0]; bfxil x4,x3,#32,#32\n\t" | |||
| FMLA_IR "%[c3i].4s,v10.4s,v7.s[1]; add %[sa],%[sa],#128\n\t" | |||
| FMLA_RR "%[c4r].4s,v10.4s,v7.s[2]; prfm pldl1keep,[%[sb],#128]\n\t" | |||
| FMLA_IR "%[c4i].4s,v10.4s,v7.s[3]; sub %[K],%[K],#2\n\t" | |||
| FMLA_RR "%[c5r].4s,v10.4s,v8.s[0]; prfm pldl1keep,[%[sa],#128]\n\t" | |||
| FMLA_IR "%[c5i].4s,v10.4s,v8.s[1]; prfm pldl1keep,[%[sa],#192]\n\t" | |||
| FMLA_RR "%[c6r].4s,v10.4s,v8.s[2]; cmp %[K],#2\n\t" | |||
| FMLA_IR "%[c6i].4s,v10.4s,v8.s[3]\n\t" | |||
| FMLA_RR "%[c7r].4s,v10.4s,v9.s[0]\n\t" FMLA_IR "%[c7i].4s,v10.4s,v9.s[1]\n\t" | |||
| FMLA_RR "%[c8r].4s,v10.4s,v9.s[2]\n\t" FMLA_IR "%[c8i].4s,v10.4s,v9.s[3]\n\t" | |||
| FMLA_II "%[c1r].4s,v11.4s,v6.s[1]\n\t" FMLA_RI "%[c1i].4s,v11.4s,v6.s[0]\n\t" | |||
| FMLA_II "%[c2r].4s,v11.4s,v6.s[3]\n\t" FMLA_RI "%[c2i].4s,v11.4s,v6.s[2]\n\t" | |||
| FMLA_II "%[c3r].4s,v11.4s,v7.s[1]\n\t" FMLA_RI "%[c3i].4s,v11.4s,v7.s[0]\n\t" | |||
| FMLA_II "%[c4r].4s,v11.4s,v7.s[3]\n\t" FMLA_RI "%[c4i].4s,v11.4s,v7.s[2]\n\t" | |||
| FMLA_II "%[c5r].4s,v11.4s,v8.s[1]\n\t" FMLA_RI "%[c5i].4s,v11.4s,v8.s[0]\n\t" | |||
| FMLA_II "%[c6r].4s,v11.4s,v8.s[3]\n\t" FMLA_RI "%[c6i].4s,v11.4s,v8.s[2]\n\t" | |||
| FMLA_II "%[c7r].4s,v11.4s,v9.s[1]\n\t" FMLA_RI "%[c7i].4s,v11.4s,v9.s[0]\n\t" | |||
| FMLA_II "%[c8r].4s,v11.4s,v9.s[3]\n\t" FMLA_RI "%[c8i].4s,v11.4s,v9.s[2]\n\t" | |||
| "bgt 1b; blt 3f\n\t" | |||
| "2:\n\t" | |||
| "fmov v3.d[1],x0; ldr d4,[%[sa],#-32]\n\t" | |||
| FMLA_RR "%[c1r].4s,v0.4s,v2.s[0]; ldr x0,[%[sa],#-24]\n\t" | |||
| FMLA_IR "%[c1i].4s,v0.4s,v2.s[1]; ldr x1,[%[sb]],#32\n\t" | |||
| FMLA_RR "%[c2r].4s,v0.4s,v2.s[2]\n\t" | |||
| "fmov v4.d[1],x0; ldr d5,[%[sa],#-16]\n\t" | |||
| FMLA_IR "%[c2i].4s,v0.4s,v2.s[3]; ldr x0,[%[sa],#-8]\n\t" | |||
| FMLA_RR "%[c3r].4s,v0.4s,v3.s[0]; mov w5,w1\n\t" | |||
| FMLA_IR "%[c3i].4s,v0.4s,v3.s[1]\n\t" | |||
| "fmov v5.d[1],x0; fmov d1,x2\n\t" | |||
| FMLA_RR "%[c4r].4s,v0.4s,v3.s[2]; ldr x2,[%[sb],#-24]\n\t" | |||
| FMLA_IR "%[c4i].4s,v0.4s,v3.s[3]; ldr x3,[%[sb],#-16]\n\t" | |||
| FMLA_RR "%[c5r].4s,v0.4s,v4.s[0]\n\t" | |||
| "fmov v1.d[1],x4; ldr d6,[%[sa]]\n\t" | |||
| FMLA_IR "%[c5i].4s,v0.4s,v4.s[1]; ldr x0,[%[sa],#8]\n\t" | |||
| FMLA_RR "%[c6r].4s,v0.4s,v4.s[2]; ldr x4,[%[sb],#-8]\n\t" | |||
| FMLA_IR "%[c6i].4s,v0.4s,v4.s[3]; bfi x5,x2,#32,#32\n\t" | |||
| "fmov v6.d[1],x0; ldr d7,[%[sa],#16]\n\t" | |||
| FMLA_RR "%[c7r].4s,v0.4s,v5.s[0]; ldr x0,[%[sa],#24]\n\t" | |||
| FMLA_IR "%[c7i].4s,v0.4s,v5.s[1]; mov w6,w3\n\t" | |||
| FMLA_RR "%[c8r].4s,v0.4s,v5.s[2]; bfxil x2,x1,#32,#32\n\t" | |||
| "fmov v7.d[1],x0; fmov d10,x5\n\t" | |||
| FMLA_IR "%[c8i].4s,v0.4s,v5.s[3]; bfi x6,x4,#32,#32\n\t" | |||
| FMLA_II "%[c1r].4s,v1.4s,v2.s[1]\n\t" | |||
| FMLA_RI "%[c1i].4s,v1.4s,v2.s[0]; bfxil x4,x3,#32,#32\n\t" | |||
| "fmov v10.d[1],x6; fmov d11,x2\n\t" | |||
| FMLA_II "%[c2r].4s,v1.4s,v2.s[3]\n\t" | |||
| FMLA_RI "%[c2i].4s,v1.4s,v2.s[2]\n\t" | |||
| FMLA_II "%[c3r].4s,v1.4s,v3.s[1]\n\t" | |||
| "fmov v11.d[1],x4; ldr d8,[%[sa],#32]\n\t" | |||
| FMLA_RI "%[c3i].4s,v1.4s,v3.s[0]; ldr x0,[%[sa],#40]\n\t" | |||
| FMLA_II "%[c4r].4s,v1.4s,v3.s[3]; sub %[K],%[K],#2\n\t" | |||
| FMLA_RI "%[c4i].4s,v1.4s,v3.s[2]\n\t" | |||
| "fmov v8.d[1],x0; ldr d9,[%[sa],#48]\n\t" | |||
| FMLA_II "%[c5r].4s,v1.4s,v4.s[1]; ldr x0,[%[sa],#56]\n\t" | |||
| FMLA_RI "%[c5i].4s,v1.4s,v4.s[0]; add %[sa],%[sa],#64\n\t" | |||
| FMLA_II "%[c6r].4s,v1.4s,v4.s[3]\n\t" | |||
| "fmov v9.d[1],x0\n\t" | |||
| FMLA_RI "%[c6i].4s,v1.4s,v4.s[2]\n\t" | |||
| FMLA_II "%[c7r].4s,v1.4s,v5.s[1]\n\t" FMLA_RI "%[c7i].4s,v1.4s,v5.s[0]\n\t" | |||
| FMLA_II "%[c8r].4s,v1.4s,v5.s[3]\n\t" FMLA_RI "%[c8i].4s,v1.4s,v5.s[2]\n\t" | |||
| FMLA_RR "%[c1r].4s,v10.4s,v6.s[0]\n\t" FMLA_IR "%[c1i].4s,v10.4s,v6.s[1]\n\t" | |||
| FMLA_RR "%[c2r].4s,v10.4s,v6.s[2]\n\t" FMLA_IR "%[c2i].4s,v10.4s,v6.s[3]\n\t" | |||
| FMLA_RR "%[c3r].4s,v10.4s,v7.s[0]\n\t" FMLA_IR "%[c3i].4s,v10.4s,v7.s[1]\n\t" | |||
| FMLA_RR "%[c4r].4s,v10.4s,v7.s[2]\n\t" FMLA_IR "%[c4i].4s,v10.4s,v7.s[3]\n\t" | |||
| FMLA_RR "%[c5r].4s,v10.4s,v8.s[0]\n\t" FMLA_IR "%[c5i].4s,v10.4s,v8.s[1]\n\t" | |||
| FMLA_RR "%[c6r].4s,v10.4s,v8.s[2]\n\t" FMLA_IR "%[c6i].4s,v10.4s,v8.s[3]\n\t" | |||
| FMLA_RR "%[c7r].4s,v10.4s,v9.s[0]\n\t" FMLA_IR "%[c7i].4s,v10.4s,v9.s[1]\n\t" | |||
| FMLA_RR "%[c8r].4s,v10.4s,v9.s[2]\n\t" FMLA_IR "%[c8i].4s,v10.4s,v9.s[3]\n\t" | |||
| FMLA_II "%[c1r].4s,v11.4s,v6.s[1]\n\t" FMLA_RI "%[c1i].4s,v11.4s,v6.s[0]\n\t" | |||
| FMLA_II "%[c2r].4s,v11.4s,v6.s[3]\n\t" FMLA_RI "%[c2i].4s,v11.4s,v6.s[2]\n\t" | |||
| FMLA_II "%[c3r].4s,v11.4s,v7.s[1]\n\t" FMLA_RI "%[c3i].4s,v11.4s,v7.s[0]\n\t" | |||
| FMLA_II "%[c4r].4s,v11.4s,v7.s[3]\n\t" FMLA_RI "%[c4i].4s,v11.4s,v7.s[2]\n\t" | |||
| FMLA_II "%[c5r].4s,v11.4s,v8.s[1]\n\t" FMLA_RI "%[c5i].4s,v11.4s,v8.s[0]\n\t" | |||
| FMLA_II "%[c6r].4s,v11.4s,v8.s[3]\n\t" FMLA_RI "%[c6i].4s,v11.4s,v8.s[2]\n\t" | |||
| FMLA_II "%[c7r].4s,v11.4s,v9.s[1]\n\t" FMLA_RI "%[c7i].4s,v11.4s,v9.s[0]\n\t" | |||
| FMLA_II "%[c8r].4s,v11.4s,v9.s[3]\n\t" FMLA_RI "%[c8i].4s,v11.4s,v9.s[2]\n\t" | |||
| "b 4f\n\t" | |||
| "3:\n\t" | |||
| "fmov v3.d[1],x0; ldr d4,[%[sa],#-32]\n\t" | |||
| FMLA_RR "%[c1r].4s,v0.4s,v2.s[0]; ldr x0,[%[sa],#-24]\n\t" | |||
| FMLA_IR "%[c1i].4s,v0.4s,v2.s[1]\n\t" | |||
| FMLA_RR "%[c2r].4s,v0.4s,v2.s[2]\n\t" | |||
| "fmov v4.d[1],x0; ldr d5,[%[sa],#-16]\n\t" | |||
| FMLA_IR "%[c2i].4s,v0.4s,v2.s[3]; ldr x0,[%[sa],#-8]\n\t" | |||
| FMLA_RR "%[c3r].4s,v0.4s,v3.s[0]\n\t" | |||
| FMLA_IR "%[c3i].4s,v0.4s,v3.s[1]\n\t" | |||
| "fmov v5.d[1],x0; fmov d1,x2\n\t" | |||
| FMLA_RR "%[c4r].4s,v0.4s,v3.s[2]\n\t" | |||
| FMLA_IR "%[c4i].4s,v0.4s,v3.s[3]\n\t" | |||
| FMLA_RR "%[c5r].4s,v0.4s,v4.s[0]\n\t" | |||
| "fmov v1.d[1],x4\n\t" | |||
| FMLA_IR "%[c5i].4s,v0.4s,v4.s[1]; sub %[K],%[K],#1\n\t" | |||
| FMLA_RR "%[c6r].4s,v0.4s,v4.s[2]\n\t" FMLA_IR "%[c6i].4s,v0.4s,v4.s[3]\n\t" | |||
| FMLA_RR "%[c7r].4s,v0.4s,v5.s[0]\n\t" FMLA_IR "%[c7i].4s,v0.4s,v5.s[1]\n\t" | |||
| FMLA_RR "%[c8r].4s,v0.4s,v5.s[2]\n\t" FMLA_IR "%[c8i].4s,v0.4s,v5.s[3]\n\t" | |||
| FMLA_II "%[c1r].4s,v1.4s,v2.s[1]\n\t" FMLA_RI "%[c1i].4s,v1.4s,v2.s[0]\n\t" | |||
| FMLA_II "%[c2r].4s,v1.4s,v2.s[3]\n\t" FMLA_RI "%[c2i].4s,v1.4s,v2.s[2]\n\t" | |||
| FMLA_II "%[c3r].4s,v1.4s,v3.s[1]\n\t" FMLA_RI "%[c3i].4s,v1.4s,v3.s[0]\n\t" | |||
| FMLA_II "%[c4r].4s,v1.4s,v3.s[3]\n\t" FMLA_RI "%[c4i].4s,v1.4s,v3.s[2]\n\t" | |||
| FMLA_II "%[c5r].4s,v1.4s,v4.s[1]\n\t" FMLA_RI "%[c5i].4s,v1.4s,v4.s[0]\n\t" | |||
| FMLA_II "%[c6r].4s,v1.4s,v4.s[3]\n\t" FMLA_RI "%[c6i].4s,v1.4s,v4.s[2]\n\t" | |||
| FMLA_II "%[c7r].4s,v1.4s,v5.s[1]\n\t" FMLA_RI "%[c7i].4s,v1.4s,v5.s[0]\n\t" | |||
| FMLA_II "%[c8r].4s,v1.4s,v5.s[3]\n\t" FMLA_RI "%[c8i].4s,v1.4s,v5.s[2]\n\t" | |||
| "4:\n\t" | |||
| "mov %[c_pref],%[C]\n\t" | |||
| "zip1 v0.4s,%[c1r].4s,%[c2r].4s; prfm pstl1keep,[%[c_pref]]\n\t" | |||
| "zip1 v4.4s,%[c1i].4s,%[c2i].4s; prfm pstl1keep,[%[c_pref],#64]\n\t" | |||
| "zip1 v1.4s,%[c3r].4s,%[c4r].4s; add %[c_pref],%[c_pref],%[LDC],LSL#3\n\t" | |||
| "zip1 v5.4s,%[c3i].4s,%[c4i].4s; prfm pstl1keep,[%[c_pref]]\n\t" | |||
| "zip2 v2.4s,%[c1r].4s,%[c2r].4s; prfm pstl1keep,[%[c_pref],#64]\n\t" | |||
| "zip2 v6.4s,%[c1i].4s,%[c2i].4s; add %[c_pref],%[c_pref],%[LDC],LSL#3\n\t" | |||
| "zip2 v3.4s,%[c3r].4s,%[c4r].4s; prfm pstl1keep,[%[c_pref]]\n\t" | |||
| "zip2 v7.4s,%[c3i].4s,%[c4i].4s; prfm pstl1keep,[%[c_pref],#64]\n\t" | |||
| "zip1 %[c1r].2d,v0.2d,v1.2d; add %[c_pref],%[c_pref],%[LDC],LSL#3\n\t" | |||
| "zip1 %[c1i].2d,v4.2d,v5.2d; prfm pstl1keep,[%[c_pref]]\n\t" | |||
| "zip2 %[c2r].2d,v0.2d,v1.2d; prfm pstl1keep,[%[c_pref],#64]\n\t" | |||
| "zip2 %[c2i].2d,v4.2d,v5.2d\n\t" | |||
| "zip1 %[c3r].2d,v2.2d,v3.2d; zip1 %[c3i].2d,v6.2d,v7.2d\n\t" | |||
| "zip2 %[c4r].2d,v2.2d,v3.2d; zip2 %[c4i].2d,v6.2d,v7.2d\n\t" | |||
| "zip1 v0.4s,%[c5r].4s,%[c6r].4s; zip1 v4.4s,%[c5i].4s,%[c6i].4s\n\t" | |||
| "zip1 v1.4s,%[c7r].4s,%[c8r].4s; zip1 v5.4s,%[c7i].4s,%[c8i].4s\n\t" | |||
| "zip2 v2.4s,%[c5r].4s,%[c6r].4s; zip2 v6.4s,%[c5i].4s,%[c6i].4s\n\t" | |||
| "zip2 v3.4s,%[c7r].4s,%[c8r].4s; zip2 v7.4s,%[c7i].4s,%[c8i].4s\n\t" | |||
| "zip1 %[c5r].2d,v0.2d,v1.2d; zip1 %[c5i].2d,v4.2d,v5.2d\n\t" | |||
| "zip2 %[c6r].2d,v0.2d,v1.2d; zip2 %[c6i].2d,v4.2d,v5.2d\n\t" | |||
| "zip1 %[c7r].2d,v2.2d,v3.2d; zip1 %[c7i].2d,v6.2d,v7.2d\n\t" | |||
| "zip2 %[c8r].2d,v2.2d,v3.2d; zip2 %[c8i].2d,v6.2d,v7.2d\n\t" | |||
| :[c1r]"=w"(c1r), [c1i]"=w"(c1i), [c2r]"=w"(c2r), [c2i]"=w"(c2i), | |||
| [c3r]"=w"(c3r), [c3i]"=w"(c3i), [c4r]"=w"(c4r), [c4i]"=w"(c4i), | |||
| [c5r]"=w"(c5r), [c5i]"=w"(c5i), [c6r]"=w"(c6r), [c6i]"=w"(c6i), | |||
| [c7r]"=w"(c7r), [c7i]"=w"(c7i), [c8r]"=w"(c8r), [c8i]"=w"(c8i), | |||
| [K]"+r"(K), [sa]"+r"(sa), [sb]"+r"(sb), [c_pref]"+r"(c_pref) | |||
| :[C]"r"(C), [LDC]"r"(LDC) | |||
| :"cc","memory","x0","x1","x2","x3","x4","x5","x6", | |||
| "v0","v1","v2","v3","v4","v5","v6","v7","v8","v9","v10","v11"); | |||
| store_m8n1_contracted(C, c1r, c1i, c5r, c5i, alphar, alphai); C += LDC * 2; | |||
| store_m8n1_contracted(C, c2r, c2i, c6r, c6i, alphar, alphai); C += LDC * 2; | |||
| store_m8n1_contracted(C, c3r, c3i, c7r, c7i, alphar, alphai); C += LDC * 2; | |||
| store_m8n1_contracted(C, c4r, c4i, c8r, c8i, alphar, alphai); | |||
| } | |||
| static inline float32x4x4_t acc_expanded_m2n2(float32x4x4_t acc, | |||
| float32x4_t a, float32x4_t b) { | |||
| acc.val[0] = vfmaq_laneq_f32(acc.val[0], a, b, 0); | |||
| acc.val[1] = vfmaq_laneq_f32(acc.val[1], a, b, 1); | |||
| acc.val[2] = vfmaq_laneq_f32(acc.val[2], a, b, 2); | |||
| acc.val[3] = vfmaq_laneq_f32(acc.val[3], a, b, 3); | |||
| return acc; | |||
| } | |||
| static inline float32x4x4_t expand_alpha(float alphar, float alphai) { | |||
| float32x4x4_t ret; | |||
| const float maskp[] = { -1, 1, -1, 1 }; | |||
| const float maskn[] = { 1, -1, 1, -1 }; | |||
| const float32x4_t vrevp = vld1q_f32(maskp); | |||
| const float32x4_t vrevn = vld1q_f32(maskn); | |||
| #if defined(NN) || defined(NT) || defined(TN) || defined(TT) | |||
| ret.val[0] = vdupq_n_f32(alphar); | |||
| ret.val[1] = vdupq_n_f32(-alphai); | |||
| ret.val[2] = vmulq_f32(ret.val[1], vrevn); | |||
| ret.val[3] = vmulq_f32(ret.val[0], vrevp); | |||
| #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) | |||
| ret.val[0] = vdupq_n_f32(alphar); | |||
| ret.val[1] = vdupq_n_f32(alphai); | |||
| ret.val[2] = vmulq_f32(ret.val[1], vrevp); | |||
| ret.val[3] = vmulq_f32(ret.val[0], vrevn); | |||
| #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) | |||
| ret.val[2] = vdupq_n_f32(alphai); | |||
| ret.val[3] = vdupq_n_f32(alphar); | |||
| ret.val[0] = vmulq_f32(ret.val[3], vrevn); | |||
| ret.val[1] = vmulq_f32(ret.val[2], vrevp); | |||
| #else | |||
| ret.val[2] = vdupq_n_f32(alphai); | |||
| ret.val[3] = vdupq_n_f32(-alphar); | |||
| ret.val[0] = vmulq_f32(ret.val[3], vrevp); | |||
| ret.val[1] = vmulq_f32(ret.val[2], vrevn); | |||
| #endif | |||
| return ret; | |||
| } | |||
| static inline void store_expanded_m2n2(float *C, BLASLONG LDC, | |||
| float32x4x4_t acc, float32x4x4_t expanded_alpha) { | |||
| float32x4_t ld1 = vld1q_f32(C), ld2 = vld1q_f32(C + LDC * 2); | |||
| ld1 = vfmaq_f32(ld1, acc.val[0], expanded_alpha.val[0]); | |||
| ld2 = vfmaq_f32(ld2, acc.val[2], expanded_alpha.val[0]); | |||
| acc.val[0] = vrev64q_f32(acc.val[0]); | |||
| acc.val[2] = vrev64q_f32(acc.val[2]); | |||
| ld1 = vfmaq_f32(ld1, acc.val[1], expanded_alpha.val[1]); | |||
| ld2 = vfmaq_f32(ld2, acc.val[3], expanded_alpha.val[1]); | |||
| acc.val[1] = vrev64q_f32(acc.val[1]); | |||
| acc.val[3] = vrev64q_f32(acc.val[3]); | |||
| ld1 = vfmaq_f32(ld1, acc.val[0], expanded_alpha.val[2]); | |||
| ld2 = vfmaq_f32(ld2, acc.val[2], expanded_alpha.val[2]); | |||
| ld1 = vfmaq_f32(ld1, acc.val[1], expanded_alpha.val[3]); | |||
| ld2 = vfmaq_f32(ld2, acc.val[3], expanded_alpha.val[3]); | |||
| vst1q_f32(C, ld1); | |||
| vst1q_f32(C + LDC * 2, ld2); | |||
| } | |||
| static inline float32x4x4_t init_expanded_m2n2() { | |||
| float32x4x4_t ret = {{ vdupq_n_f32(0), vdupq_n_f32(0), | |||
| vdupq_n_f32(0), vdupq_n_f32(0) }}; | |||
| return ret; | |||
| } | |||
| static inline void kernel_4x4(const float *sa, const float *sb, float *C, | |||
| float alphar, float alphai, BLASLONG K, BLASLONG LDC) { | |||
| float32x4x4_t c1, c2, c3, c4; | |||
| c1 = c2 = c3 = c4 = init_expanded_m2n2(); | |||
| for (; K > 1; K -= 2) { | |||
| float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4), | |||
| a3 = vld1q_f32(sa + 8), a4 = vld1q_f32(sa + 12); sa += 16; | |||
| float32x4_t b1 = vld1q_f32(sb), b2 = vld1q_f32(sb + 4), | |||
| b3 = vld1q_f32(sb + 8), b4 = vld1q_f32(sb + 12); sb += 16; | |||
| c1 = acc_expanded_m2n2(c1, a1, b1); | |||
| c2 = acc_expanded_m2n2(c2, a2, b1); | |||
| c3 = acc_expanded_m2n2(c3, a1, b2); | |||
| c4 = acc_expanded_m2n2(c4, a2, b2); | |||
| c1 = acc_expanded_m2n2(c1, a3, b3); | |||
| c2 = acc_expanded_m2n2(c2, a4, b3); | |||
| c3 = acc_expanded_m2n2(c3, a3, b4); | |||
| c4 = acc_expanded_m2n2(c4, a4, b4); | |||
| } | |||
| if (K) { | |||
| float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4); | |||
| float32x4_t b1 = vld1q_f32(sb), b2 = vld1q_f32(sb + 4); | |||
| c1 = acc_expanded_m2n2(c1, a1, b1); | |||
| c2 = acc_expanded_m2n2(c2, a2, b1); | |||
| c3 = acc_expanded_m2n2(c3, a1, b2); | |||
| c4 = acc_expanded_m2n2(c4, a2, b2); | |||
| } | |||
| float32x4x4_t e_alpha = expand_alpha(alphar, alphai); | |||
| store_expanded_m2n2(C, LDC, c1, e_alpha); | |||
| store_expanded_m2n2(C + 4, LDC, c2, e_alpha); | |||
| C += LDC * 4; | |||
| store_expanded_m2n2(C, LDC, c3, e_alpha); | |||
| store_expanded_m2n2(C + 4, LDC, c4, e_alpha); | |||
| } | |||
| static inline void kernel_8x2(const float *sa, const float *sb, float *C, | |||
| float alphar, float alphai, BLASLONG K, BLASLONG LDC) { | |||
| float32x4x4_t c1, c2, c3, c4; | |||
| c1 = c2 = c3 = c4 = init_expanded_m2n2(); | |||
| for (; K > 1; K -= 2) { | |||
| float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4); | |||
| float32x4_t a3 = vld1q_f32(sa + 8), a4 = vld1q_f32(sa + 12); | |||
| float32x4_t a5 = vld1q_f32(sa + 16), a6 = vld1q_f32(sa + 20); | |||
| float32x4_t a7 = vld1q_f32(sa + 24), a8 = vld1q_f32(sa + 28); sa += 32; | |||
| float32x4_t b1 = vld1q_f32(sb), b2 = vld1q_f32(sb + 4); sb += 8; | |||
| c1 = acc_expanded_m2n2(c1, a1, b1); | |||
| c2 = acc_expanded_m2n2(c2, a2, b1); | |||
| c3 = acc_expanded_m2n2(c3, a3, b1); | |||
| c4 = acc_expanded_m2n2(c4, a4, b1); | |||
| c1 = acc_expanded_m2n2(c1, a5, b2); | |||
| c2 = acc_expanded_m2n2(c2, a6, b2); | |||
| c3 = acc_expanded_m2n2(c3, a7, b2); | |||
| c4 = acc_expanded_m2n2(c4, a8, b2); | |||
| } | |||
| if (K) { | |||
| float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4); | |||
| float32x4_t a3 = vld1q_f32(sa + 8), a4 = vld1q_f32(sa + 12); | |||
| float32x4_t b1 = vld1q_f32(sb); | |||
| c1 = acc_expanded_m2n2(c1, a1, b1); | |||
| c2 = acc_expanded_m2n2(c2, a2, b1); | |||
| c3 = acc_expanded_m2n2(c3, a3, b1); | |||
| c4 = acc_expanded_m2n2(c4, a4, b1); | |||
| } | |||
| float32x4x4_t e_alpha = expand_alpha(alphar, alphai); | |||
| store_expanded_m2n2(C, LDC, c1, e_alpha); | |||
| store_expanded_m2n2(C + 4, LDC, c2, e_alpha); | |||
| store_expanded_m2n2(C + 8, LDC, c3, e_alpha); | |||
| store_expanded_m2n2(C + 12, LDC, c4, e_alpha); | |||
| } | |||
| static inline void kernel_4x2(const float *sa, const float *sb, float *C, | |||
| float alphar, float alphai, BLASLONG K, BLASLONG LDC) { | |||
| float32x4x4_t c1, c2; | |||
| c1 = c2 = init_expanded_m2n2(); | |||
| for (; K > 1; K -= 2) { | |||
| float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4); | |||
| float32x4_t a3 = vld1q_f32(sa + 8), a4 = vld1q_f32(sa + 12); sa += 16; | |||
| float32x4_t b1 = vld1q_f32(sb), b2 = vld1q_f32(sb + 4); sb += 8; | |||
| c1 = acc_expanded_m2n2(c1, a1, b1); | |||
| c2 = acc_expanded_m2n2(c2, a2, b1); | |||
| c1 = acc_expanded_m2n2(c1, a3, b2); | |||
| c2 = acc_expanded_m2n2(c2, a4, b2); | |||
| } | |||
| if (K) { | |||
| float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4); | |||
| float32x4_t b1 = vld1q_f32(sb); | |||
| c1 = acc_expanded_m2n2(c1, a1, b1); | |||
| c2 = acc_expanded_m2n2(c2, a2, b1); | |||
| } | |||
| float32x4x4_t e_alpha = expand_alpha(alphar, alphai); | |||
| store_expanded_m2n2(C, LDC, c1, e_alpha); | |||
| store_expanded_m2n2(C + 4, LDC, c2, e_alpha); | |||
| } | |||
| static inline void kernel_2x4(const float *sa, const float *sb, float *C, | |||
| float alphar, float alphai, BLASLONG K, BLASLONG LDC) { | |||
| float32x4x4_t c1, c2; | |||
| c1 = c2 = init_expanded_m2n2(); | |||
| for (; K > 1; K -= 2) { | |||
| float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4); sa += 8; | |||
| float32x4_t b1 = vld1q_f32(sb), b2 = vld1q_f32(sb + 4); | |||
| float32x4_t b3 = vld1q_f32(sb + 8), b4 = vld1q_f32(sb + 12); sb += 16; | |||
| c1 = acc_expanded_m2n2(c1, a1, b1); | |||
| c2 = acc_expanded_m2n2(c2, a1, b2); | |||
| c1 = acc_expanded_m2n2(c1, a2, b3); | |||
| c2 = acc_expanded_m2n2(c2, a2, b4); | |||
| } | |||
| if (K) { | |||
| float32x4_t a1 = vld1q_f32(sa); | |||
| float32x4_t b1 = vld1q_f32(sb), b2 = vld1q_f32(sb + 4); | |||
| c1 = acc_expanded_m2n2(c1, a1, b1); | |||
| c2 = acc_expanded_m2n2(c2, a1, b2); | |||
| } | |||
| float32x4x4_t e_alpha = expand_alpha(alphar, alphai); | |||
| store_expanded_m2n2(C, LDC, c1, e_alpha); | |||
| store_expanded_m2n2(C + LDC * 4, LDC, c2, e_alpha); | |||
| } | |||
| static inline void kernel_2x2(const float *sa, const float *sb, float *C, | |||
| float alphar, float alphai, BLASLONG K, BLASLONG LDC) { | |||
| float32x4x4_t c1, c2; | |||
| c1 = c2 = init_expanded_m2n2(); | |||
| for (; K > 1; K -= 2) { | |||
| float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4); sa += 8; | |||
| float32x4_t b1 = vld1q_f32(sb), b2 = vld1q_f32(sb + 4); sb += 8; | |||
| c1 = acc_expanded_m2n2(c1, a1, b1); | |||
| c2 = acc_expanded_m2n2(c2, a2, b2); | |||
| } | |||
| c1.val[0] = vaddq_f32(c1.val[0], c2.val[0]); | |||
| c1.val[1] = vaddq_f32(c1.val[1], c2.val[1]); | |||
| c1.val[2] = vaddq_f32(c1.val[2], c2.val[2]); | |||
| c1.val[3] = vaddq_f32(c1.val[3], c2.val[3]); | |||
| if (K) { | |||
| float32x4_t a1 = vld1q_f32(sa); | |||
| float32x4_t b1 = vld1q_f32(sb); | |||
| c1 = acc_expanded_m2n2(c1, a1, b1); | |||
| } | |||
| store_expanded_m2n2(C, LDC, c1, expand_alpha(alphar, alphai)); | |||
| } | |||
| static inline float32x4x2_t acc_expanded_m2n1(float32x4x2_t acc, | |||
| float32x4_t a, float32x2_t b) { | |||
| acc.val[0] = vfmaq_lane_f32(acc.val[0], a, b, 0); | |||
| acc.val[1] = vfmaq_lane_f32(acc.val[1], a, b, 1); | |||
| return acc; | |||
| } | |||
| static inline void store_expanded_m2n1(float *C, | |||
| float32x4x2_t acc, float32x4x4_t expanded_alpha) { | |||
| float32x4_t ld1 = vld1q_f32(C); | |||
| ld1 = vfmaq_f32(ld1, acc.val[0], expanded_alpha.val[0]); | |||
| acc.val[0] = vrev64q_f32(acc.val[0]); | |||
| ld1 = vfmaq_f32(ld1, acc.val[1], expanded_alpha.val[1]); | |||
| acc.val[1] = vrev64q_f32(acc.val[1]); | |||
| ld1 = vfmaq_f32(ld1, acc.val[0], expanded_alpha.val[2]); | |||
| ld1 = vfmaq_f32(ld1, acc.val[1], expanded_alpha.val[3]); | |||
| vst1q_f32(C, ld1); | |||
| } | |||
| static inline float32x4x2_t init_expanded_m2n1() { | |||
| float32x4x2_t ret = {{ vdupq_n_f32(0), vdupq_n_f32(0) }}; | |||
| return ret; | |||
| } | |||
| static inline void kernel_8x1(const float *sa, const float *sb, float *C, | |||
| float alphar, float alphai, BLASLONG K) { | |||
| float32x4x2_t c1, c2, c3, c4; | |||
| c1 = c2 = c3 = c4 = init_expanded_m2n1(); | |||
| for (; K > 1; K -= 2) { | |||
| float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4), | |||
| a3 = vld1q_f32(sa + 8), a4 = vld1q_f32(sa + 12), | |||
| a5 = vld1q_f32(sa + 16), a6 = vld1q_f32(sa + 20), | |||
| a7 = vld1q_f32(sa + 24), a8 = vld1q_f32(sa + 28); sa += 32; | |||
| float32x2_t b1 = vld1_f32(sb), b2 = vld1_f32(sb + 2); sb += 4; | |||
| c1 = acc_expanded_m2n1(c1, a1, b1); | |||
| c2 = acc_expanded_m2n1(c2, a2, b1); | |||
| c3 = acc_expanded_m2n1(c3, a3, b1); | |||
| c4 = acc_expanded_m2n1(c4, a4, b1); | |||
| c1 = acc_expanded_m2n1(c1, a5, b2); | |||
| c2 = acc_expanded_m2n1(c2, a6, b2); | |||
| c3 = acc_expanded_m2n1(c3, a7, b2); | |||
| c4 = acc_expanded_m2n1(c4, a8, b2); | |||
| } | |||
| if (K) { | |||
| float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4), | |||
| a3 = vld1q_f32(sa + 8), a4 = vld1q_f32(sa + 12); | |||
| float32x2_t b1 = vld1_f32(sb); | |||
| c1 = acc_expanded_m2n1(c1, a1, b1); | |||
| c2 = acc_expanded_m2n1(c2, a2, b1); | |||
| c3 = acc_expanded_m2n1(c3, a3, b1); | |||
| c4 = acc_expanded_m2n1(c4, a4, b1); | |||
| } | |||
| float32x4x4_t expanded_alpha = expand_alpha(alphar, alphai); | |||
| store_expanded_m2n1(C, c1, expanded_alpha); | |||
| store_expanded_m2n1(C + 4, c2, expanded_alpha); | |||
| store_expanded_m2n1(C + 8, c3, expanded_alpha); | |||
| store_expanded_m2n1(C + 12, c4, expanded_alpha); | |||
| } | |||
| static inline void kernel_4x1(const float *sa, const float *sb, float *C, | |||
| float alphar, float alphai, BLASLONG K) { | |||
| float32x4x2_t c1, c2, c3, c4; | |||
| c1 = c2 = c3 = c4 = init_expanded_m2n1(); | |||
| for (; K > 1; K -= 2) { | |||
| float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4), | |||
| a3 = vld1q_f32(sa + 8), a4 = vld1q_f32(sa + 12); sa += 16; | |||
| float32x2_t b1 = vld1_f32(sb), b2 = vld1_f32(sb + 2); sb += 4; | |||
| c1 = acc_expanded_m2n1(c1, a1, b1); | |||
| c2 = acc_expanded_m2n1(c2, a2, b1); | |||
| c3 = acc_expanded_m2n1(c3, a3, b2); | |||
| c4 = acc_expanded_m2n1(c4, a4, b2); | |||
| } | |||
| c1.val[0] = vaddq_f32(c1.val[0], c3.val[0]); | |||
| c1.val[1] = vaddq_f32(c1.val[1], c3.val[1]); | |||
| c2.val[0] = vaddq_f32(c2.val[0], c4.val[0]); | |||
| c2.val[1] = vaddq_f32(c2.val[1], c4.val[1]); | |||
| if (K) { | |||
| float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4); | |||
| float32x2_t b1 = vld1_f32(sb); | |||
| c1 = acc_expanded_m2n1(c1, a1, b1); | |||
| c2 = acc_expanded_m2n1(c2, a2, b1); | |||
| } | |||
| float32x4x4_t expanded_alpha = expand_alpha(alphar, alphai); | |||
| store_expanded_m2n1(C, c1, expanded_alpha); | |||
| store_expanded_m2n1(C + 4, c2, expanded_alpha); | |||
| } | |||
| static inline void kernel_2x1(const float *sa, const float *sb, float *C, | |||
| float alphar, float alphai, BLASLONG K) { | |||
| float32x4x2_t c1, c2, c3, c4; | |||
| c1 = c2 = c3 = c4 = init_expanded_m2n1(); | |||
| for (; K > 3; K -= 4) { | |||
| float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4), | |||
| a3 = vld1q_f32(sa + 8), a4 = vld1q_f32(sa + 12); sa += 16; | |||
| float32x2_t b1 = vld1_f32(sb), b2 = vld1_f32(sb + 2), | |||
| b3 = vld1_f32(sb + 4), b4 = vld1_f32(sb + 6); sb += 8; | |||
| c1 = acc_expanded_m2n1(c1, a1, b1); | |||
| c2 = acc_expanded_m2n1(c2, a2, b2); | |||
| c3 = acc_expanded_m2n1(c3, a3, b3); | |||
| c4 = acc_expanded_m2n1(c4, a4, b4); | |||
| } | |||
| c1.val[0] = vaddq_f32(c1.val[0], c3.val[0]); | |||
| c1.val[1] = vaddq_f32(c1.val[1], c3.val[1]); | |||
| c2.val[0] = vaddq_f32(c2.val[0], c4.val[0]); | |||
| c2.val[1] = vaddq_f32(c2.val[1], c4.val[1]); | |||
| c1.val[0] = vaddq_f32(c1.val[0], c2.val[0]); | |||
| c1.val[1] = vaddq_f32(c1.val[1], c2.val[1]); | |||
| for (; K; K--) { | |||
| float32x4_t a1 = vld1q_f32(sa); sa += 4; | |||
| float32x2_t b1 = vld1_f32(sb); sb += 2; | |||
| c1 = acc_expanded_m2n1(c1, a1, b1); | |||
| } | |||
| float32x4x4_t expanded_alpha = expand_alpha(alphar, alphai); | |||
| store_expanded_m2n1(C, c1, expanded_alpha); | |||
| } | |||
| static inline float32x2x4_t expand_alpha_d(float alphar, float alphai) { | |||
| float32x2x4_t ret; | |||
| const float maskp[] = { -1, 1 }; | |||
| const float maskn[] = { 1, -1 }; | |||
| const float32x2_t vrevp = vld1_f32(maskp); | |||
| const float32x2_t vrevn = vld1_f32(maskn); | |||
| #if defined(NN) || defined(NT) || defined(TN) || defined(TT) | |||
| ret.val[0] = vdup_n_f32(alphar); | |||
| ret.val[1] = vdup_n_f32(-alphai); | |||
| ret.val[2] = vmul_f32(ret.val[1], vrevn); | |||
| ret.val[3] = vmul_f32(ret.val[0], vrevp); | |||
| #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) | |||
| ret.val[0] = vdup_n_f32(alphar); | |||
| ret.val[1] = vdup_n_f32(alphai); | |||
| ret.val[2] = vmul_f32(ret.val[1], vrevp); | |||
| ret.val[3] = vmul_f32(ret.val[0], vrevn); | |||
| #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) | |||
| ret.val[2] = vdup_n_f32(alphai); | |||
| ret.val[3] = vdup_n_f32(alphar); | |||
| ret.val[0] = vmul_f32(ret.val[3], vrevn); | |||
| ret.val[1] = vmul_f32(ret.val[2], vrevp); | |||
| #else | |||
| ret.val[2] = vdup_n_f32(alphai); | |||
| ret.val[3] = vdup_n_f32(-alphar); | |||
| ret.val[0] = vmul_f32(ret.val[3], vrevp); | |||
| ret.val[1] = vmul_f32(ret.val[2], vrevn); | |||
| #endif | |||
| return ret; | |||
| } | |||
| static inline float32x2x2_t acc_expanded_m1n1(float32x2x2_t acc, | |||
| float32x2_t a, float32x2_t b) { | |||
| acc.val[0] = vfma_lane_f32(acc.val[0], a, b, 0); | |||
| acc.val[1] = vfma_lane_f32(acc.val[1], a, b, 1); | |||
| return acc; | |||
| } | |||
| static inline void store_expanded_m1n1(float *C, | |||
| float32x2x2_t acc, float32x2x4_t expanded_alpha) { | |||
| float32x2_t ld1 = vld1_f32(C); | |||
| ld1 = vfma_f32(ld1, acc.val[0], expanded_alpha.val[0]); | |||
| acc.val[0] = vrev64_f32(acc.val[0]); | |||
| ld1 = vfma_f32(ld1, acc.val[1], expanded_alpha.val[1]); | |||
| acc.val[1] = vrev64_f32(acc.val[1]); | |||
| ld1 = vfma_f32(ld1, acc.val[0], expanded_alpha.val[2]); | |||
| ld1 = vfma_f32(ld1, acc.val[1], expanded_alpha.val[3]); | |||
| vst1_f32(C, ld1); | |||
| } | |||
| static inline float32x2x2_t init_expanded_m1n1() { | |||
| float32x2x2_t ret = {{ vdup_n_f32(0), vdup_n_f32(0) }}; | |||
| return ret; | |||
| } | |||
| static inline void kernel_1x4(const float *sa, const float *sb, float *C, | |||
| float alphar, float alphai, BLASLONG K, BLASLONG LDC) { | |||
| float32x2x2_t c1, c2, c3, c4; | |||
| c1 = c2 = c3 = c4 = init_expanded_m1n1(); | |||
| for (; K; K--) { | |||
| float32x2_t a1 = vld1_f32(sa); sa += 2; | |||
| c1 = acc_expanded_m1n1(c1, a1, vld1_f32(sb)); | |||
| c2 = acc_expanded_m1n1(c2, a1, vld1_f32(sb + 2)); | |||
| c3 = acc_expanded_m1n1(c3, a1, vld1_f32(sb + 4)); | |||
| c4 = acc_expanded_m1n1(c4, a1, vld1_f32(sb + 6)); | |||
| sb += 8; | |||
| } | |||
| float32x2x4_t expanded_alpha = expand_alpha_d(alphar, alphai); | |||
| store_expanded_m1n1(C, c1, expanded_alpha); C += LDC * 2; | |||
| store_expanded_m1n1(C, c2, expanded_alpha); C += LDC * 2; | |||
| store_expanded_m1n1(C, c3, expanded_alpha); C += LDC * 2; | |||
| store_expanded_m1n1(C, c4, expanded_alpha); | |||
| } | |||
| static inline void kernel_1x2(const float *sa, const float *sb, float *C, | |||
| float alphar, float alphai, BLASLONG K, BLASLONG LDC) { | |||
| float32x2x2_t c1, c2, c3, c4; | |||
| c1 = c2 = c3 = c4 = init_expanded_m1n1(); | |||
| for (; K > 1; K -= 2) { | |||
| float32x2_t a1 = vld1_f32(sa), a2 = vld1_f32(sa + 2); sa += 4; | |||
| c1 = acc_expanded_m1n1(c1, a1, vld1_f32(sb)); | |||
| c2 = acc_expanded_m1n1(c2, a1, vld1_f32(sb + 2)); | |||
| c3 = acc_expanded_m1n1(c3, a2, vld1_f32(sb + 4)); | |||
| c4 = acc_expanded_m1n1(c4, a2, vld1_f32(sb + 6)); | |||
| sb += 8; | |||
| } | |||
| c1.val[0] = vadd_f32(c1.val[0], c3.val[0]); | |||
| c1.val[1] = vadd_f32(c1.val[1], c3.val[1]); | |||
| c2.val[0] = vadd_f32(c2.val[0], c4.val[0]); | |||
| c2.val[1] = vadd_f32(c2.val[1], c4.val[1]); | |||
| if (K) { | |||
| float32x2_t a1 = vld1_f32(sa); | |||
| c1 = acc_expanded_m1n1(c1, a1, vld1_f32(sb)); | |||
| c2 = acc_expanded_m1n1(c2, a1, vld1_f32(sb + 2)); | |||
| } | |||
| float32x2x4_t expanded_alpha = expand_alpha_d(alphar, alphai); | |||
| store_expanded_m1n1(C, c1, expanded_alpha); C += LDC * 2; | |||
| store_expanded_m1n1(C, c2, expanded_alpha); | |||
| } | |||
| static inline void kernel_1x1(const float *sa, const float *sb, float *C, | |||
| float alphar, float alphai, BLASLONG K) { | |||
| float32x2x2_t c1, c2, c3, c4; | |||
| c1 = c2 = c3 = c4 = init_expanded_m1n1(); | |||
| for (; K > 3; K -= 4) { | |||
| c1 = acc_expanded_m1n1(c1, vld1_f32(sa), vld1_f32(sb)); | |||
| c2 = acc_expanded_m1n1(c2, vld1_f32(sa + 2), vld1_f32(sb + 2)); | |||
| c3 = acc_expanded_m1n1(c3, vld1_f32(sa + 4), vld1_f32(sb + 4)); | |||
| c4 = acc_expanded_m1n1(c4, vld1_f32(sa + 6), vld1_f32(sb + 6)); | |||
| sa += 8; sb += 8; | |||
| } | |||
| c1.val[0] = vadd_f32(c1.val[0], c3.val[0]); | |||
| c1.val[1] = vadd_f32(c1.val[1], c3.val[1]); | |||
| c2.val[0] = vadd_f32(c2.val[0], c4.val[0]); | |||
| c2.val[1] = vadd_f32(c2.val[1], c4.val[1]); | |||
| c1.val[0] = vadd_f32(c1.val[0], c2.val[0]); | |||
| c1.val[1] = vadd_f32(c1.val[1], c2.val[1]); | |||
| for (; K; K--) { | |||
| c1 = acc_expanded_m1n1(c1, vld1_f32(sa), vld1_f32(sb)); | |||
| sa += 2; sb += 2; | |||
| } | |||
| store_expanded_m1n1(C, c1, expand_alpha_d(alphar, alphai)); | |||
| } | |||
| int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alphar, FLOAT alphai, | |||
| FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG LDC) { | |||
| BLASLONG n_left = N; | |||
| for (; n_left >= 8; n_left -= 8) { | |||
| const FLOAT *a_ = sa; | |||
| FLOAT *c1_ = C; | |||
| FLOAT *c2_ = C + LDC * 8; | |||
| const FLOAT *b1_ = sb; | |||
| const FLOAT *b2_ = sb + K * 8; | |||
| BLASLONG m_left = M; | |||
| for (; m_left >= 8; m_left -= 8) { | |||
| kernel_8x4(a_, b1_, c1_, alphar, alphai, K, LDC); | |||
| kernel_8x4(a_, b2_, c2_, alphar, alphai, K, LDC); | |||
| a_ += 16 * K; | |||
| c1_ += 16; | |||
| c2_ += 16; | |||
| } | |||
| if (m_left >= 4) { | |||
| m_left -= 4; | |||
| kernel_4x4(a_, b1_, c1_, alphar, alphai, K, LDC); | |||
| kernel_4x4(a_, b2_, c2_, alphar, alphai, K, LDC); | |||
| a_ += 8 * K; | |||
| c1_ += 8; | |||
| c2_ += 8; | |||
| } | |||
| if (m_left >= 2) { | |||
| m_left -= 2; | |||
| kernel_2x4(a_, b1_, c1_, alphar, alphai, K, LDC); | |||
| kernel_2x4(a_, b2_, c2_, alphar, alphai, K, LDC); | |||
| a_ += 4 * K; | |||
| c1_ += 4; | |||
| c2_ += 4; | |||
| } | |||
| if (m_left) { | |||
| kernel_1x4(a_, b1_, c1_, alphar, alphai, K, LDC); | |||
| kernel_1x4(a_, b2_, c2_, alphar, alphai, K, LDC); | |||
| } | |||
| C += 16 * LDC; | |||
| sb += 16 * K; | |||
| } | |||
| if (n_left >= 4) { | |||
| n_left -= 4; | |||
| const FLOAT *a_ = sa; | |||
| FLOAT *c_ = C; | |||
| BLASLONG m_left = M; | |||
| for (; m_left >= 8; m_left -= 8) { | |||
| kernel_8x4(a_, sb, c_, alphar, alphai, K, LDC); | |||
| a_ += 16 * K; | |||
| c_ += 16; | |||
| } | |||
| if (m_left >= 4) { | |||
| m_left -= 4; | |||
| kernel_4x4(a_, sb, c_, alphar, alphai, K, LDC); | |||
| a_ += 8 * K; | |||
| c_ += 8; | |||
| } | |||
| if (m_left >= 2) { | |||
| m_left -= 2; | |||
| kernel_2x4(a_, sb, c_, alphar, alphai, K, LDC); | |||
| a_ += 4 * K; | |||
| c_ += 4; | |||
| } | |||
| if (m_left) { | |||
| kernel_1x4(a_, sb, c_, alphar, alphai, K, LDC); | |||
| } | |||
| C += 8 * LDC; | |||
| sb += 8 * K; | |||
| } | |||
| if (n_left >= 2) { | |||
| n_left -= 2; | |||
| const FLOAT *a_ = sa; | |||
| FLOAT *c_ = C; | |||
| BLASLONG m_left = M; | |||
| for (; m_left >= 8; m_left -= 8) { | |||
| kernel_8x2(a_, sb, c_, alphar, alphai, K, LDC); | |||
| a_ += 16 * K; | |||
| c_ += 16; | |||
| } | |||
| if (m_left >= 4) { | |||
| m_left -= 4; | |||
| kernel_4x2(a_, sb, c_, alphar, alphai, K, LDC); | |||
| a_ += 8 * K; | |||
| c_ += 8; | |||
| } | |||
| if (m_left >= 2) { | |||
| m_left -= 2; | |||
| kernel_2x2(a_, sb, c_, alphar, alphai, K, LDC); | |||
| a_ += 4 * K; | |||
| c_ += 4; | |||
| } | |||
| if (m_left) { | |||
| kernel_1x2(a_, sb, c_, alphar, alphai, K, LDC); | |||
| } | |||
| C += 4 * LDC; | |||
| sb += 4 * K; | |||
| } | |||
| if (n_left) { | |||
| BLASLONG m_left = M; | |||
| for (; m_left >= 8; m_left -= 8) { | |||
| kernel_8x1(sa, sb, C, alphar, alphai, K); | |||
| sa += 16 * K; | |||
| C += 16; | |||
| } | |||
| if (m_left >= 4) { | |||
| m_left -= 4; | |||
| kernel_4x1(sa, sb, C, alphar, alphai, K); | |||
| sa += 8 * K; | |||
| C += 8; | |||
| } | |||
| if (m_left >= 2) { | |||
| m_left -= 2; | |||
| kernel_2x1(sa, sb, C, alphar, alphai, K); | |||
| sa += 4 * K; | |||
| C += 4; | |||
| } | |||
| if (m_left) { | |||
| kernel_1x1(sa, sb, C, alphar, alphai, K); | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,890 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2021, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #include <arm_neon.h> | |||
| /********************************************************** | |||
| * Function: dgemm_kernel_arm_cortex_a53_4x4_m4n12 | |||
| * Operation: C[4][12] += alpha * sa[4][K] * sb[K][12] | |||
| * Matrix orders: | |||
| * sa: column-major (leading dimension == 4) | |||
| * sb: 3 concatenated row-major 4-column submatrices | |||
| * C: column-major (leading dimension == LDC) | |||
| *********************************************************/ | |||
| static inline void dgemm_kernel_arm_cortex_a53_4x4_m4n12( | |||
| const FLOAT *sa, const FLOAT *sb, FLOAT *C, | |||
| BLASLONG K, BLASLONG LDC, FLOAT alpha) { | |||
| /** prefetch 4x12 elements from matrix C for RW purpose */ | |||
| __asm__ __volatile__( | |||
| "mov x0,%[C]\n\t" | |||
| "prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]; add x0,x0,%[LDC],LSL #3\n\t" | |||
| "prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]; add x0,x0,%[LDC],LSL #3\n\t" | |||
| "prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]; add x0,x0,%[LDC],LSL #3\n\t" | |||
| "prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]; add x0,x0,%[LDC],LSL #3\n\t" | |||
| "prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]; add x0,x0,%[LDC],LSL #3\n\t" | |||
| "prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]; add x0,x0,%[LDC],LSL #3\n\t" | |||
| "prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]; add x0,x0,%[LDC],LSL #3\n\t" | |||
| "prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]; add x0,x0,%[LDC],LSL #3\n\t" | |||
| "prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]; add x0,x0,%[LDC],LSL #3\n\t" | |||
| "prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]; add x0,x0,%[LDC],LSL #3\n\t" | |||
| "prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]; add x0,x0,%[LDC],LSL #3\n\t" | |||
| "prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]\n\t" | |||
| ::[C]"r"(C), [LDC]"r"(LDC):"x0"); | |||
| /** 3 pointers to 3 submatrices of sb respectively */ | |||
| const FLOAT *b1_ = sb; | |||
| const FLOAT *b2_ = sb + K * 4; | |||
| const FLOAT *b3_ = sb + K * 8; | |||
| /** register mapping of 4x12 elements of C, row-id ==> coordinate-M, column-id ==> coordinate-N */ | |||
| /** v8.d[0] v10.d[0] v12.d[0] v14.d[0] v16.d[0] v18.d[0] v20.d[0] v22.d[0] v24.d[0] v26.d[0] v28.d[0] v30.d[0] */ | |||
| /** v8.d[1] v10.d[1] v12.d[1] v14.d[1] v16.d[1] v18.d[1] v20.d[1] v22.d[1] v24.d[1] v26.d[1] v28.d[1] v30.d[1] */ | |||
| /** v9.d[0] v11.d[0] v13.d[0] v15.d[0] v17.d[0] v19.d[0] v21.d[0] v23.d[0] v25.d[0] v27.d[0] v29.d[0] v31.d[0] */ | |||
| /** v9.d[1] v11.d[1] v13.d[1] v15.d[1] v17.d[1] v19.d[1] v21.d[1] v23.d[1] v25.d[1] v27.d[1] v29.d[1] v31.d[1] */ | |||
| __asm__ __volatile__( | |||
| "cmp %[K],#0\n\t" | |||
| /** fill registers holding elements of C with 0.0 */ | |||
| "movi v8.16b,#0; movi v9.16b,#0; movi v10.16b,#0; movi v11.16b,#0\n\t" | |||
| "movi v12.16b,#0; movi v13.16b,#0; movi v14.16b,#0; movi v15.16b,#0\n\t" | |||
| "movi v16.16b,#0; movi v17.16b,#0; movi v18.16b,#0; movi v19.16b,#0\n\t" | |||
| "movi v20.16b,#0; movi v21.16b,#0; movi v22.16b,#0; movi v23.16b,#0\n\t" | |||
| "movi v24.16b,#0; movi v25.16b,#0; movi v26.16b,#0; movi v27.16b,#0\n\t" | |||
| "movi v28.16b,#0; movi v29.16b,#0; movi v30.16b,#0; movi v31.16b,#0\n\t" | |||
| "beq 4f; cmp %[K],#2\n\t" | |||
| /** register v0-v3 for loading A, v4-v7 for loading B, x0 for transporting data */ | |||
| "ldp q0,q1,[%[sa]]; ldp q4,q5,[%[b1_]]\n\t" | |||
| "ldr d6,[%[b2_]]; ldr x0,[%[b2_],#8]\n\t" | |||
| "blt 3f; beq 2f\n\t" | |||
| "1:\n\t" | |||
| /** main loop with unroll_k = 2, specially designed for cortex-A53 NEON pipeline */ | |||
| "ldr d7,[%[b2_],#16]; fmov v6.d[1],x0\n\t" | |||
| "fmla v8.2d,v0.2d,v4.d[0]; ldr x0,[%[b2_],#24]\n\t" | |||
| "fmla v9.2d,v1.2d,v4.d[0]; prfm pldl1keep,[%[sa],#128]\n\t" | |||
| "fmla v10.2d,v0.2d,v4.d[1]\n\t" | |||
| "ldr d2,[%[sa],#32]; fmov v7.d[1],x0\n\t" | |||
| "fmla v11.2d,v1.2d,v4.d[1]; ldr x0,[%[sa],#40]\n\t" | |||
| "fmla v12.2d,v0.2d,v5.d[0]\n\t" | |||
| "fmla v13.2d,v1.2d,v5.d[0]\n\t" | |||
| "ldr d4,[%[b3_]]; fmov v2.d[1],x0\n\t" | |||
| "fmla v14.2d,v0.2d,v5.d[1]; ldr x0,[%[b3_],#8]\n\t" | |||
| "fmla v15.2d,v1.2d,v5.d[1]\n\t" | |||
| "fmla v16.2d,v0.2d,v6.d[0]\n\t" | |||
| "ldr d5,[%[b3_],#16]; fmov v4.d[1],x0\n\t" | |||
| "fmla v17.2d,v1.2d,v6.d[0]; ldr x0,[%[b3_],#24]\n\t" | |||
| "fmla v18.2d,v0.2d,v6.d[1]\n\t" | |||
| "fmla v19.2d,v1.2d,v6.d[1]\n\t" | |||
| "ldr d3,[%[sa],#48]; fmov v5.d[1],x0\n\t" | |||
| "fmla v20.2d,v0.2d,v7.d[0]; ldr x0,[%[sa],#56]\n\t" | |||
| "fmla v21.2d,v1.2d,v7.d[0]; add %[sa],%[sa],#64\n\t" | |||
| "fmla v22.2d,v0.2d,v7.d[1]\n\t" | |||
| "ldr d6,[%[b1_],#32]; fmov v3.d[1],x0\n\t" | |||
| "fmla v23.2d,v1.2d,v7.d[1]; ldr x0,[%[b1_],#40]\n\t" | |||
| "fmla v24.2d,v0.2d,v4.d[0]; prfm pldl1keep,[%[b1_],#128]\n\t" | |||
| "fmla v25.2d,v1.2d,v4.d[0]\n\t" | |||
| "ldr d7,[%[b1_],#48]; fmov v6.d[1],x0\n\t" | |||
| "fmla v26.2d,v0.2d,v4.d[1]; ldr x0,[%[b1_],#56]\n\t" | |||
| "fmla v27.2d,v1.2d,v4.d[1]; add %[b1_],%[b1_],#64\n\t" | |||
| "fmla v28.2d,v0.2d,v5.d[0]\n\t" | |||
| "ldr d4,[%[b2_],#32]; fmov v7.d[1],x0\n\t" | |||
| "fmla v29.2d,v1.2d,v5.d[0]; ldr x0,[%[b2_],#40]\n\t" | |||
| "fmla v30.2d,v0.2d,v5.d[1]; prfm pldl1keep,[%[b2_],#128]\n\t" | |||
| "fmla v31.2d,v1.2d,v5.d[1]\n\t" | |||
| "ldr d0,[%[sa]]; fmov v4.d[1],x0\n\t" | |||
| "fmla v8.2d,v2.2d,v6.d[0]; ldr x0,[%[sa],#8]\n\t" | |||
| "fmla v9.2d,v3.2d,v6.d[0]\n\t" | |||
| "fmla v10.2d,v2.2d,v6.d[1]\n\t" | |||
| "ldr d5,[%[b2_],#48]; fmov v0.d[1],x0\n\t" | |||
| "fmla v11.2d,v3.2d,v6.d[1]; ldr x0,[%[b2_],#56]\n\t" | |||
| "fmla v12.2d,v2.2d,v7.d[0]; add %[b2_],%[b2_],#64\n\t" | |||
| "fmla v13.2d,v3.2d,v7.d[0]\n\t" | |||
| "ldr d6,[%[b3_],#32]; fmov v5.d[1],x0\n\t" | |||
| "fmla v14.2d,v2.2d,v7.d[1]; ldr x0,[%[b3_],#40]\n\t" | |||
| "fmla v15.2d,v3.2d,v7.d[1]; prfm pldl1keep,[%[b3_],#128]\n\t" | |||
| "fmla v16.2d,v2.2d,v4.d[0]\n\t" | |||
| "ldr d7,[%[b3_],#48]; fmov v6.d[1],x0\n\t" | |||
| "fmla v17.2d,v3.2d,v4.d[0]; ldr x0,[%[b3_],#56]\n\t" | |||
| "fmla v18.2d,v2.2d,v4.d[1]; add %[b3_],%[b3_],#64\n\t" | |||
| "fmla v19.2d,v3.2d,v4.d[1]\n\t" | |||
| "ldr d1,[%[sa],#16]; fmov v7.d[1],x0\n\t" | |||
| "fmla v20.2d,v2.2d,v5.d[0]; ldr x0,[%[sa],#24]\n\t" | |||
| "fmla v21.2d,v3.2d,v5.d[0]\n\t" | |||
| "fmla v22.2d,v2.2d,v5.d[1]\n\t" | |||
| "ldr d4,[%[b1_]]; fmov v1.d[1],x0\n\t" | |||
| "fmla v23.2d,v3.2d,v5.d[1]; ldr x0,[%[b1_],#8]\n\t" | |||
| "fmla v24.2d,v2.2d,v6.d[0]\n\t" | |||
| "fmla v25.2d,v3.2d,v6.d[0]\n\t" | |||
| "ldr d5,[%[b1_],#16]; fmov v4.d[1],x0\n\t" | |||
| "fmla v26.2d,v2.2d,v6.d[1]; ldr x0,[%[b1_],#24]\n\t" | |||
| "fmla v27.2d,v3.2d,v6.d[1]; sub %[K],%[K],#2\n\t" | |||
| "fmla v28.2d,v2.2d,v7.d[0]\n\t" | |||
| "ldr d6,[%[b2_]]; fmov v5.d[1],x0\n\t" | |||
| "fmla v29.2d,v3.2d,v7.d[0]; ldr x0,[%[b2_],#8]\n\t" | |||
| "fmla v30.2d,v2.2d,v7.d[1]; cmp %[K],#2\n\t" | |||
| "fmla v31.2d,v3.2d,v7.d[1]\n\t" | |||
| "bgt 1b; blt 3f\n\t" | |||
| "2:\n\t" | |||
| /** tail part with k = 2 */ | |||
| "ldr d7,[%[b2_],#16]; fmov v6.d[1],x0\n\t" | |||
| "fmla v8.2d,v0.2d,v4.d[0]; ldr x0,[%[b2_],#24]\n\t" | |||
| "fmla v9.2d,v1.2d,v4.d[0]; prfm pldl1keep,[%[sa],#128]\n\t" | |||
| "fmla v10.2d,v0.2d,v4.d[1]\n\t" | |||
| "ldr d2,[%[sa],#32]; fmov v7.d[1],x0\n\t" | |||
| "fmla v11.2d,v1.2d,v4.d[1]; ldr x0,[%[sa],#40]\n\t" | |||
| "fmla v12.2d,v0.2d,v5.d[0]\n\t" | |||
| "fmla v13.2d,v1.2d,v5.d[0]\n\t" | |||
| "ldr d4,[%[b3_]]; fmov v2.d[1],x0\n\t" | |||
| "fmla v14.2d,v0.2d,v5.d[1]; ldr x0,[%[b3_],#8]\n\t" | |||
| "fmla v15.2d,v1.2d,v5.d[1]\n\t" | |||
| "fmla v16.2d,v0.2d,v6.d[0]\n\t" | |||
| "ldr d5,[%[b3_],#16]; fmov v4.d[1],x0\n\t" | |||
| "fmla v17.2d,v1.2d,v6.d[0]; ldr x0,[%[b3_],#24]\n\t" | |||
| "fmla v18.2d,v0.2d,v6.d[1]\n\t" | |||
| "fmla v19.2d,v1.2d,v6.d[1]\n\t" | |||
| "ldr d3,[%[sa],#48]; fmov v5.d[1],x0\n\t" | |||
| "fmla v20.2d,v0.2d,v7.d[0]; ldr x0,[%[sa],#56]\n\t" | |||
| "fmla v21.2d,v1.2d,v7.d[0]; add %[sa],%[sa],#64\n\t" | |||
| "fmla v22.2d,v0.2d,v7.d[1]\n\t" | |||
| "ldr d6,[%[b1_],#32]; fmov v3.d[1],x0\n\t" | |||
| "fmla v23.2d,v1.2d,v7.d[1]; ldr x0,[%[b1_],#40]\n\t" | |||
| "fmla v24.2d,v0.2d,v4.d[0]\n\t" | |||
| "fmla v25.2d,v1.2d,v4.d[0]\n\t" | |||
| "ldr d7,[%[b1_],#48]; fmov v6.d[1],x0\n\t" | |||
| "fmla v26.2d,v0.2d,v4.d[1]; ldr x0,[%[b1_],#56]\n\t" | |||
| "fmla v27.2d,v1.2d,v4.d[1]; add %[b1_],%[b1_],#64\n\t" | |||
| "fmla v28.2d,v0.2d,v5.d[0]\n\t" | |||
| "ldr d4,[%[b2_],#32]; fmov v7.d[1],x0\n\t" | |||
| "fmla v29.2d,v1.2d,v5.d[0]; ldr x0,[%[b2_],#40]\n\t" | |||
| "fmla v30.2d,v0.2d,v5.d[1]\n\t" | |||
| "fmla v31.2d,v1.2d,v5.d[1]\n\t" | |||
| "fmov v4.d[1],x0\n\t" | |||
| "fmla v8.2d,v2.2d,v6.d[0]\n\t" | |||
| "fmla v9.2d,v3.2d,v6.d[0]\n\t" | |||
| "fmla v10.2d,v2.2d,v6.d[1]\n\t" | |||
| "ldr d5,[%[b2_],#48]\n\t" | |||
| "fmla v11.2d,v3.2d,v6.d[1]; ldr x0,[%[b2_],#56]\n\t" | |||
| "fmla v12.2d,v2.2d,v7.d[0]; add %[b2_],%[b2_],#64\n\t" | |||
| "fmla v13.2d,v3.2d,v7.d[0]\n\t" | |||
| "ldr d6,[%[b3_],#32]; fmov v5.d[1],x0\n\t" | |||
| "fmla v14.2d,v2.2d,v7.d[1]; ldr x0,[%[b3_],#40]\n\t" | |||
| "fmla v15.2d,v3.2d,v7.d[1]\n\t" | |||
| "fmla v16.2d,v2.2d,v4.d[0]\n\t" | |||
| "ldr d7,[%[b3_],#48]; fmov v6.d[1],x0\n\t" | |||
| "fmla v17.2d,v3.2d,v4.d[0]; ldr x0,[%[b3_],#56]\n\t" | |||
| "fmla v18.2d,v2.2d,v4.d[1]; add %[b3_],%[b3_],#64\n\t" | |||
| "fmla v19.2d,v3.2d,v4.d[1]\n\t" | |||
| "fmov v7.d[1],x0\n\t" | |||
| "fmla v20.2d,v2.2d,v5.d[0]\n\t" | |||
| "fmla v21.2d,v3.2d,v5.d[0]\n\t" | |||
| "fmla v22.2d,v2.2d,v5.d[1]\n\t" | |||
| "fmla v23.2d,v3.2d,v5.d[1]\n\t" | |||
| "fmla v24.2d,v2.2d,v6.d[0]\n\t" | |||
| "fmla v25.2d,v3.2d,v6.d[0]\n\t" | |||
| "fmla v26.2d,v2.2d,v6.d[1]\n\t" | |||
| "fmla v27.2d,v3.2d,v6.d[1]; sub %[K],%[K],#2\n\t" | |||
| "fmla v28.2d,v2.2d,v7.d[0]\n\t" | |||
| "fmla v29.2d,v3.2d,v7.d[0]\n\t" | |||
| "fmla v30.2d,v2.2d,v7.d[1]\n\t" | |||
| "fmla v31.2d,v3.2d,v7.d[1]\n\t" | |||
| "b 4f\n\t" | |||
| "3:\n\t" | |||
| /** tail part with k = 1 */ | |||
| "ldr d7,[%[b2_],#16]; fmov v6.d[1],x0\n\t" | |||
| "fmla v8.2d,v0.2d,v4.d[0]; ldr x0,[%[b2_],#24]\n\t" | |||
| "fmla v9.2d,v1.2d,v4.d[0]; add %[b2_],%[b2_],#32\n\t" | |||
| "fmla v10.2d,v0.2d,v4.d[1]\n\t" | |||
| "fmov v7.d[1],x0\n\t" | |||
| "fmla v11.2d,v1.2d,v4.d[1]; add %[sa],%[sa],#32\n\t" | |||
| "fmla v12.2d,v0.2d,v5.d[0]; add %[b1_],%[b1_],#32\n\t" | |||
| "fmla v13.2d,v1.2d,v5.d[0]; sub %[K],%[K],#1\n\t" | |||
| "ldr d4,[%[b3_]]\n\t" | |||
| "fmla v14.2d,v0.2d,v5.d[1]; ldr x0,[%[b3_],#8]\n\t" | |||
| "fmla v15.2d,v1.2d,v5.d[1]\n\t" | |||
| "fmla v16.2d,v0.2d,v6.d[0]\n\t" | |||
| "ldr d5,[%[b3_],#16]; fmov v4.d[1],x0\n\t" | |||
| "fmla v17.2d,v1.2d,v6.d[0]; ldr x0,[%[b3_],#24]\n\t" | |||
| "fmla v18.2d,v0.2d,v6.d[1]; add %[b3_],%[b3_],#32\n\t" | |||
| "fmla v19.2d,v1.2d,v6.d[1]\n\t" | |||
| "fmov v5.d[1],x0\n\t" | |||
| "fmla v20.2d,v0.2d,v7.d[0]\n\t" | |||
| "fmla v21.2d,v1.2d,v7.d[0]\n\t" | |||
| "fmla v22.2d,v0.2d,v7.d[1]\n\t" | |||
| "fmla v23.2d,v1.2d,v7.d[1]\n\t" | |||
| "fmla v24.2d,v0.2d,v4.d[0]\n\t" | |||
| "fmla v25.2d,v1.2d,v4.d[0]\n\t" | |||
| "fmla v26.2d,v0.2d,v4.d[1]\n\t" | |||
| "fmla v27.2d,v1.2d,v4.d[1]\n\t" | |||
| "fmla v28.2d,v0.2d,v5.d[0]\n\t" | |||
| "fmla v29.2d,v1.2d,v5.d[0]\n\t" | |||
| "fmla v30.2d,v0.2d,v5.d[1]\n\t" | |||
| "fmla v31.2d,v1.2d,v5.d[1]\n\t" | |||
| /** store 4x12 elements to C */ | |||
| "4:\n\t" | |||
| "ldr d0,%[alpha]; add x0,%[C],%[LDC],LSL #3\n\t" | |||
| "ldp q1,q2,[%[C]]; ldp q3,q4,[x0]\n\t" | |||
| "fmla v1.2d,v8.2d,v0.d[0]; fmla v2.2d,v9.2d,v0.d[0]\n\t" | |||
| "fmla v3.2d,v10.2d,v0.d[0]; fmla v4.2d,v11.2d,v0.d[0]\n\t" | |||
| "stp q1,q2,[%[C]]; add %[C],%[C],%[LDC],LSL #4\n\t" | |||
| "stp q3,q4,[x0]; add x0,x0,%[LDC],LSL #4\n\t" | |||
| "ldp q1,q2,[%[C]]; ldp q3,q4,[x0]\n\t" | |||
| "fmla v1.2d,v12.2d,v0.d[0]; fmla v2.2d,v13.2d,v0.d[0]\n\t" | |||
| "fmla v3.2d,v14.2d,v0.d[0]; fmla v4.2d,v15.2d,v0.d[0]\n\t" | |||
| "stp q1,q2,[%[C]]; add %[C],%[C],%[LDC],LSL #4\n\t" | |||
| "stp q3,q4,[x0]; add x0,x0,%[LDC],LSL #4\n\t" | |||
| "ldp q1,q2,[%[C]]; ldp q3,q4,[x0]\n\t" | |||
| "fmla v1.2d,v16.2d,v0.d[0]; fmla v2.2d,v17.2d,v0.d[0]\n\t" | |||
| "fmla v3.2d,v18.2d,v0.d[0]; fmla v4.2d,v19.2d,v0.d[0]\n\t" | |||
| "stp q1,q2,[%[C]]; add %[C],%[C],%[LDC],LSL #4\n\t" | |||
| "stp q3,q4,[x0]; add x0,x0,%[LDC],LSL #4\n\t" | |||
| "ldp q1,q2,[%[C]]; ldp q3,q4,[x0]\n\t" | |||
| "fmla v1.2d,v20.2d,v0.d[0]; fmla v2.2d,v21.2d,v0.d[0]\n\t" | |||
| "fmla v3.2d,v22.2d,v0.d[0]; fmla v4.2d,v23.2d,v0.d[0]\n\t" | |||
| "stp q1,q2,[%[C]]; add %[C],%[C],%[LDC],LSL #4\n\t" | |||
| "stp q3,q4,[x0]; add x0,x0,%[LDC],LSL #4\n\t" | |||
| "ldp q1,q2,[%[C]]; ldp q3,q4,[x0]\n\t" | |||
| "fmla v1.2d,v24.2d,v0.d[0]; fmla v2.2d,v25.2d,v0.d[0]\n\t" | |||
| "fmla v3.2d,v26.2d,v0.d[0]; fmla v4.2d,v27.2d,v0.d[0]\n\t" | |||
| "stp q1,q2,[%[C]]; add %[C],%[C],%[LDC],LSL #4\n\t" | |||
| "stp q3,q4,[x0]; add x0,x0,%[LDC],LSL #4\n\t" | |||
| "ldp q1,q2,[%[C]]; ldp q3,q4,[x0]\n\t" | |||
| "fmla v1.2d,v28.2d,v0.d[0]; fmla v2.2d,v29.2d,v0.d[0]\n\t" | |||
| "fmla v3.2d,v30.2d,v0.d[0]; fmla v4.2d,v31.2d,v0.d[0]\n\t" | |||
| "stp q1,q2,[%[C]]; stp q3,q4,[x0]\n\t" | |||
| :[sa]"+r"(sa), [b1_]"+r"(b1_), [b2_]"+r"(b2_), [b3_]"+r"(b3_), [C]"+r"(C), [K]"+r"(K) | |||
| :[LDC]"r"(LDC), [alpha]"m"(alpha) | |||
| :"cc", "memory", "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", | |||
| "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", | |||
| "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); | |||
| } | |||
| /********************************************************** | |||
| * Operation: | |||
| C[0] += alpha * up[0]; C[1] += alpha * up[1]; | |||
| C[2] += alpha * down[0]; C[3] += alpha * down[1]; | |||
| *********************************************************/ | |||
| static inline void dgemm_store_m4n1(FLOAT *C, float64x2_t up, float64x2_t down, FLOAT alpha) { | |||
| float64x2_t t1 = vld1q_f64(C), t2 = vld1q_f64(C + 2); | |||
| t1 = vfmaq_n_f64(t1, up, alpha); | |||
| t2 = vfmaq_n_f64(t2, down, alpha); | |||
| vst1q_f64(C, t1); | |||
| vst1q_f64(C + 2, t2); | |||
| } | |||
| /********************************************************** | |||
| * Function: dgemm_kernel_arm64_4x4_m4n8 | |||
| * Operation: C[4][8] += alpha * sa[4][K] * sb[K][8] | |||
| * Matrix orders: | |||
| * sa: column-major (leading dimension == 4) | |||
| * sb: 2 concatenated row-major 4-column submatrices | |||
| * C: column-major (leading dimension == LDC) | |||
| *********************************************************/ | |||
| static inline void dgemm_kernel_arm64_4x4_m4n8( | |||
| const FLOAT *sa, const FLOAT *sb, FLOAT *C, | |||
| BLASLONG K, BLASLONG LDC, FLOAT alpha) { | |||
| const FLOAT *b1_ = sb; | |||
| const FLOAT *b2_ = sb + K * 4; | |||
| /** register naming: c + m_id + n_id, m_id=1~2, n_id=1~8 */ | |||
| float64x2_t c11, c12, c13, c14, c15, c16, c17, c18; | |||
| float64x2_t c21, c22, c23, c24, c25, c26, c27, c28; | |||
| c11 = c12 = c13 = c14 = c15 = c16 = c17 = c18 = vdupq_n_f64(0); | |||
| c21 = c22 = c23 = c24 = c25 = c26 = c27 = c28 = vdupq_n_f64(0); | |||
| for (; K; K--) { | |||
| float64x2_t a1 = vld1q_f64(sa); | |||
| float64x2_t a2 = vld1q_f64(sa + 2); sa += 4; | |||
| float64x2_t b1 = vld1q_f64(b1_); | |||
| c11 = vfmaq_laneq_f64(c11, a1, b1, 0); | |||
| c21 = vfmaq_laneq_f64(c21, a2, b1, 0); | |||
| c12 = vfmaq_laneq_f64(c12, a1, b1, 1); | |||
| c22 = vfmaq_laneq_f64(c22, a2, b1, 1); | |||
| float64x2_t b2 = vld1q_f64(b1_ + 2); b1_ += 4; | |||
| c13 = vfmaq_laneq_f64(c13, a1, b2, 0); | |||
| c23 = vfmaq_laneq_f64(c23, a2, b2, 0); | |||
| c14 = vfmaq_laneq_f64(c14, a1, b2, 1); | |||
| c24 = vfmaq_laneq_f64(c24, a2, b2, 1); | |||
| float64x2_t b3 = vld1q_f64(b2_); | |||
| c15 = vfmaq_laneq_f64(c15, a1, b3, 0); | |||
| c25 = vfmaq_laneq_f64(c25, a2, b3, 0); | |||
| c16 = vfmaq_laneq_f64(c16, a1, b3, 1); | |||
| c26 = vfmaq_laneq_f64(c26, a2, b3, 1); | |||
| float64x2_t b4 = vld1q_f64(b2_ + 2); b2_ += 4; | |||
| c17 = vfmaq_laneq_f64(c17, a1, b4, 0); | |||
| c27 = vfmaq_laneq_f64(c27, a2, b4, 0); | |||
| c18 = vfmaq_laneq_f64(c18, a1, b4, 1); | |||
| c28 = vfmaq_laneq_f64(c28, a2, b4, 1); | |||
| } | |||
| dgemm_store_m4n1(C, c11, c21, alpha); C += LDC; | |||
| dgemm_store_m4n1(C, c12, c22, alpha); C += LDC; | |||
| dgemm_store_m4n1(C, c13, c23, alpha); C += LDC; | |||
| dgemm_store_m4n1(C, c14, c24, alpha); C += LDC; | |||
| dgemm_store_m4n1(C, c15, c25, alpha); C += LDC; | |||
| dgemm_store_m4n1(C, c16, c26, alpha); C += LDC; | |||
| dgemm_store_m4n1(C, c17, c27, alpha); C += LDC; | |||
| dgemm_store_m4n1(C, c18, c28, alpha); | |||
| } | |||
| /********************************************************** | |||
| * Function: dgemm_kernel_arm64_4x4_m4n4 | |||
| * Operation: C[4][4] += alpha * sa[4][K] * sb[K][4] | |||
| * Matrix orders: | |||
| * sa: column-major (leading dimension == 4) | |||
| * sb: row-major (leading dimension == 4) | |||
| * C: column-major (leading dimension == LDC) | |||
| *********************************************************/ | |||
| static inline void dgemm_kernel_arm64_4x4_m4n4( | |||
| const FLOAT *sa, const FLOAT *sb, FLOAT *C, | |||
| BLASLONG K, BLASLONG LDC, FLOAT alpha) { | |||
| float64x2_t c11, c21, c12, c22, c13, c23, c14, c24; | |||
| c11 = c21 = c12 = c22 = c13 = c23 = c14 = c24 = vdupq_n_f64(0); | |||
| for (; K; K--) { | |||
| float64x2_t a1 = vld1q_f64(sa); | |||
| float64x2_t a2 = vld1q_f64(sa + 2); sa += 4; | |||
| float64x2_t b1 = vld1q_f64(sb); | |||
| float64x2_t b2 = vld1q_f64(sb + 2); sb += 4; | |||
| c11 = vfmaq_laneq_f64(c11, a1, b1, 0); | |||
| c21 = vfmaq_laneq_f64(c21, a2, b1, 0); | |||
| c12 = vfmaq_laneq_f64(c12, a1, b1, 1); | |||
| c22 = vfmaq_laneq_f64(c22, a2, b1, 1); | |||
| c13 = vfmaq_laneq_f64(c13, a1, b2, 0); | |||
| c23 = vfmaq_laneq_f64(c23, a2, b2, 0); | |||
| c14 = vfmaq_laneq_f64(c14, a1, b2, 1); | |||
| c24 = vfmaq_laneq_f64(c24, a2, b2, 1); | |||
| } | |||
| dgemm_store_m4n1(C, c11, c21, alpha); C += LDC; | |||
| dgemm_store_m4n1(C, c12, c22, alpha); C += LDC; | |||
| dgemm_store_m4n1(C, c13, c23, alpha); C += LDC; | |||
| dgemm_store_m4n1(C, c14, c24, alpha); | |||
| } | |||
| static inline void dgemm_kernel_arm64_4x4_m4n2( | |||
| const FLOAT *sa, const FLOAT *sb, FLOAT *C, | |||
| BLASLONG K, BLASLONG LDC, FLOAT alpha) { | |||
| float64x2_t c11_1, c11_2, c21_1, c21_2, c12_1, c12_2, c22_1, c22_2; | |||
| c11_1 = c11_2 = c21_1 = c21_2 = c12_1 = c12_2 = c22_1 = c22_2 = vdupq_n_f64(0); | |||
| for (; K > 1; K -= 2) { | |||
| float64x2_t b1 = vld1q_f64(sb), b2 = vld1q_f64(sb + 2); sb += 4; | |||
| float64x2_t a1_1 = vld1q_f64(sa), a2_1 = vld1q_f64(sa + 2), | |||
| a1_2 = vld1q_f64(sa + 4), a2_2 = vld1q_f64(sa + 6); sa += 8; | |||
| c11_1 = vfmaq_laneq_f64(c11_1, a1_1, b1, 0); | |||
| c21_1 = vfmaq_laneq_f64(c21_1, a2_1, b1, 0); | |||
| c12_1 = vfmaq_laneq_f64(c12_1, a1_1, b1, 1); | |||
| c22_1 = vfmaq_laneq_f64(c22_1, a2_1, b1, 1); | |||
| c11_2 = vfmaq_laneq_f64(c11_2, a1_2, b2, 0); | |||
| c21_2 = vfmaq_laneq_f64(c21_2, a2_2, b2, 0); | |||
| c12_2 = vfmaq_laneq_f64(c12_2, a1_2, b2, 1); | |||
| c22_2 = vfmaq_laneq_f64(c22_2, a2_2, b2, 1); | |||
| } | |||
| c11_1 = vaddq_f64(c11_1, c11_2); | |||
| c21_1 = vaddq_f64(c21_1, c21_2); | |||
| c12_1 = vaddq_f64(c12_1, c12_2); | |||
| c22_1 = vaddq_f64(c22_1, c22_2); | |||
| if (K) { | |||
| float64x2_t b1 = vld1q_f64(sb); sb += 2; | |||
| float64x2_t a1 = vld1q_f64(sa), a2 = vld1q_f64(sa + 2); sa += 4; | |||
| c11_1 = vfmaq_laneq_f64(c11_1, a1, b1, 0); | |||
| c21_1 = vfmaq_laneq_f64(c21_1, a2, b1, 0); | |||
| c12_1 = vfmaq_laneq_f64(c12_1, a1, b1, 1); | |||
| c22_1 = vfmaq_laneq_f64(c22_1, a2, b1, 1); | |||
| } | |||
| dgemm_store_m4n1(C, c11_1, c21_1, alpha); C += LDC; | |||
| dgemm_store_m4n1(C, c12_1, c22_1, alpha); | |||
| } | |||
| static inline void dgemm_kernel_arm64_4x4_m4n1( | |||
| const FLOAT *sa, const FLOAT *sb, FLOAT *C, | |||
| BLASLONG K, BLASLONG LDC, FLOAT alpha) { | |||
| float64x2_t c11_1, c11_2, c21_1, c21_2; | |||
| c11_1 = c11_2 = c21_1 = c21_2 = vdupq_n_f64(0); | |||
| for (; K > 1; K -= 2) { | |||
| float64x2_t b1 = vld1q_f64(sb); sb += 2; | |||
| c11_1 = vfmaq_laneq_f64(c11_1, vld1q_f64(sa), b1, 0); | |||
| c21_1 = vfmaq_laneq_f64(c21_1, vld1q_f64(sa + 2), b1, 0); | |||
| c11_2 = vfmaq_laneq_f64(c11_2, vld1q_f64(sa + 4), b1, 1); | |||
| c21_2 = vfmaq_laneq_f64(c21_2, vld1q_f64(sa + 6), b1, 1); | |||
| sa += 8; | |||
| } | |||
| c11_1 = vaddq_f64(c11_1, c11_2); | |||
| c21_1 = vaddq_f64(c21_1, c21_2); | |||
| if (K) { | |||
| double b1 = *sb++; | |||
| c11_1 = vfmaq_n_f64(c11_1, vld1q_f64(sa), b1); | |||
| c21_1 = vfmaq_n_f64(c21_1, vld1q_f64(sa + 2), b1); | |||
| sa += 4; | |||
| } | |||
| dgemm_store_m4n1(C, c11_1, c21_1, alpha); | |||
| } | |||
| static inline void dgemm_kernel_arm64_4x4_m2n12( | |||
| const FLOAT *sa, const FLOAT *sb, FLOAT *c, | |||
| BLASLONG K, BLASLONG LDC, FLOAT alpha) { | |||
| float64x2_t c01, c02, c03, c04, c11, c12, c13, c14, c21, c22, c23, c24; | |||
| c01 = c02 = c03 = c04 = c11 = c12 = c13 = c14 = | |||
| c21 = c22 = c23 = c24 = vdupq_n_f64(0); | |||
| const FLOAT *b1_ = sb; | |||
| const FLOAT *b2_ = sb + 4 * K; | |||
| const FLOAT *b3_ = b2_ + 4 * K; | |||
| for (; K; K--) { | |||
| const float64x2_t a1 = vld1q_f64(sa); sa += 2; | |||
| float64x2_t b1 = vld1q_f64(b1_), b2 = vld1q_f64(b1_ + 2); b1_ += 4; | |||
| c01 = vfmaq_laneq_f64(c01, a1, b1, 0); | |||
| c02 = vfmaq_laneq_f64(c02, a1, b1, 1); | |||
| c03 = vfmaq_laneq_f64(c03, a1, b2, 0); | |||
| c04 = vfmaq_laneq_f64(c04, a1, b2, 1); | |||
| b1 = vld1q_f64(b2_); b2 = vld1q_f64(b2_ + 2); b2_ += 4; | |||
| c11 = vfmaq_laneq_f64(c11, a1, b1, 0); | |||
| c12 = vfmaq_laneq_f64(c12, a1, b1, 1); | |||
| c13 = vfmaq_laneq_f64(c13, a1, b2, 0); | |||
| c14 = vfmaq_laneq_f64(c14, a1, b2, 1); | |||
| b1 = vld1q_f64(b3_); b2 = vld1q_f64(b3_ + 2); b3_ += 4; | |||
| c21 = vfmaq_laneq_f64(c21, a1, b1, 0); | |||
| c22 = vfmaq_laneq_f64(c22, a1, b1, 1); | |||
| c23 = vfmaq_laneq_f64(c23, a1, b2, 0); | |||
| c24 = vfmaq_laneq_f64(c24, a1, b2, 1); | |||
| } | |||
| vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c01, alpha)); c += LDC; | |||
| vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c02, alpha)); c += LDC; | |||
| vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c03, alpha)); c += LDC; | |||
| vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c04, alpha)); c += LDC; | |||
| vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c11, alpha)); c += LDC; | |||
| vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c12, alpha)); c += LDC; | |||
| vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c13, alpha)); c += LDC; | |||
| vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c14, alpha)); c += LDC; | |||
| vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c21, alpha)); c += LDC; | |||
| vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c22, alpha)); c += LDC; | |||
| vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c23, alpha)); c += LDC; | |||
| vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c24, alpha)); | |||
| } | |||
| static inline void dgemm_kernel_arm64_4x4_m2n8( | |||
| const FLOAT *sa, const FLOAT *sb, FLOAT *c, | |||
| BLASLONG K, BLASLONG LDC, FLOAT alpha) { | |||
| float64x2_t c01, c02, c03, c04, c11, c12, c13, c14; | |||
| c01 = c02 = c03 = c04 = c11 = c12 = c13 = c14 = vdupq_n_f64(0); | |||
| const FLOAT *b1_ = sb; | |||
| const FLOAT *b2_ = sb + 4 * K; | |||
| for (; K; K--) { | |||
| const float64x2_t a1 = vld1q_f64(sa); sa += 2; | |||
| float64x2_t b1 = vld1q_f64(b1_), b2 = vld1q_f64(b1_ + 2); b1_ += 4; | |||
| c01 = vfmaq_laneq_f64(c01, a1, b1, 0); | |||
| c02 = vfmaq_laneq_f64(c02, a1, b1, 1); | |||
| c03 = vfmaq_laneq_f64(c03, a1, b2, 0); | |||
| c04 = vfmaq_laneq_f64(c04, a1, b2, 1); | |||
| b1 = vld1q_f64(b2_); b2 = vld1q_f64(b2_ + 2); b2_ += 4; | |||
| c11 = vfmaq_laneq_f64(c11, a1, b1, 0); | |||
| c12 = vfmaq_laneq_f64(c12, a1, b1, 1); | |||
| c13 = vfmaq_laneq_f64(c13, a1, b2, 0); | |||
| c14 = vfmaq_laneq_f64(c14, a1, b2, 1); | |||
| } | |||
| vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c01, alpha)); c += LDC; | |||
| vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c02, alpha)); c += LDC; | |||
| vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c03, alpha)); c += LDC; | |||
| vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c04, alpha)); c += LDC; | |||
| vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c11, alpha)); c += LDC; | |||
| vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c12, alpha)); c += LDC; | |||
| vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c13, alpha)); c += LDC; | |||
| vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c14, alpha)); | |||
| } | |||
| static inline void dgemm_kernel_arm64_4x4_m2n4( | |||
| const FLOAT *sa, const FLOAT *sb, FLOAT *c, | |||
| BLASLONG K, BLASLONG LDC, FLOAT alpha) { | |||
| float64x2_t c1_1, c1_2, c2_1, c2_2, c3_1, c3_2, c4_1, c4_2; | |||
| c1_1 = c1_2 = c2_1 = c2_2 = c3_1 = c3_2 = c4_1 = c4_2 = vdupq_n_f64(0); | |||
| for (; K > 1; K -= 2) { | |||
| float64x2_t a1 = vld1q_f64(sa), a2 = vld1q_f64(sa + 2); sa += 4; | |||
| float64x2_t b1_1 = vld1q_f64(sb), b2_1 = vld1q_f64(sb + 2); | |||
| float64x2_t b1_2 = vld1q_f64(sb + 4), b2_2 = vld1q_f64(sb + 6); sb += 8; | |||
| c1_1 = vfmaq_laneq_f64(c1_1, a1, b1_1, 0); | |||
| c2_1 = vfmaq_laneq_f64(c2_1, a1, b1_1, 1); | |||
| c3_1 = vfmaq_laneq_f64(c3_1, a1, b2_1, 0); | |||
| c4_1 = vfmaq_laneq_f64(c4_1, a1, b2_1, 1); | |||
| c1_2 = vfmaq_laneq_f64(c1_2, a2, b1_2, 0); | |||
| c2_2 = vfmaq_laneq_f64(c2_2, a2, b1_2, 1); | |||
| c3_2 = vfmaq_laneq_f64(c3_2, a2, b2_2, 0); | |||
| c4_2 = vfmaq_laneq_f64(c4_2, a2, b2_2, 1); | |||
| } | |||
| c1_1 = vaddq_f64(c1_1, c1_2); | |||
| c2_1 = vaddq_f64(c2_1, c2_2); | |||
| c3_1 = vaddq_f64(c3_1, c3_2); | |||
| c4_1 = vaddq_f64(c4_1, c4_2); | |||
| if (K) { | |||
| float64x2_t a1 = vld1q_f64(sa); sa += 2; | |||
| float64x2_t b1 = vld1q_f64(sb), b2 = vld1q_f64(sb + 2); sb += 4; | |||
| c1_1 = vfmaq_laneq_f64(c1_1, a1, b1, 0); | |||
| c2_1 = vfmaq_laneq_f64(c2_1, a1, b1, 1); | |||
| c3_1 = vfmaq_laneq_f64(c3_1, a1, b2, 0); | |||
| c4_1 = vfmaq_laneq_f64(c4_1, a1, b2, 1); | |||
| } | |||
| vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c1_1, alpha)); c += LDC; | |||
| vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c2_1, alpha)); c += LDC; | |||
| vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c3_1, alpha)); c += LDC; | |||
| vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c4_1, alpha)); | |||
| } | |||
| static inline void dgemm_kernel_arm64_4x4_m2n2( | |||
| const FLOAT *sa, const FLOAT *sb, FLOAT *c, | |||
| BLASLONG K, BLASLONG LDC, FLOAT alpha) { | |||
| float64x2_t c1_1, c1_2, c2_1, c2_2; | |||
| c1_1 = c1_2 = c2_1 = c2_2 = vdupq_n_f64(0); | |||
| for (; K > 1; K -= 2) { | |||
| float64x2_t a1 = vld1q_f64(sa), a2 = vld1q_f64(sa + 2); sa += 4; | |||
| float64x2_t b1 = vld1q_f64(sb), b2 = vld1q_f64(sb + 2); sb += 4; | |||
| c1_1 = vfmaq_laneq_f64(c1_1, a1, b1, 0); | |||
| c2_1 = vfmaq_laneq_f64(c2_1, a1, b1, 1); | |||
| c1_2 = vfmaq_laneq_f64(c1_2, a2, b2, 0); | |||
| c2_2 = vfmaq_laneq_f64(c2_2, a2, b2, 1); | |||
| } | |||
| c1_1 = vaddq_f64(c1_1, c1_2); | |||
| c2_1 = vaddq_f64(c2_1, c2_2); | |||
| if (K) { | |||
| float64x2_t a1 = vld1q_f64(sa); sa += 2; | |||
| float64x2_t b1 = vld1q_f64(sb); sb += 2; | |||
| c1_1 = vfmaq_laneq_f64(c1_1, a1, b1, 0); | |||
| c2_1 = vfmaq_laneq_f64(c2_1, a1, b1, 1); | |||
| } | |||
| vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c1_1, alpha)); c += LDC; | |||
| vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c2_1, alpha)); | |||
| } | |||
| static inline void dgemm_kernel_arm64_4x4_m2n1( | |||
| const FLOAT *sa, const FLOAT *sb, FLOAT *c, | |||
| BLASLONG K, BLASLONG LDC, FLOAT alpha) { | |||
| float64x2_t c1, c2, c3, c4; | |||
| c1 = c2 = c3 = c4 = vdupq_n_f64(0); | |||
| for (; K > 3; K -= 4) { | |||
| float64x2_t b12 = vld1q_f64(sb), b34 = vld1q_f64(sb + 2); sb += 4; | |||
| c1 = vfmaq_laneq_f64(c1, vld1q_f64(sa), b12, 0); | |||
| c2 = vfmaq_laneq_f64(c2, vld1q_f64(sa + 2), b12, 1); | |||
| c3 = vfmaq_laneq_f64(c3, vld1q_f64(sa + 4), b34, 0); | |||
| c4 = vfmaq_laneq_f64(c4, vld1q_f64(sa + 6), b34, 1); | |||
| sa += 8; | |||
| } | |||
| c1 = vaddq_f64(c1, c2); | |||
| c3 = vaddq_f64(c3, c4); | |||
| c1 = vaddq_f64(c1, c3); | |||
| for (; K; K--) { | |||
| c1 = vfmaq_n_f64(c1, vld1q_f64(sa), *sb++); | |||
| sa += 2; | |||
| } | |||
| vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c1, alpha)); | |||
| } | |||
| static inline void dgemm_store_m1n2(double *C, float64x2_t vc, | |||
| double alpha, BLASLONG LDC) { | |||
| double c0 = vgetq_lane_f64(vc, 0); | |||
| double c1 = vgetq_lane_f64(vc, 1); | |||
| C[0] += c0 * alpha; | |||
| C[LDC] += c1 * alpha; | |||
| } | |||
| static inline void dgemm_kernel_arm64_4x4_m1n12( | |||
| const FLOAT *sa, const FLOAT *sb, FLOAT *C, | |||
| BLASLONG K, BLASLONG LDC, FLOAT alpha) { | |||
| float64x2_t c1, c2, c3, c4, c5, c6; | |||
| c1 = c2 = c3 = c4 = c5 = c6 = vdupq_n_f64(0); | |||
| const double *b1_ = sb; | |||
| const double *b2_ = sb + 4 * K; | |||
| const double *b3_ = b2_ + 4 * K; | |||
| for (; K; K--) { | |||
| const double a1 = *sa++; | |||
| c1 = vfmaq_n_f64(c1, vld1q_f64(b1_), a1); | |||
| c2 = vfmaq_n_f64(c2, vld1q_f64(b1_ + 2), a1); b1_ += 4; | |||
| c3 = vfmaq_n_f64(c3, vld1q_f64(b2_), a1); | |||
| c4 = vfmaq_n_f64(c4, vld1q_f64(b2_ + 2), a1); b2_ += 4; | |||
| c5 = vfmaq_n_f64(c5, vld1q_f64(b3_), a1); | |||
| c6 = vfmaq_n_f64(c6, vld1q_f64(b3_ + 2), a1); b3_ += 4; | |||
| } | |||
| dgemm_store_m1n2(C, c1, alpha, LDC); C += LDC * 2; | |||
| dgemm_store_m1n2(C, c2, alpha, LDC); C += LDC * 2; | |||
| dgemm_store_m1n2(C, c3, alpha, LDC); C += LDC * 2; | |||
| dgemm_store_m1n2(C, c4, alpha, LDC); C += LDC * 2; | |||
| dgemm_store_m1n2(C, c5, alpha, LDC); C += LDC * 2; | |||
| dgemm_store_m1n2(C, c6, alpha, LDC); | |||
| } | |||
| static inline void dgemm_kernel_arm64_4x4_m1n8( | |||
| const FLOAT *sa, const FLOAT *sb, FLOAT *C, | |||
| BLASLONG K, BLASLONG LDC, FLOAT alpha) { | |||
| float64x2_t c1, c2, c3, c4; | |||
| c1 = c2 = c3 = c4 = vdupq_n_f64(0); | |||
| const double *b1_ = sb; | |||
| const double *b2_ = sb + 4 * K; | |||
| for (; K; K--) { | |||
| const double a1 = *sa++; | |||
| c1 = vfmaq_n_f64(c1, vld1q_f64(b1_), a1); | |||
| c2 = vfmaq_n_f64(c2, vld1q_f64(b1_ + 2), a1); b1_ += 4; | |||
| c3 = vfmaq_n_f64(c3, vld1q_f64(b2_), a1); | |||
| c4 = vfmaq_n_f64(c4, vld1q_f64(b2_ + 2), a1); b2_ += 4; | |||
| } | |||
| dgemm_store_m1n2(C, c1, alpha, LDC); C += LDC * 2; | |||
| dgemm_store_m1n2(C, c2, alpha, LDC); C += LDC * 2; | |||
| dgemm_store_m1n2(C, c3, alpha, LDC); C += LDC * 2; | |||
| dgemm_store_m1n2(C, c4, alpha, LDC); | |||
| } | |||
| static inline void dgemm_kernel_arm64_4x4_m1n4( | |||
| const FLOAT *sa, const FLOAT *sb, FLOAT *C, | |||
| BLASLONG K, BLASLONG LDC, FLOAT alpha) { | |||
| float64x2_t c1_1, c1_2, c2_1, c2_2; | |||
| c1_1 = c1_2 = c2_1 = c2_2 = vdupq_n_f64(0); | |||
| for (; K > 1; K -= 2) { | |||
| float64x2_t a1 = vld1q_f64(sa); sa += 2; | |||
| c1_1 = vfmaq_laneq_f64(c1_1, vld1q_f64(sb), a1, 0); | |||
| c2_1 = vfmaq_laneq_f64(c2_1, vld1q_f64(sb + 2), a1, 0); | |||
| c1_2 = vfmaq_laneq_f64(c1_2, vld1q_f64(sb + 4), a1, 1); | |||
| c2_2 = vfmaq_laneq_f64(c2_2, vld1q_f64(sb + 6), a1, 1); sb += 8; | |||
| } | |||
| c1_1 = vaddq_f64(c1_1, c1_2); | |||
| c2_1 = vaddq_f64(c2_1, c2_2); | |||
| if (K) { | |||
| double a1 = *sa++; | |||
| c1_1 = vfmaq_n_f64(c1_1, vld1q_f64(sb), a1); | |||
| c2_1 = vfmaq_n_f64(c2_1, vld1q_f64(sb + 2), a1); | |||
| sb += 4; | |||
| } | |||
| dgemm_store_m1n2(C, c1_1, alpha, LDC); C += LDC * 2; | |||
| dgemm_store_m1n2(C, c2_1, alpha, LDC); | |||
| } | |||
| static inline void dgemm_kernel_arm64_4x4_m1n2( | |||
| const FLOAT *sa, const FLOAT *sb, FLOAT *C, | |||
| BLASLONG K, BLASLONG LDC, FLOAT alpha) { | |||
| float64x2_t c1, c2, c3, c4; | |||
| c1 = c2 = c3 = c4 = vdupq_n_f64(0); | |||
| for (; K > 3; K -= 4) { | |||
| float64x2_t a12 = vld1q_f64(sa), a34 = vld1q_f64(sa + 2); sa += 4; | |||
| c1 = vfmaq_laneq_f64(c1, vld1q_f64(sb), a12, 0); | |||
| c2 = vfmaq_laneq_f64(c2, vld1q_f64(sb + 2), a12, 1); | |||
| c3 = vfmaq_laneq_f64(c3, vld1q_f64(sb + 4), a34, 0); | |||
| c4 = vfmaq_laneq_f64(c4, vld1q_f64(sb + 6), a34, 1); sb += 8; | |||
| } | |||
| c1 = vaddq_f64(c1, c2); | |||
| c3 = vaddq_f64(c3, c4); | |||
| c1 = vaddq_f64(c1, c3); | |||
| for (; K; K--) { | |||
| c1 = vfmaq_n_f64(c1, vld1q_f64(sb), *sa++); | |||
| sb += 2; | |||
| } | |||
| dgemm_store_m1n2(C, c1, alpha, LDC); | |||
| } | |||
| static inline void dgemm_kernel_arm64_4x4_m1n1( | |||
| const FLOAT *sa, const FLOAT *sb, FLOAT *C, | |||
| BLASLONG K, BLASLONG LDC, FLOAT alpha) { | |||
| float64x2_t c1, c2, c3, c4; | |||
| c1 = c2 = c3 = c4 = vdupq_n_f64(0); | |||
| for (; K > 7; K -= 8) { | |||
| c1 = vfmaq_f64(c1, vld1q_f64(sb), vld1q_f64(sa)); | |||
| c2 = vfmaq_f64(c2, vld1q_f64(sb + 2), vld1q_f64(sa + 2)); | |||
| c3 = vfmaq_f64(c3, vld1q_f64(sb + 4), vld1q_f64(sa + 4)); | |||
| c4 = vfmaq_f64(c4, vld1q_f64(sb + 6), vld1q_f64(sa + 6)); | |||
| sa += 8; sb += 8; | |||
| } | |||
| c1 = vaddq_f64(c1, c2); | |||
| c3 = vaddq_f64(c3, c4); | |||
| c1 = vaddq_f64(c1, c3); | |||
| double cs1 = vpaddd_f64(c1); | |||
| for (; K; K--) { | |||
| cs1 += (*sa++) * (*sb++); | |||
| } | |||
| C[0] += cs1 * alpha; | |||
| } | |||
| int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, | |||
| FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG LDC) { | |||
| for (; N >= 12; N -= 12) { | |||
| BLASLONG m_left = M; | |||
| const FLOAT *a_ = sa; | |||
| FLOAT *c_ = C; | |||
| for (; m_left >= 4; m_left -= 4) { | |||
| dgemm_kernel_arm_cortex_a53_4x4_m4n12(a_, sb, c_, K, LDC, alpha); | |||
| c_ += 4; | |||
| a_ += 4 * K; | |||
| } | |||
| if (m_left >= 2) { | |||
| m_left -= 2; | |||
| dgemm_kernel_arm64_4x4_m2n12(a_, sb, c_, K, LDC, alpha); | |||
| c_ += 2; | |||
| a_ += 2 * K; | |||
| } | |||
| if (m_left) { | |||
| dgemm_kernel_arm64_4x4_m1n12(a_, sb, c_, K, LDC, alpha); | |||
| } | |||
| sb += 12 * K; | |||
| C += 12 * LDC; | |||
| } | |||
| if (N >= 8) { | |||
| N -= 8; | |||
| BLASLONG m_left = M; | |||
| const FLOAT *a_ = sa; | |||
| FLOAT *c_ = C; | |||
| for (; m_left >= 4; m_left -= 4) { | |||
| dgemm_kernel_arm64_4x4_m4n8(a_, sb, c_, K, LDC, alpha); | |||
| c_ += 4; | |||
| a_ += 4 * K; | |||
| } | |||
| if (m_left >= 2) { | |||
| m_left -= 2; | |||
| dgemm_kernel_arm64_4x4_m2n8(a_, sb, c_, K, LDC, alpha); | |||
| c_ += 2; | |||
| a_ += 2 * K; | |||
| } | |||
| if (m_left) { | |||
| dgemm_kernel_arm64_4x4_m1n8(a_, sb, c_, K, LDC, alpha); | |||
| } | |||
| sb += 8 * K; | |||
| C += 8 * LDC; | |||
| } else if (N >= 4) { | |||
| N -= 4; | |||
| BLASLONG m_left = M; | |||
| const FLOAT *a_ = sa; | |||
| FLOAT *c_ = C; | |||
| for (; m_left >= 4; m_left -= 4) { | |||
| dgemm_kernel_arm64_4x4_m4n4(a_, sb, c_, K, LDC, alpha); | |||
| c_ += 4; | |||
| a_ += 4 * K; | |||
| } | |||
| if (m_left >= 2) { | |||
| m_left -= 2; | |||
| dgemm_kernel_arm64_4x4_m2n4(a_, sb, c_, K, LDC, alpha); | |||
| c_ += 2; | |||
| a_ += 2 * K; | |||
| } | |||
| if (m_left) { | |||
| dgemm_kernel_arm64_4x4_m1n4(a_, sb, c_, K, LDC, alpha); | |||
| } | |||
| sb += 4 * K; | |||
| C += 4 * LDC; | |||
| } | |||
| if (N >= 2) { | |||
| N -= 2; | |||
| BLASLONG m_left = M; | |||
| const FLOAT *a_ = sa; | |||
| FLOAT *c_ = C; | |||
| for (; m_left >= 4; m_left -= 4) { | |||
| dgemm_kernel_arm64_4x4_m4n2(a_, sb, c_, K, LDC, alpha); | |||
| c_ += 4; | |||
| a_ += 4 * K; | |||
| } | |||
| if (m_left >= 2) { | |||
| m_left -= 2; | |||
| dgemm_kernel_arm64_4x4_m2n2(a_, sb, c_, K, LDC, alpha); | |||
| c_ += 2; | |||
| a_ += 2 * K; | |||
| } | |||
| if (m_left) { | |||
| dgemm_kernel_arm64_4x4_m1n2(a_, sb, c_, K, LDC, alpha); | |||
| } | |||
| sb += 2 * K; | |||
| C += 2 * LDC; | |||
| } | |||
| if (N) { | |||
| BLASLONG m_left = M; | |||
| const FLOAT *a_ = sa; | |||
| FLOAT *c_ = C; | |||
| for (; m_left >= 4; m_left -= 4) { | |||
| dgemm_kernel_arm64_4x4_m4n1(a_, sb, c_, K, LDC, alpha); | |||
| c_ += 4; | |||
| a_ += 4 * K; | |||
| } | |||
| if (m_left >= 2) { | |||
| m_left -= 2; | |||
| dgemm_kernel_arm64_4x4_m2n1(a_, sb, c_, K, LDC, alpha); | |||
| c_ += 2; | |||
| a_ += 2 * K; | |||
| } | |||
| if (m_left) { | |||
| dgemm_kernel_arm64_4x4_m1n1(a_, sb, c_, K, LDC, alpha); | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,874 @@ | |||
| /******************************************************************************* | |||
| Copyright (c) 2015, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *******************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| /* X0 X1 X2 s0 X3 x4 x5 x6 */ | |||
| /*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc )*/ | |||
| #define origM x0 | |||
| #define origN x1 | |||
| #define origK x2 | |||
| #define origPA x3 | |||
| #define origPB x4 | |||
| #define pC x5 | |||
| #define LDC x6 | |||
| #define temp x7 | |||
| #define counterL x8 | |||
| #define counterI x9 | |||
| #define counterJ x10 | |||
| #define pB x11 | |||
| #define pCRow0 x12 | |||
| #define pCRow1 x13 | |||
| #define pCRow2 x14 | |||
| #define lanes x15 | |||
| #define pA x16 | |||
| #define alpha x17 | |||
| #define alpha0 d10 | |||
| #define alphaZ z2.d | |||
| #define A_PRE_SIZE 1536 | |||
| #define B_PRE_SIZE 512 | |||
| #define C_PRE_SIZE 128 | |||
| // 00 origM | |||
| // 01 origN | |||
| // 02 origK | |||
| // 03 origPA | |||
| // 04 origPB | |||
| // 05 pC | |||
| // 06 origLDC -> LDC | |||
| // 07 temp | |||
| // 08 counterL | |||
| // 09 counterI | |||
| // 10 counterJ | |||
| // 11 pB | |||
| // 12 pCRow0 | |||
| // 13 pCRow1 | |||
| // 14 pCRow2 | |||
| // 15 lanes | |||
| // 16 pA | |||
| // 17 | |||
| // 18 must save | |||
| // 19 must save | |||
| // 20 must save | |||
| // 21 must save | |||
| // 22 must save | |||
| // 23 must save | |||
| // 24 must save | |||
| // 25 must save | |||
| // 26 must save | |||
| // 27 must save | |||
| // 28 must save | |||
| // 29 frame | |||
| // 30 link | |||
| // 31 sp | |||
| //v00 ALPHA -> pA0_0 | |||
| //v01 pA0_1 | |||
| //v02 ALPHA0 | |||
| //v03 | |||
| //v04 | |||
| //v05 | |||
| //v06 | |||
| //v07 | |||
| //v08 must save pB0_0 | |||
| //v09 must save pB0_1 | |||
| //v10 must save pB0_2 | |||
| //v11 must save pB0_3 | |||
| //v12 must save pB0_4 | |||
| //v13 must save pB0_5 | |||
| //v14 must save pB0_6 | |||
| //v15 must save pB0_7 | |||
| //v16 must save C0 | |||
| //v17 must save C1 | |||
| //v18 must save C2 | |||
| //v19 must save C3 | |||
| //v20 must save C4 | |||
| //v21 must save C5 | |||
| //v22 must save C6 | |||
| //v23 must save C7 | |||
| /******************************************************************************* | |||
| * Macro definitions | |||
| *******************************************************************************/ | |||
| .macro INITv1x8 | |||
| dup z16.d, #0 | |||
| dup z17.d, #0 | |||
| dup z18.d, #0 | |||
| dup z19.d, #0 | |||
| dup z20.d, #0 | |||
| dup z21.d, #0 | |||
| dup z22.d, #0 | |||
| dup z23.d, #0 | |||
| .endm | |||
| .macro KERNELv1x8_I | |||
| ld1d z0.d, p1/z, [pA] | |||
| ld1d z1.d, p1/z, [pA, lanes, lsl #3] // next one | |||
| add pA, pA, lanes, lsl #4 // pA = pA + lanes * 2 * 8 | |||
| ld1rd z8.d, p0/z, [pB] | |||
| ld1rd z9.d, p0/z, [pB, 8] | |||
| ld1rd z10.d, p0/z, [pB, 16] | |||
| ld1rd z11.d, p0/z, [pB, 24] | |||
| ld1rd z12.d, p0/z, [pB, 32] | |||
| ld1rd z13.d, p0/z, [pB, 40] | |||
| ld1rd z14.d, p0/z, [pB, 48] | |||
| ld1rd z15.d, p0/z, [pB, 56] | |||
| add pB, pB, 64 | |||
| fmla z16.d, p1/m, z0.d, z8.d | |||
| ld1rd z8.d, p0/z, [pB] | |||
| fmla z17.d, p1/m, z0.d, z9.d | |||
| ld1rd z9.d, p0/z, [pB, 8] | |||
| fmla z18.d, p1/m, z0.d, z10.d | |||
| ld1rd z10.d, p0/z, [pB, 16] | |||
| fmla z19.d, p1/m, z0.d, z11.d | |||
| ld1rd z11.d, p0/z, [pB, 24] | |||
| fmla z20.d, p1/m, z0.d, z12.d | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| ld1rd z12.d, p0/z, [pB, 32] | |||
| fmla z21.d, p1/m, z0.d, z13.d | |||
| ld1rd z13.d, p0/z, [pB, 40] | |||
| fmla z22.d, p1/m, z0.d, z14.d | |||
| ld1rd z14.d, p0/z, [pB, 48] | |||
| fmla z23.d, p1/m, z0.d, z15.d | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] | |||
| ld1rd z15.d, p0/z, [pB, 56] | |||
| add pB, pB, 64 | |||
| .endm | |||
| .macro KERNELv1x8_M1 | |||
| ld1d z1.d, p1/z, [pA] | |||
| add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8 | |||
| fmla z16.d, p1/m, z0.d, z8.d | |||
| ld1rd z8.d, p0/z, [pB] | |||
| fmla z17.d, p1/m, z0.d, z9.d | |||
| ld1rd z9.d, p0/z, [pB, 8] | |||
| fmla z18.d, p1/m, z0.d, z10.d | |||
| ld1rd z10.d, p0/z, [pB, 16] | |||
| fmla z19.d, p1/m, z0.d, z11.d | |||
| ld1rd z11.d, p0/z, [pB, 24] | |||
| fmla z20.d, p1/m, z0.d, z12.d | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| ld1rd z12.d, p0/z, [pB, 32] | |||
| fmla z21.d, p1/m, z0.d, z13.d | |||
| ld1rd z13.d, p0/z, [pB, 40] | |||
| fmla z22.d, p1/m, z0.d, z14.d | |||
| ld1rd z14.d, p0/z, [pB, 48] | |||
| fmla z23.d, p1/m, z0.d, z15.d | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] | |||
| ld1rd z15.d, p0/z, [pB, 56] | |||
| add pB, pB, 64 | |||
| .endm | |||
| .macro KERNELv1x8_M2 | |||
| ld1d z0.d, p1/z, [pA] | |||
| add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8 | |||
| fmla z16.d, p1/m, z1.d, z8.d | |||
| ld1rd z8.d, p0/z, [pB] | |||
| fmla z17.d, p1/m, z1.d, z9.d | |||
| ld1rd z9.d, p0/z, [pB, 8] | |||
| fmla z18.d, p1/m, z1.d, z10.d | |||
| ld1rd z10.d, p0/z, [pB, 16] | |||
| fmla z19.d, p1/m, z1.d, z11.d | |||
| ld1rd z11.d, p0/z, [pB, 24] | |||
| fmla z20.d, p1/m, z1.d, z12.d | |||
| ld1rd z12.d, p0/z, [pB, 32] | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| fmla z21.d, p1/m, z1.d, z13.d | |||
| ld1rd z13.d, p0/z, [pB, 40] | |||
| fmla z22.d, p1/m, z1.d, z14.d | |||
| ld1rd z14.d, p0/z, [pB, 48] | |||
| fmla z23.d, p1/m, z1.d, z15.d | |||
| ld1rd z15.d, p0/z, [pB, 56] | |||
| add pB, pB, 64 | |||
| .endm | |||
| .macro KERNELv1x8_E | |||
| fmla z16.d, p1/m, z1.d, z8.d | |||
| fmla z17.d, p1/m, z1.d, z9.d | |||
| fmla z18.d, p1/m, z1.d, z10.d | |||
| fmla z19.d, p1/m, z1.d, z11.d | |||
| fmla z20.d, p1/m, z1.d, z12.d | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| fmla z21.d, p1/m, z1.d, z13.d | |||
| fmla z22.d, p1/m, z1.d, z14.d | |||
| fmla z23.d, p1/m, z1.d, z15.d | |||
| .endm | |||
| .macro KERNELv1x8_SUB | |||
| ld1d z0.d, p1/z, [pA] | |||
| add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8 | |||
| ld1rd z8.d, p0/z, [pB] | |||
| ld1rd z9.d, p0/z, [pB, 8] | |||
| ld1rd z10.d, p0/z, [pB, 16] | |||
| ld1rd z11.d, p0/z, [pB, 24] | |||
| ld1rd z12.d, p0/z, [pB, 32] | |||
| ld1rd z13.d, p0/z, [pB, 40] | |||
| ld1rd z14.d, p0/z, [pB, 48] | |||
| ld1rd z15.d, p0/z, [pB, 56] | |||
| add pB, pB, 64 | |||
| fmla z16.d, p1/m, z0.d, z8.d | |||
| fmla z17.d, p1/m, z0.d, z9.d | |||
| fmla z18.d, p1/m, z0.d, z10.d | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| fmla z19.d, p1/m, z0.d, z11.d | |||
| fmla z20.d, p1/m, z0.d, z12.d | |||
| fmla z21.d, p1/m, z0.d, z13.d | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| fmla z22.d, p1/m, z0.d, z14.d | |||
| fmla z23.d, p1/m, z0.d, z15.d | |||
| .endm | |||
| .macro SAVEv1x8 | |||
| prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||
| add pCRow1, pCRow0, LDC | |||
| ld1d z24.d, p1/z, [pCRow0] | |||
| fmla z24.d, p1/m, z16.d, alphaZ | |||
| st1d z24.d, p1, [pCRow0] | |||
| prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||
| add pCRow2, pCRow1, LDC | |||
| ld1d z25.d, p1/z, [pCRow1] | |||
| fmla z25.d, p1/m, z17.d, alphaZ | |||
| st1d z25.d, p1, [pCRow1] | |||
| prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] | |||
| add pCRow1, pCRow2, LDC | |||
| ld1d z26.d, p1/z, [pCRow2] | |||
| fmla z26.d, p1/m, z18.d, alphaZ | |||
| st1d z26.d, p1, [pCRow2] | |||
| prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||
| add pCRow2, pCRow1, LDC | |||
| ld1d z27.d, p1/z, [pCRow1] | |||
| fmla z27.d, p1/m, z19.d, alphaZ | |||
| st1d z27.d, p1, [pCRow1] | |||
| prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] | |||
| add pCRow1, pCRow2, LDC | |||
| ld1d z28.d, p1/z, [pCRow2] | |||
| fmla z28.d, p1/m, z20.d, alphaZ | |||
| st1d z28.d, p1, [pCRow2] | |||
| prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||
| add pCRow2, pCRow1, LDC | |||
| ld1d z29.d, p1/z, [pCRow1] | |||
| fmla z29.d, p1/m, z21.d, alphaZ | |||
| st1d z29.d, p1, [pCRow1] | |||
| prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] | |||
| add pCRow1, pCRow2, LDC | |||
| ld1d z30.d, p1/z, [pCRow2] | |||
| fmla z30.d, p1/m, z22.d, alphaZ | |||
| st1d z30.d, p1, [pCRow2] | |||
| prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||
| ld1d z31.d, p1/z, [pCRow1] | |||
| fmla z31.d, p1/m, z23.d, alphaZ | |||
| st1d z31.d, p1, [pCRow1] | |||
| add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8 | |||
| .endm | |||
| /******************************************************************************/ | |||
| .macro INITv1x4 | |||
| dup z16.d, #0 | |||
| dup z17.d, #0 | |||
| dup z18.d, #0 | |||
| dup z19.d, #0 | |||
| .endm | |||
| .macro KERNELv1x4_SUB | |||
| ld1d z0.d, p1/z, [pA] | |||
| add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8 | |||
| ld1rd z8.d, p0/z, [pB] | |||
| ld1rd z9.d, p0/z, [pB, 8] | |||
| ld1rd z10.d, p0/z, [pB, 16] | |||
| ld1rd z11.d, p0/z, [pB, 24] | |||
| add pB, pB, 32 | |||
| fmla z16.d, p1/m, z0.d, z8.d | |||
| fmla z17.d, p1/m, z0.d, z9.d | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| fmla z18.d, p1/m, z0.d, z10.d | |||
| fmla z19.d, p1/m, z0.d, z11.d | |||
| .endm | |||
| .macro SAVEv1x4 | |||
| prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||
| add pCRow1, pCRow0, LDC | |||
| ld1d z24.d, p1/z, [pCRow0] | |||
| fmla z24.d, p1/m, z16.d, alphaZ | |||
| st1d z24.d, p1, [pCRow0] | |||
| prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||
| add pCRow2, pCRow1, LDC | |||
| ld1d z25.d, p1/z, [pCRow1] | |||
| fmla z25.d, p1/m, z17.d, alphaZ | |||
| st1d z25.d, p1, [pCRow1] | |||
| prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] | |||
| add pCRow1, pCRow2, LDC | |||
| ld1d z26.d, p1/z, [pCRow2] | |||
| fmla z26.d, p1/m, z18.d, alphaZ | |||
| st1d z26.d, p1, [pCRow2] | |||
| prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||
| ld1d z27.d, p1/z, [pCRow1] | |||
| fmla z27.d, p1/m, z19.d, alphaZ | |||
| st1d z27.d, p1, [pCRow1] | |||
| add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8 | |||
| .endm | |||
| /******************************************************************************/ | |||
| .macro INITv1x2 | |||
| dup z16.d, #0 | |||
| dup z17.d, #0 | |||
| .endm | |||
| .macro KERNELv1x2_SUB | |||
| ld1d z0.d, p1/z, [pA] | |||
| add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8 | |||
| ld1rd z8.d, p0/z, [pB] | |||
| ld1rd z9.d, p0/z, [pB, 8] | |||
| add pB, pB, 16 | |||
| fmla z16.d, p1/m, z0.d, z8.d | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| fmla z17.d, p1/m, z0.d, z9.d | |||
| .endm | |||
| .macro SAVEv1x2 | |||
| prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||
| add pCRow1, pCRow0, LDC | |||
| ld1d z24.d, p1/z, [pCRow0] | |||
| fmla z24.d, p1/m, z16.d, alphaZ | |||
| st1d z24.d, p1, [pCRow0] | |||
| prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||
| ld1d z25.d, p1/z, [pCRow1] | |||
| fmla z25.d, p1/m, z17.d, alphaZ | |||
| st1d z25.d, p1, [pCRow1] | |||
| add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8 | |||
| .endm | |||
| /******************************************************************************/ | |||
| .macro INITv1x1 | |||
| dup z16.d, #0 | |||
| .endm | |||
| .macro KERNELv1x1_SUB | |||
| ld1d z0.d, p1/z, [pA] | |||
| add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8 | |||
| ld1rd z8.d, p0/z, [pB] | |||
| add pB, pB, 8 | |||
| fmla z16.d, p1/m, z0.d, z8.d | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| .endm | |||
| .macro SAVEv1x1 | |||
| prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||
| ld1d z24.d, p1/z, [pCRow0] | |||
| fmla z24.d, p1/m, z16.d, alphaZ | |||
| st1d z24.d, p1, [pCRow0] | |||
| add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8 | |||
| .endm | |||
| /******************************************************************************* | |||
| * End of macro definitions | |||
| *******************************************************************************/ | |||
| PROLOGUE | |||
| .align 5 | |||
| add sp, sp, #-(11 * 16) | |||
| stp d8, d9, [sp, #(0 * 16)] | |||
| stp d10, d11, [sp, #(1 * 16)] | |||
| stp d12, d13, [sp, #(2 * 16)] | |||
| stp d14, d15, [sp, #(3 * 16)] | |||
| stp d16, d17, [sp, #(4 * 16)] | |||
| stp x18, x19, [sp, #(5 * 16)] | |||
| stp x20, x21, [sp, #(6 * 16)] | |||
| stp x22, x23, [sp, #(7 * 16)] | |||
| stp x24, x25, [sp, #(8 * 16)] | |||
| stp x26, x27, [sp, #(9 * 16)] | |||
| str x28, [sp, #(10 * 16)] | |||
| prfm PLDL1KEEP, [origPB] | |||
| prfm PLDL1KEEP, [origPA] | |||
| fmov alpha, d0 | |||
| dup alphaZ, alpha | |||
| lsl LDC, LDC, #3 // ldc = ldc * 8 | |||
| ptrue p0.d // create true predicate | |||
| mov pB, origPB | |||
| // Loop over N | |||
| mov counterJ, origN | |||
| asr counterJ, counterJ, #3 // J = J / 8 | |||
| cmp counterJ, #0 | |||
| ble .Ldgemm_kernel_L4_BEGIN | |||
| /******************************************************************************/ | |||
| /* Repeat this as long as there are 8 left in N */ | |||
| .align 5 | |||
| .Ldgemm_kernel_L8_BEGIN: | |||
| mov pCRow0, pC | |||
| add pC, pC, LDC, lsl #3 // add 8 x LDC | |||
| mov pA, origPA // pA = start of A array | |||
| .Ldgemm_kernel_L8_Mv1_BEGIN: | |||
| /* Loop over M is done in an SVE fashion. This has the benefit of the last M%SVE_LEN iterations being done in a single sweep */ | |||
| mov counterI, #0 | |||
| whilelt p1.d, counterI, origM | |||
| cntp lanes, p0, p1.d // lanes contain number of active SVE lanes in M dimension | |||
| .align 5 | |||
| .Ldgemm_kernel_L8_Mv1_20: | |||
| mov pB, origPB | |||
| INITv1x8 // fill with zeros | |||
| asr counterL , origK, #3 // L = K / 8 | |||
| cmp counterL , #2 // is there at least 4 to do? | |||
| blt .Ldgemm_kernel_L8_Mv1_32 | |||
| KERNELv1x8_I | |||
| KERNELv1x8_M2 | |||
| KERNELv1x8_M1 | |||
| KERNELv1x8_M2 | |||
| KERNELv1x8_M1 | |||
| KERNELv1x8_M2 | |||
| KERNELv1x8_M1 | |||
| KERNELv1x8_M2 | |||
| subs counterL, counterL, #2 // subtract 2 | |||
| ble .Ldgemm_kernel_L8_Mv1_22a | |||
| .align 5 | |||
| .Ldgemm_kernel_L8_Mv1_22: | |||
| KERNELv1x8_M1 | |||
| KERNELv1x8_M2 | |||
| KERNELv1x8_M1 | |||
| KERNELv1x8_M2 | |||
| KERNELv1x8_M1 | |||
| KERNELv1x8_M2 | |||
| KERNELv1x8_M1 | |||
| KERNELv1x8_M2 | |||
| subs counterL, counterL, #1 | |||
| bgt .Ldgemm_kernel_L8_Mv1_22 | |||
| .align 5 | |||
| .Ldgemm_kernel_L8_Mv1_22a: | |||
| KERNELv1x8_M1 | |||
| KERNELv1x8_M2 | |||
| KERNELv1x8_M1 | |||
| KERNELv1x8_M2 | |||
| KERNELv1x8_M1 | |||
| KERNELv1x8_M2 | |||
| KERNELv1x8_M1 | |||
| KERNELv1x8_E | |||
| b .Ldgemm_kernel_L8_Mv1_44 | |||
| .align 5 | |||
| .Ldgemm_kernel_L8_Mv1_32: | |||
| tst counterL, #1 | |||
| ble .Ldgemm_kernel_L8_Mv1_40 | |||
| KERNELv1x8_I | |||
| KERNELv1x8_M2 | |||
| KERNELv1x8_M1 | |||
| KERNELv1x8_M2 | |||
| KERNELv1x8_M1 | |||
| KERNELv1x8_M2 | |||
| KERNELv1x8_M1 | |||
| KERNELv1x8_E | |||
| b .Ldgemm_kernel_L8_Mv1_44 | |||
| .Ldgemm_kernel_L8_Mv1_40: | |||
| INITv1x8 | |||
| .Ldgemm_kernel_L8_Mv1_44: | |||
| ands counterL , origK, #7 | |||
| ble .Ldgemm_kernel_L8_Mv1_100 | |||
| .align 5 | |||
| .Ldgemm_kernel_L8_Mv1_46: | |||
| KERNELv1x8_SUB | |||
| subs counterL, counterL, #1 | |||
| bne .Ldgemm_kernel_L8_Mv1_46 | |||
| .Ldgemm_kernel_L8_Mv1_100: | |||
| prfm PLDL1KEEP, [pA] | |||
| prfm PLDL1KEEP, [pA, #64] | |||
| prfm PLDL1KEEP, [origPB] | |||
| SAVEv1x8 | |||
| .Ldgemm_kernel_L8_Mv1_END: | |||
| incd counterI | |||
| whilelt p1.d, counterI, origM //SVE instruction | |||
| cntp lanes, p0, p1.d // lanes contain number of active SVE lanes in M dimension | |||
| b.any .Ldgemm_kernel_L8_Mv1_20 | |||
| .Ldgemm_kernel_L8_END: | |||
| lsl temp, origK, #6 | |||
| add origPB, origPB, temp // B = B + K * 8 * 8 | |||
| subs counterJ, counterJ , #1 // j-- | |||
| bgt .Ldgemm_kernel_L8_BEGIN | |||
| /******************************************************************************/ | |||
| /* Repeat the same thing if 4 left in N */ | |||
| .align 5 | |||
| .Ldgemm_kernel_L4_BEGIN: | |||
| mov counterJ , origN | |||
| tst counterJ , #4 | |||
| ble .Ldgemm_kernel_L2_BEGIN | |||
| mov pCRow0, pC | |||
| add pC, pC, LDC, lsl #2 // add 4 x LDC | |||
| mov pA, origPA // pA = start of A array | |||
| .Ldgemm_kernel_L4_Mv1_BEGIN: | |||
| mov counterI, #0 | |||
| whilelt p1.d, counterI, origM //SVE instruction | |||
| cntp lanes, p0, p1.d | |||
| .align 5 | |||
| .Ldgemm_kernel_L4_Mv1_20: | |||
| mov pB, origPB | |||
| INITv1x4 // fill with zeros | |||
| asr counterL , origK, #3 // L = K / 8 | |||
| cmp counterL , #0 // is there at least 4 to do? | |||
| ble .Ldgemm_kernel_L4_Mv1_44 | |||
| .align 5 | |||
| .Ldgemm_kernel_L4_Mv1_22: | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| KERNELv1x4_SUB | |||
| KERNELv1x4_SUB | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| KERNELv1x4_SUB | |||
| KERNELv1x4_SUB | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| KERNELv1x4_SUB | |||
| KERNELv1x4_SUB | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| KERNELv1x4_SUB | |||
| KERNELv1x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt .Ldgemm_kernel_L4_Mv1_22 | |||
| .Ldgemm_kernel_L4_Mv1_44: | |||
| ands counterL , origK, #7 | |||
| ble .Ldgemm_kernel_L4_Mv1_100 | |||
| .align 5 | |||
| .Ldgemm_kernel_L4_Mv1_46: | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| KERNELv1x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bne .Ldgemm_kernel_L4_Mv1_46 | |||
| .Ldgemm_kernel_L4_Mv1_100: | |||
| prfm PLDL1KEEP, [pA] | |||
| prfm PLDL1KEEP, [pA, #64] | |||
| prfm PLDL1KEEP, [origPB] | |||
| SAVEv1x4 | |||
| .Ldgemm_kernel_L4_Mv1_END: | |||
| incd counterI | |||
| whilelt p1.d, counterI, origM //SVE instruction | |||
| cntp lanes, p0, p1.d | |||
| b.any .Ldgemm_kernel_L4_Mv1_20 | |||
| .Ldgemm_kernel_L4_END: | |||
| lsl temp, origK, #5 | |||
| add origPB, origPB, temp // B = B + K * 4 * 8 | |||
| /******************************************************************************/ | |||
| /* Repeat the same thing if 2 left in N */ | |||
| .align 5 | |||
| .Ldgemm_kernel_L2_BEGIN: | |||
| mov counterJ , origN | |||
| tst counterJ , #2 | |||
| ble .Ldgemm_kernel_L1_BEGIN | |||
| mov pCRow0, pC | |||
| add pC, pC, LDC, lsl #1 // add 2 x LDC | |||
| mov pA, origPA // pA = start of A array | |||
| .Ldgemm_kernel_L2_Mv1_BEGIN: | |||
| mov counterI, #0 | |||
| whilelt p1.d, counterI, origM //SVE instruction | |||
| cntp lanes, p0, p1.d | |||
| .align 5 | |||
| .Ldgemm_kernel_L2_Mv1_20: | |||
| mov pB, origPB | |||
| INITv1x2 // fill with zeros | |||
| asr counterL , origK, #3 // L = K / 8 | |||
| cmp counterL , #0 // is there at least 4 to do? | |||
| ble .Ldgemm_kernel_L2_Mv1_44 | |||
| .align 5 | |||
| .Ldgemm_kernel_L2_Mv1_22: | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| KERNELv1x2_SUB | |||
| KERNELv1x2_SUB | |||
| KERNELv1x2_SUB | |||
| KERNELv1x2_SUB | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| KERNELv1x2_SUB | |||
| KERNELv1x2_SUB | |||
| KERNELv1x2_SUB | |||
| KERNELv1x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt .Ldgemm_kernel_L2_Mv1_22 | |||
| .Ldgemm_kernel_L2_Mv1_44: | |||
| ands counterL , origK, #7 | |||
| ble .Ldgemm_kernel_L2_Mv1_100 | |||
| .align 5 | |||
| .Ldgemm_kernel_L2_Mv1_46: | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| KERNELv1x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bne .Ldgemm_kernel_L2_Mv1_46 | |||
| .Ldgemm_kernel_L2_Mv1_100: | |||
| prfm PLDL1KEEP, [pA] | |||
| prfm PLDL1KEEP, [pA, #64] | |||
| prfm PLDL1KEEP, [origPB] | |||
| SAVEv1x2 | |||
| .Ldgemm_kernel_L2_Mv1_END: | |||
| incd counterI | |||
| whilelt p1.d, counterI, origM //SVE instruction | |||
| cntp lanes, p0, p1.d | |||
| b.any .Ldgemm_kernel_L2_Mv1_20 | |||
| .Ldgemm_kernel_L2_END: | |||
| add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 | |||
| /******************************************************************************/ | |||
| /* Repeat the same thing if 1 left in N */ | |||
| .align 5 | |||
| .Ldgemm_kernel_L1_BEGIN: | |||
| mov counterJ , origN | |||
| tst counterJ , #1 | |||
| ble .Ldgemm_kernel_L999 // done | |||
| mov pCRow0, pC | |||
| add pC, pC, LDC // add 1 x LDC | |||
| mov pA, origPA // pA = start of A array | |||
| .Ldgemm_kernel_L1_Mv1_BEGIN: | |||
| mov counterI, #0 | |||
| whilelt p1.d, counterI, origM //SVE instruction | |||
| cntp lanes, p0, p1.d | |||
| .align 5 | |||
| .Ldgemm_kernel_L1_Mv1_20: | |||
| mov pB, origPB | |||
| INITv1x1 // fill with zeros | |||
| asr counterL , origK, #3 // L = K / 8 | |||
| cmp counterL , #0 // is there at least 8 to do? | |||
| ble .Ldgemm_kernel_L1_Mv1_44 | |||
| .align 5 | |||
| .Ldgemm_kernel_L1_Mv1_22: | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| KERNELv1x1_SUB | |||
| KERNELv1x1_SUB | |||
| KERNELv1x1_SUB | |||
| KERNELv1x1_SUB | |||
| KERNELv1x1_SUB | |||
| KERNELv1x1_SUB | |||
| KERNELv1x1_SUB | |||
| KERNELv1x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt .Ldgemm_kernel_L1_Mv1_22 | |||
| .Ldgemm_kernel_L1_Mv1_44: | |||
| ands counterL , origK, #7 | |||
| ble .Ldgemm_kernel_L1_Mv1_100 | |||
| .align 5 | |||
| .Ldgemm_kernel_L1_Mv1_46: | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| KERNELv1x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt .Ldgemm_kernel_L1_Mv1_46 | |||
| .Ldgemm_kernel_L1_Mv1_100: | |||
| prfm PLDL1KEEP, [pA] | |||
| prfm PLDL1KEEP, [pA, #64] | |||
| prfm PLDL1KEEP, [origPB] | |||
| SAVEv1x1 | |||
| .Ldgemm_kernel_L1_Mv1_END: | |||
| incd counterI | |||
| whilelt p1.d, counterI, origM //SVE instruction | |||
| cntp lanes, p0, p1.d | |||
| b.any .Ldgemm_kernel_L1_Mv1_20 | |||
| .Ldgemm_kernel_L1_END: | |||
| /******************************************************************************/ | |||
| .Ldgemm_kernel_L999: | |||
| mov x0, #0 // set return value | |||
| ldp d8, d9, [sp, #(0 * 16)] | |||
| ldp d10, d11, [sp, #(1 * 16)] | |||
| ldp d12, d13, [sp, #(2 * 16)] | |||
| ldp d14, d15, [sp, #(3 * 16)] | |||
| ldp d16, d17, [sp, #(4 * 16)] | |||
| ldp x18, x19, [sp, #(5 * 16)] | |||
| ldp x20, x21, [sp, #(6 * 16)] | |||
| ldp x22, x23, [sp, #(7 * 16)] | |||
| ldp x24, x25, [sp, #(8 * 16)] | |||
| ldp x26, x27, [sp, #(9 * 16)] | |||
| ldr x28, [sp, #(10 * 16)] | |||
| add sp, sp, #(11*16) | |||
| ret | |||
| EPILOGUE | |||
| @@ -0,0 +1,79 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| #include <arm_sve.h> | |||
| // TODO: write in assembly with proper unrolling of inner loop | |||
| int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ | |||
| BLASLONG j; | |||
| IFLOAT *aoffset, *aoffset1, *boffset; | |||
| svint64_t lda_vec = svindex_s64(0LL, lda); | |||
| uint64_t sve_size = svcntd(); | |||
| aoffset = a; | |||
| boffset = b; | |||
| j = 0; | |||
| svbool_t pg = svwhilelt_b64(j, n); | |||
| uint64_t active = svcntp_b64(svptrue_b64(), pg); | |||
| do { | |||
| aoffset1 = aoffset; | |||
| uint64_t i_cnt = m; | |||
| while (i_cnt--) { | |||
| svfloat64_t a_vec = svld1_gather_index(pg, (double *) aoffset1, lda_vec); | |||
| svst1_f64(pg, (double *) boffset, a_vec); | |||
| aoffset1++; | |||
| boffset += active; | |||
| } | |||
| aoffset += sve_size * lda; | |||
| j += svcntd(); | |||
| pg = svwhilelt_b64(j, n); | |||
| active = svcntp_b64(svptrue_b64(), pg); | |||
| } while (svptest_any(svptrue_b64(), pg)); | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,77 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| #include <arm_sve.h> | |||
| // TODO: write in assembly with proper unrolling of inner loop | |||
| int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ | |||
| BLASLONG j; | |||
| IFLOAT *aoffset, *aoffset1, *boffset; | |||
| uint64_t sve_size = svcntd(); | |||
| aoffset = a; | |||
| boffset = b; | |||
| j = 0; | |||
| svbool_t pg = svwhilelt_b64(j, n); | |||
| uint64_t active = svcntp_b64(svptrue_b64(), pg); | |||
| do { | |||
| aoffset1 = aoffset; | |||
| uint64_t i_cnt = m; | |||
| while (i_cnt--) { | |||
| svfloat64_t a_vec = svld1(pg, (double *)aoffset1); | |||
| svst1_f64(pg, (double *) boffset, a_vec); | |||
| aoffset1 += lda; | |||
| boffset += active; | |||
| } | |||
| aoffset += sve_size; | |||
| j += svcntd(); | |||
| pg = svwhilelt_b64(j, n); | |||
| active = svcntp_b64(svptrue_b64(), pg); | |||
| } while (svptest_any(svptrue_b64(), pg)); | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,874 @@ | |||
| /******************************************************************************* | |||
| Copyright (c) 2015, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *******************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| /* X0 X1 X2 s0 X3 x4 x5 x6 */ | |||
| /*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc )*/ | |||
| #define origM x0 | |||
| #define origN x1 | |||
| #define origK x2 | |||
| #define origPA x3 | |||
| #define origPB x4 | |||
| #define pC x5 | |||
| #define LDC x6 | |||
| #define temp x7 | |||
| #define counterL x8 | |||
| #define counterI x9 | |||
| #define counterJ x10 | |||
| #define pB x11 | |||
| #define pCRow0 x12 | |||
| #define pCRow1 x13 | |||
| #define pCRow2 x14 | |||
| #define lanes x15 | |||
| #define pA x16 | |||
| #define alpha w17 | |||
| #define alpha0 s10 | |||
| #define alphaZ z2.s | |||
| #define A_PRE_SIZE 1536 | |||
| #define B_PRE_SIZE 512 | |||
| #define C_PRE_SIZE 128 | |||
| // 00 origM | |||
| // 01 origN | |||
| // 02 origK | |||
| // 03 origPA | |||
| // 04 origPB | |||
| // 05 pC | |||
| // 06 origLDC -> LDC | |||
| // 07 temp | |||
| // 08 counterL | |||
| // 09 counterI | |||
| // 10 counterJ | |||
| // 11 pB | |||
| // 12 pCRow0 | |||
| // 13 pCRow1 | |||
| // 14 pCRow2 | |||
| // 15 lanes | |||
| // 16 pA | |||
| // 17 | |||
| // 18 must save | |||
| // 19 must save | |||
| // 20 must save | |||
| // 21 must save | |||
| // 22 must save | |||
| // 23 must save | |||
| // 24 must save | |||
| // 25 must save | |||
| // 26 must save | |||
| // 27 must save | |||
| // 28 must save | |||
| // 29 frame | |||
| // 30 link | |||
| // 31 sp | |||
| //v00 ALPHA -> pA0_0 | |||
| //v01 pA0_1 | |||
| //v02 ALPHA0 | |||
| //v03 | |||
| //v04 | |||
| //v05 | |||
| //v06 | |||
| //v07 | |||
| //v08 must save pB0_0 | |||
| //v09 must save pB0_1 | |||
| //v10 must save pB0_2 | |||
| //v11 must save pB0_3 | |||
| //v12 must save pB0_4 | |||
| //v13 must save pB0_5 | |||
| //v14 must save pB0_6 | |||
| //v15 must save pB0_7 | |||
| //v16 must save C0 | |||
| //v17 must save C1 | |||
| //v18 must save C2 | |||
| //v19 must save C3 | |||
| //v20 must save C4 | |||
| //v21 must save C5 | |||
| //v22 must save C6 | |||
| //v23 must save C7 | |||
| /******************************************************************************* | |||
| * Macro definitions | |||
| *******************************************************************************/ | |||
| .macro INITv1x8 | |||
| dup z16.s, #0 | |||
| dup z17.s, #0 | |||
| dup z18.s, #0 | |||
| dup z19.s, #0 | |||
| dup z20.s, #0 | |||
| dup z21.s, #0 | |||
| dup z22.s, #0 | |||
| dup z23.s, #0 | |||
| .endm | |||
| .macro KERNELv1x8_I | |||
| ld1w z0.s, p1/z, [pA] | |||
| ld1w z1.s, p1/z, [pA, lanes, lsl #2] // next one | |||
| add pA, pA, lanes, lsl #3 // pA = pA + lanes * 2 * 4 | |||
| ld1rw z8.s, p0/z, [pB] | |||
| ld1rw z9.s, p0/z, [pB, 4] | |||
| ld1rw z10.s, p0/z, [pB, 8] | |||
| ld1rw z11.s, p0/z, [pB, 12] | |||
| ld1rw z12.s, p0/z, [pB, 16] | |||
| ld1rw z13.s, p0/z, [pB, 20] | |||
| ld1rw z14.s, p0/z, [pB, 24] | |||
| ld1rw z15.s, p0/z, [pB, 28] | |||
| add pB, pB, 32 | |||
| fmla z16.s, p1/m, z0.s, z8.s | |||
| ld1rw z8.s, p0/z, [pB] | |||
| fmla z17.s, p1/m, z0.s, z9.s | |||
| ld1rw z9.s, p0/z, [pB, 4] | |||
| fmla z18.s, p1/m, z0.s, z10.s | |||
| ld1rw z10.s, p0/z, [pB, 8] | |||
| fmla z19.s, p1/m, z0.s, z11.s | |||
| ld1rw z11.s, p0/z, [pB, 12] | |||
| fmla z20.s, p1/m, z0.s, z12.s | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| ld1rw z12.s, p0/z, [pB, 16] | |||
| fmla z21.s, p1/m, z0.s, z13.s | |||
| ld1rw z13.s, p0/z, [pB, 20] | |||
| fmla z22.s, p1/m, z0.s, z14.s | |||
| ld1rw z14.s, p0/z, [pB, 24] | |||
| fmla z23.s, p1/m, z0.s, z15.s | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] | |||
| ld1rw z15.s, p0/z, [pB, 28] | |||
| add pB, pB, 32 | |||
| .endm | |||
| .macro KERNELv1x8_M1 | |||
| ld1w z1.s, p1/z, [pA] | |||
| add pA, pA, lanes, lsl #2 // pA = pA + lanes * 4 | |||
| fmla z16.s, p1/m, z0.s, z8.s | |||
| ld1rw z8.s, p0/z, [pB] | |||
| fmla z17.s, p1/m, z0.s, z9.s | |||
| ld1rw z9.s, p0/z, [pB, 4] | |||
| fmla z18.s, p1/m, z0.s, z10.s | |||
| ld1rw z10.s, p0/z, [pB, 8] | |||
| fmla z19.s, p1/m, z0.s, z11.s | |||
| ld1rw z11.s, p0/z, [pB, 12] | |||
| fmla z20.s, p1/m, z0.s, z12.s | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| ld1rw z12.s, p0/z, [pB, 16] | |||
| fmla z21.s, p1/m, z0.s, z13.s | |||
| ld1rw z13.s, p0/z, [pB, 20] | |||
| fmla z22.s, p1/m, z0.s, z14.s | |||
| ld1rw z14.s, p0/z, [pB, 24] | |||
| fmla z23.s, p1/m, z0.s, z15.s | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] | |||
| ld1rw z15.s, p0/z, [pB, 28] | |||
| add pB, pB, 32 | |||
| .endm | |||
| .macro KERNELv1x8_M2 | |||
| ld1w z0.s, p1/z, [pA] | |||
| add pA, pA, lanes, lsl #2 // pA = pA + lanes * 4 | |||
| fmla z16.s, p1/m, z1.s, z8.s | |||
| ld1rw z8.s, p0/z, [pB] | |||
| fmla z17.s, p1/m, z1.s, z9.s | |||
| ld1rw z9.s, p0/z, [pB, 4] | |||
| fmla z18.s, p1/m, z1.s, z10.s | |||
| ld1rw z10.s, p0/z, [pB, 8] | |||
| fmla z19.s, p1/m, z1.s, z11.s | |||
| ld1rw z11.s, p0/z, [pB, 12] | |||
| fmla z20.s, p1/m, z1.s, z12.s | |||
| ld1rw z12.s, p0/z, [pB, 16] | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| fmla z21.s, p1/m, z1.s, z13.s | |||
| ld1rw z13.s, p0/z, [pB, 20] | |||
| fmla z22.s, p1/m, z1.s, z14.s | |||
| ld1rw z14.s, p0/z, [pB, 24] | |||
| fmla z23.s, p1/m, z1.s, z15.s | |||
| ld1rw z15.s, p0/z, [pB, 28] | |||
| add pB, pB, 32 | |||
| .endm | |||
| .macro KERNELv1x8_E | |||
| fmla z16.s, p1/m, z1.s, z8.s | |||
| fmla z17.s, p1/m, z1.s, z9.s | |||
| fmla z18.s, p1/m, z1.s, z10.s | |||
| fmla z19.s, p1/m, z1.s, z11.s | |||
| fmla z20.s, p1/m, z1.s, z12.s | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| fmla z21.s, p1/m, z1.s, z13.s | |||
| fmla z22.s, p1/m, z1.s, z14.s | |||
| fmla z23.s, p1/m, z1.s, z15.s | |||
| .endm | |||
| .macro KERNELv1x8_SUB | |||
| ld1w z0.s, p1/z, [pA] | |||
| add pA, pA, lanes, lsl #2 // pA = pA + lanes * 4 | |||
| ld1rw z8.s, p0/z, [pB] | |||
| ld1rw z9.s, p0/z, [pB, 4] | |||
| ld1rw z10.s, p0/z, [pB, 8] | |||
| ld1rw z11.s, p0/z, [pB, 12] | |||
| ld1rw z12.s, p0/z, [pB, 16] | |||
| ld1rw z13.s, p0/z, [pB, 20] | |||
| ld1rw z14.s, p0/z, [pB, 24] | |||
| ld1rw z15.s, p0/z, [pB, 28] | |||
| add pB, pB, 32 | |||
| fmla z16.s, p1/m, z0.s, z8.s | |||
| fmla z17.s, p1/m, z0.s, z9.s | |||
| fmla z18.s, p1/m, z0.s, z10.s | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| fmla z19.s, p1/m, z0.s, z11.s | |||
| fmla z20.s, p1/m, z0.s, z12.s | |||
| fmla z21.s, p1/m, z0.s, z13.s | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| fmla z22.s, p1/m, z0.s, z14.s | |||
| fmla z23.s, p1/m, z0.s, z15.s | |||
| .endm | |||
| .macro SAVEv1x8 | |||
| prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||
| add pCRow1, pCRow0, LDC | |||
| ld1w z24.s, p1/z, [pCRow0] | |||
| fmla z24.s, p1/m, z16.s, alphaZ | |||
| st1w z24.s, p1, [pCRow0] | |||
| prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||
| add pCRow2, pCRow1, LDC | |||
| ld1w z25.s, p1/z, [pCRow1] | |||
| fmla z25.s, p1/m, z17.s, alphaZ | |||
| st1w z25.s, p1, [pCRow1] | |||
| prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] | |||
| add pCRow1, pCRow2, LDC | |||
| ld1w z26.s, p1/z, [pCRow2] | |||
| fmla z26.s, p1/m, z18.s, alphaZ | |||
| st1w z26.s, p1, [pCRow2] | |||
| prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||
| add pCRow2, pCRow1, LDC | |||
| ld1w z27.s, p1/z, [pCRow1] | |||
| fmla z27.s, p1/m, z19.s, alphaZ | |||
| st1w z27.s, p1, [pCRow1] | |||
| prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] | |||
| add pCRow1, pCRow2, LDC | |||
| ld1w z28.s, p1/z, [pCRow2] | |||
| fmla z28.s, p1/m, z20.s, alphaZ | |||
| st1w z28.s, p1, [pCRow2] | |||
| prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||
| add pCRow2, pCRow1, LDC | |||
| ld1w z29.s, p1/z, [pCRow1] | |||
| fmla z29.s, p1/m, z21.s, alphaZ | |||
| st1w z29.s, p1, [pCRow1] | |||
| prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] | |||
| add pCRow1, pCRow2, LDC | |||
| ld1w z30.s, p1/z, [pCRow2] | |||
| fmla z30.s, p1/m, z22.s, alphaZ | |||
| st1w z30.s, p1, [pCRow2] | |||
| prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||
| ld1w z31.s, p1/z, [pCRow1] | |||
| fmla z31.s, p1/m, z23.s, alphaZ | |||
| st1w z31.s, p1, [pCRow1] | |||
| add pCRow0, pCRow0, lanes, lsl #2 // pC = pC + lanes * 4 | |||
| .endm | |||
| /******************************************************************************/ | |||
| .macro INITv1x4 | |||
| dup z16.s, #0 | |||
| dup z17.s, #0 | |||
| dup z18.s, #0 | |||
| dup z19.s, #0 | |||
| .endm | |||
| .macro KERNELv1x4_SUB | |||
| ld1w z0.s, p1/z, [pA] | |||
| add pA, pA, lanes, lsl #2 // pA = pA + lanes * 4 | |||
| ld1rw z8.s, p0/z, [pB] | |||
| ld1rw z9.s, p0/z, [pB, 4] | |||
| ld1rw z10.s, p0/z, [pB, 8] | |||
| ld1rw z11.s, p0/z, [pB, 12] | |||
| add pB, pB, 16 | |||
| fmla z16.s, p1/m, z0.s, z8.s | |||
| fmla z17.s, p1/m, z0.s, z9.s | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| fmla z18.s, p1/m, z0.s, z10.s | |||
| fmla z19.s, p1/m, z0.s, z11.s | |||
| .endm | |||
| .macro SAVEv1x4 | |||
| prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||
| add pCRow1, pCRow0, LDC | |||
| ld1w z24.s, p1/z, [pCRow0] | |||
| fmla z24.s, p1/m, z16.s, alphaZ | |||
| st1w z24.s, p1, [pCRow0] | |||
| prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||
| add pCRow2, pCRow1, LDC | |||
| ld1w z25.s, p1/z, [pCRow1] | |||
| fmla z25.s, p1/m, z17.s, alphaZ | |||
| st1w z25.s, p1, [pCRow1] | |||
| prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] | |||
| add pCRow1, pCRow2, LDC | |||
| ld1w z26.s, p1/z, [pCRow2] | |||
| fmla z26.s, p1/m, z18.s, alphaZ | |||
| st1w z26.s, p1, [pCRow2] | |||
| prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||
| ld1w z27.s, p1/z, [pCRow1] | |||
| fmla z27.s, p1/m, z19.s, alphaZ | |||
| st1w z27.s, p1, [pCRow1] | |||
| add pCRow0, pCRow0, lanes, lsl #2 // pC = pC + lanes * 4 | |||
| .endm | |||
| /******************************************************************************/ | |||
| .macro INITv1x2 | |||
| dup z16.s, #0 | |||
| dup z17.s, #0 | |||
| .endm | |||
| .macro KERNELv1x2_SUB | |||
| ld1w z0.s, p1/z, [pA] | |||
| add pA, pA, lanes, lsl #2 // pA = pA + lanes * 4 | |||
| ld1rw z8.s, p0/z, [pB] | |||
| ld1rw z9.s, p0/z, [pB, 4] | |||
| add pB, pB, 8 | |||
| fmla z16.s, p1/m, z0.s, z8.s | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| fmla z17.s, p1/m, z0.s, z9.s | |||
| .endm | |||
| .macro SAVEv1x2 | |||
| prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||
| add pCRow1, pCRow0, LDC | |||
| ld1w z24.s, p1/z, [pCRow0] | |||
| fmla z24.s, p1/m, z16.s, alphaZ | |||
| st1w z24.s, p1, [pCRow0] | |||
| prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||
| ld1w z25.s, p1/z, [pCRow1] | |||
| fmla z25.s, p1/m, z17.s, alphaZ | |||
| st1w z25.s, p1, [pCRow1] | |||
| add pCRow0, pCRow0, lanes, lsl #2 // pC = pC + lanes * 4 | |||
| .endm | |||
| /******************************************************************************/ | |||
| .macro INITv1x1 | |||
| dup z16.s, #0 | |||
| .endm | |||
| .macro KERNELv1x1_SUB | |||
| ld1w z0.s, p1/z, [pA] | |||
| add pA, pA, lanes, lsl #2 // pA = pA + lanes * 8 | |||
| ld1rw z8.s, p0/z, [pB] | |||
| add pB, pB, 4 | |||
| fmla z16.s, p1/m, z0.s, z8.s | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| .endm | |||
| .macro SAVEv1x1 | |||
| prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||
| ld1w z24.s, p1/z, [pCRow0] | |||
| fmla z24.s, p1/m, z16.s, alphaZ | |||
| st1w z24.s, p1, [pCRow0] | |||
| add pCRow0, pCRow0, lanes, lsl #2 // pC = pC + lanes * 4 | |||
| .endm | |||
| /******************************************************************************* | |||
| * End of macro definitions | |||
| *******************************************************************************/ | |||
| PROLOGUE | |||
| .align 5 | |||
| add sp, sp, #-(11 * 16) | |||
| stp d8, d9, [sp, #(0 * 16)] | |||
| stp d10, d11, [sp, #(1 * 16)] | |||
| stp d12, d13, [sp, #(2 * 16)] | |||
| stp d14, d15, [sp, #(3 * 16)] | |||
| stp d16, d17, [sp, #(4 * 16)] | |||
| stp x18, x19, [sp, #(5 * 16)] | |||
| stp x20, x21, [sp, #(6 * 16)] | |||
| stp x22, x23, [sp, #(7 * 16)] | |||
| stp x24, x25, [sp, #(8 * 16)] | |||
| stp x26, x27, [sp, #(9 * 16)] | |||
| str x28, [sp, #(10 * 16)] | |||
| prfm PLDL1KEEP, [origPB] | |||
| prfm PLDL1KEEP, [origPA] | |||
| fmov alpha, s0 | |||
| dup alphaZ, alpha | |||
| lsl LDC, LDC, #2 // ldc = ldc * 4 | |||
| ptrue p0.s // create true predicate | |||
| mov pB, origPB | |||
| // Loop over N | |||
| mov counterJ, origN | |||
| asr counterJ, counterJ, #3 // J = J / 8 | |||
| cmp counterJ, #0 | |||
| ble .Ldgemm_kernel_L4_BEGIN | |||
| /******************************************************************************/ | |||
| /* Repeat this as long as there are 8 left in N */ | |||
| .align 5 | |||
| .Ldgemm_kernel_L8_BEGIN: | |||
| mov pCRow0, pC | |||
| add pC, pC, LDC, lsl #3 // add 8 x LDC | |||
| mov pA, origPA // pA = start of A array | |||
| .Ldgemm_kernel_L8_Mv1_BEGIN: | |||
| /* Loop over M is done in an SVE fashion. This has the benefit of the last M%SVE_LEN iterations being done in a single sweep */ | |||
| mov counterI, #0 | |||
| whilelt p1.s, counterI, origM | |||
| cntp lanes, p0, p1.s // lanes contain number of active SVE lanes in M dimension | |||
| .align 5 | |||
| .Ldgemm_kernel_L8_Mv1_20: | |||
| mov pB, origPB | |||
| INITv1x8 // fill with zeros | |||
| asr counterL , origK, #3 // L = K / 8 | |||
| cmp counterL , #2 // is there at least 4 to do? | |||
| blt .Ldgemm_kernel_L8_Mv1_32 | |||
| KERNELv1x8_I | |||
| KERNELv1x8_M2 | |||
| KERNELv1x8_M1 | |||
| KERNELv1x8_M2 | |||
| KERNELv1x8_M1 | |||
| KERNELv1x8_M2 | |||
| KERNELv1x8_M1 | |||
| KERNELv1x8_M2 | |||
| subs counterL, counterL, #2 // subtract 2 | |||
| ble .Ldgemm_kernel_L8_Mv1_22a | |||
| .align 5 | |||
| .Ldgemm_kernel_L8_Mv1_22: | |||
| KERNELv1x8_M1 | |||
| KERNELv1x8_M2 | |||
| KERNELv1x8_M1 | |||
| KERNELv1x8_M2 | |||
| KERNELv1x8_M1 | |||
| KERNELv1x8_M2 | |||
| KERNELv1x8_M1 | |||
| KERNELv1x8_M2 | |||
| subs counterL, counterL, #1 | |||
| bgt .Ldgemm_kernel_L8_Mv1_22 | |||
| .align 5 | |||
| .Ldgemm_kernel_L8_Mv1_22a: | |||
| KERNELv1x8_M1 | |||
| KERNELv1x8_M2 | |||
| KERNELv1x8_M1 | |||
| KERNELv1x8_M2 | |||
| KERNELv1x8_M1 | |||
| KERNELv1x8_M2 | |||
| KERNELv1x8_M1 | |||
| KERNELv1x8_E | |||
| b .Ldgemm_kernel_L8_Mv1_44 | |||
| .align 5 | |||
| .Ldgemm_kernel_L8_Mv1_32: | |||
| tst counterL, #1 | |||
| ble .Ldgemm_kernel_L8_Mv1_40 | |||
| KERNELv1x8_I | |||
| KERNELv1x8_M2 | |||
| KERNELv1x8_M1 | |||
| KERNELv1x8_M2 | |||
| KERNELv1x8_M1 | |||
| KERNELv1x8_M2 | |||
| KERNELv1x8_M1 | |||
| KERNELv1x8_E | |||
| b .Ldgemm_kernel_L8_Mv1_44 | |||
| .Ldgemm_kernel_L8_Mv1_40: | |||
| INITv1x8 | |||
| .Ldgemm_kernel_L8_Mv1_44: | |||
| ands counterL , origK, #7 | |||
| ble .Ldgemm_kernel_L8_Mv1_100 | |||
| .align 5 | |||
| .Ldgemm_kernel_L8_Mv1_46: | |||
| KERNELv1x8_SUB | |||
| subs counterL, counterL, #1 | |||
| bne .Ldgemm_kernel_L8_Mv1_46 | |||
| .Ldgemm_kernel_L8_Mv1_100: | |||
| prfm PLDL1KEEP, [pA] | |||
| prfm PLDL1KEEP, [pA, #64] | |||
| prfm PLDL1KEEP, [origPB] | |||
| SAVEv1x8 | |||
| .Ldgemm_kernel_L8_Mv1_END: | |||
| incw counterI | |||
| whilelt p1.s, counterI, origM //SVE instruction | |||
| cntp lanes, p0, p1.s // lanes contain number of active SVE lanes in M dimension | |||
| b.any .Ldgemm_kernel_L8_Mv1_20 | |||
| .Ldgemm_kernel_L8_END: | |||
| lsl temp, origK, #5 | |||
| add origPB, origPB, temp // B = B + K * 8 * 4 | |||
| subs counterJ, counterJ , #1 // j-- | |||
| bgt .Ldgemm_kernel_L8_BEGIN | |||
| /******************************************************************************/ | |||
| /* Repeat the same thing if 4 left in N */ | |||
| .align 5 | |||
| .Ldgemm_kernel_L4_BEGIN: | |||
| mov counterJ , origN | |||
| tst counterJ , #4 | |||
| ble .Ldgemm_kernel_L2_BEGIN | |||
| mov pCRow0, pC | |||
| add pC, pC, LDC, lsl #2 // add 4 x LDC | |||
| mov pA, origPA // pA = start of A array | |||
| .Ldgemm_kernel_L4_Mv1_BEGIN: | |||
| mov counterI, #0 | |||
| whilelt p1.s, counterI, origM //SVE instruction | |||
| cntp lanes, p0, p1.s | |||
| .align 5 | |||
| .Ldgemm_kernel_L4_Mv1_20: | |||
| mov pB, origPB | |||
| INITv1x4 // fill with zeros | |||
| asr counterL , origK, #3 // L = K / 8 | |||
| cmp counterL , #0 // is there at least 4 to do? | |||
| ble .Ldgemm_kernel_L4_Mv1_44 | |||
| .align 5 | |||
| .Ldgemm_kernel_L4_Mv1_22: | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| KERNELv1x4_SUB | |||
| KERNELv1x4_SUB | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| KERNELv1x4_SUB | |||
| KERNELv1x4_SUB | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| KERNELv1x4_SUB | |||
| KERNELv1x4_SUB | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| KERNELv1x4_SUB | |||
| KERNELv1x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt .Ldgemm_kernel_L4_Mv1_22 | |||
| .Ldgemm_kernel_L4_Mv1_44: | |||
| ands counterL , origK, #7 | |||
| ble .Ldgemm_kernel_L4_Mv1_100 | |||
| .align 5 | |||
| .Ldgemm_kernel_L4_Mv1_46: | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| KERNELv1x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bne .Ldgemm_kernel_L4_Mv1_46 | |||
| .Ldgemm_kernel_L4_Mv1_100: | |||
| prfm PLDL1KEEP, [pA] | |||
| prfm PLDL1KEEP, [pA, #64] | |||
| prfm PLDL1KEEP, [origPB] | |||
| SAVEv1x4 | |||
| .Ldgemm_kernel_L4_Mv1_END: | |||
| incw counterI | |||
| whilelt p1.s, counterI, origM //SVE instruction | |||
| cntp lanes, p0, p1.s | |||
| b.any .Ldgemm_kernel_L4_Mv1_20 | |||
| .Ldgemm_kernel_L4_END: | |||
| lsl temp, origK, #4 | |||
| add origPB, origPB, temp // B = B + K * 4 * 4 | |||
| /******************************************************************************/ | |||
| /* Repeat the same thing if 2 left in N */ | |||
| .align 5 | |||
| .Ldgemm_kernel_L2_BEGIN: | |||
| mov counterJ , origN | |||
| tst counterJ , #2 | |||
| ble .Ldgemm_kernel_L1_BEGIN | |||
| mov pCRow0, pC | |||
| add pC, pC, LDC, lsl #1 // add 2 x LDC | |||
| mov pA, origPA // pA = start of A array | |||
| .Ldgemm_kernel_L2_Mv1_BEGIN: | |||
| mov counterI, #0 | |||
| whilelt p1.s, counterI, origM //SVE instruction | |||
| cntp lanes, p0, p1.s | |||
| .align 5 | |||
| .Ldgemm_kernel_L2_Mv1_20: | |||
| mov pB, origPB | |||
| INITv1x2 // fill with zeros | |||
| asr counterL , origK, #3 // L = K / 8 | |||
| cmp counterL , #0 // is there at least 4 to do? | |||
| ble .Ldgemm_kernel_L2_Mv1_44 | |||
| .align 5 | |||
| .Ldgemm_kernel_L2_Mv1_22: | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| KERNELv1x2_SUB | |||
| KERNELv1x2_SUB | |||
| KERNELv1x2_SUB | |||
| KERNELv1x2_SUB | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| KERNELv1x2_SUB | |||
| KERNELv1x2_SUB | |||
| KERNELv1x2_SUB | |||
| KERNELv1x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt .Ldgemm_kernel_L2_Mv1_22 | |||
| .Ldgemm_kernel_L2_Mv1_44: | |||
| ands counterL , origK, #7 | |||
| ble .Ldgemm_kernel_L2_Mv1_100 | |||
| .align 5 | |||
| .Ldgemm_kernel_L2_Mv1_46: | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| KERNELv1x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bne .Ldgemm_kernel_L2_Mv1_46 | |||
| .Ldgemm_kernel_L2_Mv1_100: | |||
| prfm PLDL1KEEP, [pA] | |||
| prfm PLDL1KEEP, [pA, #64] | |||
| prfm PLDL1KEEP, [origPB] | |||
| SAVEv1x2 | |||
| .Ldgemm_kernel_L2_Mv1_END: | |||
| incw counterI | |||
| whilelt p1.s, counterI, origM //SVE instruction | |||
| cntp lanes, p0, p1.s | |||
| b.any .Ldgemm_kernel_L2_Mv1_20 | |||
| .Ldgemm_kernel_L2_END: | |||
| add origPB, origPB, origK, lsl #3 // B = B + K * 2 * 4 | |||
| /******************************************************************************/ | |||
| /* Repeat the same thing if 1 left in N */ | |||
| .align 5 | |||
| .Ldgemm_kernel_L1_BEGIN: | |||
| mov counterJ , origN | |||
| tst counterJ , #1 | |||
| ble .Ldgemm_kernel_L999 // done | |||
| mov pCRow0, pC | |||
| add pC, pC, LDC // add 1 x LDC | |||
| mov pA, origPA // pA = start of A array | |||
| .Ldgemm_kernel_L1_Mv1_BEGIN: | |||
| mov counterI, #0 | |||
| whilelt p1.s, counterI, origM //SVE instruction | |||
| cntp lanes, p0, p1.s | |||
| .align 5 | |||
| .Ldgemm_kernel_L1_Mv1_20: | |||
| mov pB, origPB | |||
| INITv1x1 // fill with zeros | |||
| asr counterL , origK, #3 // L = K / 8 | |||
| cmp counterL , #0 // is there at least 8 to do? | |||
| ble .Ldgemm_kernel_L1_Mv1_44 | |||
| .align 5 | |||
| .Ldgemm_kernel_L1_Mv1_22: | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| KERNELv1x1_SUB | |||
| KERNELv1x1_SUB | |||
| KERNELv1x1_SUB | |||
| KERNELv1x1_SUB | |||
| KERNELv1x1_SUB | |||
| KERNELv1x1_SUB | |||
| KERNELv1x1_SUB | |||
| KERNELv1x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt .Ldgemm_kernel_L1_Mv1_22 | |||
| .Ldgemm_kernel_L1_Mv1_44: | |||
| ands counterL , origK, #7 | |||
| ble .Ldgemm_kernel_L1_Mv1_100 | |||
| .align 5 | |||
| .Ldgemm_kernel_L1_Mv1_46: | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| KERNELv1x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt .Ldgemm_kernel_L1_Mv1_46 | |||
| .Ldgemm_kernel_L1_Mv1_100: | |||
| prfm PLDL1KEEP, [pA] | |||
| prfm PLDL1KEEP, [pA, #64] | |||
| prfm PLDL1KEEP, [origPB] | |||
| SAVEv1x1 | |||
| .Ldgemm_kernel_L1_Mv1_END: | |||
| incw counterI | |||
| whilelt p1.s, counterI, origM //SVE instruction | |||
| cntp lanes, p0, p1.s | |||
| b.any .Ldgemm_kernel_L1_Mv1_20 | |||
| .Ldgemm_kernel_L1_END: | |||
| /******************************************************************************/ | |||
| .Ldgemm_kernel_L999: | |||
| mov x0, #0 // set return value | |||
| ldp d8, d9, [sp, #(0 * 16)] | |||
| ldp d10, d11, [sp, #(1 * 16)] | |||
| ldp d12, d13, [sp, #(2 * 16)] | |||
| ldp d14, d15, [sp, #(3 * 16)] | |||
| ldp d16, d17, [sp, #(4 * 16)] | |||
| ldp x18, x19, [sp, #(5 * 16)] | |||
| ldp x20, x21, [sp, #(6 * 16)] | |||
| ldp x22, x23, [sp, #(7 * 16)] | |||
| ldp x24, x25, [sp, #(8 * 16)] | |||
| ldp x26, x27, [sp, #(9 * 16)] | |||
| ldr x28, [sp, #(10 * 16)] | |||
| add sp, sp, #(11*16) | |||
| ret | |||
| EPILOGUE | |||
| @@ -0,0 +1,78 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| #include <arm_sve.h> | |||
| // TODO: write in assembly with proper unrolling of inner loop | |||
| int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ | |||
| BLASLONG j; | |||
| IFLOAT *aoffset, *aoffset1, *boffset; | |||
| svint32_t lda_vec = svindex_s32(0LL, lda); | |||
| uint32_t sve_size = svcntw(); | |||
| aoffset = a; | |||
| boffset = b; | |||
| j = 0; | |||
| svbool_t pg = svwhilelt_b32(j, n); | |||
| uint32_t active = svcntp_b32(svptrue_b32(), pg); | |||
| do { | |||
| aoffset1 = aoffset; | |||
| uint32_t i_cnt = m; | |||
| while (i_cnt--) { | |||
| svfloat32_t a_vec = svld1_gather_index(pg, (float *) aoffset1, lda_vec); | |||
| svst1_f32(pg, (float *) boffset, a_vec); | |||
| aoffset1++; | |||
| boffset += active; | |||
| } | |||
| aoffset += sve_size * lda; | |||
| j += svcntw(); | |||
| pg = svwhilelt_b32(j, n); | |||
| active = svcntp_b32(svptrue_b32(), pg); | |||
| } while (svptest_any(svptrue_b32(), pg)); | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,77 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| #include <arm_sve.h> | |||
| // TODO: write in assembly with proper unrolling of inner loop | |||
| int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ | |||
| BLASLONG j; | |||
| IFLOAT *aoffset, *aoffset1, *boffset; | |||
| uint32_t sve_size = svcntw(); | |||
| aoffset = a; | |||
| boffset = b; | |||
| j = 0; | |||
| svbool_t pg = svwhilelt_b32(j, n); | |||
| uint32_t active = svcntp_b32(svptrue_b32(), pg); | |||
| do { | |||
| aoffset1 = aoffset; | |||
| uint32_t i_cnt = m; | |||
| while (i_cnt--) { | |||
| svfloat32_t a_vec = svld1(pg, (float *) aoffset1); | |||
| svst1_f32(pg, (float *) boffset, a_vec); | |||
| aoffset1 += lda; | |||
| boffset += active; | |||
| } | |||
| aoffset += sve_size; | |||
| j += svcntw(); | |||
| pg = svwhilelt_b32(j, n); | |||
| active = svcntp_b32(svptrue_b32(), pg); | |||
| } while (svptest_any(svptrue_b32(), pg)); | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,143 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| #include <arm_sve.h> | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ | |||
| BLASLONG i, offset; | |||
| #if defined(DOUBLE) | |||
| uint64_t sve_size = svcntd(); | |||
| svint64_t posY_vec = svdup_s64(posY); | |||
| svint64_t posX_vec = svdup_s64(posX); | |||
| svint64_t lda_vec = svdup_s64(lda); | |||
| svint64_t one_vec = svdup_s64(1LL); | |||
| int64_t j = 0; | |||
| svbool_t pg = svwhilelt_b64(j, n); | |||
| int64_t active = svcntp_b64(svptrue_b64(), pg); | |||
| svint64_t index_neg = svindex_s64(0LL, -1LL); | |||
| svint64_t index = svindex_s64(0LL, 1LL); | |||
| do { | |||
| offset = posX - posY; | |||
| svint64_t vec_off = svdup_s64(offset); | |||
| svbool_t cmp = svcmpgt(pg, vec_off, index_neg); | |||
| svint64_t temp = svadd_z(pg, posX_vec, index); | |||
| svint64_t temp1 = svmla_z(pg, temp, posY_vec, lda_vec); | |||
| svint64_t temp2 = svmla_z(pg, posY_vec, temp, lda); | |||
| svint64_t gat_ind = svsel(cmp, temp1, temp2); | |||
| i = m; | |||
| while (i>0) { | |||
| svfloat64_t data_vec = svld1_gather_index(pg, a, gat_ind); | |||
| gat_ind = svadd_m(cmp, gat_ind, lda_vec); | |||
| gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, one_vec); | |||
| svst1(pg, b, data_vec); | |||
| b += active; | |||
| offset --; | |||
| vec_off = svsub_z(pg, vec_off, one_vec); | |||
| cmp = svcmpgt(pg, vec_off, index_neg); | |||
| i--; | |||
| } | |||
| posX += sve_size; | |||
| posX_vec = svdup_s64(posX); | |||
| j += sve_size; | |||
| pg = svwhilelt_b64(j, n); | |||
| active = svcntp_b64(svptrue_b64(), pg); | |||
| } while (svptest_any(svptrue_b64(), pg)); | |||
| #else | |||
| uint32_t sve_size = svcntw(); | |||
| svint32_t posY_vec = svdup_s32(posY); | |||
| svint32_t posX_vec = svdup_s32(posX); | |||
| svint32_t lda_vec = svdup_s32(lda); | |||
| svint32_t one_vec = svdup_s32(1); | |||
| int32_t N = n; | |||
| int32_t j = 0; | |||
| svbool_t pg = svwhilelt_b32(j, N); | |||
| int32_t active = svcntp_b32(svptrue_b32(), pg); | |||
| svint32_t index_neg = svindex_s32(0, -1); | |||
| svint32_t index = svindex_s32(0, 1); | |||
| do { | |||
| offset = posX - posY; | |||
| svint32_t vec_off = svdup_s32(offset); | |||
| svbool_t cmp = svcmpgt(pg, vec_off, index_neg); | |||
| svint32_t temp = svadd_z(pg, posX_vec, index); | |||
| svint32_t temp1 = svmla_z(pg, temp, posY_vec, lda_vec); | |||
| svint32_t temp2 = svmla_z(pg, posY_vec, temp, lda); | |||
| svint32_t gat_ind = svsel(cmp, temp1, temp2); | |||
| i = m; | |||
| while (i>0) { | |||
| svfloat32_t data_vec = svld1_gather_index(pg, a, gat_ind); | |||
| gat_ind = svadd_m(cmp, gat_ind, lda_vec); | |||
| gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, one_vec); | |||
| svst1(pg, b, data_vec); | |||
| b += active; | |||
| offset --; | |||
| vec_off = svsub_z(pg, vec_off, one_vec); | |||
| cmp = svcmpgt(pg, vec_off, index_neg); | |||
| i--; | |||
| } | |||
| posX += sve_size; | |||
| posX_vec = svdup_s32(posX); | |||
| j += sve_size; | |||
| pg = svwhilelt_b32(j, N); | |||
| active = svcntp_b32(svptrue_b32(), pg); | |||
| } while (svptest_any(svptrue_b32(), pg)); | |||
| #endif | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,143 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| #include <arm_sve.h> | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ | |||
| BLASLONG i, offset; | |||
| #if defined(DOUBLE) | |||
| uint64_t sve_size = svcntd(); | |||
| svint64_t posY_vec = svdup_s64(posY); | |||
| svint64_t posX_vec = svdup_s64(posX); | |||
| svint64_t lda_vec = svdup_s64(lda); | |||
| svint64_t one_vec = svdup_s64(1LL); | |||
| int64_t j = 0; | |||
| svbool_t pg = svwhilelt_b64(j, n); | |||
| int64_t active = svcntp_b64(svptrue_b64(), pg); | |||
| svint64_t index_neg = svindex_s64(0LL, -1LL); | |||
| svint64_t index = svindex_s64(0LL, 1LL); | |||
| do { | |||
| offset = posX - posY; | |||
| svint64_t vec_off = svdup_s64(offset); | |||
| svbool_t cmp = svcmpgt(pg, vec_off, index_neg); | |||
| svint64_t temp = svadd_z(pg, posX_vec, index); | |||
| svint64_t temp1 = svmla_z(pg, temp, posY_vec, lda_vec); | |||
| svint64_t temp2 = svmla_z(pg, posY_vec, temp, lda); | |||
| svint64_t gat_ind = svsel(cmp, temp2, temp1); | |||
| i = m; | |||
| while (i>0) { | |||
| svfloat64_t data_vec = svld1_gather_index(pg, a, gat_ind); | |||
| gat_ind = svadd_m(cmp, gat_ind, one_vec); | |||
| gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec); | |||
| svst1(pg, b, data_vec); | |||
| b += active; | |||
| offset --; | |||
| vec_off = svsub_z(pg, vec_off, one_vec); | |||
| cmp = svcmpgt(pg, vec_off, index_neg); | |||
| i--; | |||
| } | |||
| posX += sve_size; | |||
| posX_vec = svdup_s64(posX); | |||
| j += sve_size; | |||
| pg = svwhilelt_b64(j, n); | |||
| active = svcntp_b64(svptrue_b64(), pg); | |||
| } while (svptest_any(svptrue_b64(), pg)); | |||
| #else | |||
| uint32_t sve_size = svcntw(); | |||
| svint32_t posY_vec = svdup_s32(posY); | |||
| svint32_t posX_vec = svdup_s32(posX); | |||
| svint32_t lda_vec = svdup_s32(lda); | |||
| svint32_t one_vec = svdup_s32(1); | |||
| int32_t N = n; | |||
| int32_t j = 0; | |||
| svbool_t pg = svwhilelt_b32(j, N); | |||
| int32_t active = svcntp_b32(svptrue_b32(), pg); | |||
| svint32_t index_neg = svindex_s32(0, -1); | |||
| svint32_t index = svindex_s32(0, 1); | |||
| do { | |||
| offset = posX - posY; | |||
| svint32_t vec_off = svdup_s32(offset); | |||
| svbool_t cmp = svcmpgt(pg, vec_off, index_neg); | |||
| svint32_t temp = svadd_z(pg, posX_vec, index); | |||
| svint32_t temp1 = svmla_z(pg, temp, posY_vec, lda_vec); | |||
| svint32_t temp2 = svmla_z(pg, posY_vec, temp, lda); | |||
| svint32_t gat_ind = svsel(cmp, temp2, temp1); | |||
| i = m; | |||
| while (i>0) { | |||
| svfloat32_t data_vec = svld1_gather_index(pg, a, gat_ind); | |||
| gat_ind = svadd_m(cmp, gat_ind, one_vec); | |||
| gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec); | |||
| svst1(pg, b, data_vec); | |||
| b += active; | |||
| offset --; | |||
| vec_off = svsub_z(pg, vec_off, one_vec); | |||
| cmp = svcmpgt(pg, vec_off, index_neg); | |||
| i--; | |||
| } | |||
| posX += sve_size; | |||
| posX_vec = svdup_s32(posX); | |||
| j += sve_size; | |||
| pg = svwhilelt_b32(j, N); | |||
| active = svcntp_b32(svptrue_b32(), pg); | |||
| } while (svptest_any(svptrue_b32(), pg)); | |||
| #endif | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,136 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| #ifdef __ARM_FEATURE_SVE | |||
| #include <arm_sve.h> | |||
| #endif | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ | |||
| BLASLONG i, js; | |||
| BLASLONG X; | |||
| js = 0; | |||
| FLOAT *ao; | |||
| #ifdef DOUBLE | |||
| svint64_t index = svindex_s64(0LL, lda); | |||
| svbool_t pn = svwhilelt_b64(js, n); | |||
| int n_active = svcntp_b64(svptrue_b64(), pn); | |||
| #else | |||
| svint32_t index = svindex_s32(0, lda); | |||
| svbool_t pn = svwhilelt_b32(js, n); | |||
| int n_active = svcntp_b32(svptrue_b32(), pn); | |||
| #endif | |||
| do | |||
| { | |||
| X = posX; | |||
| if (posX <= posY) { | |||
| ao = a + posY + posX * lda; | |||
| } else { | |||
| ao = a + posX + posY * lda; | |||
| } | |||
| i = 0; | |||
| do | |||
| { | |||
| if (X > posY) { | |||
| #ifdef DOUBLE | |||
| svfloat64_t aj_vec = svld1_gather_index(pn, ao, index); | |||
| #else | |||
| svfloat32_t aj_vec = svld1_gather_index(pn, ao, index); | |||
| #endif | |||
| svst1(pn, b, aj_vec); | |||
| ao ++; | |||
| b += n_active; | |||
| X ++; | |||
| i ++; | |||
| } else | |||
| if (X < posY) { | |||
| ao += lda; | |||
| b += n_active; | |||
| X ++; | |||
| i ++; | |||
| } else { | |||
| /* I did not find a way to unroll this while preserving vector-length-agnostic code. */ | |||
| #ifdef UNIT | |||
| int temp = 0; | |||
| for (int j = 0; j < n_active; j++) { | |||
| for (int k = 0 ; k < j; k++) { | |||
| b[temp++] = *(ao+k*lda+j); | |||
| } | |||
| b[temp++] = ONE; | |||
| for (int k = j+1; k < n_active; k++) { | |||
| b[temp++] = ZERO; | |||
| } | |||
| } | |||
| #else | |||
| int temp = 0; | |||
| for (int j = 0; j < n_active; j++) { | |||
| for (int k = 0 ; k <= j; k++) { | |||
| b[temp++] = *(ao+k*lda+j); | |||
| } | |||
| for (int k = j+1; k < n_active; k++) { | |||
| b[temp++] = ZERO; | |||
| } | |||
| } | |||
| #endif | |||
| ao += n_active; | |||
| b += n_active*n_active; | |||
| X += n_active; | |||
| i += n_active; | |||
| } | |||
| } while (i < m); | |||
| posY += n_active; | |||
| js += n_active; | |||
| #ifdef DOUBLE | |||
| pn = svwhilelt_b64(js, n); | |||
| n_active = svcntp_b64(svptrue_b64(), pn); | |||
| } while (svptest_any(svptrue_b64(), pn)); | |||
| #else | |||
| pn = svwhilelt_b32(js, n); | |||
| n_active = svcntp_b32(svptrue_b32(), pn); | |||
| } while (svptest_any(svptrue_b32(), pn)); | |||
| #endif | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,136 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| #ifdef __ARM_FEATURE_SVE | |||
| #include <arm_sve.h> | |||
| #endif | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ | |||
| BLASLONG i, js; | |||
| BLASLONG X; | |||
| FLOAT *ao; | |||
| js = 0; | |||
| #ifdef DOUBLE | |||
| svbool_t pn = svwhilelt_b64(js, n); | |||
| int n_active = svcntp_b64(svptrue_b64(), pn); | |||
| #else | |||
| svbool_t pn = svwhilelt_b32(js, n); | |||
| int n_active = svcntp_b32(svptrue_b32(), pn); | |||
| #endif | |||
| do | |||
| { | |||
| X = posX; | |||
| if (posX <= posY) { | |||
| ao = a + posY + posX * lda; | |||
| } else { | |||
| ao = a + posX + posY * lda; | |||
| } | |||
| i = 0; | |||
| do | |||
| { | |||
| if (X > posY) { | |||
| ao ++; | |||
| b += n_active; | |||
| X ++; | |||
| i ++; | |||
| } else | |||
| if (X < posY) { | |||
| #ifdef DOUBLE | |||
| svfloat64_t aj_vec = svld1(pn, ao); | |||
| #else | |||
| svfloat32_t aj_vec = svld1(pn, ao); | |||
| #endif | |||
| svst1(pn, b, aj_vec); | |||
| ao += lda; | |||
| b += n_active; | |||
| X ++; | |||
| i ++; | |||
| } else { | |||
| /* I did not find a way to unroll this while preserving vector-length-agnostic code. */ | |||
| #ifdef UNIT | |||
| int temp = 0; | |||
| for (int j = 0; j < n_active; j++) { | |||
| for (int k = 0 ; k < j; k++) { | |||
| b[temp++] = ZERO; | |||
| } | |||
| b[temp++] = ONE; | |||
| for (int k = j+1; k < n_active; k++) { | |||
| b[temp++] = *(ao+j*lda+k); | |||
| } | |||
| } | |||
| #else | |||
| int temp = 0; | |||
| for (int j = 0; j < n_active; j++) { | |||
| for (int k = 0 ; k < j; k++) { | |||
| b[temp++] = ZERO; | |||
| } | |||
| for (int k = j; k < n_active; k++) { | |||
| b[temp++] = *(ao+j*lda+k); | |||
| } | |||
| } | |||
| #endif | |||
| ao += n_active * lda; | |||
| b += n_active*n_active; | |||
| X += n_active; | |||
| i += n_active; | |||
| } | |||
| } while (i < m); | |||
| posY += n_active; | |||
| js += n_active; | |||
| #ifdef DOUBLE | |||
| pn = svwhilelt_b64(js, n); | |||
| n_active = svcntp_b64(svptrue_b64(), pn); | |||
| } while (svptest_any(svptrue_b64(), pn)); | |||
| #else | |||
| pn = svwhilelt_b32(js, n); | |||
| n_active = svcntp_b32(svptrue_b32(), pn); | |||
| } while (svptest_any(svptrue_b32(), pn)); | |||
| #endif | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,136 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| #ifdef __ARM_FEATURE_SVE | |||
| #include <arm_sve.h> | |||
| #endif | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ | |||
| BLASLONG i, js; | |||
| BLASLONG X; | |||
| js = 0; | |||
| FLOAT *ao; | |||
| #ifdef DOUBLE | |||
| svint64_t index = svindex_s64(0LL, lda); | |||
| svbool_t pn = svwhilelt_b64(js, n); | |||
| int n_active = svcntp_b64(svptrue_b64(), pn); | |||
| #else | |||
| svint32_t index = svindex_s32(0, lda); | |||
| svbool_t pn = svwhilelt_b32(js, n); | |||
| int n_active = svcntp_b32(svptrue_b32(), pn); | |||
| #endif | |||
| do | |||
| { | |||
| X = posX; | |||
| if (posX <= posY) { | |||
| ao = a + posX + posY * lda; | |||
| } else { | |||
| ao = a + posY + posX * lda; | |||
| } | |||
| i = 0; | |||
| do | |||
| { | |||
| if (X < posY) { | |||
| #ifdef DOUBLE | |||
| svfloat64_t aj_vec = svld1_gather_index(pn, ao, index); | |||
| #else | |||
| svfloat32_t aj_vec = svld1_gather_index(pn, ao, index); | |||
| #endif | |||
| svst1(pn, b, aj_vec); | |||
| ao ++; | |||
| b += n_active; | |||
| X ++; | |||
| i ++; | |||
| } else | |||
| if (X > posY) { | |||
| ao += lda; | |||
| b += n_active; | |||
| X ++; | |||
| i ++; | |||
| } else { | |||
| /* I did not find a way to unroll this while preserving vector-length-agnostic code. */ | |||
| #ifdef UNIT | |||
| int temp = 0; | |||
| for (int j = 0; j < n_active; j++) { | |||
| for (int k = 0 ; k < j; k++) { | |||
| b[temp++] = ZERO; | |||
| } | |||
| b[temp++] = ONE; | |||
| for (int k = j+1; k < n_active; k++) { | |||
| b[temp++] = *(ao+k*lda+j); | |||
| } | |||
| } | |||
| #else | |||
| int temp = 0; | |||
| for (int j = 0; j < n_active; j++) { | |||
| for (int k = 0 ; k < j; k++) { | |||
| b[temp++] = ZERO; | |||
| } | |||
| for (int k = j; k < n_active; k++) { | |||
| b[temp++] = *(ao+k*lda+j); | |||
| } | |||
| } | |||
| #endif | |||
| ao += n_active; | |||
| b += n_active*n_active; | |||
| X += n_active; | |||
| i += n_active; | |||
| } | |||
| } while (i < m); | |||
| posY += n_active; | |||
| js += n_active; | |||
| #ifdef DOUBLE | |||
| pn = svwhilelt_b64(js, n); | |||
| n_active = svcntp_b64(svptrue_b64(), pn); | |||
| } while (svptest_any(svptrue_b64(), pn)); | |||
| #else | |||
| pn = svwhilelt_b32(js, n); | |||
| n_active = svcntp_b32(svptrue_b32(), pn); | |||
| } while (svptest_any(svptrue_b32(), pn)); | |||
| #endif | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,134 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| #ifdef __ARM_FEATURE_SVE | |||
| #include <arm_sve.h> | |||
| #endif | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ | |||
| BLASLONG i, js; | |||
| BLASLONG X; | |||
| FLOAT *ao; | |||
| js = 0; | |||
| #ifdef DOUBLE | |||
| svbool_t pn = svwhilelt_b64(js, n); | |||
| int n_active = svcntp_b64(svptrue_b64(), pn); | |||
| #else | |||
| svbool_t pn = svwhilelt_b32(js, n); | |||
| int n_active = svcntp_b32(svptrue_b32(), pn); | |||
| #endif | |||
| do | |||
| { | |||
| X = posX; | |||
| if (posX <= posY) { | |||
| ao = a + posX + posY * lda; | |||
| } else { | |||
| ao = a + posY + posX * lda; | |||
| } | |||
| i = 0; | |||
| do | |||
| { | |||
| if (X < posY) { | |||
| ao ++; | |||
| b += n_active; | |||
| X ++; | |||
| i ++; | |||
| } else | |||
| if (X > posY) { | |||
| #ifdef DOUBLE | |||
| svfloat64_t aj_vec = svld1(pn, ao); | |||
| #else | |||
| svfloat32_t aj_vec = svld1(pn, ao); | |||
| #endif | |||
| svst1(pn, b, aj_vec); | |||
| ao += lda; | |||
| b += n_active; | |||
| X ++; | |||
| i ++; | |||
| } else { | |||
| /* I did not find a way to unroll this while preserving vector-length-agnostic code. */ | |||
| #ifdef UNIT | |||
| int temp = 0; | |||
| for (int j = 0; j < n_active; j++) { | |||
| for (int k = 0 ; k < j; k++) { | |||
| b[temp++] = *(ao+j*lda+k); | |||
| } | |||
| b[temp++] = ONE; | |||
| for (int k = j+1; k < n_active; k++) { | |||
| b[temp++] = ZERO; | |||
| } | |||
| } | |||
| #else | |||
| int temp = 0; | |||
| for (int j = 0; j < n_active; j++) { | |||
| for (int k = 0 ; k <= j; k++) { | |||
| b[temp++] = *(ao+j*lda+k); | |||
| } | |||
| for (int k = j+1; k < n_active; k++) { | |||
| b[temp++] = ZERO; | |||
| } | |||
| } | |||
| #endif | |||
| ao += n_active * lda; | |||
| b += n_active*n_active; | |||
| X += n_active; | |||
| i += n_active; | |||
| } | |||
| } while (i < m); | |||
| posY += n_active; | |||
| js += n_active; | |||
| #ifdef DOUBLE | |||
| pn = svwhilelt_b64(js, n); | |||
| n_active = svcntp_b64(svptrue_b64(), pn); | |||
| } while (svptest_any(svptrue_b64(), pn)); | |||
| #else | |||
| pn = svwhilelt_b32(js, n); | |||
| n_active = svcntp_b32(svptrue_b32(), pn); | |||
| } while (svptest_any(svptrue_b32(), pn)); | |||
| #endif | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,736 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2021, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #include <arm_neon.h> | |||
| /******************************************************************************* | |||
| The complex GEMM kernels in OpenBLAS use static configuration of conjugation | |||
| modes via specific macros: | |||
| MACRO_NAME | conjugation on matrix A | conjugation on matrix B | | |||
| ---------- | ----------------------- | ----------------------- | | |||
| NN/NT/TN/TT | No | No | | |||
| NR/NC/TR/TC | No | Yes | | |||
| RN/RT/CN/CT | Yes | No | | |||
| RR/RC/CR/CC | Yes | Yes | | |||
| "conjugation on matrix A" means the complex conjugates of elements from | |||
| matrix A are used for matmul (rather than the original elements). "conjugation | |||
| on matrix B" means the complex conjugate of each element from matrix B is taken | |||
| for matrix multiplication, respectively. | |||
| Complex numbers in arrays or matrices are usually packed together as an | |||
| array of struct (without padding): | |||
| struct complex_number { | |||
| FLOAT real_part; | |||
| FLOAT imag_part; | |||
| }; | |||
| For a double complex array ARR[] which is usually DEFINED AS AN ARRAY OF | |||
| DOUBLE, the real part of its Kth complex number can be accessed as | |||
| ARR[K * 2], the imaginary part of the Kth complex number is ARR[2 * K + 1]. | |||
| This file uses 2 ways to vectorize matrix multiplication of complex numbers: | |||
| (1) Expanded-form | |||
| During accumulation along direction K: | |||
| Σk(a[0][k].real b[k][n].real) | |||
| accumulate Σk(a[0][k].imag b[k][n].real) | |||
| -------------------> . | |||
| | * b[k][n].real . | |||
| | (broadcasted) . | |||
| a[0][k].real Σk(a[v-1][k].real b[k][n].real) | |||
| a[0][k].imag Σk(a[v-1][k].imag b[k][n].real) | |||
| . VECTOR I | |||
| (vec_a) . | |||
| . | |||
| a[v-1][k].real Σk(a[0][k].real b[k][n].imag) | |||
| a[v-1][k].imag Σk(a[0][k].imag b[k][n].imag) | |||
| | . | |||
| | accumulate . | |||
| -------------------> . | |||
| * b[k][n].imag Σk(a[v-1][k].real b[k][n].imag) | |||
| (broadcasted) Σk(a[v-1][k].imag b[k][n].imag) | |||
| VECTOR II | |||
| After accumulation, prior to storage: | |||
| -1 -Σk(a[0][k].imag b[k][n].imag) | |||
| 1 Σk(a[0][k].real b[k][n].imag) | |||
| . . | |||
| VECTOR II permute and multiply . to get . | |||
| . . | |||
| -1 -Σk(a[v-1][k].imag b[k][n].imag) | |||
| 1 Σk(a[v-1][k].real b[k][n].imag) | |||
| then add with VECTOR I to get the result vector of elements of C. | |||
| 2 vector registers are needed for every v elements of C, with | |||
| v == sizeof(vector) / sizeof(complex) | |||
| (2) Contracted-form | |||
| During accumulation along direction K: | |||
| (the K coordinate is not shown, since the operation is identical for each k) | |||
| (load vector in mem) (load vector in mem) | |||
| a[0].r a[0].i ... a[v-1].r a[v-1].i a[v].r a[v].i ... a[2v-1].r a[2v-1]i | |||
| | | | |||
| | unzip operation (or VLD2 in arm neon) | | |||
| ----------------------------------------------------- | |||
| | | |||
| | | |||
| -------------------------------------------------- | |||
| | | | |||
| | | | |||
| v v | |||
| a[0].real ... a[2v-1].real a[0].imag ... a[2v-1].imag | |||
| | | | | | |||
| | | * b[i].imag(broadcast) | | | |||
| * b[i].real | -----------------------------|---- | * b[i].real | |||
| (broadcast) | | | | (broadcast) | |||
| | ------------------------------ | | | |||
| + | - | * b[i].imag(broadcast) + | + | | |||
| v v v v | |||
| (accumulate) (accumulate) | |||
| c[0].real ... c[2v-1].real c[0].imag ... c[2v-1].imag | |||
| VECTOR_REAL VECTOR_IMAG | |||
| After accumulation, VECTOR_REAL and VECTOR_IMAG are zipped (interleaved) | |||
| then stored to matrix C directly. | |||
| For 2v elements of C, only 2 vector registers are needed, while | |||
| 4 registers are required for expanded-form. | |||
| (v == sizeof(vector) / sizeof(complex)) | |||
| For AArch64 zgemm, 4x4 kernel needs 32 128-bit NEON registers | |||
| to store elements of C when using expanded-form calculation, where | |||
| the register spilling will occur. So contracted-form operation is | |||
| selected for 4x4 kernel. As for all other combinations of unroll parameters | |||
| (2x4, 4x2, 2x2, and so on), expanded-form mode is used to bring more | |||
| NEON registers into usage to hide latency of multiply-add instructions. | |||
| ******************************************************************************/ | |||
| static inline float64x2_t set_f64x2(double lo, double hi) { | |||
| float64x2_t ret = vdupq_n_f64(0); | |||
| ret = vsetq_lane_f64(lo, ret, 0); | |||
| ret = vsetq_lane_f64(hi, ret, 1); | |||
| return ret; | |||
| } | |||
| static inline float64x2x2_t expand_alpha(double alpha_r, double alpha_i) { | |||
| float64x2x2_t ret = {{ set_f64x2(alpha_r, alpha_i), set_f64x2(-alpha_i, alpha_r) }}; | |||
| return ret; | |||
| } | |||
| /***************************************************************** | |||
| * operation: *c += alpha * c_value //complex multiplication | |||
| * expanded_alpha: { { alpha_r, alpha_i }, { -alpha_i, alpha_r } | |||
| * expanded_c: {{ arbr, aibr }, { arbi, aibi }} | |||
| ****************************************************************/ | |||
| static inline void store_1c(double *c, float64x2x2_t expanded_c, | |||
| float64x2x2_t expanded_alpha) { | |||
| float64x2_t ld = vld1q_f64(c); | |||
| #if defined(NN) || defined(NT) || defined(TN) || defined(TT) | |||
| double real = vgetq_lane_f64(expanded_c.val[0], 0) - vgetq_lane_f64(expanded_c.val[1], 1); | |||
| double imag = vgetq_lane_f64(expanded_c.val[0], 1) + vgetq_lane_f64(expanded_c.val[1], 0); | |||
| #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) | |||
| double real = vgetq_lane_f64(expanded_c.val[0], 0) + vgetq_lane_f64(expanded_c.val[1], 1); | |||
| double imag = vgetq_lane_f64(expanded_c.val[0], 1) - vgetq_lane_f64(expanded_c.val[1], 0); | |||
| #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) | |||
| double real = vgetq_lane_f64(expanded_c.val[0], 0) + vgetq_lane_f64(expanded_c.val[1], 1); | |||
| double imag = -vgetq_lane_f64(expanded_c.val[0], 1) + vgetq_lane_f64(expanded_c.val[1], 0); | |||
| #else | |||
| double real = vgetq_lane_f64(expanded_c.val[0], 0) - vgetq_lane_f64(expanded_c.val[1], 1); | |||
| double imag = -vgetq_lane_f64(expanded_c.val[0], 1) - vgetq_lane_f64(expanded_c.val[1], 0); | |||
| #endif | |||
| ld = vfmaq_n_f64(ld, expanded_alpha.val[0], real); | |||
| vst1q_f64(c, vfmaq_n_f64(ld, expanded_alpha.val[1], imag)); | |||
| } | |||
| static inline void pref_c_4(const double *c) { | |||
| __asm__ __volatile__("prfm pstl1keep,[%0]; prfm pstl1keep,[%0,#56]\n\t"::"r"(c):); | |||
| } | |||
| static inline float64x2x2_t add_ec(float64x2x2_t ec1, float64x2x2_t ec2) { | |||
| float64x2x2_t ret = {{ vaddq_f64(ec1.val[0], ec2.val[0]), | |||
| vaddq_f64(ec1.val[1], ec2.val[1]) }}; | |||
| return ret; | |||
| } | |||
| static inline float64x2x2_t update_ec(float64x2x2_t ec, float64x2_t a, float64x2_t b) { | |||
| float64x2x2_t ret = {{ vfmaq_laneq_f64(ec.val[0], a, b, 0), vfmaq_laneq_f64(ec.val[1], a, b, 1) }}; | |||
| return ret; | |||
| } | |||
| static inline float64x2x2_t init() { | |||
| float64x2x2_t ret = {{ vdupq_n_f64(0), vdupq_n_f64(0) }}; | |||
| return ret; | |||
| } | |||
| static inline void kernel_1x1(const double *sa, const double *sb, double *C, | |||
| BLASLONG K, double alphar, double alphai) { | |||
| const float64x2x2_t expanded_alpha = expand_alpha(alphar, alphai); | |||
| float64x2x2_t c1, c2, c3, c4; | |||
| c1 = c2 = c3 = c4 = init(); | |||
| for (; K > 3; K -= 4) { | |||
| float64x2_t a1 = vld1q_f64(sa), a2 = vld1q_f64(sa + 2), | |||
| a3 = vld1q_f64(sa + 4), a4 = vld1q_f64(sa + 6); sa += 8; | |||
| float64x2_t b1 = vld1q_f64(sb), b2 = vld1q_f64(sb + 2), | |||
| b3 = vld1q_f64(sb + 4), b4 = vld1q_f64(sb + 6); sb += 8; | |||
| c1 = update_ec(c1, a1, b1); | |||
| c2 = update_ec(c2, a2, b2); | |||
| c3 = update_ec(c3, a3, b3); | |||
| c4 = update_ec(c4, a4, b4); | |||
| } | |||
| c1 = add_ec(c1, c2); | |||
| c3 = add_ec(c3, c4); | |||
| c1 = add_ec(c1, c3); | |||
| for (; K; K--) { | |||
| c1 = update_ec(c1, vld1q_f64(sa), vld1q_f64(sb)); sa += 2; sb += 2; | |||
| } | |||
| store_1c(C, c1, expanded_alpha); | |||
| } | |||
| static inline void kernel_2x1(const double *sa, const double *sb, double *C, | |||
| BLASLONG K, double alphar, double alphai) { | |||
| const float64x2x2_t expanded_alpha = expand_alpha(alphar, alphai); | |||
| float64x2x2_t c1, c2, c3, c4; | |||
| c1 = c2 = c3 = c4 = init(); | |||
| for (; K > 1; K -= 2) { | |||
| float64x2_t a1 = vld1q_f64(sa), a2 = vld1q_f64(sa + 2), | |||
| a3 = vld1q_f64(sa + 4), a4 = vld1q_f64(sa + 6); sa += 8; | |||
| float64x2_t b1 = vld1q_f64(sb), b2 = vld1q_f64(sb + 2); sb += 4; | |||
| c1 = update_ec(c1, a1, b1); | |||
| c2 = update_ec(c2, a2, b1); | |||
| c3 = update_ec(c3, a3, b2); | |||
| c4 = update_ec(c4, a4, b2); | |||
| } | |||
| c1 = add_ec(c1, c3); | |||
| c2 = add_ec(c2, c4); | |||
| if (K) { | |||
| float64x2_t b1 = vld1q_f64(sb); | |||
| c1 = update_ec(c1, vld1q_f64(sa), b1); | |||
| c2 = update_ec(c2, vld1q_f64(sa + 2), b1); | |||
| } | |||
| store_1c(C, c1, expanded_alpha); | |||
| store_1c(C + 2, c2, expanded_alpha); | |||
| } | |||
| static inline void kernel_1x2(const double *sa, const double *sb, double *C, | |||
| BLASLONG LDC, BLASLONG K, double alphar, double alphai) { | |||
| const float64x2x2_t expanded_alpha = expand_alpha(alphar, alphai); | |||
| float64x2x2_t c1, c2, c3, c4; | |||
| c1 = c2 = c3 = c4 = init(); | |||
| for (; K > 1; K -= 2) { | |||
| float64x2_t a1 = vld1q_f64(sa), a2 = vld1q_f64(sa + 2); sa += 4; | |||
| float64x2_t b1 = vld1q_f64(sb), b2 = vld1q_f64(sb + 2), | |||
| b3 = vld1q_f64(sb + 4), b4 = vld1q_f64(sb + 6); sb += 8; | |||
| c1 = update_ec(c1, a1, b1); | |||
| c2 = update_ec(c2, a1, b2); | |||
| c3 = update_ec(c3, a2, b3); | |||
| c4 = update_ec(c4, a2, b4); | |||
| } | |||
| c1 = add_ec(c1, c3); | |||
| c2 = add_ec(c2, c4); | |||
| if (K) { | |||
| float64x2_t a1 = vld1q_f64(sa); | |||
| c1 = update_ec(c1, a1, vld1q_f64(sb)); | |||
| c2 = update_ec(c2, a1, vld1q_f64(sb + 2)); | |||
| } | |||
| store_1c(C, c1, expanded_alpha); | |||
| store_1c(C + LDC * 2, c2, expanded_alpha); | |||
| } | |||
| static inline void kernel_2x2(const double *sa, const double *sb, double *C, | |||
| BLASLONG LDC, BLASLONG K, double alphar, double alphai) { | |||
| const float64x2x2_t expanded_alpha = expand_alpha(alphar, alphai); | |||
| float64x2x2_t c1, c2, c3, c4; | |||
| c1 = c2 = c3 = c4 = init(); | |||
| for (; K; K--) { | |||
| float64x2_t a1 = vld1q_f64(sa), a2 = vld1q_f64(sa + 2); sa += 4; | |||
| float64x2_t b1 = vld1q_f64(sb), b2 = vld1q_f64(sb + 2); sb += 4; | |||
| c1 = update_ec(c1, a1, b1); | |||
| c2 = update_ec(c2, a2, b1); | |||
| c3 = update_ec(c3, a1, b2); | |||
| c4 = update_ec(c4, a2, b2); | |||
| } | |||
| store_1c(C, c1, expanded_alpha); | |||
| store_1c(C + 2, c2, expanded_alpha); C += LDC * 2; | |||
| store_1c(C, c3, expanded_alpha); | |||
| store_1c(C + 2, c4, expanded_alpha); | |||
| } | |||
| static inline void kernel_4x1(const double *sa, const double *sb, double *C, | |||
| BLASLONG K, double alphar, double alphai) { | |||
| const float64x2x2_t expanded_alpha = expand_alpha(alphar, alphai); | |||
| float64x2x2_t c1, c2, c3, c4; | |||
| c1 = c2 = c3 = c4 = init(); | |||
| pref_c_4(C); | |||
| for (; K; K--) { | |||
| float64x2_t b1 = vld1q_f64(sb); sb += 2; | |||
| c1 = update_ec(c1, vld1q_f64(sa), b1); | |||
| c2 = update_ec(c2, vld1q_f64(sa + 2), b1); | |||
| c3 = update_ec(c3, vld1q_f64(sa + 4), b1); | |||
| c4 = update_ec(c4, vld1q_f64(sa + 6), b1); | |||
| sa += 8; | |||
| } | |||
| store_1c(C, c1, expanded_alpha); | |||
| store_1c(C + 2, c2, expanded_alpha); | |||
| store_1c(C + 4, c3, expanded_alpha); | |||
| store_1c(C + 6, c4, expanded_alpha); | |||
| } | |||
| static inline void kernel_4x2(const double *sa, const double *sb, double *C, | |||
| BLASLONG LDC, BLASLONG K, double alphar, double alphai) { | |||
| const float64x2x2_t expanded_alpha = expand_alpha(alphar, alphai); | |||
| float64x2x2_t c1, c2, c3, c4, c5, c6, c7, c8; | |||
| c1 = c2 = c3 = c4 = c5 = c6 = c7 = c8 = init(); | |||
| pref_c_4(C); | |||
| pref_c_4(C + LDC * 2); | |||
| for (; K; K--) { | |||
| float64x2_t b1 = vld1q_f64(sb), b2 = vld1q_f64(sb + 2); sb += 4; | |||
| float64x2_t a1 = vld1q_f64(sa), a2 = vld1q_f64(sa + 2), | |||
| a3 = vld1q_f64(sa + 4), a4 = vld1q_f64(sa + 6); sa += 8; | |||
| c1 = update_ec(c1, a1, b1); | |||
| c2 = update_ec(c2, a2, b1); | |||
| c3 = update_ec(c3, a3, b1); | |||
| c4 = update_ec(c4, a4, b1); | |||
| c5 = update_ec(c5, a1, b2); | |||
| c6 = update_ec(c6, a2, b2); | |||
| c7 = update_ec(c7, a3, b2); | |||
| c8 = update_ec(c8, a4, b2); | |||
| } | |||
| store_1c(C, c1, expanded_alpha); | |||
| store_1c(C + 2, c2, expanded_alpha); | |||
| store_1c(C + 4, c3, expanded_alpha); | |||
| store_1c(C + 6, c4, expanded_alpha); C += LDC * 2; | |||
| store_1c(C, c5, expanded_alpha); | |||
| store_1c(C + 2, c6, expanded_alpha); | |||
| store_1c(C + 4, c7, expanded_alpha); | |||
| store_1c(C + 6, c8, expanded_alpha); | |||
| } | |||
| static inline void kernel_1x4(const double *sa, const double *sb, double *C, | |||
| BLASLONG LDC, BLASLONG K, double alphar, double alphai) { | |||
| const float64x2x2_t expanded_alpha = expand_alpha(alphar, alphai); | |||
| float64x2x2_t c1, c2, c3, c4; | |||
| c1 = c2 = c3 = c4 = init(); | |||
| for (; K; K--) { | |||
| float64x2_t a1 = vld1q_f64(sa); sa += 2; | |||
| c1 = update_ec(c1, a1, vld1q_f64(sb)); | |||
| c2 = update_ec(c2, a1, vld1q_f64(sb + 2)); | |||
| c3 = update_ec(c3, a1, vld1q_f64(sb + 4)); | |||
| c4 = update_ec(c4, a1, vld1q_f64(sb + 6)); | |||
| sb += 8; | |||
| } | |||
| store_1c(C, c1, expanded_alpha); C += LDC * 2; | |||
| store_1c(C, c2, expanded_alpha); C += LDC * 2; | |||
| store_1c(C, c3, expanded_alpha); C += LDC * 2; | |||
| store_1c(C, c4, expanded_alpha); | |||
| } | |||
| static inline void kernel_2x4(const double *sa, const double *sb, double *C, | |||
| BLASLONG LDC, BLASLONG K, double alphar, double alphai) { | |||
| const float64x2x2_t expanded_alpha = expand_alpha(alphar, alphai); | |||
| float64x2x2_t c1, c2, c3, c4, c5, c6, c7, c8; | |||
| c1 = c2 = c3 = c4 = c5 = c6 = c7 = c8 = init(); | |||
| for (; K; K--) { | |||
| float64x2_t a1 = vld1q_f64(sa), a2 = vld1q_f64(sa + 2); sa += 4; | |||
| float64x2_t b1 = vld1q_f64(sb), b2 = vld1q_f64(sb + 2), | |||
| b3 = vld1q_f64(sb + 4), b4 = vld1q_f64(sb + 6); sb += 8; | |||
| c1 = update_ec(c1, a1, b1); | |||
| c2 = update_ec(c2, a2, b1); | |||
| c3 = update_ec(c3, a1, b2); | |||
| c4 = update_ec(c4, a2, b2); | |||
| c5 = update_ec(c5, a1, b3); | |||
| c6 = update_ec(c6, a2, b3); | |||
| c7 = update_ec(c7, a1, b4); | |||
| c8 = update_ec(c8, a2, b4); | |||
| } | |||
| store_1c(C, c1, expanded_alpha); | |||
| store_1c(C + 2, c2, expanded_alpha); C += LDC * 2; | |||
| store_1c(C, c3, expanded_alpha); | |||
| store_1c(C + 2, c4, expanded_alpha); C += LDC * 2; | |||
| store_1c(C, c5, expanded_alpha); | |||
| store_1c(C + 2, c6, expanded_alpha); C += LDC * 2; | |||
| store_1c(C, c7, expanded_alpha); | |||
| store_1c(C + 2, c8, expanded_alpha); | |||
| } | |||
| #if defined(NN) || defined(NT) || defined(TN) || defined(TT) | |||
| #define FMLA_RI "fmla " | |||
| #define FMLA_IR "fmla " | |||
| #define FMLA_II "fmls " | |||
| #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) | |||
| #define FMLA_RI "fmls " | |||
| #define FMLA_IR "fmla " | |||
| #define FMLA_II "fmla " | |||
| #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) | |||
| #define FMLA_RI "fmla " | |||
| #define FMLA_IR "fmls " | |||
| #define FMLA_II "fmla " | |||
| #else | |||
| #define FMLA_RI "fmls " | |||
| #define FMLA_IR "fmls " | |||
| #define FMLA_II "fmls " | |||
| #endif | |||
| #define FMLA_RR "fmla " | |||
| static inline void store_4c(double *C, float64x2_t up_r, float64x2_t up_i, | |||
| float64x2_t lo_r, float64x2_t lo_i, double alphar, double alphai) { | |||
| float64x2x2_t up = vld2q_f64(C), lo = vld2q_f64(C + 4); | |||
| up.val[0] = vfmaq_n_f64(up.val[0], up_r, alphar); | |||
| up.val[1] = vfmaq_n_f64(up.val[1], up_r, alphai); | |||
| lo.val[0] = vfmaq_n_f64(lo.val[0], lo_r, alphar); | |||
| lo.val[1] = vfmaq_n_f64(lo.val[1], lo_r, alphai); | |||
| up.val[0] = vfmsq_n_f64(up.val[0], up_i, alphai); | |||
| up.val[1] = vfmaq_n_f64(up.val[1], up_i, alphar); | |||
| lo.val[0] = vfmsq_n_f64(lo.val[0], lo_i, alphai); | |||
| lo.val[1] = vfmaq_n_f64(lo.val[1], lo_i, alphar); | |||
| vst2q_f64(C, up); | |||
| vst2q_f64(C + 4, lo); | |||
| } | |||
| static inline void kernel_4x4(const double *sa, const double *sb, double *C, | |||
| BLASLONG LDC, BLASLONG K, double alphar, double alphai) { | |||
| float64x2_t c1r, c1i, c2r, c2i; | |||
| float64x2_t c3r, c3i, c4r, c4i; | |||
| float64x2_t c5r, c5i, c6r, c6i; | |||
| float64x2_t c7r, c7i, c8r, c8i; | |||
| const double *pref_ = C; | |||
| pref_c_4(pref_); pref_ += LDC * 2; | |||
| pref_c_4(pref_); pref_ += LDC * 2; | |||
| pref_c_4(pref_); pref_ += LDC * 2; | |||
| pref_c_4(pref_); | |||
| __asm__ __volatile__( | |||
| "cmp %[K],#0\n\t" | |||
| "movi %[c1r].16b,#0; movi %[c1i].16b,#0; movi %[c2r].16b,#0; movi %[c2i].16b,#0\n\t" | |||
| "movi %[c3r].16b,#0; movi %[c3i].16b,#0; movi %[c4r].16b,#0; movi %[c4i].16b,#0\n\t" | |||
| "movi %[c5r].16b,#0; movi %[c5i].16b,#0; movi %[c6r].16b,#0; movi %[c6i].16b,#0\n\t" | |||
| "movi %[c7r].16b,#0; movi %[c7i].16b,#0; movi %[c8r].16b,#0; movi %[c8i].16b,#0\n\t" | |||
| "beq 4f; cmp %[K],#2\n\t" | |||
| "ld2 {v0.2d,v1.2d},[%[sa]],#32; ldp q4,q5,[%[sb]],#32\n\t" | |||
| "ld2 {v2.2d,v3.2d},[%[sa]],#32; ldr q6,[%[sb]]; ldr d7,[%[sb],#16]\n\t" | |||
| "ldr x0,[%[sb],#24]; add %[sb],%[sb],#32\n\t" | |||
| "beq 2f; blt 3f\n\t" | |||
| "1:\n\t" | |||
| "fmov v7.d[1],x0; ldr d8,[%[sa]]\n\t" | |||
| FMLA_RR "%[c1r].2d,v0.2d,v4.d[0]; ldr x0,[%[sa],#16]\n\t" | |||
| FMLA_RR "%[c2r].2d,v2.2d,v4.d[0]\n\t" | |||
| FMLA_RI "%[c1i].2d,v0.2d,v4.d[1]\n\t" | |||
| "fmov v8.d[1],x0; ldr d9,[%[sa],#8]\n\t" | |||
| FMLA_RI "%[c2i].2d,v2.2d,v4.d[1]; ldr x0,[%[sa],#24]\n\t" | |||
| FMLA_II "%[c1r].2d,v1.2d,v4.d[1]\n\t" | |||
| FMLA_II "%[c2r].2d,v3.2d,v4.d[1]\n\t" | |||
| "fmov v9.d[1],x0; ldr d10,[%[sa],#32]\n\t" | |||
| FMLA_IR "%[c1i].2d,v1.2d,v4.d[0]; ldr x0,[%[sa],#48]\n\t" | |||
| FMLA_IR "%[c2i].2d,v3.2d,v4.d[0]\n\t" | |||
| FMLA_RR "%[c3r].2d,v0.2d,v5.d[0]\n\t" | |||
| "fmov v10.d[1],x0; ldr d11,[%[sa],#40]\n\t" | |||
| FMLA_RR "%[c4r].2d,v2.2d,v5.d[0]; ldr x0,[%[sa],#56]\n\t" | |||
| FMLA_RI "%[c3i].2d,v0.2d,v5.d[1]\n\t" | |||
| FMLA_RI "%[c4i].2d,v2.2d,v5.d[1]\n\t" | |||
| "fmov v11.d[1],x0; ldr d12,[%[sb]]\n\t" | |||
| FMLA_II "%[c3r].2d,v1.2d,v5.d[1]; ldr x0,[%[sb],#8]\n\t" | |||
| FMLA_II "%[c4r].2d,v3.2d,v5.d[1]\n\t" | |||
| FMLA_IR "%[c3i].2d,v1.2d,v5.d[0]\n\t" | |||
| "fmov v12.d[1],x0; ldr d13,[%[sb],#16]\n\t" | |||
| FMLA_IR "%[c4i].2d,v3.2d,v5.d[0]; ldr x0,[%[sb],#24]\n\t" | |||
| FMLA_RR "%[c5r].2d,v0.2d,v6.d[0]\n\t" | |||
| FMLA_RR "%[c6r].2d,v2.2d,v6.d[0]\n\t" | |||
| "fmov v13.d[1],x0; ldr d14,[%[sb],#32]\n\t" | |||
| FMLA_RI "%[c5i].2d,v0.2d,v6.d[1]; ldr x0,[%[sb],#40]\n\t" | |||
| FMLA_RI "%[c6i].2d,v2.2d,v6.d[1]\n\t" | |||
| FMLA_II "%[c5r].2d,v1.2d,v6.d[1]\n\t" | |||
| "fmov v14.d[1],x0; ldr d15,[%[sb],#48]\n\t" | |||
| FMLA_II "%[c6r].2d,v3.2d,v6.d[1]; ldr x0,[%[sb],#56]\n\t" | |||
| FMLA_IR "%[c5i].2d,v1.2d,v6.d[0]\n\t" | |||
| FMLA_IR "%[c6i].2d,v3.2d,v6.d[0]\n\t" | |||
| "fmov v15.d[1],x0; ldr d4,[%[sb],#64]\n\t" | |||
| FMLA_RR "%[c7r].2d,v0.2d,v7.d[0]; ldr x0,[%[sb],#72]\n\t" | |||
| FMLA_RR "%[c8r].2d,v2.2d,v7.d[0]\n\t" | |||
| FMLA_RI "%[c7i].2d,v0.2d,v7.d[1]\n\t" | |||
| "fmov v4.d[1],x0; ldr d5,[%[sb],#80]\n\t" | |||
| FMLA_RI "%[c8i].2d,v2.2d,v7.d[1]; ldr x0,[%[sb],#88]\n\t" | |||
| FMLA_II "%[c7r].2d,v1.2d,v7.d[1]\n\t" | |||
| FMLA_II "%[c8r].2d,v3.2d,v7.d[1]\n\t" | |||
| "fmov v5.d[1],x0; ldr d0,[%[sa],#64]\n\t" | |||
| FMLA_IR "%[c7i].2d,v1.2d,v7.d[0]; ldr x0,[%[sa],#80]\n\t" | |||
| FMLA_IR "%[c8i].2d,v3.2d,v7.d[0]\n\t" | |||
| FMLA_RR "%[c1r].2d,v8.2d,v12.d[0]\n\t" | |||
| "fmov v0.d[1],x0; ldr d1,[%[sa],#72]\n\t" | |||
| FMLA_RR "%[c2r].2d,v10.2d,v12.d[0]; ldr x0,[%[sa],#88]\n\t" | |||
| FMLA_RI "%[c1i].2d,v8.2d,v12.d[1]\n\t" | |||
| FMLA_RI "%[c2i].2d,v10.2d,v12.d[1]\n\t" | |||
| "fmov v1.d[1],x0; ldr d2,[%[sa],#96]\n\t" | |||
| FMLA_II "%[c1r].2d,v9.2d,v12.d[1]; ldr x0,[%[sa],#112]\n\t" | |||
| FMLA_II "%[c2r].2d,v11.2d,v12.d[1]\n\t" | |||
| FMLA_IR "%[c1i].2d,v9.2d,v12.d[0]\n\t" | |||
| "fmov v2.d[1],x0; ldr d3,[%[sa],#104]\n\t" | |||
| FMLA_IR "%[c2i].2d,v11.2d,v12.d[0]; ldr x0,[%[sa],#120]\n\t" | |||
| FMLA_RR "%[c3r].2d,v8.2d,v13.d[0]\n\t" | |||
| FMLA_RR "%[c4r].2d,v10.2d,v13.d[0]\n\t" | |||
| "fmov v3.d[1],x0; ldr d6,[%[sb],#96]\n\t" | |||
| FMLA_RI "%[c3i].2d,v8.2d,v13.d[1]; ldr x0,[%[sb],#104]\n\t" | |||
| FMLA_RI "%[c4i].2d,v10.2d,v13.d[1]\n\t" | |||
| FMLA_II "%[c3r].2d,v9.2d,v13.d[1]\n\t" | |||
| "fmov v6.d[1],x0; ldr d7,[%[sb],#112]\n\t" | |||
| FMLA_II "%[c4r].2d,v11.2d,v13.d[1]; ldr x0,[%[sb],#120]\n\t" | |||
| FMLA_IR "%[c3i].2d,v9.2d,v13.d[0]\n\t" | |||
| FMLA_IR "%[c4i].2d,v11.2d,v13.d[0]; prfm pldl1keep,[%[sa],#256]\n\t" | |||
| FMLA_RR "%[c5r].2d,v8.2d,v14.d[0]\n\t" | |||
| FMLA_RR "%[c6r].2d,v10.2d,v14.d[0]; prfm pldl1keep,[%[sa],#320]\n\t" | |||
| FMLA_RI "%[c5i].2d,v8.2d,v14.d[1]\n\t" | |||
| FMLA_RI "%[c6i].2d,v10.2d,v14.d[1]; prfm pldl1keep,[%[sb],#256]\n\t" | |||
| FMLA_II "%[c5r].2d,v9.2d,v14.d[1]\n\t" | |||
| FMLA_II "%[c6r].2d,v11.2d,v14.d[1]; prfm pldl1keep,[%[sb],#320]\n\t" | |||
| FMLA_IR "%[c5i].2d,v9.2d,v14.d[0]\n\t" | |||
| FMLA_IR "%[c6i].2d,v11.2d,v14.d[0]; add %[sa],%[sa],#128\n\t" | |||
| FMLA_RR "%[c7r].2d,v8.2d,v15.d[0]\n\t" | |||
| FMLA_RR "%[c8r].2d,v10.2d,v15.d[0]; add %[sb],%[sb],#128\n\t" | |||
| FMLA_RI "%[c7i].2d,v8.2d,v15.d[1]\n\t" | |||
| FMLA_RI "%[c8i].2d,v10.2d,v15.d[1]; sub %[K],%[K],#2\n\t" | |||
| FMLA_II "%[c7r].2d,v9.2d,v15.d[1]\n\t" | |||
| FMLA_II "%[c8r].2d,v11.2d,v15.d[1]; cmp %[K],#2\n\t" | |||
| FMLA_IR "%[c7i].2d,v9.2d,v15.d[0]\n\t" | |||
| FMLA_IR "%[c8i].2d,v11.2d,v15.d[0]; bgt 1b; blt 3f\n\t" | |||
| "2:\n\t" | |||
| "fmov v7.d[1],x0; ldr d8,[%[sa]]\n\t" | |||
| FMLA_RR "%[c1r].2d,v0.2d,v4.d[0]; ldr x0,[%[sa],#16]\n\t" | |||
| FMLA_RR "%[c2r].2d,v2.2d,v4.d[0]\n\t" | |||
| FMLA_RI "%[c1i].2d,v0.2d,v4.d[1]\n\t" | |||
| "fmov v8.d[1],x0; ldr d9,[%[sa],#8]\n\t" | |||
| FMLA_RI "%[c2i].2d,v2.2d,v4.d[1]; ldr x0,[%[sa],#24]\n\t" | |||
| FMLA_II "%[c1r].2d,v1.2d,v4.d[1]\n\t" | |||
| FMLA_II "%[c2r].2d,v3.2d,v4.d[1]\n\t" | |||
| "fmov v9.d[1],x0; ldr d10,[%[sa],#32]\n\t" | |||
| FMLA_IR "%[c1i].2d,v1.2d,v4.d[0]; ldr x0,[%[sa],#48]\n\t" | |||
| FMLA_IR "%[c2i].2d,v3.2d,v4.d[0]\n\t" | |||
| FMLA_RR "%[c3r].2d,v0.2d,v5.d[0]\n\t" | |||
| "fmov v10.d[1],x0; ldr d11,[%[sa],#40]\n\t" | |||
| FMLA_RR "%[c4r].2d,v2.2d,v5.d[0]; ldr x0,[%[sa],#56]\n\t" | |||
| FMLA_RI "%[c3i].2d,v0.2d,v5.d[1]\n\t" | |||
| FMLA_RI "%[c4i].2d,v2.2d,v5.d[1]\n\t" | |||
| "fmov v11.d[1],x0; ldr d12,[%[sb]]\n\t" | |||
| FMLA_II "%[c3r].2d,v1.2d,v5.d[1]; ldr x0,[%[sb],#8]\n\t" | |||
| FMLA_II "%[c4r].2d,v3.2d,v5.d[1]\n\t" | |||
| FMLA_IR "%[c3i].2d,v1.2d,v5.d[0]\n\t" | |||
| "fmov v12.d[1],x0; ldr d13,[%[sb],#16]\n\t" | |||
| FMLA_IR "%[c4i].2d,v3.2d,v5.d[0]; ldr x0,[%[sb],#24]\n\t" | |||
| FMLA_RR "%[c5r].2d,v0.2d,v6.d[0]\n\t" | |||
| FMLA_RR "%[c6r].2d,v2.2d,v6.d[0]\n\t" | |||
| "fmov v13.d[1],x0; ldr d14,[%[sb],#32]\n\t" | |||
| FMLA_RI "%[c5i].2d,v0.2d,v6.d[1]; ldr x0,[%[sb],#40]\n\t" | |||
| FMLA_RI "%[c6i].2d,v2.2d,v6.d[1]\n\t" | |||
| FMLA_II "%[c5r].2d,v1.2d,v6.d[1]\n\t" | |||
| "fmov v14.d[1],x0; ldr d15,[%[sb],#48]\n\t" | |||
| FMLA_II "%[c6r].2d,v3.2d,v6.d[1]; ldr x0,[%[sb],#56]\n\t" | |||
| FMLA_IR "%[c5i].2d,v1.2d,v6.d[0]\n\t" | |||
| FMLA_IR "%[c6i].2d,v3.2d,v6.d[0]\n\t" | |||
| "fmov v15.d[1],x0\n\t" | |||
| FMLA_RR "%[c7r].2d,v0.2d,v7.d[0]\n\t" | |||
| FMLA_RR "%[c8r].2d,v2.2d,v7.d[0]\n\t" | |||
| FMLA_RI "%[c7i].2d,v0.2d,v7.d[1]\n\t" | |||
| FMLA_RI "%[c8i].2d,v2.2d,v7.d[1]\n\t" | |||
| FMLA_II "%[c7r].2d,v1.2d,v7.d[1]\n\t" | |||
| FMLA_II "%[c8r].2d,v3.2d,v7.d[1]\n\t" | |||
| FMLA_IR "%[c7i].2d,v1.2d,v7.d[0]\n\t" | |||
| FMLA_IR "%[c8i].2d,v3.2d,v7.d[0]\n\t" | |||
| FMLA_RR "%[c1r].2d,v8.2d,v12.d[0]\n\t" | |||
| FMLA_RR "%[c2r].2d,v10.2d,v12.d[0]\n\t" | |||
| FMLA_RI "%[c1i].2d,v8.2d,v12.d[1]\n\t" | |||
| FMLA_RI "%[c2i].2d,v10.2d,v12.d[1]\n\t" | |||
| FMLA_II "%[c1r].2d,v9.2d,v12.d[1]\n\t" | |||
| FMLA_II "%[c2r].2d,v11.2d,v12.d[1]\n\t" | |||
| FMLA_IR "%[c1i].2d,v9.2d,v12.d[0]\n\t" | |||
| FMLA_IR "%[c2i].2d,v11.2d,v12.d[0]\n\t" | |||
| FMLA_RR "%[c3r].2d,v8.2d,v13.d[0]\n\t" | |||
| FMLA_RR "%[c4r].2d,v10.2d,v13.d[0]\n\t" | |||
| FMLA_RI "%[c3i].2d,v8.2d,v13.d[1]\n\t" | |||
| FMLA_RI "%[c4i].2d,v10.2d,v13.d[1]\n\t" | |||
| FMLA_II "%[c3r].2d,v9.2d,v13.d[1]\n\t" | |||
| FMLA_II "%[c4r].2d,v11.2d,v13.d[1]\n\t" | |||
| FMLA_IR "%[c3i].2d,v9.2d,v13.d[0]\n\t" | |||
| FMLA_IR "%[c4i].2d,v11.2d,v13.d[0]\n\t" | |||
| FMLA_RR "%[c5r].2d,v8.2d,v14.d[0]\n\t" | |||
| FMLA_RR "%[c6r].2d,v10.2d,v14.d[0]\n\t" | |||
| FMLA_RI "%[c5i].2d,v8.2d,v14.d[1]\n\t" | |||
| FMLA_RI "%[c6i].2d,v10.2d,v14.d[1]\n\t" | |||
| FMLA_II "%[c5r].2d,v9.2d,v14.d[1]\n\t" | |||
| FMLA_II "%[c6r].2d,v11.2d,v14.d[1]\n\t" | |||
| FMLA_IR "%[c5i].2d,v9.2d,v14.d[0]\n\t" | |||
| FMLA_IR "%[c6i].2d,v11.2d,v14.d[0]; add %[sa],%[sa],#64\n\t" | |||
| FMLA_RR "%[c7r].2d,v8.2d,v15.d[0]\n\t" | |||
| FMLA_RR "%[c8r].2d,v10.2d,v15.d[0]; add %[sb],%[sb],#64\n\t" | |||
| FMLA_RI "%[c7i].2d,v8.2d,v15.d[1]\n\t" | |||
| FMLA_RI "%[c8i].2d,v10.2d,v15.d[1]; sub %[K],%[K],#2\n\t" | |||
| FMLA_II "%[c7r].2d,v9.2d,v15.d[1]\n\t" | |||
| FMLA_II "%[c8r].2d,v11.2d,v15.d[1]\n\t" | |||
| FMLA_IR "%[c7i].2d,v9.2d,v15.d[0]\n\t" | |||
| FMLA_IR "%[c8i].2d,v11.2d,v15.d[0]; b 4f\n\t" | |||
| "3:\n\t" | |||
| "fmov v7.d[1],x0\n\t" | |||
| FMLA_RR "%[c1r].2d,v0.2d,v4.d[0]\n\t" | |||
| FMLA_RR "%[c2r].2d,v2.2d,v4.d[0]\n\t" | |||
| FMLA_RI "%[c1i].2d,v0.2d,v4.d[1]\n\t" | |||
| FMLA_RI "%[c2i].2d,v2.2d,v4.d[1]\n\t" | |||
| FMLA_II "%[c1r].2d,v1.2d,v4.d[1]\n\t" | |||
| FMLA_II "%[c2r].2d,v3.2d,v4.d[1]\n\t" | |||
| FMLA_IR "%[c1i].2d,v1.2d,v4.d[0]\n\t" | |||
| FMLA_IR "%[c2i].2d,v3.2d,v4.d[0]\n\t" | |||
| FMLA_RR "%[c3r].2d,v0.2d,v5.d[0]\n\t" | |||
| FMLA_RR "%[c4r].2d,v2.2d,v5.d[0]\n\t" | |||
| FMLA_RI "%[c3i].2d,v0.2d,v5.d[1]\n\t" | |||
| FMLA_RI "%[c4i].2d,v2.2d,v5.d[1]\n\t" | |||
| FMLA_II "%[c3r].2d,v1.2d,v5.d[1]\n\t" | |||
| FMLA_II "%[c4r].2d,v3.2d,v5.d[1]\n\t" | |||
| FMLA_IR "%[c3i].2d,v1.2d,v5.d[0]\n\t" | |||
| FMLA_IR "%[c4i].2d,v3.2d,v5.d[0]\n\t" | |||
| FMLA_RR "%[c5r].2d,v0.2d,v6.d[0]\n\t" | |||
| FMLA_RR "%[c6r].2d,v2.2d,v6.d[0]\n\t" | |||
| FMLA_RI "%[c5i].2d,v0.2d,v6.d[1]\n\t" | |||
| FMLA_RI "%[c6i].2d,v2.2d,v6.d[1]\n\t" | |||
| FMLA_II "%[c5r].2d,v1.2d,v6.d[1]\n\t" | |||
| FMLA_II "%[c6r].2d,v3.2d,v6.d[1]\n\t" | |||
| FMLA_IR "%[c5i].2d,v1.2d,v6.d[0]\n\t" | |||
| FMLA_IR "%[c6i].2d,v3.2d,v6.d[0]\n\t" | |||
| FMLA_RR "%[c7r].2d,v0.2d,v7.d[0]\n\t" | |||
| FMLA_RR "%[c8r].2d,v2.2d,v7.d[0]\n\t" | |||
| FMLA_RI "%[c7i].2d,v0.2d,v7.d[1]\n\t" | |||
| FMLA_RI "%[c8i].2d,v2.2d,v7.d[1]\n\t" | |||
| FMLA_II "%[c7r].2d,v1.2d,v7.d[1]\n\t" | |||
| FMLA_II "%[c8r].2d,v3.2d,v7.d[1]\n\t" | |||
| FMLA_IR "%[c7i].2d,v1.2d,v7.d[0]\n\t" | |||
| FMLA_IR "%[c8i].2d,v3.2d,v7.d[0]; sub %[K],%[K],#1\n\t" | |||
| "4:\n\t" | |||
| :[c1r]"=w"(c1r), [c1i]"=w"(c1i), [c2r]"=w"(c2r), [c2i]"=w"(c2i), | |||
| [c3r]"=w"(c3r), [c3i]"=w"(c3i), [c4r]"=w"(c4r), [c4i]"=w"(c4i), | |||
| [c5r]"=w"(c5r), [c5i]"=w"(c5i), [c6r]"=w"(c6r), [c6i]"=w"(c6i), | |||
| [c7r]"=w"(c7r), [c7i]"=w"(c7i), [c8r]"=w"(c8r), [c8i]"=w"(c8i), | |||
| [K]"+r"(K), [sa]"+r"(sa), [sb]"+r"(sb) | |||
| ::"cc", "memory", "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", | |||
| "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"); | |||
| store_4c(C, c1r, c1i, c2r, c2i, alphar, alphai); C += LDC * 2; | |||
| store_4c(C, c3r, c3i, c4r, c4i, alphar, alphai); C += LDC * 2; | |||
| store_4c(C, c5r, c5i, c6r, c6i, alphar, alphai); C += LDC * 2; | |||
| store_4c(C, c7r, c7i, c8r, c8i, alphar, alphai); | |||
| } | |||
| int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alphar, FLOAT alphai, | |||
| FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG LDC) { | |||
| BLASLONG n_left = N; | |||
| for (; n_left >= 4; n_left -= 4) { | |||
| const FLOAT *a_ = sa; | |||
| FLOAT *c_ = C; | |||
| BLASLONG m_left = M; | |||
| for (; m_left >= 4; m_left -= 4) { | |||
| kernel_4x4(a_, sb, c_, LDC, K, alphar, alphai); | |||
| a_ += 8 * K; | |||
| c_ += 8; | |||
| } | |||
| if (m_left >= 2) { | |||
| m_left -= 2; | |||
| kernel_2x4(a_, sb, c_, LDC, K, alphar, alphai); | |||
| a_ += 4 * K; | |||
| c_ += 4; | |||
| } | |||
| if (m_left) { | |||
| kernel_1x4(a_, sb, c_, LDC, K, alphar, alphai); | |||
| } | |||
| sb += 8 * K; | |||
| C += 8 * LDC; | |||
| } | |||
| if (n_left >= 2) { | |||
| n_left -= 2; | |||
| const FLOAT *a_ = sa; | |||
| FLOAT *c_ = C; | |||
| BLASLONG m_left = M; | |||
| for (; m_left >= 4; m_left -= 4) { | |||
| kernel_4x2(a_, sb, c_, LDC, K, alphar, alphai); | |||
| a_ += 8 * K; | |||
| c_ += 8; | |||
| } | |||
| if (m_left >= 2) { | |||
| m_left -= 2; | |||
| kernel_2x2(a_, sb, c_, LDC, K, alphar, alphai); | |||
| a_ += 4 * K; | |||
| c_ += 4; | |||
| } | |||
| if (m_left) { | |||
| kernel_1x2(a_, sb, c_, LDC, K, alphar, alphai); | |||
| } | |||
| sb += 4 * K; | |||
| C += 4 * LDC; | |||
| } | |||
| if (n_left) { | |||
| const FLOAT *a_ = sa; | |||
| FLOAT *c_ = C; | |||
| BLASLONG m_left = M; | |||
| for (; m_left >= 4; m_left -= 4) { | |||
| kernel_4x1(a_, sb, c_, K, alphar, alphai); | |||
| a_ += 8 * K; | |||
| c_ += 8; | |||
| } | |||
| if (m_left >= 2) { | |||
| m_left -= 2; | |||
| kernel_2x1(a_, sb, c_, K, alphar, alphai); | |||
| a_ += 4 * K; | |||
| c_ += 4; | |||
| } | |||
| if (m_left) { | |||
| kernel_1x1(a_, sb, c_, K, alphar, alphai); | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,160 @@ | |||
| SGEMM_BETA = ../generic/gemm_beta.c | |||
| DGEMM_BETA = ../generic/gemm_beta.c | |||
| CGEMM_BETA = ../generic/zgemm_beta.c | |||
| ZGEMM_BETA = ../generic/zgemm_beta.c | |||
| STRMMKERNEL = ../generic/trmmkernel_2x2.c | |||
| DTRMMKERNEL = ../generic/trmmkernel_2x2.c | |||
| CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c | |||
| ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c | |||
| SGEMMKERNEL = ../generic/gemmkernel_2x2.c | |||
| SGEMMONCOPY = ../generic/gemm_ncopy_2.c | |||
| SGEMMOTCOPY = ../generic/gemm_tcopy_2.c | |||
| SGEMMONCOPYOBJ = sgemm_oncopy.o | |||
| SGEMMOTCOPYOBJ = sgemm_otcopy.o | |||
| DGEMMKERNEL = ../generic/gemmkernel_2x2.c | |||
| DGEMMONCOPY = ../generic/gemm_ncopy_2.c | |||
| DGEMMOTCOPY = ../generic/gemm_tcopy_2.c | |||
| DGEMMONCOPYOBJ = dgemm_oncopy.o | |||
| DGEMMOTCOPYOBJ = dgemm_otcopy.o | |||
| CGEMMKERNEL = ../generic/zgemmkernel_2x2.c | |||
| CGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||
| CGEMMONCOPYOBJ = cgemm_oncopy.o | |||
| CGEMMOTCOPYOBJ = cgemm_otcopy.o | |||
| ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c | |||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||
| ZGEMMONCOPYOBJ = zgemm_oncopy.o | |||
| ZGEMMOTCOPYOBJ = zgemm_otcopy.o | |||
| STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| #Pure C for other kernels | |||
| SAMAXKERNEL = ../mips/amax.c | |||
| DAMAXKERNEL = ../mips/amax.c | |||
| CAMAXKERNEL = ../mips/zamax.c | |||
| ZAMAXKERNEL = ../mips/zamax.c | |||
| SAMINKERNEL = ../mips/amin.c | |||
| DAMINKERNEL = ../mips/amin.c | |||
| CAMINKERNEL = ../mips/zamin.c | |||
| ZAMINKERNEL = ../mips/zamin.c | |||
| SMAXKERNEL = ../mips/max.c | |||
| DMAXKERNEL = ../mips/max.c | |||
| SMINKERNEL = ../mips/min.c | |||
| DMINKERNEL = ../mips/min.c | |||
| ISAMAXKERNEL = ../mips/iamax.c | |||
| IDAMAXKERNEL = ../mips/iamax.c | |||
| ICAMAXKERNEL = ../mips/izamax.c | |||
| IZAMAXKERNEL = ../mips/izamax.c | |||
| ISAMINKERNEL = ../mips/iamin.c | |||
| IDAMINKERNEL = ../mips/iamin.c | |||
| ICAMINKERNEL = ../mips/izamin.c | |||
| IZAMINKERNEL = ../mips/izamin.c | |||
| ISMAXKERNEL = ../mips/imax.c | |||
| IDMAXKERNEL = ../mips/imax.c | |||
| ISMINKERNEL = ../mips/imin.c | |||
| IDMINKERNEL = ../mips/imin.c | |||
| SASUMKERNEL = ../mips/asum.c | |||
| DASUMKERNEL = ../mips/asum.c | |||
| CASUMKERNEL = ../mips/zasum.c | |||
| ZASUMKERNEL = ../mips/zasum.c | |||
| SSUMKERNEL = ../mips/sum.c | |||
| DSUMKERNEL = ../mips/sum.c | |||
| CSUMKERNEL = ../mips/zsum.c | |||
| ZSUMKERNEL = ../mips/zsum.c | |||
| SAXPYKERNEL = ../mips/axpy.c | |||
| DAXPYKERNEL = ../mips/axpy.c | |||
| CAXPYKERNEL = ../mips/zaxpy.c | |||
| ZAXPYKERNEL = ../mips/zaxpy.c | |||
| SCOPYKERNEL = ../mips/copy.c | |||
| DCOPYKERNEL = ../mips/copy.c | |||
| CCOPYKERNEL = ../mips/zcopy.c | |||
| ZCOPYKERNEL = ../mips/zcopy.c | |||
| SDOTKERNEL = ../mips/dot.c | |||
| DDOTKERNEL = ../mips/dot.c | |||
| CDOTKERNEL = ../mips/zdot.c | |||
| ZDOTKERNEL = ../mips/zdot.c | |||
| SNRM2KERNEL = ../mips/nrm2.c | |||
| DNRM2KERNEL = ../mips/nrm2.c | |||
| CNRM2KERNEL = ../mips/znrm2.c | |||
| ZNRM2KERNEL = ../mips/znrm2.c | |||
| SROTKERNEL = ../mips/rot.c | |||
| DROTKERNEL = ../mips/rot.c | |||
| CROTKERNEL = ../mips/zrot.c | |||
| ZROTKERNEL = ../mips/zrot.c | |||
| SSCALKERNEL = ../mips/scal.c | |||
| DSCALKERNEL = ../mips/scal.c | |||
| CSCALKERNEL = ../mips/zscal.c | |||
| ZSCALKERNEL = ../mips/zscal.c | |||
| SSWAPKERNEL = ../mips/swap.c | |||
| DSWAPKERNEL = ../mips/swap.c | |||
| CSWAPKERNEL = ../mips/zswap.c | |||
| ZSWAPKERNEL = ../mips/zswap.c | |||
| SGEMVNKERNEL = ../mips/gemv_n.c | |||
| DGEMVNKERNEL = ../mips/gemv_n.c | |||
| CGEMVNKERNEL = ../mips/zgemv_n.c | |||
| ZGEMVNKERNEL = ../mips/zgemv_n.c | |||
| SGEMVTKERNEL = ../mips/gemv_t.c | |||
| DGEMVTKERNEL = ../mips/gemv_t.c | |||
| CGEMVTKERNEL = ../mips/zgemv_t.c | |||
| ZGEMVTKERNEL = ../mips/zgemv_t.c | |||
| SSYMV_U_KERNEL = ../generic/symv_k.c | |||
| SSYMV_L_KERNEL = ../generic/symv_k.c | |||
| DSYMV_U_KERNEL = ../generic/symv_k.c | |||
| DSYMV_L_KERNEL = ../generic/symv_k.c | |||
| QSYMV_U_KERNEL = ../generic/symv_k.c | |||
| QSYMV_L_KERNEL = ../generic/symv_k.c | |||
| CSYMV_U_KERNEL = ../generic/zsymv_k.c | |||
| CSYMV_L_KERNEL = ../generic/zsymv_k.c | |||
| ZSYMV_U_KERNEL = ../generic/zsymv_k.c | |||
| ZSYMV_L_KERNEL = ../generic/zsymv_k.c | |||
| XSYMV_U_KERNEL = ../generic/zsymv_k.c | |||
| XSYMV_L_KERNEL = ../generic/zsymv_k.c | |||
| ZHEMV_U_KERNEL = ../generic/zhemv_k.c | |||
| ZHEMV_L_KERNEL = ../generic/zhemv_k.c | |||
| CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c | |||
| ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c | |||
| @@ -1,7 +1,6 @@ | |||
| ifeq ($(__BYTE_ORDER__),__ORDER_BIG_ENDIAN__) | |||
| ifeq ($(HAVE_GAS), 1) | |||
| include $(KERNELDIR)/KERNEL.POWER8 | |||
| else | |||
| #SGEMM_BETA = ../generic/gemm_beta.c | |||
| #DGEMM_BETA = ../generic/gemm_beta.c | |||
| #CGEMM_BETA = ../generic/zgemm_beta.c | |||
| @@ -33,6 +32,16 @@ SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMM_SMALL_M_PERMIT = gemm_small_kernel_permit_power10.c | |||
| SGEMM_SMALL_K_NN = sgemm_small_kernel_nn_power10.c | |||
| SGEMM_SMALL_K_B0_NN = sgemm_small_kernel_nn_power10.c | |||
| SGEMM_SMALL_K_NT = sgemm_small_kernel_nt_power10.c | |||
| SGEMM_SMALL_K_B0_NT = sgemm_small_kernel_nt_power10.c | |||
| SGEMM_SMALL_K_TN = sgemm_small_kernel_tn_power10.c | |||
| SGEMM_SMALL_K_B0_TN = sgemm_small_kernel_tn_power10.c | |||
| SGEMM_SMALL_K_TT = sgemm_small_kernel_tt_power10.c | |||
| SGEMM_SMALL_K_B0_TT = sgemm_small_kernel_tt_power10.c | |||
| DGEMMKERNEL = dgemm_kernel_power10.c | |||
| DGEMMINCOPY = | |||
| DGEMMITCOPY = | |||
| @@ -43,7 +52,18 @@ DGEMMITCOPYOBJ = | |||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMM_SMALL_M_PERMIT = gemm_small_kernel_permit_power10.c | |||
| DGEMM_SMALL_K_NT = dgemm_small_kernel_nt_power10.c | |||
| DGEMM_SMALL_K_B0_NT = dgemm_small_kernel_nt_power10.c | |||
| DGEMM_SMALL_K_NN = dgemm_small_kernel_nn_power10.c | |||
| DGEMM_SMALL_K_B0_NN = dgemm_small_kernel_nn_power10.c | |||
| DGEMM_SMALL_K_TT = dgemm_small_kernel_tt_power10.c | |||
| DGEMM_SMALL_K_B0_TT = dgemm_small_kernel_tt_power10.c | |||
| DGEMM_SMALL_K_TN = dgemm_small_kernel_tn_power10.c | |||
| DGEMM_SMALL_K_B0_TN = dgemm_small_kernel_tn_power10.c | |||
| CGEMMKERNEL = cgemm_kernel_power10.S | |||
| #CGEMMKERNEL = cgemm_kernel_8x4_power8.S | |||
| CGEMMINCOPY = ../generic/zgemm_ncopy_8.c | |||
| CGEMMITCOPY = ../generic/zgemm_tcopy_8.c | |||
| CGEMMONCOPY = ../generic/zgemm_ncopy_4.c | |||
| @@ -218,5 +238,4 @@ QCABS_KERNEL = ../generic/cabs.c | |||
| #Dump kernel | |||
| CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c | |||
| ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c | |||
| endif | |||
| @@ -36,9 +36,12 @@ static void caxpy_kernel_8 (long n, float *x, float *y, | |||
| #endif | |||
| const float *mvecp = mvec; | |||
| /* We have to load reverse mask for big endian. */ | |||
| /* __vector unsigned char mask={ 4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11}; */ | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| __vector unsigned char mask={ 4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11}; | |||
| #else | |||
| __vector unsigned char mask = { 11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4}; | |||
| #endif | |||
| long ytmp; | |||
| __asm__ | |||
| @@ -112,6 +115,16 @@ static void caxpy_kernel_8 (long n, float *x, float *y, | |||
| "xvmaddasp 38, 58, 33 \n\t" | |||
| "xvmaddasp 39, 59, 33 \n\t" | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| "stxv 48, 0(%4) \n\t" | |||
| "stxv 49, 16(%4) \n\t" | |||
| "stxv 50, 32(%4) \n\t" | |||
| "stxv 51, 48(%4) \n\t" | |||
| "stxv 34, 64(%4) \n\t" | |||
| "stxv 35, 80(%4) \n\t" | |||
| "stxv 38, 96(%4) \n\t" | |||
| "stxv 39, 112(%4) \n\t" | |||
| #else | |||
| "stxv 49, 0(%4) \n\t" | |||
| "stxv 48, 16(%4) \n\t" | |||
| "stxv 51, 32(%4) \n\t" | |||
| @@ -120,6 +133,7 @@ static void caxpy_kernel_8 (long n, float *x, float *y, | |||
| "stxv 34, 80(%4) \n\t" | |||
| "stxv 39, 96(%4) \n\t" | |||
| "stxv 38, 112(%4) \n\t" | |||
| #endif | |||
| "addi %4, %4, 128 \n\t" | |||
| "xxperm 52, 40, %x10 \n\t" // exchange real and imag part | |||
| @@ -163,6 +177,16 @@ static void caxpy_kernel_8 (long n, float *x, float *y, | |||
| "xvmaddasp 38, 58, 33 \n\t" | |||
| "xvmaddasp 39, 59, 33 \n\t" | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| "stxv 48, 0(%4) \n\t" | |||
| "stxv 49, 16(%4) \n\t" | |||
| "stxv 50, 32(%4) \n\t" | |||
| "stxv 51, 48(%4) \n\t" | |||
| "stxv 34, 64(%4) \n\t" | |||
| "stxv 35, 80(%4) \n\t" | |||
| "stxv 38, 96(%4) \n\t" | |||
| "stxv 39, 112(%4) \n\t" | |||
| #else | |||
| "stxv 49, 0(%4) \n\t" | |||
| "stxv 48, 16(%4) \n\t" | |||
| "stxv 51, 32(%4) \n\t" | |||
| @@ -171,6 +195,7 @@ static void caxpy_kernel_8 (long n, float *x, float *y, | |||
| "stxv 34, 80(%4) \n\t" | |||
| "stxv 39, 96(%4) \n\t" | |||
| "stxv 38, 112(%4) \n\t" | |||
| #endif | |||
| "#n=%1 x=%5=%2 y=%0=%3 alpha=(%7,%8) mvecp=%6=%9 ytmp=%4\n" | |||
| : | |||
| @@ -46,7 +46,16 @@ static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y) | |||
| ".align 5 \n" | |||
| "one%=: \n\t" | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| "stxv 32, 0(%3) \n\t" | |||
| "stxv 33, 16(%3) \n\t" | |||
| "stxv 34, 32(%3) \n\t" | |||
| "stxv 35, 48(%3) \n\t" | |||
| "stxv 36, 64(%3) \n\t" | |||
| "stxv 37, 80(%3) \n\t" | |||
| "stxv 38, 96(%3) \n\t" | |||
| "stxv 39, 112(%3) \n\t" | |||
| #else | |||
| "stxv 33, 0(%3) \n\t" | |||
| "stxv 32, 16(%3) \n\t" | |||
| "stxv 35, 32(%3) \n\t" | |||
| @@ -55,11 +64,21 @@ static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y) | |||
| "stxv 36, 80(%3) \n\t" | |||
| "stxv 39, 96(%3) \n\t" | |||
| "stxv 38, 112(%3) \n\t" | |||
| #endif | |||
| "lxvp 32, 0(%2) \n\t" | |||
| "lxvp 34, 32(%2) \n\t" | |||
| "lxvp 36, 64(%2) \n\t" | |||
| "lxvp 38, 96(%2) \n\t" | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| "stxv 40, 128(%3) \n\t" | |||
| "stxv 41, 144(%3) \n\t" | |||
| "stxv 42, 160(%3) \n\t" | |||
| "stxv 43, 176(%3) \n\t" | |||
| "stxv 44, 192(%3) \n\t" | |||
| "stxv 45, 208(%3) \n\t" | |||
| "stxv 46, 224(%3) \n\t" | |||
| "stxv 47, 240(%3) \n\t" | |||
| #else | |||
| "stxv 41, 128(%3) \n\t" | |||
| "stxv 40, 144(%3) \n\t" | |||
| "stxv 43, 160(%3) \n\t" | |||
| @@ -68,6 +87,7 @@ static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y) | |||
| "stxv 44, 208(%3) \n\t" | |||
| "stxv 47, 224(%3) \n\t" | |||
| "stxv 46, 240(%3) \n\t" | |||
| #endif | |||
| "lxvp 40, 128(%2) \n\t" | |||
| "lxvp 42, 160(%2) \n\t" | |||
| "lxvp 44, 192(%2) \n\t" | |||
| @@ -81,7 +101,24 @@ static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y) | |||
| "bgt one%= \n" | |||
| "two%=: \n\t" | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| "stxv 32, 0(%3) \n\t" | |||
| "stxv 33, 16(%3) \n\t" | |||
| "stxv 34, 32(%3) \n\t" | |||
| "stxv 35, 48(%3) \n\t" | |||
| "stxv 36, 64(%3) \n\t" | |||
| "stxv 37, 80(%3) \n\t" | |||
| "stxv 38, 96(%3) \n\t" | |||
| "stxv 39, 112(%3) \n\t" | |||
| "stxv 40, 128(%3) \n\t" | |||
| "stxv 41, 144(%3) \n\t" | |||
| "stxv 42, 160(%3) \n\t" | |||
| "stxv 43, 176(%3) \n\t" | |||
| "stxv 44, 192(%3) \n\t" | |||
| "stxv 45, 208(%3) \n\t" | |||
| "stxv 46, 224(%3) \n\t" | |||
| "stxv 47, 240(%3) \n\t" | |||
| #else | |||
| "stxv 33, 0(%3) \n\t" | |||
| "stxv 32, 16(%3) \n\t" | |||
| "stxv 35, 32(%3) \n\t" | |||
| @@ -98,7 +135,7 @@ static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y) | |||
| "stxv 44, 208(%3) \n\t" | |||
| "stxv 47, 224(%3) \n\t" | |||
| "stxv 46, 240(%3) \n\t" | |||
| #endif | |||
| "#n=%1 x=%4=%2 y=%0=%3" | |||
| : | |||
| "=m" (*y), | |||
| @@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #else | |||
| #include "common.h" | |||
| #if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) | |||
| #if defined(POWER10) | |||
| #include "cdot_microk_power10.c" | |||
| #else | |||
| #ifndef HAVE_KERNEL_8 | |||
| @@ -120,7 +120,7 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA | |||
| if ((inc_x == 1) && (inc_y == 1)) { | |||
| #if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) | |||
| #if defined(POWER10) | |||
| BLASLONG n1 = n & -16; | |||
| #else | |||
| BLASLONG n1 = n & -8; | |||
| @@ -29,7 +29,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| static void cdot_kernel_8 (long n, float *x, float *y, float *dot) | |||
| { | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| __vector unsigned char mask = {4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11}; | |||
| #else | |||
| __vector unsigned char mask = { 11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4}; | |||
| #endif | |||
| __asm__ | |||
| ( | |||
| "dcbt 0, %2 \n\t" | |||
| @@ -153,7 +157,11 @@ static void cdot_kernel_8 (long n, float *x, float *y, float *dot) | |||
| "xxswapd 33, 34 \n\t" | |||
| "xvaddsp 35, 35, 32 \n\t" | |||
| "xvaddsp 34, 34, 33 \n\t" | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| "xxpermdi 34, 35, 34, 0 \n\t" | |||
| #else | |||
| "xxpermdi 34, 34, 35, 2 \n\t" | |||
| #endif | |||
| "stxv 34, 0(%6) \n\t" | |||
| "#n=%1 x=%4=%2 y=%5=%3 dot=%0=%6" | |||
| @@ -76,11 +76,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "cgemm_macros_power10.S" | |||
| #if (_AIX) | |||
| .set perm_const1, 0x0405060700010203 | |||
| .set perm_const2, 0x0c0d0e0f08090a0b | |||
| .set save_permute_12, 0x1011121300010203 | |||
| .set save_permute_11, 0x18191a1b08090a0b | |||
| #else | |||
| .equ perm_const1, 0x0405060700010203 | |||
| .equ perm_const2, 0x0c0d0e0f08090a0b | |||
| .equ save_permute_12, 0x0c0d0e0f1c1d1e1f | |||
| .equ save_permute_11, 0x0405060714151617 | |||
| #endif | |||
| #ifndef NEEDPARAM | |||
| @@ -172,24 +178,44 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| /*load reverse permute mask for big endian | |||
| uint128 = 0xc0d0e0f08090a0b0405060700010203 | |||
| */ | |||
| #if (_AIX) | |||
| lis T2, (perm_const2>>48 & 0xFFFF) | |||
| lis T1, (perm_const1>>48 & 0xFFFF) | |||
| lis T3, (save_permute_12>>48 & 0xFFFF) | |||
| lis T4, (save_permute_11>>48 & 0xFFFF) | |||
| ori T2, T2, (perm_const2>>32 & 0xFFFF) | |||
| ori T1, T1, (perm_const1>>32 & 0xFFFF) | |||
| ori T3, T3, (save_permute_12>>32 & 0xFFFF) | |||
| ori T4, T4, (save_permute_11>>32 & 0xFFFF) | |||
| #else | |||
| lis T2, perm_const2@highest | |||
| lis T1, perm_const1@highest | |||
| lis T3, save_permute_12@highest | |||
| lis T4, save_permute_11@highest | |||
| ori T2, T2, perm_const2@higher | |||
| ori T1, T1, perm_const1@higher | |||
| ori T3, T3, save_permute_12@higher | |||
| ori T4, T4, save_permute_11@higher | |||
| #endif | |||
| rldicr T2, T2, 32, 31 | |||
| rldicr T1, T1, 32, 31 | |||
| rldicr T3, T3, 32, 31 | |||
| rldicr T4, T4, 32, 31 | |||
| #if (_AIX) | |||
| oris T2, T2, (perm_const2>>16 & 0xFFFF) | |||
| oris T1, T1, (perm_const1>>16 & 0xFFFF) | |||
| oris T3, T3, (save_permute_12>>16 & 0xFFFF) | |||
| oris T4, T4, (save_permute_11>>16 & 0xFFFF) | |||
| ori T2, T2, (perm_const2 & 0xFFFF) | |||
| ori T1, T1, (perm_const1 & 0xFFFF) | |||
| ori T3, T3, (save_permute_12 & 0xFFFF) | |||
| ori T4, T4, (save_permute_11 & 0xFFFF) | |||
| #else | |||
| oris T2, T2, perm_const2@h | |||
| oris T1, T1, perm_const1@h | |||
| oris T3, T3, save_permute_12@h | |||
| @@ -200,7 +226,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ori T1, T1, perm_const1@l | |||
| ori T3, T3, save_permute_12@l | |||
| ori T4, T4, save_permute_11@l | |||
| #endif | |||
| li r0,0 | |||
| li PRE,512 | |||
| @@ -218,6 +218,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .if \OffsetA != 0 | |||
| addi \AREG, \AREG, \OffsetA | |||
| .endif | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xvf32gerpp 3, 36, 34 | |||
| xvf32gerpp 2, 37, 34 | |||
| xvf32gerpp 1, 32, 34 | |||
| xvf32gerpp 0, 33, 34 | |||
| xvf32gerpp 7, 36, 35 | |||
| xvf32gerpp 6, 37, 35 | |||
| xvf32gerpp 5, 32, 35 | |||
| xvf32gerpp 4, 33, 35 | |||
| #else | |||
| xvf32gerpp 3, 36, 35 | |||
| xvf32gerpp 2, 37, 35 | |||
| xvf32gerpp 1, 32, 35 | |||
| @@ -226,6 +236,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xvf32gerpp 6, 37, 34 | |||
| xvf32gerpp 5, 32, 34 | |||
| xvf32gerpp 4, 33, 34 | |||
| #endif | |||
| .endm | |||
| .macro LOAD4x8_2 | |||
| @@ -255,6 +266,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro KERNEL4x8_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete | |||
| #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ | |||
| xvf32gerpp 3, 36, 34 | |||
| xvf32gerpp 2, 37, 34 | |||
| xvf32gerpp 1, 32, 34 | |||
| xvf32gerpp 0, 33, 34 | |||
| xvf32gerpp 7, 36, 35 | |||
| xvf32gerpp 6, 37, 35 | |||
| xvf32gerpp 5, 32, 35 | |||
| xvf32gerpp 4, 33, 35 | |||
| #else | |||
| xvf32gerpp 3, 36, 35 | |||
| xvf32gerpp 2, 37, 35 | |||
| xvf32gerpp 1, 32, 35 | |||
| @@ -263,11 +284,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xvf32gerpp 6, 37, 34 | |||
| xvf32gerpp 5, 32, 34 | |||
| xvf32gerpp 4, 33, 34 | |||
| #endif | |||
| .if \Complete==0 | |||
| lxvp vs34, DISP8(\Index, \OffsetB)(\BREG) | |||
| lxvp vs32, DISP16(\Index, 0+\OffsetA)(\AREG) | |||
| lxvp vs36, DISP16(\Index, 32+\OffsetA)(\AREG) | |||
| .endif | |||
| #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ | |||
| xvf32gerpp 3, 42, 38 | |||
| xvf32gerpp 2, 43, 38 | |||
| xvf32gerpp 1, 40, 38 | |||
| xvf32gerpp 0, 41, 38 | |||
| xvf32gerpp 7, 42, 39 | |||
| xvf32gerpp 6, 43, 39 | |||
| xvf32gerpp 5, 40, 39 | |||
| xvf32gerpp 4, 41, 39 | |||
| #else | |||
| xvf32gerpp 3, 42, 39 | |||
| xvf32gerpp 2, 43, 39 | |||
| xvf32gerpp 1, 40, 39 | |||
| @@ -276,6 +308,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xvf32gerpp 6, 43, 38 | |||
| xvf32gerpp 5, 40, 38 | |||
| xvf32gerpp 4, 41, 38 | |||
| #endif | |||
| .if \Complete==0 | |||
| lxvp vs40, DISP16(\Index, 64+\OffsetA)(\AREG) | |||
| lxvp vs38, DISP8(\Index, 32+\OffsetB)(\BREG) | |||
| @@ -393,22 +426,46 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| RECONSTRUCT_PAIR2 | |||
| #ifndef TRMMKERNEL | |||
| /* add */ | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xxpermdi vs1, vs0, vs8, 1 | |||
| xxpermdi vs3, vs2, vs10, 1 | |||
| xxpermdi vs5, vs4, vs12, 1 | |||
| xxpermdi vs7, vs6, vs14, 1 | |||
| xxpermdi vs9, vs8, vs0, 1 | |||
| xxpermdi vs11, vs10, vs2, 1 | |||
| #else | |||
| xxpermdi vs1, vs8, vs0, 2 | |||
| xxpermdi vs3, vs10, vs2, 2 | |||
| xxpermdi vs5, vs12, vs4, 2 | |||
| xxpermdi vs7, vs14, vs6, 2 | |||
| xxpermdi vs9, vs0, vs8, 2 | |||
| xxpermdi vs11, vs2, vs10, 2 | |||
| #endif | |||
| xvaddsp vs24, vs24, vs3 | |||
| xvaddsp vs25, vs25, vs1 | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xxpermdi vs13, vs12, vs4, 1 | |||
| xxpermdi vs15, vs14, vs6, 1 | |||
| #else | |||
| xxpermdi vs13, vs4, vs12, 2 | |||
| xxpermdi vs15, vs6, vs14, 2 | |||
| #endif | |||
| xvaddsp vs26, vs26, vs7 | |||
| xvaddsp vs27, vs27, vs5 | |||
| xvaddsp vs28, vs28, vs11 | |||
| xvaddsp vs29, vs29, vs9 | |||
| xvaddsp vs30, vs30, vs15 | |||
| xvaddsp vs31, vs31, vs13 | |||
| #else | |||
| #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ | |||
| xxpermdi vs25, vs0, vs8, 1 | |||
| xxpermdi vs24, vs2, vs10, 1 | |||
| xxpermdi vs27, vs4, vs12, 1 | |||
| xxpermdi vs26, vs6, vs14, 1 | |||
| xxpermdi vs29, vs8, vs0, 1 | |||
| xxpermdi vs28, vs10, vs2, 1 | |||
| xxpermdi vs31, vs12, vs4, 1 | |||
| xxpermdi vs30, vs14, vs6, 1 | |||
| #else | |||
| xxpermdi vs25, vs8, vs0, 2 | |||
| xxpermdi vs24, vs10, vs2, 2 | |||
| @@ -418,6 +475,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xxpermdi vs28, vs2, vs10, 2 | |||
| xxpermdi vs31, vs4, vs12, 2 | |||
| xxpermdi vs30, vs6, vs14, 2 | |||
| #endif | |||
| #endif | |||
| stxvp vs24, 0(CO) | |||
| MULT_APLHA_PART1 vs48, vs56, vs0, vs1 | |||
| @@ -443,22 +501,46 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| RECONSTRUCT_PAIR2 | |||
| #ifndef TRMMKERNEL | |||
| /* add */ | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xxpermdi vs1, vs0, vs8, 1 | |||
| xxpermdi vs3, vs2, vs10, 1 | |||
| xxpermdi vs5, vs4, vs12, 1 | |||
| xxpermdi vs7, vs6, vs14, 1 | |||
| xxpermdi vs9, vs8, vs0, 1 | |||
| xxpermdi vs11, vs10, vs2, 1 | |||
| #else | |||
| xxpermdi vs1, vs8, vs0, 2 | |||
| xxpermdi vs3, vs10, vs2, 2 | |||
| xxpermdi vs5, vs12, vs4, 2 | |||
| xxpermdi vs7, vs14, vs6, 2 | |||
| xxpermdi vs9, vs0, vs8, 2 | |||
| xxpermdi vs11, vs2, vs10, 2 | |||
| #endif | |||
| xvaddsp vs32, vs32, vs3 | |||
| xvaddsp vs33, vs33, vs1 | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xxpermdi vs13, vs12, vs4, 1 | |||
| xxpermdi vs15, vs14, vs6, 1 | |||
| #else | |||
| xxpermdi vs13, vs4, vs12, 2 | |||
| xxpermdi vs15, vs6, vs14, 2 | |||
| #endif | |||
| xvaddsp vs40, vs40, vs7 | |||
| xvaddsp vs41, vs41, vs5 | |||
| xvaddsp vs34, vs34, vs11 | |||
| xvaddsp vs35, vs35, vs9 | |||
| xvaddsp vs42, vs42, vs15 | |||
| xvaddsp vs43, vs43, vs13 | |||
| #else | |||
| #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ | |||
| xxpermdi vs33, vs0, vs8, 1 | |||
| xxpermdi vs32, vs2, vs10, 1 | |||
| xxpermdi vs41, vs4, vs12, 1 | |||
| xxpermdi vs40, vs6, vs14, 1 | |||
| xxpermdi vs35, vs8, vs0, 1 | |||
| xxpermdi vs34, vs10, vs2, 1 | |||
| xxpermdi vs43, vs12, vs4, 1 | |||
| xxpermdi vs42, vs14, vs6, 1 | |||
| #else | |||
| xxpermdi vs33, vs8, vs0, 2 | |||
| xxpermdi vs32, vs10, vs2, 2 | |||
| @@ -468,6 +550,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xxpermdi vs34, vs2, vs10, 2 | |||
| xxpermdi vs43, vs4, vs12, 2 | |||
| xxpermdi vs42, vs6, vs14, 2 | |||
| #endif | |||
| #endif | |||
| stxvp vs32, 0(T2) | |||
| stxvp vs40, 32(T2) | |||
| @@ -510,10 +593,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .if \OffsetA != 0 | |||
| addi \AREG, \AREG, \OffsetA | |||
| .endif | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xvf32gerpp 3, 32, 35 | |||
| xvf32gerpp 2, 33, 35 | |||
| xvf32gerpp 1, 32, 34 | |||
| xvf32gerpp 0, 33, 34 | |||
| #else | |||
| xvf32gerpp 3, 32, 34 | |||
| xvf32gerpp 2, 33, 34 | |||
| xvf32gerpp 1, 32, 35 | |||
| xvf32gerpp 0, 33, 35 | |||
| #endif | |||
| .endm | |||
| .macro LOAD4x4_2 | |||
| @@ -541,18 +631,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro KERNEL4x4_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xvf32gerpp 3, 32, 35 | |||
| xvf32gerpp 2, 33, 35 | |||
| xvf32gerpp 1, 32, 34 | |||
| xvf32gerpp 0, 33, 34 | |||
| #else | |||
| xvf32gerpp 3, 32, 34 | |||
| xvf32gerpp 2, 33, 34 | |||
| xvf32gerpp 1, 32, 35 | |||
| xvf32gerpp 0, 33, 35 | |||
| #endif | |||
| .if \Complete==0 | |||
| lxvp vs34, DISP8(\Index, \OffsetB)(\BREG) | |||
| lxvp vs32, DISP8(\Index, 0+\OffsetA)(\AREG) | |||
| .endif | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xvf32gerpp 3, 36, 39 | |||
| xvf32gerpp 2, 37, 39 | |||
| xvf32gerpp 1, 36, 38 | |||
| xvf32gerpp 0, 37, 38 | |||
| #else | |||
| xvf32gerpp 3, 36, 38 | |||
| xvf32gerpp 2, 37, 38 | |||
| xvf32gerpp 1, 36, 39 | |||
| xvf32gerpp 0, 37, 39 | |||
| #endif | |||
| .if \Complete==0 | |||
| lxvp vs38, DISP8(\Index, 32+\OffsetB)(\BREG) | |||
| lxvp vs36, DISP8(\Index, 32+\OffsetA)(\AREG) | |||
| @@ -606,6 +710,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| RECONSTRUCT_PAIR2 | |||
| #ifndef TRMMKERNEL | |||
| /* add */ | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xxpermdi vs1, vs0, vs8, 1 | |||
| xxpermdi vs3, vs2, vs10, 1 | |||
| xxpermdi vs9, vs8, vs0, 1 | |||
| xxpermdi vs11, vs10, vs2, 1 | |||
| xxpermdi vs5, vs4, vs12, 1 | |||
| xxpermdi vs7, vs6, vs14, 1 | |||
| xxpermdi vs13, vs12, vs4, 1 | |||
| xxpermdi vs15, vs14, vs6, 1 | |||
| #else | |||
| xxpermdi vs1, vs8, vs0, 2 | |||
| xxpermdi vs3, vs10, vs2, 2 | |||
| xxpermdi vs9, vs0, vs8, 2 | |||
| @@ -614,6 +728,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xxpermdi vs7, vs14, vs6, 2 | |||
| xxpermdi vs13, vs4, vs12, 2 | |||
| xxpermdi vs15, vs6, vs14, 2 | |||
| #endif | |||
| xvaddsp vs24, vs24, vs3 | |||
| xvaddsp vs25, vs25, vs1 | |||
| xvaddsp vs26, vs26, vs11 | |||
| @@ -622,6 +737,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xvaddsp vs29, vs29, vs5 | |||
| xvaddsp vs30, vs30, vs15 | |||
| xvaddsp vs31, vs31, vs13 | |||
| #else | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xxpermdi vs25, vs0, vs8, 1 | |||
| xxpermdi vs24, vs2, vs10, 1 | |||
| xxpermdi vs27, vs8, vs0, 1 | |||
| xxpermdi vs26, vs10, vs2, 1 | |||
| xxpermdi vs29, vs4, vs12, 1 | |||
| xxpermdi vs28, vs6, vs14, 1 | |||
| xxpermdi vs31, vs12, vs4, 1 | |||
| xxpermdi vs30, vs14, vs6, 1 | |||
| #else | |||
| xxpermdi vs25, vs8, vs0, 2 | |||
| xxpermdi vs24, vs10, vs2, 2 | |||
| @@ -631,6 +756,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xxpermdi vs28, vs14, vs6, 2 | |||
| xxpermdi vs31, vs4, vs12, 2 | |||
| xxpermdi vs30, vs6, vs14, 2 | |||
| #endif | |||
| #endif | |||
| stxvp vs24, 0(CO) | |||
| stxvp vs26, 0(T1) | |||
| @@ -672,8 +798,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .if \OffsetA != 0 | |||
| addi \AREG, \AREG, \OffsetA | |||
| .endif | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xvf32gerpp 1, 35, 32 | |||
| xvf32gerpp 0, 34, 32 | |||
| #else | |||
| xvf32gerpp 1, 34, 32 | |||
| xvf32gerpp 0, 35, 32 | |||
| #endif | |||
| .endm | |||
| .macro LOAD4x2_2 | |||
| @@ -700,13 +831,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro KERNEL4x2_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xvf32gerpp 1, 35, 32 | |||
| xvf32gerpp 0, 34, 32 | |||
| #else | |||
| xvf32gerpp 1, 34, 33 | |||
| xvf32gerpp 0, 35, 33 | |||
| #endif | |||
| .if \Complete==0 | |||
| lxvp vs34, DISP8(\Index, 0+\OffsetB)(\BREG) | |||
| .endif | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xvf32gerpp 1, 37, 33 | |||
| xvf32gerpp 0, 36, 33 | |||
| #else | |||
| xvf32gerpp 1, 36, 32 | |||
| xvf32gerpp 0, 37, 32 | |||
| #endif | |||
| .if \Complete==0 | |||
| lxvp vs32, DISP4(\Index, \OffsetA)(\AREG) | |||
| lxvp vs36, DISP8(\Index, 32+\OffsetB)(\BREG) | |||
| @@ -757,19 +898,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| RECONSTRUCT_PAIR1 | |||
| #ifndef TRMMKERNEL | |||
| /* add */ | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xxpermdi vs1, vs0, vs8, 0 | |||
| xxpermdi vs9, vs2, vs10, 0 | |||
| xxpermdi vs3, vs8, vs0, 3 | |||
| xxpermdi vs11, vs10, vs2, 3 | |||
| #else | |||
| xxpermdi vs1, vs8, vs0, 0 | |||
| xxpermdi vs9, vs10, vs2, 0 | |||
| xxpermdi vs3, vs0, vs8, 3 | |||
| xxpermdi vs11, vs2, vs10, 3 | |||
| #endif | |||
| xvaddsp vs24, vs24, vs1 | |||
| xvaddsp vs26, vs26, vs9 | |||
| xvaddsp vs25, vs25, vs3 | |||
| xvaddsp vs27, vs27, vs11 | |||
| #else | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xxpermdi vs24, vs0, vs8, 0 | |||
| xxpermdi vs26, vs2, vs10, 0 | |||
| xxpermdi vs25, vs8, vs0, 3 | |||
| xxpermdi vs27, vs10, vs2, 3 | |||
| #else | |||
| xxpermdi vs24, vs8, vs0, 0 | |||
| xxpermdi vs26, vs10, vs2, 0 | |||
| xxpermdi vs25, vs0, vs8, 3 | |||
| xxpermdi vs27, vs2, vs10, 3 | |||
| #endif | |||
| #endif | |||
| stxv vs24, 0(CO) | |||
| stxv vs25, 0(T1) | |||
| @@ -811,8 +966,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .if \OffsetA != 0 | |||
| addi \AREG, \AREG, \OffsetA | |||
| .endif | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xvf32gerpp 0, 34, 32 | |||
| xvf32gerpp 1, 35, 32 | |||
| #else | |||
| xvf32gerpp 0, 35, 32 | |||
| xvf32gerpp 1, 34, 32 | |||
| #endif | |||
| .endm | |||
| .macro LOAD4x1_2 | |||
| @@ -822,8 +982,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro LOAD4x1_2O OffsetA, OffsetB | |||
| lxv vs32, (\OffsetA)(AO) | |||
| vspltisb v6, 0 | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xxpermdi vs33, vs32, vs38, 2 | |||
| xxpermdi vs32, vs32, vs38, 0 | |||
| #else | |||
| xxpermdi vs33, vs32, vs38, 0 | |||
| xxpermdi vs32, vs32, vs38, 2 | |||
| #endif | |||
| lxvp vs34, (0+\OffsetB)(BO) | |||
| lxvp vs36, (32+\OffsetB)(BO) | |||
| .endm | |||
| @@ -842,18 +1007,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro KERNEL4x1_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xvf32gerpp 0, 34, 32 | |||
| xvf32gerpp 1, 35, 32 | |||
| #else | |||
| xvf32gerpp 0, 35, 32 | |||
| xvf32gerpp 1, 34, 32 | |||
| #endif | |||
| .if \Complete==0 | |||
| lxvp vs34, DISP8(\Index, 0+\OffsetB)(\BREG) | |||
| .endif | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xvf32gerpp 0, 36, 33 | |||
| xvf32gerpp 1, 37, 33 | |||
| #else | |||
| xvf32gerpp 0, 37, 33 | |||
| xvf32gerpp 1, 36, 33 | |||
| #endif | |||
| .if \Complete==0 | |||
| lxv vs32, DISP2(\Index, \OffsetA)(\AREG) | |||
| lxvp vs36, DISP8(\Index, 32+\OffsetB)(\BREG) | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xxpermdi vs33, vs32, vs38, 2 | |||
| xxpermdi vs32, vs32, vs38, 0 | |||
| #else | |||
| xxpermdi vs33, vs32, vs38, 0 | |||
| xxpermdi vs32, vs32, vs38, 2 | |||
| #endif | |||
| .endif | |||
| .if \IsLast==1 | |||
| .if \Complete==1 | |||
| @@ -1001,19 +1181,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro KERNEL2x8_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xvf32gerpp 2, 37, 34 | |||
| xvf32gerpp 3, 36, 34 | |||
| xvf32gerpp 0, 33, 34 | |||
| xvf32gerpp 1, 32, 34 | |||
| #else | |||
| xvf32gerpp 2, 37, 35 | |||
| xvf32gerpp 3, 36, 35 | |||
| xvf32gerpp 0, 33, 35 | |||
| xvf32gerpp 1, 32, 35 | |||
| #endif | |||
| .if \Complete==0 | |||
| lxvp vs32, DISP16(\Index, 0+\OffsetA)(\AREG) | |||
| lxvp vs36, DISP16(\Index, 32+\OffsetA)(\AREG) | |||
| .endif | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xvf32gerpp 2, 41, 35 | |||
| xvf32gerpp 3, 40, 35 | |||
| xvf32gerpp 0, 39, 35 | |||
| xvf32gerpp 1, 38, 35 | |||
| #else | |||
| xvf32gerpp 2, 41, 34 | |||
| xvf32gerpp 3, 40, 34 | |||
| xvf32gerpp 0, 39, 34 | |||
| xvf32gerpp 1, 38, 34 | |||
| #endif | |||
| .if \Complete==0 | |||
| lxvp vs34, DISP4(\Index, \OffsetB)(\BREG) | |||
| @@ -1068,16 +1262,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| RECONSTRUCT_PAIR2 | |||
| #ifndef TRMMKERNEL | |||
| /* add */ | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xxpermdi vs1, vs0, vs8, 1 | |||
| xxpermdi vs3, vs2, vs10, 1 | |||
| xxpermdi vs5, vs4, vs12, 1 | |||
| xxpermdi vs7, vs6, vs14, 1 | |||
| xxpermdi vs9, vs8, vs0, 1 | |||
| xxpermdi vs11, vs10, vs2, 1 | |||
| #else | |||
| xxpermdi vs1, vs8, vs0, 2 | |||
| xxpermdi vs3, vs10, vs2, 2 | |||
| xxpermdi vs5, vs12, vs4, 2 | |||
| xxpermdi vs7, vs14, vs6, 2 | |||
| xxpermdi vs9, vs0, vs8, 2 | |||
| xxpermdi vs11, vs2, vs10, 2 | |||
| #endif | |||
| xvaddsp vs24, vs24, vs3 | |||
| xvaddsp vs25, vs25, vs1 | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xxpermdi vs13, vs12, vs4, 1 | |||
| xxpermdi vs15, vs14, vs6, 1 | |||
| #else | |||
| xxpermdi vs13, vs4, vs12, 2 | |||
| xxpermdi vs15, vs6, vs14, 2 | |||
| #endif | |||
| xvaddsp vs26, vs26, vs7 | |||
| xvaddsp vs27, vs27, vs5 | |||
| xvaddsp vs28, vs28, vs11 | |||
| @@ -1085,6 +1293,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xvaddsp vs30, vs30, vs15 | |||
| xvaddsp vs31, vs31, vs13 | |||
| #else | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xxpermdi vs25, vs0, vs8, 1 | |||
| xxpermdi vs24, vs2, vs10, 1 | |||
| xxpermdi vs27, vs4, vs12, 1 | |||
| xxpermdi vs26, vs6, vs14, 1 | |||
| xxpermdi vs29, vs8, vs0, 1 | |||
| xxpermdi vs28, vs10, vs2, 1 | |||
| xxpermdi vs31, vs12, vs4, 1 | |||
| xxpermdi vs30, vs14, vs6, 1 | |||
| #else | |||
| xxpermdi vs25, vs8, vs0, 2 | |||
| xxpermdi vs24, vs10, vs2, 2 | |||
| xxpermdi vs27, vs12, vs4, 2 | |||
| @@ -1093,6 +1311,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xxpermdi vs28, vs2, vs10, 2 | |||
| xxpermdi vs31, vs4, vs12, 2 | |||
| xxpermdi vs30, vs6, vs14, 2 | |||
| #endif | |||
| #endif | |||
| stxvp vs24, 0(CO) | |||
| stxvp vs26, 32(CO) | |||
| @@ -1161,13 +1380,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro KERNEL2x4_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xvf32gerpp 0, 33, 34 | |||
| xvf32gerpp 1, 32, 34 | |||
| #else | |||
| xvf32gerpp 0, 33, 35 | |||
| xvf32gerpp 1, 32, 35 | |||
| #endif | |||
| .if \Complete==0 | |||
| lxvp vs32, DISP8(\Index, 0+\OffsetA)(\AREG) | |||
| .endif | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xvf32gerpp 0, 37, 35 | |||
| xvf32gerpp 1, 36, 35 | |||
| #else | |||
| xvf32gerpp 0, 37, 34 | |||
| xvf32gerpp 1, 36, 34 | |||
| #endif | |||
| .if \Complete==0 | |||
| lxvp vs34, DISP4(\Index, \OffsetB)(\BREG) | |||
| lxvp vs36, DISP8(\Index, 32+\OffsetA)(\AREG) | |||
| @@ -1206,19 +1436,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| RECONSTRUCT_PAIR1 | |||
| #ifndef TRMMKERNEL | |||
| /* add */ | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xxpermdi vs1, vs0, vs8, 1 | |||
| xxpermdi vs3, vs2, vs10, 1 | |||
| xxpermdi vs9, vs8, vs0, 1 | |||
| xxpermdi vs11, vs10, vs2, 1 | |||
| #else | |||
| xxpermdi vs1, vs8, vs0, 2 | |||
| xxpermdi vs3, vs10, vs2, 2 | |||
| xxpermdi vs9, vs0, vs8, 2 | |||
| xxpermdi vs11, vs2, vs10, 2 | |||
| #endif | |||
| xvaddsp vs24, vs24, vs3 | |||
| xvaddsp vs25, vs25, vs1 | |||
| xvaddsp vs26, vs26, vs11 | |||
| xvaddsp vs27, vs27, vs9 | |||
| #else | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xxpermdi vs25, vs0, vs8, 1 | |||
| xxpermdi vs24, vs2, vs10, 1 | |||
| xxpermdi vs27, vs8, vs0, 1 | |||
| xxpermdi vs26, vs10, vs2, 1 | |||
| #else | |||
| xxpermdi vs25, vs8, vs0, 2 | |||
| xxpermdi vs24, vs10, vs2, 2 | |||
| xxpermdi vs27, vs0, vs8, 2 | |||
| xxpermdi vs26, vs2, vs10, 2 | |||
| #endif | |||
| #endif | |||
| stxvp vs24, 0(CO) | |||
| stxvp vs26, 0(T1) | |||
| @@ -1330,13 +1574,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xxperm vs8, vs9, save_permute_1 | |||
| #ifndef TRMMKERNEL | |||
| /* add */ | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xxpermdi vs1, vs0, vs8, 0 | |||
| xxpermdi vs9, vs8, vs0, 3 | |||
| #else | |||
| xxpermdi vs1, vs8, vs0, 0 | |||
| xxpermdi vs9, vs0, vs8, 3 | |||
| #endif | |||
| xvaddsp vs24, vs24, vs1 | |||
| xvaddsp vs26, vs26, vs9 | |||
| #else | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xxpermdi vs24, vs0, vs8, 0 | |||
| xxpermdi vs26, vs8, vs0, 3 | |||
| #else | |||
| xxpermdi vs24, vs8, vs0, 0 | |||
| xxpermdi vs26, vs0, vs8, 3 | |||
| #endif | |||
| #endif | |||
| stxv vs24, 0(CO) | |||
| stxv vs26, 0(T1) | |||
| @@ -1528,8 +1782,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| lxvp vs32, (0+\OffsetA)(AO) | |||
| lxvp vs36, (32+\OffsetA)(AO) | |||
| vspltisb v10, 0 | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xxpermdi vs35, vs34, vs42, 2 | |||
| xxpermdi vs34, vs34, vs42, 0 | |||
| #else | |||
| xxpermdi vs35, vs34, vs42, 0 | |||
| xxpermdi vs34, vs34, vs42, 2 | |||
| #endif | |||
| lxvp vs38, (64+\OffsetA)(AO) | |||
| lxvp vs40, (64+32+\OffsetA)(AO) | |||
| .endm | |||
| @@ -1567,8 +1826,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xvf32gerpp 3, 35, 40 | |||
| .if \Complete==0 | |||
| lxv vs34, DISP2(\Index, \OffsetB)(\BREG) | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xxpermdi vs35, vs34, vs42, 2 | |||
| xxpermdi vs34, vs34, vs42, 0 | |||
| #else | |||
| xxpermdi vs35, vs34, vs42, 0 | |||
| xxpermdi vs34, vs34, vs42, 2 | |||
| #endif | |||
| lxvp vs40, DISP16(\Index, 64+32+\OffsetA)(\AREG) | |||
| .endif | |||
| .if \IsLast==1 | |||
| @@ -1634,10 +1898,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| MULT_APLHA_PART2 vs34, vs42, vs4, vs5 | |||
| MULT_APLHA_PART2 vs35, vs43, vs6, vs7 | |||
| /* reconstruct r, i pairs*/ | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xxperm vs0, vs1, save_permute_1 | |||
| xxperm vs2, vs3, save_permute_1 | |||
| xxperm vs4, vs5, save_permute_1 | |||
| xxperm vs6, vs7, save_permute_1 | |||
| #else | |||
| xxperm vs0, vs1, vs28 | |||
| xxperm vs2, vs3, vs28 | |||
| xxperm vs4, vs5, vs28 | |||
| xxperm vs6, vs7, vs28 | |||
| #endif | |||
| #ifndef TRMMKERNEL | |||
| /* add */ | |||
| xvaddsp vs24, vs24, vs2 | |||
| @@ -1648,10 +1919,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| stxvp vs26, 32(CO) | |||
| #else | |||
| /* reconstruct r, i pairs*/ | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| stxv vs2, 0(CO) | |||
| stxv vs0, 16(CO) | |||
| stxv vs6, 32(CO) | |||
| stxv vs4, 48(CO) | |||
| #else | |||
| stxv vs0, 0(CO) | |||
| stxv vs2, 16(CO) | |||
| stxv vs4, 32(CO) | |||
| stxv vs6, 48(CO) | |||
| #endif | |||
| #endif | |||
| addi CO, CO, 64 | |||
| .endm | |||
| @@ -1701,8 +1979,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| lxv vs34, (\OffsetB)(BO) | |||
| lxvp vs32, (0+\OffsetA)(AO) | |||
| vspltisb v6, 0 | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xxpermdi vs35, vs34, vs38, 2 | |||
| xxpermdi vs34, vs34, vs38, 0 | |||
| #else | |||
| xxpermdi vs35, vs34, vs38, 0 | |||
| xxpermdi vs34, vs34, vs38, 2 | |||
| #endif | |||
| lxvp vs36, (32+\OffsetA)(AO) | |||
| .endm | |||
| @@ -1729,8 +2012,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xvf32gerpp 1, 35, 36 | |||
| .if \Complete==0 | |||
| lxv vs34, DISP2(\Index, \OffsetB)(\BREG) | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xxpermdi vs35, vs34, vs38, 2 | |||
| xxpermdi vs34, vs34, vs38, 0 | |||
| #else | |||
| xxpermdi vs35, vs34, vs38, 0 | |||
| xxpermdi vs34, vs34, vs38, 2 | |||
| #endif | |||
| lxvp vs36, DISP8(\Index, 32+\OffsetA)(\AREG) | |||
| .endif | |||
| .if \IsLast==1 | |||
| @@ -1775,8 +2063,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| MULT_APLHA_PART2 vs32, vs40, vs0, vs1 | |||
| MULT_APLHA_PART2 vs33, vs41, vs2, vs3 | |||
| /* reconstruct r, i pairs*/ | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xxperm vs0, vs1, save_permute_1 | |||
| xxperm vs2, vs3, save_permute_1 | |||
| #else | |||
| xxperm vs0, vs1, vs28 | |||
| xxperm vs2, vs3, vs28 | |||
| #endif | |||
| #ifndef TRMMKERNEL | |||
| /* add */ | |||
| xvaddsp vs24, vs24, vs2 | |||
| @@ -1784,8 +2077,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| stxvp vs24, 0(CO) | |||
| #else | |||
| /* reconstruct r, i pairs*/ | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| stxv vs2, 0(CO) | |||
| stxv vs0, 16(CO) | |||
| #else | |||
| stxv vs0, 0(CO) | |||
| stxv vs2, 16(CO) | |||
| #endif | |||
| #endif | |||
| addi CO, CO, 32 | |||
| .endm | |||
| @@ -1904,7 +2202,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| MULT_APLHA_PART1 vs32, vs40, vs0, vs1 | |||
| MULT_APLHA_PART2 vs32, vs40, vs0, vs1 | |||
| /* reconstruct r, i pairs*/ | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xxperm vs0, vs1, save_permute_1 | |||
| #else | |||
| xxperm vs0, vs1, vs28 | |||
| #endif | |||
| #ifndef TRMMKERNEL | |||
| /* add */ | |||
| xvaddsp vs24, vs24, vs0 | |||
| @@ -2018,7 +2320,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| MULT_APLHA_PART1 vs32, vs40, vs37, vs1 | |||
| MULT_APLHA_PART2 vs32, vs40, vs37, vs1 | |||
| /* reconstruct r, i pairs*/ | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xxperm vs37, vs1, save_permute_1 | |||
| #else | |||
| xxperm vs37, vs1, vs28 | |||
| #endif | |||
| #ifndef TRMMKERNEL | |||
| /* add */ | |||
| xvaddsp vs36, vs36, vs37 | |||
| @@ -30,7 +30,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| static void zscal_kernel_8 (long n, float *x, float alpha_r, float alpha_i) | |||
| { | |||
| __vector float t0 = {-alpha_i, alpha_i, -alpha_i, alpha_i}; | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| __vector unsigned char mask = {4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11}; | |||
| #else | |||
| __vector unsigned char mask = { 11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4}; | |||
| #endif | |||
| __asm__ | |||
| ( | |||
| "dcbt 0, %2 \n\t" | |||
| @@ -39,10 +39,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||
| #if defined(POWER8) || defined(POWER9) | |||
| #include "cswap_microk_power8.c" | |||
| #elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) | |||
| #include "cswap_microk_power10.c" | |||
| #elif defined(POWER10) | |||
| #include "cswap_microk_power8.c" | |||
| #include "cswap_microk_power10.c" | |||
| #endif | |||
| #endif | |||
| @@ -49,14 +49,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||
| #if defined(POWER8) || defined(POWER9) | |||
| #include "dasum_microk_power8.c" | |||
| #elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) | |||
| #include "dasum_microk_power10.c" | |||
| #elif defined(POWER10) | |||
| #include "dasum_microk_power8.c" | |||
| #include "dasum_microk_power10.c" | |||
| #endif | |||
| #endif | |||
| #ifndef HAVE_KERNEL_16 | |||
| static FLOAT dasum_kernel_16(BLASLONG n, FLOAT *x1) | |||
| @@ -114,7 +111,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| if ( inc_x == 1 ) | |||
| { | |||
| #if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) | |||
| #if defined(POWER10) | |||
| if ( n >= 32) | |||
| { | |||
| BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3; | |||
| @@ -0,0 +1,923 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2021, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #include <altivec.h> | |||
| typedef __vector unsigned char vec_t; | |||
| #if !__has_builtin(__builtin_vsx_assemble_pair) | |||
| #define __builtin_vsx_assemble_pair __builtin_mma_assemble_pair | |||
| #endif | |||
| #if !defined(B0) | |||
| #define SAVE_4x2_ACC(ACC, N, M) \ | |||
| __builtin_mma_disassemble_acc((void *)result, ACC); \ | |||
| rc0 = vec_xl(0, C+(N+0)*ldc+M); \ | |||
| rc0 = vec_mul(rc0, vbeta); \ | |||
| result[0] = vec_madd(result[0], valpha, rc0); \ | |||
| vec_xst(result[0], 0, C+(N+0)*ldc+M); \ | |||
| rc0 = vec_xl(0, C+(N+1)*ldc+M); \ | |||
| rc0 = vec_mul(rc0, vbeta); \ | |||
| result[1] = vec_madd(result[1], valpha, rc0); \ | |||
| vec_xst(result[1], 0, C+(N+1)*ldc+M); \ | |||
| rc0 = vec_xl(0, C+(N+2)*ldc+M); \ | |||
| rc0 = vec_mul(rc0, vbeta); \ | |||
| result[2] = vec_madd(result[2], valpha, rc0); \ | |||
| vec_xst(result[2], 0, C+(N+2)*ldc+M); \ | |||
| rc0 = vec_xl(0, C+(N+3)*ldc+M); \ | |||
| rc0 = vec_mul(rc0, vbeta); \ | |||
| result[3] = vec_madd(result[3], valpha, rc0); \ | |||
| vec_xst(result[3], 0, C+(N+3)*ldc+M); | |||
| #define SAVE_4x1_ACC(ACC, N, M) \ | |||
| __builtin_mma_disassemble_acc((void *)result, ACC); \ | |||
| rc0 = vec_xl_len(C+(N+0)*ldc+M, 8); \ | |||
| rc0 = vec_mul(rc0, vbeta); \ | |||
| result[0] = vec_madd(result[0], valpha, rc0); \ | |||
| vec_xst_len(result[0], C+(N+0)*ldc+M, 8); \ | |||
| rc0 = vec_xl_len(C+(N+1)*ldc+M, 8); \ | |||
| rc0 = vec_mul(rc0, vbeta); \ | |||
| result[1] = vec_madd(result[1], valpha, rc0); \ | |||
| vec_xst_len(result[1], C+(N+1)*ldc+M, 8); \ | |||
| rc0 = vec_xl_len(C+(N+2)*ldc+M, 8); \ | |||
| rc0 = vec_mul(rc0, vbeta); \ | |||
| result[2] = vec_madd(result[2], valpha, rc0); \ | |||
| vec_xst_len(result[2], C+(N+2)*ldc+M, 8); \ | |||
| rc0 = vec_xl_len(C+(N+3)*ldc+M, 8); \ | |||
| rc0 = vec_mul(rc0, vbeta); \ | |||
| result[3] = vec_madd(result[3], valpha, rc0); \ | |||
| vec_xst_len(result[3], C+(N+3)*ldc+M, 8); | |||
| #define SAVE_2x2_ACC(ACC, N, M) \ | |||
| __builtin_mma_disassemble_acc((void *)result, ACC); \ | |||
| rc0 = vec_xl(0, C+(N+0)*ldc+M); \ | |||
| rc0 = vec_mul(rc0, vbeta); \ | |||
| result[0] = vec_madd(result[0], valpha, rc0); \ | |||
| vec_xst(result[0], 0, C+(N+0)*ldc+M); \ | |||
| rc0 = vec_xl(0, C+(N+1)*ldc+M); \ | |||
| rc0 = vec_mul(rc0, vbeta); \ | |||
| result[1] = vec_madd(result[1], valpha, rc0); \ | |||
| vec_xst(result[1], 0, C+(N+1)*ldc+M); | |||
| #define SAVE_2x1_ACC(ACC, N, M) \ | |||
| __builtin_mma_disassemble_acc((void *)result, ACC); \ | |||
| rc0 = vec_xl_len(C+(N+0)*ldc+M, 8); \ | |||
| rc0 = vec_mul(rc0, vbeta); \ | |||
| result[0] = vec_madd(result[0], valpha, rc0); \ | |||
| vec_xst_len(result[0], C+(N+0)*ldc+M, 8); \ | |||
| rc0 = vec_xl_len(C+(N+1)*ldc+M, 8); \ | |||
| rc0 = vec_mul(rc0, vbeta); \ | |||
| result[1] = vec_madd(result[1], valpha, rc0); \ | |||
| vec_xst_len(result[1], C+(N+1)*ldc+M, 8); | |||
| #define SAVE_1x4_VSR(result, N, M) \ | |||
| rc0 = vec_xl(0, C+((N)*ldc)+M); \ | |||
| rc0 = vec_mul(rc0, vbeta); \ | |||
| result = vec_madd(result, valpha, rc0); \ | |||
| vec_xst(result, 0, C+((N)*ldc)+M); | |||
| #else | |||
| #define SAVE_4x2_ACC(ACC, N, M) \ | |||
| __builtin_mma_disassemble_acc((void *)result, ACC); \ | |||
| result[0] = vec_mul(result[0], valpha); \ | |||
| vec_xst(result[0], 0, C+(N+0)*ldc+M); \ | |||
| result[1] = vec_mul(result[1], valpha); \ | |||
| vec_xst(result[1], 0, C+(N+1)*ldc+M); \ | |||
| result[2] = vec_mul(result[2], valpha); \ | |||
| vec_xst(result[2], 0, C+(N+2)*ldc+M); \ | |||
| result[3] = vec_mul(result[3], valpha); \ | |||
| vec_xst(result[3], 0, C+(N+3)*ldc+M); | |||
| #define SAVE_4x1_ACC(ACC, N, M) \ | |||
| __builtin_mma_disassemble_acc((void *)result, ACC); \ | |||
| result[0] = vec_mul(result[0], valpha); \ | |||
| vec_xst_len(result[0], C+(N+0)*ldc+M, 8); \ | |||
| result[1] = vec_mul(result[1], valpha); \ | |||
| vec_xst_len(result[1], C+(N+1)*ldc+M, 8); \ | |||
| result[2] = vec_mul(result[2], valpha); \ | |||
| vec_xst_len(result[2], C+(N+2)*ldc+M, 8); \ | |||
| result[3] = vec_mul(result[3], valpha); \ | |||
| vec_xst_len(result[3], C+(N+3)*ldc+M, 8); | |||
| #define SAVE_2x2_ACC(ACC, N, M) \ | |||
| __builtin_mma_disassemble_acc((void *)result, ACC); \ | |||
| result[0] = vec_mul(result[0], valpha); \ | |||
| vec_xst(result[0], 0, C+(N+0)*ldc+M); \ | |||
| result[1] = vec_mul(result[1], valpha); \ | |||
| vec_xst(result[1], 0, C+(N+1)*ldc+M); | |||
| #define SAVE_2x1_ACC(ACC, N, M) \ | |||
| __builtin_mma_disassemble_acc((void *)result, ACC); \ | |||
| result[0] = vec_mul(result[0], valpha); \ | |||
| vec_xst_len(result[0], C+(N+0)*ldc+M, 8); \ | |||
| result[1] = vec_mul(result[1], valpha); \ | |||
| vec_xst_len(result[1], C+(N+1)*ldc+M, 8); | |||
| #define SAVE_1x4_VSR(result, N, M) \ | |||
| result = vec_mul(result, valpha); \ | |||
| vec_xst(result, 0, C+((N)*ldc)+M); | |||
| #endif | |||
| #define INIT_8ACCS() \ | |||
| __builtin_mma_xxsetaccz(&acc0); \ | |||
| __builtin_mma_xxsetaccz(&acc1); \ | |||
| __builtin_mma_xxsetaccz(&acc2); \ | |||
| __builtin_mma_xxsetaccz(&acc3); \ | |||
| __builtin_mma_xxsetaccz(&acc4); \ | |||
| __builtin_mma_xxsetaccz(&acc5); \ | |||
| __builtin_mma_xxsetaccz(&acc6); \ | |||
| __builtin_mma_xxsetaccz(&acc7); | |||
| #define INIT_4ACCS() \ | |||
| __builtin_mma_xxsetaccz(&acc0); \ | |||
| __builtin_mma_xxsetaccz(&acc1); \ | |||
| __builtin_mma_xxsetaccz(&acc2); \ | |||
| __builtin_mma_xxsetaccz(&acc3); | |||
| #define INIT_2ACCS() \ | |||
| __builtin_mma_xxsetaccz(&acc0); \ | |||
| __builtin_mma_xxsetaccz(&acc1); | |||
| #define INIT_1ACC() __builtin_mma_xxsetaccz(&acc0); | |||
| #if (defined(__GNUC__) && (__GNUC__ == 10)) | |||
| #if defined(_AIX) | |||
| #define LOAD_PAIR(pair, v0, v1) \ | |||
| __builtin_vsx_assemble_pair(&pair, (vec_t)v0, (vec_t)v1); | |||
| #else | |||
| #define LOAD_PAIR(pair, v0, v1) \ | |||
| __builtin_vsx_assemble_pair(&pair, (vec_t)v1, (vec_t)v0); | |||
| #endif | |||
| #else | |||
| #define LOAD_PAIR(pair, v0, v1) \ | |||
| __builtin_vsx_build_pair(&pair, (vec_t)v0, (vec_t)v1); | |||
| #endif | |||
| #define LOAD_A_1x8(K, M) \ | |||
| ra0 = vec_xl(0, A+((K)*lda)+M+0); \ | |||
| ra1 = vec_xl(0, A+((K)*lda)+M+2); \ | |||
| ra2 = vec_xl(0, A+((K)*lda)+M+4); \ | |||
| ra3 = vec_xl(0, A+((K)*lda)+M+6); | |||
| #define LOAD_A_1x4(K, M) \ | |||
| ra0 = vec_xl(0, A+((K)*lda)+M+0); \ | |||
| ra1 = vec_xl(0, A+((K)*lda)+M+2); \ | |||
| #define LOAD_A_1x2(K, M) \ | |||
| ra0 = vec_xl(0, A+((K)*lda)+M+0); | |||
| #define LOAD_A_1x1(K, M) \ | |||
| ra0 = vec_splats(A[((K)*lda)+M+0]); | |||
| #define LOAD_BTP_8x2(N, K) \ | |||
| rb0 = vec_xl(0, B+(N+0)*ldb+K+0); \ | |||
| rb1 = vec_xl(0, B+(N+1)*ldb+K+0); \ | |||
| rb2 = vec_xl(0, B+(N+2)*ldb+K+0); \ | |||
| rb3 = vec_xl(0, B+(N+3)*ldb+K+0); \ | |||
| t0 = vec_mergeh(rb0, rb1); \ | |||
| t1 = vec_mergeh(rb2, rb3); \ | |||
| LOAD_PAIR(pb0, t0, t1); \ | |||
| t0 = vec_mergel(rb0, rb1); \ | |||
| t1 = vec_mergel(rb2, rb3); \ | |||
| LOAD_PAIR(pb2, t0, t1); \ | |||
| rb4 = vec_xl(0, B+(N+4)*ldb+K+0); \ | |||
| rb5 = vec_xl(0, B+(N+5)*ldb+K+0); \ | |||
| rb6 = vec_xl(0, B+(N+6)*ldb+K+0); \ | |||
| rb7 = vec_xl(0, B+(N+7)*ldb+K+0); \ | |||
| t0 = vec_mergeh(rb4, rb5); \ | |||
| t1 = vec_mergeh(rb6, rb7); \ | |||
| LOAD_PAIR(pb1, t0, t1); \ | |||
| t0 = vec_mergel(rb4, rb5); \ | |||
| t1 = vec_mergel(rb6, rb7); \ | |||
| LOAD_PAIR(pb3, t0, t1); | |||
| #define LOAD_BTP_8x1(N, K) \ | |||
| rb0 = vec_xor(rb0, rb0); \ | |||
| rb0 = vec_insert(B[(N+0)*ldb+K], rb0, 0); \ | |||
| rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 1); \ | |||
| rb1 = vec_xor(rb1, rb1); \ | |||
| rb1 = vec_insert(B[(N+2)*ldb+K], rb1, 0); \ | |||
| rb1 = vec_insert(B[(N+3)*ldb+K], rb1, 1); \ | |||
| LOAD_PAIR(pb0, rb0, rb1); \ | |||
| rb2 = vec_xor(rb2, rb2); \ | |||
| rb2 = vec_insert(B[(N+4)*ldb+K], rb2, 0); \ | |||
| rb2 = vec_insert(B[(N+5)*ldb+K], rb2, 1); \ | |||
| rb3 = vec_xor(rb3, rb3); \ | |||
| rb3 = vec_insert(B[(N+6)*ldb+K], rb3, 0); \ | |||
| rb3 = vec_insert(B[(N+7)*ldb+K], rb3, 1); \ | |||
| LOAD_PAIR(pb1, rb2, rb3); | |||
| #define LOAD_BTP_4x2(N, K) \ | |||
| rb0 = vec_xl(0, B+(N+0)*ldb+K+0); \ | |||
| rb1 = vec_xl(0, B+(N+1)*ldb+K+0); \ | |||
| rb2 = vec_xl(0, B+(N+2)*ldb+K+0); \ | |||
| rb3 = vec_xl(0, B+(N+3)*ldb+K+0); \ | |||
| t0 = vec_mergeh(rb0, rb1); \ | |||
| t1 = vec_mergeh(rb2, rb3); \ | |||
| LOAD_PAIR(pb0, t0, t1); \ | |||
| t0 = vec_mergel(rb0, rb1); \ | |||
| t1 = vec_mergel(rb2, rb3); \ | |||
| LOAD_PAIR(pb1, t0, t1); | |||
| #define LOAD_BTP_4x1(N, K) \ | |||
| rb0 = vec_xor(rb0, rb0); \ | |||
| rb0 = vec_insert(B[(N+0)*ldb+K], rb0, 0); \ | |||
| rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 1); \ | |||
| rb1 = vec_xor(rb1, rb1); \ | |||
| rb1 = vec_insert(B[(N+2)*ldb+K], rb1, 0); \ | |||
| rb1 = vec_insert(B[(N+3)*ldb+K], rb1, 1); \ | |||
| LOAD_PAIR(pb0, rb0, rb1); | |||
| #define LOAD_BTP_2x2(N, K) \ | |||
| rb0 = vec_xl(0, B+(N+0)*ldb+K+0); \ | |||
| rb1 = vec_xl(0, B+(N+1)*ldb+K+0); \ | |||
| t0 = vec_mergeh(rb0, rb1); \ | |||
| __builtin_vsx_assemble_pair(&pb0, (vec_t)t0, (vec_t)t0); \ | |||
| t1 = vec_mergel(rb0, rb1); \ | |||
| __builtin_vsx_assemble_pair(&pb1, (vec_t)t1, (vec_t)t1); | |||
| #define LOAD_BTP_2x1(N, K) \ | |||
| rb0 = vec_xor(rb0, rb0); \ | |||
| rb0 = vec_insert(B[(N+0)*ldb+K], rb0, 0); \ | |||
| rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 1); \ | |||
| __builtin_vsx_assemble_pair(&pb0, (vec_t)rb0, (vec_t)rb0); | |||
| #define LOAD_B_1x1(N, K) \ | |||
| rb0 = vec_splats(B[((N)*ldb)+K]); | |||
| #define KERNEL_MMA_8ACC(b0, b1, b2, b3, b4, b5, b6, b7, \ | |||
| a0, a1, a2, a3, a4, a5, a6, a7) \ | |||
| __builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); \ | |||
| __builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1); \ | |||
| __builtin_mma_xvf64gerpp(&acc2, b2, (vec_t)a2); \ | |||
| __builtin_mma_xvf64gerpp(&acc3, b3, (vec_t)a3); \ | |||
| __builtin_mma_xvf64gerpp(&acc4, b4, (vec_t)a4); \ | |||
| __builtin_mma_xvf64gerpp(&acc5, b5, (vec_t)a5); \ | |||
| __builtin_mma_xvf64gerpp(&acc6, b6, (vec_t)a6); \ | |||
| __builtin_mma_xvf64gerpp(&acc7, b7, (vec_t)a7); | |||
| #define KERNEL_MMA_4ACC(b0, b1, b2, b3, a0, a1, a2, a3) \ | |||
| __builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); \ | |||
| __builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1); \ | |||
| __builtin_mma_xvf64gerpp(&acc2, b2, (vec_t)a2); \ | |||
| __builtin_mma_xvf64gerpp(&acc3, b3, (vec_t)a3); | |||
| #define KERNEL_MMA_2ACC(b0, b1, a0, a1) \ | |||
| __builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); \ | |||
| __builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1); | |||
| #define KERNEL_MMA_1ACC(b0, a0) \ | |||
| __builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); | |||
| #define KERNEL_VMADD_4VSR(a0, a1, a2, a3, b0, b1, b2, b3) \ | |||
| result = vec_madd(a0, b0, result); \ | |||
| result1 = vec_madd(a1, b1, result1); \ | |||
| result2 = vec_madd(a2, b2, result2); \ | |||
| result3 = vec_madd(a3, b3, result3); | |||
| #define KERNEL_VMADD_2VSR(a0, a1, b0, b1) \ | |||
| result = vec_madd(a0, b0, result); \ | |||
| result1 = vec_madd(a1, b1, result1); | |||
| #define KERNEL_VMADD_1VSR(a0, b0) \ | |||
| result = vec_madd(a0, b0, result); | |||
| #define PACK_B(pb0, pb1, offset) \ | |||
| *((__vector_pair *)(void *)(packB+(k*8)+0+offset)) = pb0; \ | |||
| *((__vector_pair *)(void *)(packB+(k*8)+4+offset)) = pb1; | |||
| #define LOAD_PACKED_B(pb0, pb1, offset) \ | |||
| pb0 = *((__vector_pair *)((void *)(packB+(k*8)+0+offset))); \ | |||
| pb1 = *((__vector_pair *)((void *)(packB+(k*8)+4+offset))); | |||
| #ifdef B0 | |||
| int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) | |||
| #else | |||
| int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc) | |||
| #endif | |||
| { | |||
| BLASLONG m, n, k; | |||
| BLASLONG m8 = M & ~7; | |||
| BLASLONG m4 = M & ~3; | |||
| BLASLONG m2 = M & ~1; | |||
| BLASLONG n8 = N & ~7; | |||
| BLASLONG n4 = N & ~3; | |||
| BLASLONG n2 = N & ~1; | |||
| BLASLONG k2 = K & ~1; | |||
| #if defined(__GNUC__) && !defined(__clang__) | |||
| int has_packing = (M >= 32 && N >= 32 && K >= 32) ? 1 : 0; | |||
| #else | |||
| int has_packing = 0; | |||
| #endif | |||
| double *packB; | |||
| if (has_packing) packB = (double *)malloc(K*8*sizeof(double)); | |||
| vector double valpha = vec_splats(alpha); | |||
| #if !defined(B0) | |||
| vector double vbeta = vec_splats(beta); | |||
| #endif | |||
| for (n = 0; n < n8; n += 8) { | |||
| for (m = 0; m < m8; m += 8) { | |||
| __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; | |||
| INIT_8ACCS(); | |||
| register vector double ra0, ra1, ra2, ra3; | |||
| register vector double rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7; | |||
| register vector double t0, t1; | |||
| __vector_pair pb0, pb1, pb2, pb3; | |||
| if (has_packing) { | |||
| if (m == 0) { | |||
| for (k = 0; k < k2; k += 2) { | |||
| LOAD_A_1x8(k, m); | |||
| LOAD_BTP_8x2(n, k); | |||
| KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1, | |||
| ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3); | |||
| PACK_B(pb0, pb1, 0); | |||
| LOAD_A_1x8(k+1, m); | |||
| KERNEL_MMA_8ACC(pb2, pb3, pb2, pb3, pb2, pb3, pb2, pb3, | |||
| ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3); | |||
| PACK_B(pb2, pb3, 8); | |||
| } | |||
| for (; k < K; k++) { | |||
| LOAD_A_1x8(k, m); | |||
| LOAD_BTP_8x1(n, k); | |||
| KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1, | |||
| ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3); | |||
| PACK_B(pb0, pb1, 0); | |||
| } | |||
| } else { | |||
| for (k = 0; k < k2; k += 2) { | |||
| LOAD_A_1x8(k, m); | |||
| LOAD_PACKED_B(pb0, pb1, 0); | |||
| KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1, | |||
| ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3); | |||
| LOAD_A_1x8(k+1, m); | |||
| LOAD_PACKED_B(pb2, pb3, 8); | |||
| KERNEL_MMA_8ACC(pb2, pb3, pb2, pb3, pb2, pb3, pb2, pb3, | |||
| ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3); | |||
| } | |||
| for (; k < K; k++) { | |||
| LOAD_A_1x8(k, m); | |||
| LOAD_PACKED_B(pb0, pb1, 0); | |||
| KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1, | |||
| ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3); | |||
| } | |||
| } | |||
| } else { | |||
| for (k = 0; k < k2; k += 2) { | |||
| LOAD_A_1x8(k, m); | |||
| LOAD_BTP_8x2(n, k); | |||
| KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1, | |||
| ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3); | |||
| LOAD_A_1x8(k+1, m); | |||
| KERNEL_MMA_8ACC(pb2, pb3, pb2, pb3, pb2, pb3, pb2, pb3, | |||
| ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3); | |||
| } | |||
| for (; k < K; k++) { | |||
| LOAD_A_1x8(k, m); | |||
| LOAD_BTP_8x1(n, k); | |||
| KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1, | |||
| ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3); | |||
| } | |||
| } | |||
| #if !defined(B0) | |||
| register vector double rc0; | |||
| #endif | |||
| vector double result[4]; | |||
| SAVE_4x2_ACC(&acc0, n+0, m+0); | |||
| SAVE_4x2_ACC(&acc2, n+0, m+2); | |||
| SAVE_4x2_ACC(&acc4, n+0, m+4); | |||
| SAVE_4x2_ACC(&acc6, n+0, m+6); | |||
| SAVE_4x2_ACC(&acc1, n+4, m+0); | |||
| SAVE_4x2_ACC(&acc3, n+4, m+2); | |||
| SAVE_4x2_ACC(&acc5, n+4, m+4); | |||
| SAVE_4x2_ACC(&acc7, n+4, m+6); | |||
| } | |||
| for (; m < m4; m += 4) { | |||
| __vector_quad acc0, acc1, acc2, acc3; | |||
| INIT_4ACCS(); | |||
| register vector double ra0, ra1; | |||
| register vector double rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7; | |||
| register vector double t0, t1; | |||
| __vector_pair pb0, pb1, pb2, pb3; | |||
| if (!has_packing) { | |||
| for (k = 0; k < k2; k += 2) { | |||
| LOAD_A_1x4(k, m); | |||
| LOAD_BTP_8x2(n, k); | |||
| KERNEL_MMA_4ACC(pb0, pb1, pb0, pb1, ra0, ra0, ra1, ra1); | |||
| LOAD_A_1x4(k+1, m); | |||
| KERNEL_MMA_4ACC(pb2, pb3, pb2, pb3, ra0, ra0, ra1, ra1); | |||
| } | |||
| for (; k < K; k++) { | |||
| LOAD_A_1x4(k, m); | |||
| LOAD_BTP_8x1(n, k); | |||
| KERNEL_MMA_4ACC(pb0, pb1, pb0, pb1, ra0, ra0, ra1, ra1); | |||
| } | |||
| } else { | |||
| for (k = 0; k < k2; k += 2) { | |||
| LOAD_A_1x4(k, m); | |||
| LOAD_PACKED_B(pb0, pb1, 0); | |||
| KERNEL_MMA_4ACC(pb0, pb1, pb0, pb1, ra0, ra0, ra1, ra1); | |||
| LOAD_A_1x4(k+1, m); | |||
| LOAD_PACKED_B(pb2, pb3, 8); | |||
| KERNEL_MMA_4ACC(pb2, pb3, pb2, pb3, ra0, ra0, ra1, ra1); | |||
| } | |||
| for (; k < K; k++) { | |||
| LOAD_A_1x4(k, m); | |||
| LOAD_PACKED_B(pb0, pb1, 0); | |||
| KERNEL_MMA_4ACC(pb0, pb1, pb0, pb1, ra0, ra0, ra1, ra1); | |||
| } | |||
| } | |||
| #if !defined(B0) | |||
| register vector double rc0; | |||
| #endif | |||
| vector double result[4]; | |||
| SAVE_4x2_ACC(&acc0, n+0, m+0); | |||
| SAVE_4x2_ACC(&acc2, n+0, m+2); | |||
| SAVE_4x2_ACC(&acc1, n+4, m+0); | |||
| SAVE_4x2_ACC(&acc3, n+4, m+2); | |||
| } | |||
| for (; m < m2; m += 2) { | |||
| __vector_quad acc0, acc1, acc2, acc3; | |||
| INIT_4ACCS(); | |||
| register vector double ra0; | |||
| register vector double rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7; | |||
| register vector double t0, t1; | |||
| __vector_pair pb0, pb1, pb2, pb3; | |||
| if (!has_packing) { | |||
| for (k = 0; k < k2; k += 2) { | |||
| LOAD_A_1x2(k, m); | |||
| LOAD_BTP_8x2(n, k); | |||
| KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0); | |||
| LOAD_A_1x2(k+1, m); | |||
| KERNEL_MMA_2ACC(pb2, pb3, ra0, ra0); | |||
| } | |||
| for (; k < K; k++) { | |||
| LOAD_A_1x2(k, m); | |||
| LOAD_BTP_8x1(n, k); | |||
| KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0); | |||
| } | |||
| } else { | |||
| for (k = 0; k < k2; k += 2) { | |||
| LOAD_A_1x2(k, m); | |||
| LOAD_PACKED_B(pb0, pb1, 0); | |||
| KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0); | |||
| LOAD_A_1x2(k+1, m); | |||
| LOAD_PACKED_B(pb2, pb3, 8); | |||
| KERNEL_MMA_2ACC(pb2, pb3, ra0, ra0); | |||
| } | |||
| for (; k < K; k++) { | |||
| LOAD_A_1x2(k, m); | |||
| LOAD_PACKED_B(pb0, pb1, 0); | |||
| KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0); | |||
| } | |||
| } | |||
| #if !defined(B0) | |||
| register vector double rc0; | |||
| #endif | |||
| vector double result[4]; | |||
| SAVE_4x2_ACC(&acc0, n+0, m+0); | |||
| SAVE_4x2_ACC(&acc1, n+4, m+0); | |||
| } | |||
| for (; m < M; m++) { | |||
| __vector_quad acc0, acc1; | |||
| INIT_2ACCS(); | |||
| register vector double ra0; | |||
| register vector double rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7; | |||
| register vector double t0, t1; | |||
| __vector_pair pb0, pb1, pb2, pb3; | |||
| if (!has_packing) { | |||
| for (k = 0; k < k2; k += 2) { | |||
| LOAD_A_1x1(k, m); | |||
| LOAD_BTP_8x2(n, k); | |||
| KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0); | |||
| LOAD_A_1x1(k+1, m); | |||
| KERNEL_MMA_2ACC(pb2, pb3, ra0, ra0); | |||
| } | |||
| for (; k < K; k++) { | |||
| LOAD_A_1x1(k, m); | |||
| LOAD_BTP_8x1(n, k); | |||
| KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0); | |||
| } | |||
| } else { | |||
| for (k = 0; k < k2; k += 2) { | |||
| LOAD_A_1x1(k, m); | |||
| LOAD_PACKED_B(pb0, pb1, 0); | |||
| KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0); | |||
| LOAD_A_1x1(k+1, m); | |||
| LOAD_PACKED_B(pb2, pb3, 8); | |||
| KERNEL_MMA_2ACC(pb2, pb3, ra0, ra0); | |||
| } | |||
| for (; k < K; k++) { | |||
| LOAD_A_1x1(k, m); | |||
| LOAD_PACKED_B(pb0, pb1, 0); | |||
| KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0); | |||
| } | |||
| } | |||
| #if !defined(B0) | |||
| register vector double rc0; | |||
| #endif | |||
| vector double result[4]; | |||
| SAVE_4x1_ACC(&acc0, n+0, m+0); | |||
| SAVE_4x1_ACC(&acc1, n+4, m+0); | |||
| } | |||
| } | |||
| for (; n < n4; n += 4) { | |||
| for (m = 0; m < m8; m += 8) { | |||
| __vector_quad acc0, acc1, acc2, acc3; | |||
| INIT_4ACCS(); | |||
| register vector double ra0, ra1, ra2, ra3; | |||
| register vector double rb0, rb1, rb2, rb3; | |||
| register vector double t0, t1; | |||
| __vector_pair pb0, pb1; | |||
| for (k = 0; k < k2; k += 2) { | |||
| LOAD_A_1x8(k, m); | |||
| LOAD_BTP_4x2(n, k); | |||
| KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3); | |||
| LOAD_A_1x8(k+1, m); | |||
| KERNEL_MMA_4ACC(pb1, pb1, pb1, pb1, ra0, ra1, ra2, ra3); | |||
| } | |||
| for (; k < K; k++) { | |||
| LOAD_A_1x8(k, m); | |||
| LOAD_BTP_4x1(n, k); | |||
| KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3); | |||
| } | |||
| #if !defined(B0) | |||
| register vector double rc0; | |||
| #endif | |||
| vector double result[4]; | |||
| SAVE_4x2_ACC(&acc0, n+0, m+0); | |||
| SAVE_4x2_ACC(&acc1, n+0, m+2); | |||
| SAVE_4x2_ACC(&acc2, n+0, m+4); | |||
| SAVE_4x2_ACC(&acc3, n+0, m+6); | |||
| } | |||
| for (; m < m4; m += 4) { | |||
| __vector_quad acc0, acc1; | |||
| INIT_2ACCS(); | |||
| register vector double ra0, ra1; | |||
| register vector double rb0, rb1, rb2, rb3; | |||
| register vector double t0, t1; | |||
| __vector_pair pb0, pb1; | |||
| for (k = 0; k < k2; k += 2) { | |||
| LOAD_A_1x4(k, m); | |||
| LOAD_BTP_4x2(n, k); | |||
| KERNEL_MMA_2ACC(pb0, pb0, ra0, ra1); | |||
| LOAD_A_1x4(k+1, m); | |||
| KERNEL_MMA_2ACC(pb1, pb1, ra0, ra1); | |||
| } | |||
| for (; k < K; k++) { | |||
| LOAD_A_1x4(k, m); | |||
| LOAD_BTP_4x1(n, k); | |||
| KERNEL_MMA_2ACC(pb0, pb0, ra0, ra1); | |||
| } | |||
| #if !defined(B0) | |||
| register vector double rc0; | |||
| #endif | |||
| vector double result[4]; | |||
| SAVE_4x2_ACC(&acc0, n+0, m+0); | |||
| SAVE_4x2_ACC(&acc1, n+0, m+2); | |||
| } | |||
| for (; m < m2; m += 2) { | |||
| __vector_quad acc0; | |||
| INIT_1ACC(); | |||
| register vector double ra0; | |||
| register vector double rb0, rb1, rb2, rb3; | |||
| register vector double t0, t1; | |||
| __vector_pair pb0, pb1; | |||
| for (k = 0; k < k2; k += 2) { | |||
| LOAD_A_1x2(k, m); | |||
| LOAD_BTP_4x2(n, k); | |||
| KERNEL_MMA_1ACC(pb0, ra0); | |||
| LOAD_A_1x2(k+1, m); | |||
| KERNEL_MMA_1ACC(pb1, ra0); | |||
| } | |||
| for (; k < K; k++) { | |||
| LOAD_A_1x2(k, m); | |||
| LOAD_BTP_4x1(n, k); | |||
| KERNEL_MMA_1ACC(pb0, ra0); | |||
| } | |||
| #if !defined(B0) | |||
| register vector double rc0; | |||
| #endif | |||
| vector double result[4]; | |||
| SAVE_4x2_ACC(&acc0, n, m); | |||
| } | |||
| for (; m < M; m++) { | |||
| __vector_quad acc0; | |||
| INIT_1ACC(); | |||
| register vector double ra0; | |||
| register vector double rb0, rb1, rb2, rb3; | |||
| register vector double t0, t1; | |||
| __vector_pair pb0, pb1; | |||
| for (k = 0; k < k2; k += 2) { | |||
| LOAD_A_1x1(k, m); | |||
| LOAD_BTP_4x2(n, k); | |||
| KERNEL_MMA_1ACC(pb0, ra0); | |||
| LOAD_A_1x1(k+1, m); | |||
| KERNEL_MMA_1ACC(pb1, ra0); | |||
| } | |||
| for (; k < K; k++) { | |||
| LOAD_A_1x1(k, m); | |||
| LOAD_BTP_4x1(n, k); | |||
| KERNEL_MMA_1ACC(pb0, ra0); | |||
| } | |||
| #if !defined(B0) | |||
| register vector double rc0; | |||
| #endif | |||
| vector double result[4]; | |||
| SAVE_4x1_ACC(&acc0, n, m); | |||
| } | |||
| } | |||
| for (; n < n2; n += 2) { | |||
| for (m = 0; m < m8; m += 8) { | |||
| __vector_quad acc0, acc1, acc2, acc3; | |||
| INIT_4ACCS(); | |||
| register vector double ra0, ra1, ra2, ra3; | |||
| register vector double rb0, rb1; | |||
| register vector double t0, t1; | |||
| __vector_pair pb0, pb1; | |||
| for (k = 0; k < k2; k += 2) { | |||
| LOAD_A_1x8(k, m); | |||
| LOAD_BTP_2x2(n, k); | |||
| KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3); | |||
| LOAD_A_1x8(k+1, m); | |||
| KERNEL_MMA_4ACC(pb1, pb1, pb1, pb1, ra0, ra1, ra2, ra3); | |||
| } | |||
| for (; k < K; k++) { | |||
| LOAD_A_1x8(k, m); | |||
| LOAD_BTP_2x1(n, k); | |||
| KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3); | |||
| } | |||
| #if !defined(B0) | |||
| register vector double rc0; | |||
| #endif | |||
| vector double result[4]; | |||
| SAVE_2x2_ACC(&acc0, n+0, m+0); | |||
| SAVE_2x2_ACC(&acc1, n+0, m+2); | |||
| SAVE_2x2_ACC(&acc2, n+0, m+4); | |||
| SAVE_2x2_ACC(&acc3, n+0, m+6); | |||
| } | |||
| for (; m < m4; m += 4) { | |||
| __vector_quad acc0, acc1; | |||
| INIT_2ACCS(); | |||
| register vector double ra0, ra1; | |||
| register vector double rb0, rb1; | |||
| register vector double t0, t1; | |||
| __vector_pair pb0, pb1; | |||
| for (k = 0; k < k2; k += 2) { | |||
| LOAD_A_1x4(k, m); | |||
| LOAD_BTP_2x2(n, k); | |||
| KERNEL_MMA_2ACC(pb0, pb0, ra0, ra1); | |||
| LOAD_A_1x4(k+1, m); | |||
| KERNEL_MMA_2ACC(pb1, pb1, ra0, ra1); | |||
| } | |||
| for (; k < K; k++) { | |||
| LOAD_A_1x4(k, m); | |||
| LOAD_BTP_2x1(n, k); | |||
| KERNEL_MMA_2ACC(pb0, pb0, ra0, ra1); | |||
| } | |||
| #if !defined(B0) | |||
| register vector double rc0; | |||
| #endif | |||
| vector double result[4]; | |||
| SAVE_2x2_ACC(&acc0, n+0, m+0); | |||
| SAVE_2x2_ACC(&acc1, n+0, m+2); | |||
| } | |||
| for (; m < m2; m += 2) { | |||
| __vector_quad acc0; | |||
| INIT_1ACC(); | |||
| register vector double ra0; | |||
| register vector double rb0, rb1; | |||
| register vector double t0, t1; | |||
| __vector_pair pb0, pb1; | |||
| for (k = 0; k < k2; k += 2) { | |||
| LOAD_A_1x2(k, m); | |||
| LOAD_BTP_2x2(n, k); | |||
| KERNEL_MMA_1ACC(pb0, ra0); | |||
| LOAD_A_1x2(k+1, m); | |||
| KERNEL_MMA_1ACC(pb1, ra0); | |||
| } | |||
| for (; k < K; k++) { | |||
| LOAD_A_1x2(k, m); | |||
| LOAD_BTP_2x1(n, k); | |||
| KERNEL_MMA_1ACC(pb0, ra0); | |||
| } | |||
| #if !defined(B0) | |||
| register vector double rc0; | |||
| #endif | |||
| vector double result[4]; | |||
| SAVE_2x2_ACC(&acc0, n+0, m+0); | |||
| } | |||
| for (; m < M; m++) { | |||
| __vector_quad acc0; | |||
| INIT_1ACC(); | |||
| register vector double ra0; | |||
| register vector double rb0, rb1; | |||
| register vector double t0, t1; | |||
| __vector_pair pb0, pb1; | |||
| for (k = 0; k < k2; k += 2) { | |||
| LOAD_A_1x1(k, m); | |||
| LOAD_BTP_2x2(n, k); | |||
| KERNEL_MMA_1ACC(pb0, ra0); | |||
| LOAD_A_1x1(k+1, m); | |||
| KERNEL_MMA_1ACC(pb1, ra0); | |||
| } | |||
| for (; k < K; k++) { | |||
| LOAD_A_1x1(k, m); | |||
| LOAD_BTP_2x1(n, k); | |||
| KERNEL_MMA_1ACC(pb0, ra0); | |||
| } | |||
| #if !defined(B0) | |||
| register vector double rc0; | |||
| #endif | |||
| vector double result[4]; | |||
| SAVE_2x1_ACC(&acc0, n+0, m+0); | |||
| } | |||
| } | |||
| for (; n < N; n++) { | |||
| for (m = 0; m < m8; m += 8) { | |||
| vector double result = ((vector double){0.,0.}); | |||
| vector double result1 = ((vector double){0.,0.}); | |||
| vector double result2 = ((vector double){0.,0.}); | |||
| vector double result3 = ((vector double){0.,0.}); | |||
| register vector double ra0, ra1, ra2, ra3; | |||
| register vector double rb0; | |||
| for (k = 0; k < K; k++) { | |||
| LOAD_A_1x8(k, m); | |||
| LOAD_B_1x1(n, k); | |||
| KERNEL_VMADD_4VSR(ra0, ra1, ra2, ra3, rb0, rb0, rb0, rb0); | |||
| } | |||
| #if !defined(B0) | |||
| register vector double rc0; | |||
| #endif | |||
| SAVE_1x4_VSR(result, n, m+0); | |||
| SAVE_1x4_VSR(result1, n, m+2); | |||
| SAVE_1x4_VSR(result2, n, m+4); | |||
| SAVE_1x4_VSR(result3, n, m+6); | |||
| } | |||
| for (; m < m4; m += 4) { | |||
| vector double result = ((vector double){0.,0.}); | |||
| vector double result1 = ((vector double){0.,0.}); | |||
| register vector double ra0, ra1; | |||
| register vector double rb0; | |||
| for (k = 0; k < K; k++) { | |||
| LOAD_A_1x4(k, m); | |||
| LOAD_B_1x1(n, k); | |||
| KERNEL_VMADD_2VSR(ra0, ra1, rb0, rb0); | |||
| } | |||
| #if !defined(B0) | |||
| register vector double rc0; | |||
| #endif | |||
| SAVE_1x4_VSR(result, n, m+0); | |||
| SAVE_1x4_VSR(result1, n, m+2); | |||
| } | |||
| for (; m < m2; m += 2) { | |||
| vector double result = ((vector double){0.,0.}); | |||
| register vector double ra0; | |||
| register vector double rb0; | |||
| for (k = 0; k < K; k++) { | |||
| LOAD_A_1x2(k, m); | |||
| LOAD_B_1x1(n, k); | |||
| KERNEL_VMADD_1VSR(ra0, rb0); | |||
| } | |||
| #if !defined(B0) | |||
| register vector double rc0; | |||
| #endif | |||
| SAVE_1x4_VSR(result, n, m+0); | |||
| } | |||
| for (; m < M; m++) { | |||
| FLOAT result = 0.0; | |||
| for (k = 0; k < K; k++) { | |||
| result += A[m+k*lda] * B[n*ldb+k]; | |||
| } | |||
| result = result * alpha; | |||
| #if !defined(B0) | |||
| C[n*ldc+m] = (C[n*ldc+m] * beta) + result; | |||
| #else | |||
| C[n*ldc+m] = result; | |||
| #endif | |||
| } | |||
| } | |||
| if (has_packing) free(packB); | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,581 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2021, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #include <altivec.h> | |||
| typedef __vector unsigned char vec_t; | |||
| #if !__has_builtin(__builtin_vsx_assemble_pair) | |||
| #define __builtin_vsx_assemble_pair __builtin_mma_assemble_pair | |||
| #endif | |||
| #if !defined(B0) | |||
| #define SAVE_4x2_ACC(ACC, N, M) \ | |||
| __builtin_mma_disassemble_acc((void *)result, ACC); \ | |||
| rc0 = vec_xl(0, C+(N+0)*ldc+M); \ | |||
| rc0 = vec_mul(rc0, vbeta); \ | |||
| result[0] = vec_madd(result[0], valpha, rc0); \ | |||
| vec_xst(result[0], 0, C+(N+0)*ldc+M); \ | |||
| rc0 = vec_xl(0, C+(N+1)*ldc+M); \ | |||
| rc0 = vec_mul(rc0, vbeta); \ | |||
| result[1] = vec_madd(result[1], valpha, rc0); \ | |||
| vec_xst(result[1], 0, C+(N+1)*ldc+M); \ | |||
| rc0 = vec_xl(0, C+(N+2)*ldc+M); \ | |||
| rc0 = vec_mul(rc0, vbeta); \ | |||
| result[2] = vec_madd(result[2], valpha, rc0); \ | |||
| vec_xst(result[2], 0, C+(N+2)*ldc+M); \ | |||
| rc0 = vec_xl(0, C+(N+3)*ldc+M); \ | |||
| rc0 = vec_mul(rc0, vbeta); \ | |||
| result[3] = vec_madd(result[3], valpha, rc0); \ | |||
| vec_xst(result[3], 0, C+(N+3)*ldc+M); | |||
| #define SAVE_2x2_ACC(ACC, N, M) \ | |||
| __builtin_mma_disassemble_acc((void *)result, ACC); \ | |||
| rc0 = vec_xl(0, C+(N+0)*ldc+M); \ | |||
| rc0 = vec_mul(rc0, vbeta); \ | |||
| result[0] = vec_madd(result[0], valpha, rc0); \ | |||
| vec_xst(result[0], 0, C+(N+0)*ldc+M); \ | |||
| rc0 = vec_xl(0, C+(N+1)*ldc+M); \ | |||
| rc0 = vec_mul(rc0, vbeta); \ | |||
| result[1] = vec_madd(result[1], valpha, rc0); \ | |||
| vec_xst(result[1], 0, C+(N+1)*ldc+M); | |||
| #define SAVE_1x4_VSR(result, N, M) \ | |||
| rc0 = vec_xl(0, C+((N)*ldc)+M); \ | |||
| rc0 = vec_mul(rc0, vbeta); \ | |||
| result = vec_madd(result, valpha, rc0); \ | |||
| vec_xst(result, 0, C+((N)*ldc)+M); | |||
| #define SAVE_4x1_VSR(result, N, M) \ | |||
| result = vec_mul(result, valpha); \ | |||
| C[(N+0)*ldc+M] = (C[(N+0)*ldc+M] * beta) + result[0]; \ | |||
| C[(N+1)*ldc+M] = (C[(N+1)*ldc+M] * beta) + result[1]; | |||
| #else | |||
| #define SAVE_4x2_ACC(ACC, N, M) \ | |||
| __builtin_mma_disassemble_acc((void *)result, ACC); \ | |||
| result[0] = vec_mul(result[0], valpha); \ | |||
| vec_xst(result[0], 0, C+(N+0)*ldc+M); \ | |||
| result[1] = vec_mul(result[1], valpha); \ | |||
| vec_xst(result[1], 0, C+(N+1)*ldc+M); \ | |||
| result[2] = vec_mul(result[2], valpha); \ | |||
| vec_xst(result[2], 0, C+(N+2)*ldc+M); \ | |||
| result[3] = vec_mul(result[3], valpha); \ | |||
| vec_xst(result[3], 0, C+(N+3)*ldc+M); | |||
| #define SAVE_2x2_ACC(ACC, N, M) \ | |||
| __builtin_mma_disassemble_acc((void *)result, ACC); \ | |||
| result[0] = vec_mul(result[0], valpha); \ | |||
| vec_xst(result[0], 0, C+(N+0)*ldc+M); \ | |||
| result[1] = vec_mul(result[1], valpha); \ | |||
| vec_xst(result[1], 0, C+(N+1)*ldc+M); | |||
| #define SAVE_1x4_VSR(result, N, M) \ | |||
| result = vec_mul(result, valpha); \ | |||
| vec_xst(result, 0, C+((N)*ldc)+M); | |||
| #define SAVE_4x1_VSR(result, N, M) \ | |||
| result = vec_mul(result, valpha); \ | |||
| C[(N+0)*ldc+M] = result[0]; \ | |||
| C[(N+1)*ldc+M] = result[1]; | |||
| #endif | |||
| #define INIT_8ACCS() \ | |||
| __builtin_mma_xxsetaccz(&acc0); \ | |||
| __builtin_mma_xxsetaccz(&acc1); \ | |||
| __builtin_mma_xxsetaccz(&acc2); \ | |||
| __builtin_mma_xxsetaccz(&acc3); \ | |||
| __builtin_mma_xxsetaccz(&acc4); \ | |||
| __builtin_mma_xxsetaccz(&acc5); \ | |||
| __builtin_mma_xxsetaccz(&acc6); \ | |||
| __builtin_mma_xxsetaccz(&acc7); | |||
| #define INIT_4ACCS() \ | |||
| __builtin_mma_xxsetaccz(&acc0); \ | |||
| __builtin_mma_xxsetaccz(&acc1); \ | |||
| __builtin_mma_xxsetaccz(&acc2); \ | |||
| __builtin_mma_xxsetaccz(&acc3); | |||
| #define INIT_2ACCS() \ | |||
| __builtin_mma_xxsetaccz(&acc0); \ | |||
| __builtin_mma_xxsetaccz(&acc1); | |||
| #define INIT_1ACC() __builtin_mma_xxsetaccz(&acc0); | |||
| #define LOAD_A_1x8(K, M) \ | |||
| ra0 = vec_xl(0, A+(K*lda)+M+0); \ | |||
| ra1 = vec_xl(0, A+(K*lda)+M+2); \ | |||
| ra2 = vec_xl(0, A+(K*lda)+M+4); \ | |||
| ra3 = vec_xl(0, A+(K*lda)+M+6); | |||
| #define LOAD_A_1x4(K, M) \ | |||
| ra0 = vec_xl(0, A+(K*lda)+M+0); \ | |||
| ra1 = vec_xl(0, A+(K*lda)+M+2); | |||
| #define LOAD_A_1x2(K, M) ra0 = vec_xl(0, A+(K*lda)+M); | |||
| #define LOAD_A_1x1(K, M) ra0 = vec_splats(A[K*lda+M]); | |||
| #define LOAD_BP_1x8(K, N) \ | |||
| pb0 = *((__vector_pair *)((void *)&B[((K)*ldb)+N+0])); \ | |||
| pb1 = *((__vector_pair *)((void *)&B[((K)*ldb)+N+4])); | |||
| #define LOAD_BP_1x4(K, N) \ | |||
| pb0 = *((__vector_pair *)((void *)&B[((K)*ldb)+N+0])); | |||
| #define LOAD_BP_1x2(K, N) \ | |||
| t0 = vec_xl(0, B+(K*ldb)+N); \ | |||
| __builtin_vsx_assemble_pair(&pb0, (vec_t)t0, (vec_t)t0); | |||
| #define LOAD_B_1x8(K, N) \ | |||
| rb0 = vec_xl(0, B+(K*ldb)+N+0); \ | |||
| rb1 = vec_xl(0, B+(K*ldb)+N+2); \ | |||
| rb2 = vec_xl(0, B+(K*ldb)+N+4); \ | |||
| rb3 = vec_xl(0, B+(K*ldb)+N+6); \ | |||
| #define LOAD_B_1x4(K, N) \ | |||
| rb0 = vec_xl(0, B+(K*ldb)+N+0); \ | |||
| rb1 = vec_xl(0, B+(K*ldb)+N+2); | |||
| #define LOAD_B_1x2(K, N) \ | |||
| rb0 = vec_xl(0, B+(K*ldb)+N+0); | |||
| #define LOAD_B_1x1(K, N) rb0 = vec_splats(B[K*ldb+N]); | |||
| #define KERNEL_MMA_8ACC(b0, b1, b2, b3, b4, b5, b6, b7, \ | |||
| a0, a1, a2, a3, a4, a5, a6, a7) \ | |||
| __builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); \ | |||
| __builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1); \ | |||
| __builtin_mma_xvf64gerpp(&acc2, b2, (vec_t)a2); \ | |||
| __builtin_mma_xvf64gerpp(&acc3, b3, (vec_t)a3); \ | |||
| __builtin_mma_xvf64gerpp(&acc4, b4, (vec_t)a4); \ | |||
| __builtin_mma_xvf64gerpp(&acc5, b5, (vec_t)a5); \ | |||
| __builtin_mma_xvf64gerpp(&acc6, b6, (vec_t)a6); \ | |||
| __builtin_mma_xvf64gerpp(&acc7, b7, (vec_t)a7); | |||
| #define KERNEL_MMA_4ACC(b0, b1, b2, b3, a0, a1, a2, a3) \ | |||
| __builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); \ | |||
| __builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1); \ | |||
| __builtin_mma_xvf64gerpp(&acc2, b2, (vec_t)a2); \ | |||
| __builtin_mma_xvf64gerpp(&acc3, b3, (vec_t)a3); | |||
| #define KERNEL_MMA_2ACC(b0, b1, a0, a1) \ | |||
| __builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); \ | |||
| __builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1); | |||
| #define KERNEL_MMA_1ACC(b0, a0) \ | |||
| __builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); | |||
| #define KERNEL_VMADD_4VSR(a0, a1, a2, a3, b0, b1, b2, b3) \ | |||
| result = vec_madd(a0, b0, result); \ | |||
| result1 = vec_madd(a1, b1, result1); \ | |||
| result2 = vec_madd(a2, b2, result2); \ | |||
| result3 = vec_madd(a3, b3, result3); | |||
| #define KERNEL_VMADD_2VSR(a0, a1, b0, b1) \ | |||
| result = vec_madd(a0, b0, result); \ | |||
| result1 = vec_madd(a1, b1, result1); | |||
| #define KERNEL_VMADD_1VSR(a0, b0) \ | |||
| result = vec_madd(a0, b0, result); | |||
| #ifdef B0 | |||
| int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) | |||
| #else | |||
| int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc) | |||
| #endif | |||
| { | |||
| BLASLONG m, n, k; | |||
| BLASLONG m8 = M & ~7; | |||
| BLASLONG m4 = M & ~3; | |||
| BLASLONG m2 = M & ~1; | |||
| BLASLONG n8 = N & ~7; | |||
| BLASLONG n4 = N & ~3; | |||
| BLASLONG n2 = N & ~1; | |||
| vector double valpha = vec_splats(alpha); | |||
| #if !defined(B0) | |||
| vector double vbeta = vec_splats(beta); | |||
| #endif | |||
| for (m = 0; m < m8; m += 8) { | |||
| for (n = 0; n < n8; n += 8) { | |||
| __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; | |||
| INIT_8ACCS(); | |||
| register vector double ra0, ra1, ra2, ra3; | |||
| __vector_pair pb0, pb1; | |||
| for (k = 0; k < K; k++) { | |||
| LOAD_A_1x8(k, m); | |||
| LOAD_BP_1x8(k, n); | |||
| KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1, | |||
| ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3); | |||
| } | |||
| #if !defined(B0) | |||
| register vector double rc0; | |||
| #endif | |||
| vector double result[4]; | |||
| SAVE_4x2_ACC(&acc0, n+0, m+0); | |||
| SAVE_4x2_ACC(&acc2, n+0, m+2); | |||
| SAVE_4x2_ACC(&acc4, n+0, m+4); | |||
| SAVE_4x2_ACC(&acc6, n+0, m+6); | |||
| SAVE_4x2_ACC(&acc1, n+4, m+0); | |||
| SAVE_4x2_ACC(&acc3, n+4, m+2); | |||
| SAVE_4x2_ACC(&acc5, n+4, m+4); | |||
| SAVE_4x2_ACC(&acc7, n+4, m+6); | |||
| } | |||
| for (; n < n4; n += 4) { | |||
| __vector_quad acc0, acc1, acc2, acc3; | |||
| INIT_4ACCS(); | |||
| register vector double ra0, ra1, ra2, ra3; | |||
| __vector_pair pb0; | |||
| for (k = 0; k < K; k++) { | |||
| LOAD_A_1x8(k, m); | |||
| LOAD_BP_1x4(k, n); | |||
| KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3); | |||
| } | |||
| #if !defined(B0) | |||
| register vector double rc0; | |||
| #endif | |||
| vector double result[4]; | |||
| SAVE_4x2_ACC(&acc0, n+0, m+0); | |||
| SAVE_4x2_ACC(&acc1, n+0, m+2); | |||
| SAVE_4x2_ACC(&acc2, n+0, m+4); | |||
| SAVE_4x2_ACC(&acc3, n+0, m+6); | |||
| } | |||
| for (; n < n2; n += 2) { | |||
| __vector_quad acc0, acc1, acc2, acc3; | |||
| INIT_4ACCS(); | |||
| register vector double ra0, ra1, ra2, ra3; | |||
| register vector double t0; | |||
| __vector_pair pb0; | |||
| for (k = 0; k < K; k++) { | |||
| LOAD_A_1x8(k, m); | |||
| LOAD_BP_1x2(k, n); | |||
| KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3); | |||
| } | |||
| #if !defined(B0) | |||
| register vector double rc0; | |||
| #endif | |||
| vector double result[4]; | |||
| SAVE_2x2_ACC(&acc0, n+0, m+0); | |||
| SAVE_2x2_ACC(&acc1, n+0, m+2); | |||
| SAVE_2x2_ACC(&acc2, n+0, m+4); | |||
| SAVE_2x2_ACC(&acc3, n+0, m+6); | |||
| } | |||
| for (; n < N; n++) { | |||
| register vector double result = ((vector double){0.,0.}); | |||
| register vector double result1 = ((vector double){0.,0.}); | |||
| register vector double result2 = ((vector double){0.,0.}); | |||
| register vector double result3 = ((vector double){0.,0.}); | |||
| register vector double ra0, ra1, ra2, ra3; | |||
| register vector double rb0; | |||
| for (k = 0; k < K; k++) { | |||
| LOAD_A_1x8(k, m); | |||
| LOAD_B_1x1(k, n); | |||
| KERNEL_VMADD_4VSR(ra0, ra1, ra2, ra3, rb0, rb0, rb0, rb0); | |||
| } | |||
| #if !defined(B0) | |||
| register vector double rc0; | |||
| #endif | |||
| SAVE_1x4_VSR(result, n, m+0); | |||
| SAVE_1x4_VSR(result1, n, m+2); | |||
| SAVE_1x4_VSR(result2, n, m+4); | |||
| SAVE_1x4_VSR(result3, n, m+6); | |||
| } | |||
| } | |||
| for (; m < m4; m += 4) { | |||
| for (n = 0; n < n8; n += 8) { | |||
| __vector_quad acc0, acc1, acc2, acc3; | |||
| INIT_4ACCS(); | |||
| register vector double ra0, ra1; | |||
| __vector_pair pb0, pb1; | |||
| for (k = 0; k < K; k++) { | |||
| LOAD_A_1x4(k, m); | |||
| LOAD_BP_1x8(k, n); | |||
| KERNEL_MMA_4ACC(pb0, pb1, pb0, pb1, ra0, ra0, ra1, ra1); | |||
| } | |||
| #if !defined(B0) | |||
| register vector double rc0; | |||
| #endif | |||
| vector double result[4]; | |||
| SAVE_4x2_ACC(&acc0, n+0, m+0); | |||
| SAVE_4x2_ACC(&acc2, n+0, m+2); | |||
| SAVE_4x2_ACC(&acc1, n+4, m+0); | |||
| SAVE_4x2_ACC(&acc3, n+4, m+2); | |||
| } | |||
| for (; n < n4; n += 4) { | |||
| __vector_quad acc0, acc1; | |||
| INIT_2ACCS(); | |||
| register vector double ra0, ra1; | |||
| __vector_pair pb0; | |||
| for (k = 0; k < K; k++) { | |||
| LOAD_A_1x4(k, m); | |||
| LOAD_BP_1x4(k, n); | |||
| KERNEL_MMA_2ACC(pb0, pb0, ra0, ra1); | |||
| } | |||
| #if !defined(B0) | |||
| register vector double rc0; | |||
| #endif | |||
| vector double result[4]; | |||
| SAVE_4x2_ACC(&acc0, n+0, m+0); | |||
| SAVE_4x2_ACC(&acc1, n+0, m+2); | |||
| } | |||
| for (; n < n2; n += 2) { | |||
| __vector_quad acc0, acc1; | |||
| INIT_2ACCS(); | |||
| register vector double ra0, ra1; | |||
| register vector double t0; | |||
| __vector_pair pb0; | |||
| for (k = 0; k < K; k++) { | |||
| LOAD_A_1x4(k, m); | |||
| LOAD_BP_1x2(k, n); | |||
| KERNEL_MMA_2ACC(pb0, pb0, ra0, ra1); | |||
| } | |||
| #if !defined(B0) | |||
| register vector double rc0; | |||
| #endif | |||
| vector double result[4]; | |||
| SAVE_2x2_ACC(&acc0, n+0, m+0); | |||
| SAVE_2x2_ACC(&acc1, n+0, m+2); | |||
| } | |||
| for (; n < N; n++) { | |||
| register vector double result = ((vector double){0.,0.}); | |||
| register vector double result1 = ((vector double){0.,0.}); | |||
| register vector double ra0, ra1; | |||
| register vector double rb0; | |||
| for (k = 0; k < K; k++) { | |||
| LOAD_A_1x4(k, m); | |||
| LOAD_B_1x1(k, n); | |||
| KERNEL_VMADD_2VSR(ra0, ra1, rb0, rb0); | |||
| } | |||
| #if !defined(B0) | |||
| register vector double rc0; | |||
| #endif | |||
| SAVE_1x4_VSR(result, n, m+0); | |||
| SAVE_1x4_VSR(result1, n, m+2); | |||
| } | |||
| } | |||
| for (; m < m2; m += 2) { | |||
| for (n = 0; n < n8; n += 8) { | |||
| __vector_quad acc0, acc1; | |||
| INIT_2ACCS(); | |||
| register vector double ra0; | |||
| __vector_pair pb0, pb1; | |||
| for (k = 0; k < K; k++) { | |||
| LOAD_A_1x2(k, m); | |||
| LOAD_BP_1x8(k, n); | |||
| KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0); | |||
| } | |||
| #if !defined(B0) | |||
| register vector double rc0; | |||
| #endif | |||
| vector double result[4]; | |||
| SAVE_4x2_ACC(&acc0, n+0, m+0); | |||
| SAVE_4x2_ACC(&acc1, n+4, m+0); | |||
| } | |||
| for (; n < n4; n += 4) { | |||
| __vector_quad acc0; | |||
| INIT_1ACC(); | |||
| register vector double ra0; | |||
| __vector_pair pb0; | |||
| for (k = 0; k < K; k++) { | |||
| LOAD_A_1x2(k, m); | |||
| LOAD_BP_1x4(k, n); | |||
| KERNEL_MMA_1ACC(pb0, ra0); | |||
| } | |||
| #if !defined(B0) | |||
| register vector double rc0; | |||
| #endif | |||
| vector double result[4]; | |||
| SAVE_4x2_ACC(&acc0, n, m); | |||
| } | |||
| for (; n < n2; n += 2) { | |||
| __vector_quad acc0; | |||
| INIT_1ACC(); | |||
| register vector double ra0; | |||
| register vector double t0; | |||
| __vector_pair pb0; | |||
| for (k = 0; k < K; k++) { | |||
| LOAD_A_1x2(k, m); | |||
| LOAD_BP_1x2(k, n); | |||
| KERNEL_MMA_1ACC(pb0, ra0); | |||
| } | |||
| #if !defined(B0) | |||
| register vector double rc0; | |||
| #endif | |||
| vector double result[4]; | |||
| SAVE_2x2_ACC(&acc0, n, m); | |||
| } | |||
| for (; n < N; n++) { | |||
| register vector double result = ((vector double){0.,0.}); | |||
| register vector double ra0; | |||
| register vector double rb0; | |||
| for (k = 0; k < K; k++) { | |||
| LOAD_A_1x2(k, m); | |||
| LOAD_B_1x1(k, n); | |||
| KERNEL_VMADD_1VSR(ra0, rb0); | |||
| } | |||
| #if !defined(B0) | |||
| register vector double rc0; | |||
| #endif | |||
| SAVE_1x4_VSR(result, n, m+0); | |||
| } | |||
| } | |||
| for (; m < M; m++) { | |||
| for (n = 0; n < n8; n += 8) { | |||
| register vector double result = ((vector double){0.,0.}); | |||
| register vector double result1 = ((vector double){0.,0.}); | |||
| register vector double result2 = ((vector double){0.,0.}); | |||
| register vector double result3 = ((vector double){0.,0.}); | |||
| register vector double ra0; | |||
| register vector double rb0, rb1, rb2, rb3; | |||
| for (k = 0; k < K; k++) { | |||
| LOAD_A_1x1(k, m); | |||
| LOAD_B_1x8(k, n); | |||
| KERNEL_VMADD_4VSR(ra0, ra0, ra0, ra0, rb0, rb1, rb2, rb3); | |||
| } | |||
| SAVE_4x1_VSR(result, n, m); | |||
| SAVE_4x1_VSR(result1, n+2, m); | |||
| SAVE_4x1_VSR(result2, n+4, m); | |||
| SAVE_4x1_VSR(result3, n+6, m); | |||
| } | |||
| for (; n < n4; n += 4) { | |||
| register vector double result = ((vector double){0.,0.}); | |||
| register vector double result1 = ((vector double){0.,0.}); | |||
| register vector double ra0; | |||
| register vector double rb0, rb1; | |||
| for (k = 0; k < K; k++) { | |||
| LOAD_A_1x1(k, m); | |||
| LOAD_B_1x4(k, n); | |||
| KERNEL_VMADD_2VSR(ra0, ra0, rb0, rb1); | |||
| } | |||
| SAVE_4x1_VSR(result, n, m); | |||
| SAVE_4x1_VSR(result1, n+2, m); | |||
| } | |||
| for (; n < n2; n += 2) { | |||
| register vector double result = ((vector double){0.,0.}); | |||
| register vector double ra0; | |||
| register vector double rb0; | |||
| for (k = 0; k < K; k++) { | |||
| LOAD_A_1x1(k, m); | |||
| LOAD_B_1x2(k, n); | |||
| KERNEL_VMADD_1VSR(ra0, rb0); | |||
| } | |||
| SAVE_4x1_VSR(result, n, m); | |||
| } | |||
| for (; n < N; n++) { | |||
| FLOAT result = 0.0; | |||
| for (k = 0; k < K; k++) { | |||
| result += A[k*lda+m] * B[k*ldb+n]; | |||
| } | |||
| result = result * alpha; | |||
| #if !defined(B0) | |||
| C[n*ldc+m] = (C[n*ldc+m] * beta) + result; | |||
| #else | |||
| C[n*ldc+m] = result; | |||
| #endif | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,882 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2021, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #include <altivec.h> | |||
| typedef __vector unsigned char vec_t; | |||
| #if !__has_builtin(__builtin_vsx_assemble_pair) | |||
| #define __builtin_vsx_assemble_pair __builtin_mma_assemble_pair | |||
| #endif | |||
| #if !defined(B0) | |||
| #define SAVE_4x2_ACC(ACC, N, M) \ | |||
| __builtin_mma_disassemble_acc((void *)result, ACC); \ | |||
| rc0 = vec_xl(0, C+(N+0)*ldc+M); \ | |||
| rc0 = vec_mul(rc0, vbeta); \ | |||
| result[0] = vec_madd(result[0], valpha, rc0); \ | |||
| vec_xst(result[0], 0, C+(N+0)*ldc+M); \ | |||
| rc0 = vec_xl(0, C+(N+1)*ldc+M); \ | |||
| rc0 = vec_mul(rc0, vbeta); \ | |||
| result[1] = vec_madd(result[1], valpha, rc0); \ | |||
| vec_xst(result[1], 0, C+(N+1)*ldc+M); \ | |||
| rc0 = vec_xl(0, C+(N+2)*ldc+M); \ | |||
| rc0 = vec_mul(rc0, vbeta); \ | |||
| result[2] = vec_madd(result[2], valpha, rc0); \ | |||
| vec_xst(result[2], 0, C+(N+2)*ldc+M); \ | |||
| rc0 = vec_xl(0, C+(N+3)*ldc+M); \ | |||
| rc0 = vec_mul(rc0, vbeta); \ | |||
| result[3] = vec_madd(result[3], valpha, rc0); \ | |||
| vec_xst(result[3], 0, C+(N+3)*ldc+M); | |||
| #define SAVE_4x1_ACC(ACC, N, M) \ | |||
| __builtin_mma_disassemble_acc((void *)result, ACC); \ | |||
| rc0 = vec_xl_len(C+(N+0)*ldc+M, 8); \ | |||
| rc0 = vec_mul(rc0, vbeta); \ | |||
| result[0] = vec_madd(result[0], valpha, rc0); \ | |||
| vec_xst_len(result[0], C+(N+0)*ldc+M, 8); \ | |||
| rc0 = vec_xl_len(C+(N+1)*ldc+M, 8); \ | |||
| rc0 = vec_mul(rc0, vbeta); \ | |||
| result[1] = vec_madd(result[1], valpha, rc0); \ | |||
| vec_xst_len(result[1], C+(N+1)*ldc+M, 8); \ | |||
| rc0 = vec_xl_len(C+(N+2)*ldc+M, 8); \ | |||
| rc0 = vec_mul(rc0, vbeta); \ | |||
| result[2] = vec_madd(result[2], valpha, rc0); \ | |||
| vec_xst_len(result[2], C+(N+2)*ldc+M, 8); \ | |||
| rc0 = vec_xl_len(C+(N+3)*ldc+M, 8); \ | |||
| rc0 = vec_mul(rc0, vbeta); \ | |||
| result[3] = vec_madd(result[3], valpha, rc0); \ | |||
| vec_xst_len(result[3], C+(N+3)*ldc+M, 8); | |||
| #define SAVE_2x2_ACC(ACC, N, M) \ | |||
| __builtin_mma_disassemble_acc((void *)result, ACC); \ | |||
| rc0 = vec_xl(0, C+(N+0)*ldc+M); \ | |||
| rc0 = vec_mul(rc0, vbeta); \ | |||
| result[0] = vec_madd(result[0], valpha, rc0); \ | |||
| vec_xst(result[0], 0, C+(N+0)*ldc+M); \ | |||
| rc0 = vec_xl(0, C+(N+1)*ldc+M); \ | |||
| rc0 = vec_mul(rc0, vbeta); \ | |||
| result[1] = vec_madd(result[1], valpha, rc0); \ | |||
| vec_xst(result[1], 0, C+(N+1)*ldc+M); | |||
| #define SAVE_2x1_ACC(ACC, N, M) \ | |||
| __builtin_mma_disassemble_acc((void *)result, ACC); \ | |||
| rc0 = vec_xl_len(C+(N+0)*ldc+M, 8); \ | |||
| rc0 = vec_mul(rc0, vbeta); \ | |||
| result[0] = vec_madd(result[0], valpha, rc0); \ | |||
| vec_xst_len(result[0], C+(N+0)*ldc+M, 8); \ | |||
| rc0 = vec_xl_len(C+(N+1)*ldc+M, 8); \ | |||
| rc0 = vec_mul(rc0, vbeta); \ | |||
| result[1] = vec_madd(result[1], valpha, rc0); \ | |||
| vec_xst_len(result[1], C+(N+1)*ldc+M, 8); | |||
| #define SAVE_1x4_VSR(result, N, M) \ | |||
| rc0 = vec_xl(0, C+((N)*ldc)+M); \ | |||
| rc0 = vec_mul(rc0, vbeta); \ | |||
| result = vec_madd(result, valpha, rc0); \ | |||
| vec_xst(result, 0, C+((N)*ldc)+M); | |||
| #else | |||
| #define SAVE_4x2_ACC(ACC, N, M) \ | |||
| __builtin_mma_disassemble_acc((void *)result, ACC); \ | |||
| result[0] = vec_mul(result[0], valpha); \ | |||
| vec_xst(result[0], 0, C+(N+0)*ldc+M); \ | |||
| result[1] = vec_mul(result[1], valpha); \ | |||
| vec_xst(result[1], 0, C+(N+1)*ldc+M); \ | |||
| result[2] = vec_mul(result[2], valpha); \ | |||
| vec_xst(result[2], 0, C+(N+2)*ldc+M); \ | |||
| result[3] = vec_mul(result[3], valpha); \ | |||
| vec_xst(result[3], 0, C+(N+3)*ldc+M); | |||
| #define SAVE_4x1_ACC(ACC, N, M) \ | |||
| __builtin_mma_disassemble_acc((void *)result, ACC); \ | |||
| result[0] = vec_mul(result[0], valpha); \ | |||
| vec_xst_len(result[0], C+(N+0)*ldc+M, 8); \ | |||
| result[1] = vec_mul(result[1], valpha); \ | |||
| vec_xst_len(result[1], C+(N+1)*ldc+M, 8); \ | |||
| result[2] = vec_mul(result[2], valpha); \ | |||
| vec_xst_len(result[2], C+(N+2)*ldc+M, 8); \ | |||
| result[3] = vec_mul(result[3], valpha); \ | |||
| vec_xst_len(result[3], C+(N+3)*ldc+M, 8); | |||
| #define SAVE_2x2_ACC(ACC, N, M) \ | |||
| __builtin_mma_disassemble_acc((void *)result, ACC); \ | |||
| result[0] = vec_mul(result[0], valpha); \ | |||
| vec_xst(result[0], 0, C+(N+0)*ldc+M); \ | |||
| result[1] = vec_mul(result[1], valpha); \ | |||
| vec_xst(result[1], 0, C+(N+1)*ldc+M); | |||
| #define SAVE_2x1_ACC(ACC, N, M) \ | |||
| __builtin_mma_disassemble_acc((void *)result, ACC); \ | |||
| result[0] = vec_mul(result[0], valpha); \ | |||
| vec_xst_len(result[0], C+(N+0)*ldc+M, 8); \ | |||
| result[1] = vec_mul(result[1], valpha); \ | |||
| vec_xst_len(result[1], C+(N+1)*ldc+M, 8); | |||
| #define SAVE_1x4_VSR(result, N, M) \ | |||
| result = vec_mul(result, valpha); \ | |||
| vec_xst(result, 0, C+((N)*ldc)+M); | |||
| #endif | |||
| #define INIT_8ACCS() \ | |||
| __builtin_mma_xxsetaccz(&acc0); \ | |||
| __builtin_mma_xxsetaccz(&acc1); \ | |||
| __builtin_mma_xxsetaccz(&acc2); \ | |||
| __builtin_mma_xxsetaccz(&acc3); \ | |||
| __builtin_mma_xxsetaccz(&acc4); \ | |||
| __builtin_mma_xxsetaccz(&acc5); \ | |||
| __builtin_mma_xxsetaccz(&acc6); \ | |||
| __builtin_mma_xxsetaccz(&acc7); | |||
| #define INIT_4ACCS() \ | |||
| __builtin_mma_xxsetaccz(&acc0); \ | |||
| __builtin_mma_xxsetaccz(&acc1); \ | |||
| __builtin_mma_xxsetaccz(&acc2); \ | |||
| __builtin_mma_xxsetaccz(&acc3); | |||
| #define INIT_2ACCS() \ | |||
| __builtin_mma_xxsetaccz(&acc0); \ | |||
| __builtin_mma_xxsetaccz(&acc1); | |||
| #define INIT_1ACC() __builtin_mma_xxsetaccz(&acc0); | |||
| #if (defined(__GNUC__) && (__GNUC__ == 10)) | |||
| #if defined(_AIX) | |||
| #define LOAD_PAIR(pair, v0, v1) \ | |||
| __builtin_vsx_assemble_pair(&pair, (vec_t)v0, (vec_t)v1); | |||
| #else | |||
| #define LOAD_PAIR(pair, v0, v1) \ | |||
| __builtin_vsx_assemble_pair(&pair, (vec_t)v1, (vec_t)v0); | |||
| #endif | |||
| #else | |||
| #define LOAD_PAIR(pair, v0, v1) \ | |||
| __builtin_vsx_build_pair(&pair, (vec_t)v0, (vec_t)v1); | |||
| #endif | |||
| #define LOAD_AT_8x2(M, K) \ | |||
| ra0 = vec_xl(0, A+(M+0)*lda+K+0); \ | |||
| ra1 = vec_xl(0, A+(M+1)*lda+K+0); \ | |||
| t0 = vec_mergeh(ra0, ra1); \ | |||
| t1 = vec_mergel(ra0, ra1); \ | |||
| ra0 = t0; \ | |||
| ra1 = t1; \ | |||
| ra2 = vec_xl(0, A+(M+2)*lda+K+0); \ | |||
| ra3 = vec_xl(0, A+(M+3)*lda+K+0); \ | |||
| t0 = vec_mergeh(ra2, ra3); \ | |||
| t1 = vec_mergel(ra2, ra3); \ | |||
| ra2 = t0; \ | |||
| ra3 = t1; \ | |||
| ra4 = vec_xl(0, A+(M+4)*lda+K+0); \ | |||
| ra5 = vec_xl(0, A+(M+5)*lda+K+0); \ | |||
| t0 = vec_mergeh(ra4, ra5); \ | |||
| t1 = vec_mergel(ra4, ra5); \ | |||
| ra4 = t0; \ | |||
| ra5 = t1; \ | |||
| ra6 = vec_xl(0, A+(M+6)*lda+K+0); \ | |||
| ra7 = vec_xl(0, A+(M+7)*lda+K+0); \ | |||
| t0 = vec_mergeh(ra6, ra7); \ | |||
| t1 = vec_mergel(ra6, ra7); \ | |||
| ra6 = t0; \ | |||
| ra7 = t1; | |||
| #define LOAD_AT_8x1(M, K) \ | |||
| ra0 = vec_xor(ra0, ra0); \ | |||
| ra0 = vec_insert(A[(M+0)*lda+K], ra0, 0); \ | |||
| ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1); \ | |||
| ra1 = vec_xor(ra1, ra1); \ | |||
| ra1 = vec_insert(A[(M+2)*lda+K], ra1, 0); \ | |||
| ra1 = vec_insert(A[(M+3)*lda+K], ra1, 1); \ | |||
| ra2 = vec_xor(ra2, ra2); \ | |||
| ra2 = vec_insert(A[(M+4)*lda+K], ra2, 0); \ | |||
| ra2 = vec_insert(A[(M+5)*lda+K], ra2, 1); \ | |||
| ra3 = vec_xor(ra3, ra3); \ | |||
| ra3 = vec_insert(A[(M+6)*lda+K], ra3, 0); \ | |||
| ra3 = vec_insert(A[(M+7)*lda+K], ra3, 1); \ | |||
| #define LOAD_AT_4x2(M, K) \ | |||
| ra0 = vec_xl(0, A+(M+0)*lda+K+0); \ | |||
| ra1 = vec_xl(0, A+(M+1)*lda+K+0); \ | |||
| ra2 = vec_xl(0, A+(M+2)*lda+K+0); \ | |||
| ra3 = vec_xl(0, A+(M+3)*lda+K+0); \ | |||
| t0 = vec_mergeh(ra0, ra1); \ | |||
| t1 = vec_mergeh(ra2, ra3); \ | |||
| t2 = vec_mergel(ra0, ra1); \ | |||
| t3 = vec_mergel(ra2, ra3); \ | |||
| ra0 = t0; \ | |||
| ra1 = t2; \ | |||
| ra2 = t1; \ | |||
| ra3 = t3; | |||
| #define LOAD_AT_4x1(M, K) \ | |||
| ra0 = vec_xor(ra0, ra0); \ | |||
| ra0 = vec_insert(A[(M+0)*lda+K], ra0, 0); \ | |||
| ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1); \ | |||
| ra1 = vec_xor(ra1, ra1); \ | |||
| ra1 = vec_insert(A[(M+2)*lda+K], ra1, 0); \ | |||
| ra1 = vec_insert(A[(M+3)*lda+K], ra1, 1); \ | |||
| #define LOAD_AT_2x2(M, K) \ | |||
| ra0 = vec_xl(0, A+(M+0)*lda+K+0); \ | |||
| ra1 = vec_xl(0, A+(M+1)*lda+K+0); \ | |||
| t0 = vec_mergeh(ra0, ra1); \ | |||
| t1 = vec_mergel(ra0, ra1); \ | |||
| ra0 = t0; \ | |||
| ra1 = t1; | |||
| #define LOAD_AT_2x1(M, K) \ | |||
| ra0 = vec_xor(ra0, ra0); \ | |||
| ra0 = vec_insert(A[(M+0)*lda+K], ra0, 0); \ | |||
| ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1); | |||
| #define LOAD_A_1x1(K, M) \ | |||
| ra0 = vec_splats(A[((M+0)*lda)+K+0]); | |||
| #define LOAD_BTP_8x2(N, K) \ | |||
| rb0 = vec_xl(0, B+(N+0)*ldb+K+0); \ | |||
| rb1 = vec_xl(0, B+(N+1)*ldb+K+0); \ | |||
| rb2 = vec_xl(0, B+(N+2)*ldb+K+0); \ | |||
| rb3 = vec_xl(0, B+(N+3)*ldb+K+0); \ | |||
| t0 = vec_mergeh(rb0, rb1); \ | |||
| t1 = vec_mergeh(rb2, rb3); \ | |||
| LOAD_PAIR(pb0, t0, t1); \ | |||
| t0 = vec_mergel(rb0, rb1); \ | |||
| t1 = vec_mergel(rb2, rb3); \ | |||
| LOAD_PAIR(pb2, t0, t1); \ | |||
| rb4 = vec_xl(0, B+(N+4)*ldb+K+0); \ | |||
| rb5 = vec_xl(0, B+(N+5)*ldb+K+0); \ | |||
| rb6 = vec_xl(0, B+(N+6)*ldb+K+0); \ | |||
| rb7 = vec_xl(0, B+(N+7)*ldb+K+0); \ | |||
| t0 = vec_mergeh(rb4, rb5); \ | |||
| t1 = vec_mergeh(rb6, rb7); \ | |||
| LOAD_PAIR(pb1, t0, t1); \ | |||
| t0 = vec_mergel(rb4, rb5); \ | |||
| t1 = vec_mergel(rb6, rb7); \ | |||
| LOAD_PAIR(pb3, t0, t1); | |||
| #define LOAD_BTP_8x1(N, K) \ | |||
| rb0 = vec_xor(rb0, rb0); \ | |||
| rb0 = vec_insert(B[(N+0)*ldb+K], rb0, 0); \ | |||
| rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 1); \ | |||
| rb1 = vec_xor(rb1, rb1); \ | |||
| rb1 = vec_insert(B[(N+2)*ldb+K], rb1, 0); \ | |||
| rb1 = vec_insert(B[(N+3)*ldb+K], rb1, 1); \ | |||
| LOAD_PAIR(pb0, rb0, rb1); \ | |||
| rb0 = vec_xor(rb0, rb0); \ | |||
| rb0 = vec_insert(B[(N+4)*ldb+K], rb0, 0); \ | |||
| rb0 = vec_insert(B[(N+5)*ldb+K], rb0, 1); \ | |||
| rb1 = vec_xor(rb1, rb1); \ | |||
| rb1 = vec_insert(B[(N+6)*ldb+K], rb1, 0); \ | |||
| rb1 = vec_insert(B[(N+7)*ldb+K], rb1, 1); \ | |||
| LOAD_PAIR(pb1, rb0, rb1); | |||
| #define LOAD_BTP_4x2(N, K) \ | |||
| rb0 = vec_xl(0, B+(N+0)*ldb+K+0); \ | |||
| rb1 = vec_xl(0, B+(N+1)*ldb+K+0); \ | |||
| rb2 = vec_xl(0, B+(N+2)*ldb+K+0); \ | |||
| rb3 = vec_xl(0, B+(N+3)*ldb+K+0); \ | |||
| t0 = vec_mergeh(rb0, rb1); \ | |||
| t1 = vec_mergeh(rb2, rb3); \ | |||
| LOAD_PAIR(pb0, t0, t1); \ | |||
| t0 = vec_mergel(rb0, rb1); \ | |||
| t1 = vec_mergel(rb2, rb3); \ | |||
| LOAD_PAIR(pb1, t0, t1); | |||
| #define LOAD_BTP_4x1(N, K) \ | |||
| rb0 = vec_xor(rb0, rb0); \ | |||
| rb0 = vec_insert(B[(N+0)*ldb+K], rb0, 0); \ | |||
| rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 1); \ | |||
| rb1 = vec_xor(rb1, rb1); \ | |||
| rb1 = vec_insert(B[(N+2)*ldb+K], rb1, 0); \ | |||
| rb1 = vec_insert(B[(N+3)*ldb+K], rb1, 1); \ | |||
| LOAD_PAIR(pb0, rb0, rb1); | |||
| #define LOAD_BTP_2x2(N, K) \ | |||
| rb0 = vec_xl(0, B+(N+0)*ldb+K+0); \ | |||
| rb1 = vec_xl(0, B+(N+1)*ldb+K+0); \ | |||
| t0 = vec_mergeh(rb0, rb1); \ | |||
| __builtin_vsx_assemble_pair(&pb0, (vec_t)t0, (vec_t)t0); \ | |||
| t1 = vec_mergel(rb0, rb1); \ | |||
| __builtin_vsx_assemble_pair(&pb1, (vec_t)t1, (vec_t)t1); | |||
| #define LOAD_BTP_2x1(N, K) \ | |||
| rb0 = vec_xor(rb0, rb0); \ | |||
| rb0 = vec_insert(B[(N+0)*ldb+K], rb0, 0); \ | |||
| rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 1); \ | |||
| __builtin_vsx_assemble_pair(&pb0, (vec_t)rb0, (vec_t)rb0); | |||
| #define LOAD_B_1x1(N, K) rb0 = vec_splats(B[((N)*ldb)+K]); | |||
| #define KERNEL_MMA_8ACC(b0, b1, b2, b3, b4, b5, b6, b7, \ | |||
| a0, a1, a2, a3, a4, a5, a6, a7) \ | |||
| __builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); \ | |||
| __builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1); \ | |||
| __builtin_mma_xvf64gerpp(&acc2, b2, (vec_t)a2); \ | |||
| __builtin_mma_xvf64gerpp(&acc3, b3, (vec_t)a3); \ | |||
| __builtin_mma_xvf64gerpp(&acc4, b4, (vec_t)a4); \ | |||
| __builtin_mma_xvf64gerpp(&acc5, b5, (vec_t)a5); \ | |||
| __builtin_mma_xvf64gerpp(&acc6, b6, (vec_t)a6); \ | |||
| __builtin_mma_xvf64gerpp(&acc7, b7, (vec_t)a7); | |||
| #define KERNEL_MMA_4ACC(b0, b1, b2, b3, a0, a1, a2, a3) \ | |||
| __builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); \ | |||
| __builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1); \ | |||
| __builtin_mma_xvf64gerpp(&acc2, b2, (vec_t)a2); \ | |||
| __builtin_mma_xvf64gerpp(&acc3, b3, (vec_t)a3); | |||
| #define KERNEL_MMA_2ACC(b0, b1, a0, a1) \ | |||
| __builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); \ | |||
| __builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1); | |||
| #define KERNEL_MMA_1ACC(b0, a0) \ | |||
| __builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); | |||
| #define KERNEL_MMA_1ACC_(acc, b0, a0) \ | |||
| __builtin_mma_xvf64gerpp(&acc, b0, (vec_t)a0); | |||
| #define KERNEL_VMADD_4VSR(a0, a1, a2, a3, b0, b1, b2, b3) \ | |||
| result = vec_madd(a0, b0, result); \ | |||
| result1 = vec_madd(a1, b1, result1); \ | |||
| result2 = vec_madd(a2, b2, result2); \ | |||
| result3 = vec_madd(a3, b3, result3); | |||
| #define KERNEL_VMADD_2VSR(a0, a1, b0, b1) \ | |||
| result = vec_madd(a0, b0, result); \ | |||
| result1 = vec_madd(a1, b1, result1); | |||
| #define KERNEL_VMADD_1VSR(a0, b0) \ | |||
| result = vec_madd(a0, b0, result); | |||
| #ifdef B0 | |||
| int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) | |||
| #else | |||
| int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc) | |||
| #endif | |||
| { | |||
| BLASLONG m, n, k; | |||
| BLASLONG m8 = M & ~7; | |||
| BLASLONG m4 = M & ~3; | |||
| BLASLONG m2 = M & ~1; | |||
| BLASLONG n8 = N & ~7; | |||
| BLASLONG n4 = N & ~3; | |||
| BLASLONG n2 = N & ~1; | |||
| BLASLONG k2 = K & ~1; | |||
| vector double valpha = vec_splats(alpha); | |||
| #if !defined(B0) | |||
| vector double vbeta = vec_splats(beta); | |||
| #endif | |||
| for (m = 0; m < m8; m += 8) { | |||
| for (n = 0; n < n8; n += 8) { | |||
| __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; | |||
| INIT_8ACCS(); | |||
| register vector double ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7; | |||
| register vector double rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7; | |||
| register vector double t0, t1; | |||
| __vector_pair pb0, pb1, pb2, pb3; | |||
| for (k = 0; k < k2; k += 2) { | |||
| LOAD_AT_8x2(m, k); | |||
| LOAD_BTP_8x2(n, k); | |||
| KERNEL_MMA_8ACC(pb0, pb0, pb0, pb0, pb1, pb1, pb1, pb1, | |||
| ra0, ra2, ra4, ra6, ra0, ra2, ra4, ra6); | |||
| KERNEL_MMA_8ACC(pb2, pb2, pb2, pb2, pb3, pb3, pb3, pb3, | |||
| ra1, ra3, ra5, ra7, ra1, ra3, ra5, ra7); | |||
| } | |||
| // workaround to avoid register spilling | |||
| for (; k < K; k++) { | |||
| LOAD_AT_4x1(m, k); | |||
| LOAD_BTP_4x1(n, k); | |||
| KERNEL_MMA_1ACC_(acc0, pb0, ra0); | |||
| KERNEL_MMA_1ACC_(acc1, pb0, ra1); | |||
| LOAD_AT_4x1(m+4, k); | |||
| KERNEL_MMA_1ACC_(acc2, pb0, ra0); | |||
| KERNEL_MMA_1ACC_(acc3, pb0, ra1); | |||
| LOAD_AT_4x1(m, k); | |||
| LOAD_BTP_4x1(n+4, k); | |||
| KERNEL_MMA_1ACC_(acc4, pb0, ra0); | |||
| KERNEL_MMA_1ACC_(acc5, pb0, ra1); | |||
| LOAD_AT_4x1(m+4, k); | |||
| KERNEL_MMA_1ACC_(acc6, pb0, ra0); | |||
| KERNEL_MMA_1ACC_(acc7, pb0, ra1); | |||
| } | |||
| #if !defined(B0) | |||
| register vector double rc0; | |||
| #endif | |||
| vector double result[4]; | |||
| SAVE_4x2_ACC(&acc0, n+0, m+0); | |||
| SAVE_4x2_ACC(&acc2, n+0, m+4); | |||
| SAVE_4x2_ACC(&acc4, n+4, m+0); | |||
| SAVE_4x2_ACC(&acc6, n+4, m+4); | |||
| SAVE_4x2_ACC(&acc1, n+0, m+2); | |||
| SAVE_4x2_ACC(&acc3, n+0, m+6); | |||
| SAVE_4x2_ACC(&acc5, n+4, m+2); | |||
| SAVE_4x2_ACC(&acc7, n+4, m+6); | |||
| } | |||
| for (; n < n4; n += 4) { | |||
| __vector_quad acc0, acc1, acc2, acc3; | |||
| INIT_4ACCS(); | |||
| register vector double ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7; | |||
| register vector double rb0, rb1, rb2, rb3; | |||
| register vector double t0, t1; | |||
| __vector_pair pb0, pb1; | |||
| for (k = 0; k < k2; k += 2) { | |||
| LOAD_AT_8x2(m, k); | |||
| LOAD_BTP_4x2(n, k); | |||
| KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra2, ra4, ra6); | |||
| KERNEL_MMA_4ACC(pb1, pb1, pb1, pb1, ra1, ra3, ra5, ra7); | |||
| } | |||
| for (; k < K; k++) { | |||
| LOAD_AT_8x1(m, k); | |||
| LOAD_BTP_4x1(n, k); | |||
| KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3); | |||
| } | |||
| #if !defined(B0) | |||
| register vector double rc0; | |||
| #endif | |||
| vector double result[4]; | |||
| SAVE_4x2_ACC(&acc0, n+0, m+0); | |||
| SAVE_4x2_ACC(&acc2, n+0, m+4); | |||
| SAVE_4x2_ACC(&acc1, n+0, m+2); | |||
| SAVE_4x2_ACC(&acc3, n+0, m+6); | |||
| } | |||
| for (; n < n2; n += 2) { | |||
| __vector_quad acc0, acc1, acc2, acc3; | |||
| INIT_4ACCS(); | |||
| register vector double ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7; | |||
| register vector double rb0, rb1; | |||
| register vector double t0, t1; | |||
| __vector_pair pb0, pb1; | |||
| for (k = 0; k < k2; k += 2) { | |||
| LOAD_AT_8x2(m, k); | |||
| LOAD_BTP_2x2(n, k); | |||
| KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra2, ra4, ra6); | |||
| KERNEL_MMA_4ACC(pb1, pb1, pb1, pb1, ra1, ra3, ra5, ra7); | |||
| } | |||
| for (; k < K; k++) { | |||
| LOAD_AT_8x1(m, k); | |||
| LOAD_BTP_2x1(n, k); | |||
| KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3); | |||
| } | |||
| #if !defined(B0) | |||
| register vector double rc0; | |||
| #endif | |||
| vector double result[4]; | |||
| SAVE_2x2_ACC(&acc0, n+0, m+0); | |||
| SAVE_2x2_ACC(&acc2, n+0, m+4); | |||
| SAVE_2x2_ACC(&acc1, n+0, m+2); | |||
| SAVE_2x2_ACC(&acc3, n+0, m+6); | |||
| } | |||
| for (; n < N; n++) { | |||
| register vector double result = ((vector double){0.,0.}); | |||
| register vector double result1 = ((vector double){0.,0.}); | |||
| register vector double result2 = ((vector double){0.,0.}); | |||
| register vector double result3 = ((vector double){0.,0.}); | |||
| register vector double ra0, ra1, ra2, ra3; | |||
| register vector double rb0; | |||
| for (k = 0; k < K; k++) { | |||
| LOAD_AT_8x1(m, k); | |||
| LOAD_B_1x1(n, k); | |||
| KERNEL_VMADD_4VSR(ra0, ra1, ra2, ra3, rb0, rb0, rb0, rb0); | |||
| } | |||
| #if !defined(B0) | |||
| register vector double rc0; | |||
| #endif | |||
| SAVE_1x4_VSR(result, n, m+0); | |||
| SAVE_1x4_VSR(result1, n, m+2); | |||
| SAVE_1x4_VSR(result2, n, m+4); | |||
| SAVE_1x4_VSR(result3, n, m+6); | |||
| } | |||
| } | |||
| for (; m < m4; m += 4) { | |||
| for (n = 0; n < n8; n += 8) { | |||
| __vector_quad acc0, acc1, acc2, acc3; | |||
| INIT_4ACCS(); | |||
| register vector double ra0, ra1, ra2, ra3; | |||
| register vector double rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7; | |||
| register vector double t0, t1, t2, t3; | |||
| __vector_pair pb0, pb1, pb2, pb3; | |||
| for (k = 0; k < k2; k += 2) { | |||
| LOAD_AT_4x2(m, k); | |||
| LOAD_BTP_8x2(n, k); | |||
| KERNEL_MMA_4ACC(pb0, pb0, pb1, pb1, ra0, ra2, ra0, ra2); | |||
| KERNEL_MMA_4ACC(pb2, pb2, pb3, pb3, ra1, ra3, ra1, ra3); | |||
| } | |||
| for (; k < K; k++) { | |||
| LOAD_AT_4x1(m, k); | |||
| LOAD_BTP_8x1(n, k); | |||
| KERNEL_MMA_4ACC(pb0, pb0, pb1, pb1, ra0, ra1, ra0, ra1); | |||
| } | |||
| #if !defined(B0) | |||
| register vector double rc0; | |||
| #endif | |||
| vector double result[4]; | |||
| SAVE_4x2_ACC(&acc0, n+0, m+0); | |||
| SAVE_4x2_ACC(&acc1, n+0, m+2); | |||
| SAVE_4x2_ACC(&acc2, n+4, m+0); | |||
| SAVE_4x2_ACC(&acc3, n+4, m+2); | |||
| } | |||
| for (; n < n4; n += 4) { | |||
| __vector_quad acc0, acc1; | |||
| INIT_2ACCS(); | |||
| register vector double ra0, ra1, ra2, ra3; | |||
| register vector double rb0, rb1, rb2, rb3; | |||
| register vector double t0, t1, t2, t3; | |||
| __vector_pair pb0, pb1; | |||
| for (k = 0; k < k2; k += 2) { | |||
| LOAD_AT_4x2(m, k); | |||
| LOAD_BTP_4x2(n, k); | |||
| KERNEL_MMA_2ACC(pb0, pb0, ra0, ra2); | |||
| KERNEL_MMA_2ACC(pb1, pb1, ra1, ra3); | |||
| } | |||
| for (; k < K; k++) { | |||
| LOAD_AT_4x1(m, k); | |||
| LOAD_BTP_4x1(n, k); | |||
| KERNEL_MMA_2ACC(pb0, pb0, ra0, ra1); | |||
| } | |||
| #if !defined(B0) | |||
| register vector double rc0; | |||
| #endif | |||
| vector double result[4]; | |||
| SAVE_4x2_ACC(&acc0, n+0, m+0); | |||
| SAVE_4x2_ACC(&acc1, n+0, m+2); | |||
| } | |||
| for (; n < n2; n += 2) { | |||
| __vector_quad acc0, acc1; | |||
| INIT_2ACCS(); | |||
| register vector double ra0, ra1, ra2, ra3; | |||
| register vector double rb0, rb1; | |||
| register vector double t0, t1, t2, t3; | |||
| __vector_pair pb0, pb1; | |||
| for (k = 0; k < k2; k += 2) { | |||
| LOAD_AT_4x2(m, k); | |||
| LOAD_BTP_2x2(n, k); | |||
| KERNEL_MMA_2ACC(pb0, pb0, ra0, ra2); | |||
| KERNEL_MMA_2ACC(pb1, pb1, ra1, ra3); | |||
| } | |||
| for (; k < K; k++) { | |||
| LOAD_AT_4x1(m, k); | |||
| LOAD_BTP_2x1(n, k); | |||
| KERNEL_MMA_2ACC(pb0, pb0, ra0, ra1); | |||
| } | |||
| #if !defined(B0) | |||
| register vector double rc0; | |||
| #endif | |||
| vector double result[4]; | |||
| SAVE_2x2_ACC(&acc0, n+0, m+0); | |||
| SAVE_2x2_ACC(&acc1, n+0, m+2); | |||
| } | |||
| for (; n < N; n++) { | |||
| register vector double result = ((vector double){0.,0.}); | |||
| register vector double result1 = ((vector double){0.,0.}); | |||
| register vector double ra0, ra1; | |||
| register vector double rb0; | |||
| for (k = 0; k < K; k++) { | |||
| LOAD_AT_4x1(m, k); | |||
| LOAD_B_1x1(n, k); | |||
| KERNEL_VMADD_2VSR(ra0, ra1, rb0, rb0); | |||
| } | |||
| #if !defined(B0) | |||
| register vector double rc0; | |||
| #endif | |||
| SAVE_1x4_VSR(result, n, m+0); | |||
| SAVE_1x4_VSR(result1, n, m+2); | |||
| } | |||
| } | |||
| for (; m < m2; m += 2) { | |||
| for (n = 0; n < n8; n += 8) { | |||
| __vector_quad acc0, acc1; | |||
| INIT_2ACCS(); | |||
| register vector double ra0, ra1; | |||
| register vector double rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7; | |||
| register vector double t0, t1; | |||
| __vector_pair pb0, pb1, pb2, pb3; | |||
| for (k = 0; k < k2; k += 2) { | |||
| LOAD_AT_2x2(m, k); | |||
| LOAD_BTP_8x2(n, k); | |||
| KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0); | |||
| KERNEL_MMA_2ACC(pb2, pb3, ra1, ra1); | |||
| } | |||
| for (; k < K; k++) { | |||
| LOAD_AT_2x1(m, k); | |||
| LOAD_BTP_8x1(n, k); | |||
| KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0); | |||
| } | |||
| #if !defined(B0) | |||
| register vector double rc0; | |||
| #endif | |||
| vector double result[4]; | |||
| SAVE_4x2_ACC(&acc0, n+0, m+0); | |||
| SAVE_4x2_ACC(&acc1, n+4, m+0); | |||
| } | |||
| for (; n < n4; n += 4) { | |||
| __vector_quad acc0; | |||
| INIT_1ACC(); | |||
| register vector double ra0, ra1; | |||
| register vector double rb0, rb1, rb2, rb3; | |||
| register vector double t0, t1; | |||
| __vector_pair pb0, pb1; | |||
| for (k = 0; k < k2; k += 2) { | |||
| LOAD_AT_2x2(m, k); | |||
| LOAD_BTP_4x2(n, k); | |||
| KERNEL_MMA_1ACC(pb0, ra0); | |||
| KERNEL_MMA_1ACC(pb1, ra1); | |||
| } | |||
| for (; k < K; k++) { | |||
| LOAD_AT_2x1(m, k); | |||
| LOAD_BTP_4x1(n, k); | |||
| KERNEL_MMA_1ACC(pb0, ra0); | |||
| } | |||
| #if !defined(B0) | |||
| register vector double rc0; | |||
| #endif | |||
| vector double result[4]; | |||
| SAVE_4x2_ACC(&acc0, n, m); | |||
| } | |||
| for (; n < n2; n += 2) { | |||
| __vector_quad acc0; | |||
| INIT_1ACC(); | |||
| register vector double ra0, ra1; | |||
| register vector double rb0, rb1; | |||
| register vector double t0, t1; | |||
| __vector_pair pb0, pb1; | |||
| for (k = 0; k < k2; k += 2) { | |||
| LOAD_AT_2x2(m, k); | |||
| LOAD_BTP_2x2(n, k); | |||
| KERNEL_MMA_1ACC(pb0, ra0); | |||
| KERNEL_MMA_1ACC(pb1, ra1); | |||
| } | |||
| for (; k < K; k++) { | |||
| LOAD_AT_2x1(m, k); | |||
| LOAD_BTP_2x1(n, k); | |||
| KERNEL_MMA_1ACC(pb0, ra0); | |||
| } | |||
| #if !defined(B0) | |||
| register vector double rc0; | |||
| #endif | |||
| vector double result[4]; | |||
| SAVE_2x2_ACC(&acc0, n, m); | |||
| } | |||
| for (; n < N; n++) { | |||
| register vector double result = ((vector double){0.,0.}); | |||
| register vector double ra0, ra1; | |||
| register vector double rb0; | |||
| for (k = 0; k < K; k++) { | |||
| LOAD_AT_4x1(m, k); | |||
| LOAD_B_1x1(n, k); | |||
| KERNEL_VMADD_1VSR(ra0, rb0); | |||
| } | |||
| #if !defined(B0) | |||
| register vector double rc0; | |||
| #endif | |||
| SAVE_1x4_VSR(result, n, m+0); | |||
| } | |||
| } | |||
| for (; m < M; m++) { | |||
| for (n = 0; n < n8; n += 8) { | |||
| __vector_quad acc0, acc1; | |||
| INIT_2ACCS(); | |||
| register vector double ra0; | |||
| register vector double rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7; | |||
| register vector double t0, t1; | |||
| __vector_pair pb0, pb1, pb2, pb3; | |||
| for (k = 0; k < k2; k += 2) { | |||
| LOAD_A_1x1(k, m); | |||
| LOAD_BTP_8x2(n, k); | |||
| KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0); | |||
| LOAD_A_1x1(k+1, m); | |||
| KERNEL_MMA_2ACC(pb2, pb3, ra0, ra0); | |||
| } | |||
| for (; k < K; k++) { | |||
| LOAD_A_1x1(k, m); | |||
| LOAD_BTP_8x1(n, k); | |||
| KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0); | |||
| } | |||
| #if !defined(B0) | |||
| register vector double rc0; | |||
| #endif | |||
| vector double result[4]; | |||
| SAVE_4x1_ACC(&acc0, n+0, m+0); | |||
| SAVE_4x1_ACC(&acc1, n+4, m+0); | |||
| } | |||
| for (; n < n4; n += 4) { | |||
| __vector_quad acc0; | |||
| INIT_1ACC(); | |||
| register vector double ra0; | |||
| register vector double rb0, rb1, rb2, rb3; | |||
| register vector double t0, t1; | |||
| __vector_pair pb0, pb1; | |||
| for (k = 0; k < k2; k += 2) { | |||
| LOAD_A_1x1(k, m); | |||
| LOAD_BTP_4x2(n, k); | |||
| KERNEL_MMA_1ACC(pb0, ra0); | |||
| LOAD_A_1x1(k+1, m); | |||
| KERNEL_MMA_1ACC(pb1, ra0); | |||
| } | |||
| for (; k < K; k++) { | |||
| LOAD_A_1x1(k, m); | |||
| LOAD_BTP_4x1(n, k); | |||
| KERNEL_MMA_1ACC(pb0, ra0); | |||
| } | |||
| #if !defined(B0) | |||
| register vector double rc0; | |||
| #endif | |||
| vector double result[4]; | |||
| SAVE_4x1_ACC(&acc0, n, m); | |||
| } | |||
| for (; n < n2; n += 2) { | |||
| __vector_quad acc0; | |||
| INIT_1ACC(); | |||
| register vector double ra0; | |||
| register vector double rb0, rb1; | |||
| register vector double t0, t1; | |||
| __vector_pair pb0, pb1; | |||
| for (k = 0; k < k2; k += 2) { | |||
| LOAD_A_1x1(k, m); | |||
| LOAD_BTP_2x2(n, k); | |||
| KERNEL_MMA_1ACC(pb0, ra0); | |||
| LOAD_A_1x1(k+1, m); | |||
| KERNEL_MMA_1ACC(pb1, ra0); | |||
| } | |||
| for (; k < K; k++) { | |||
| LOAD_A_1x1(k, m); | |||
| LOAD_BTP_2x1(n, k); | |||
| KERNEL_MMA_1ACC(pb0, ra0); | |||
| } | |||
| #if !defined(B0) | |||
| register vector double rc0; | |||
| #endif | |||
| vector double result[4]; | |||
| SAVE_2x1_ACC(&acc0, n+0, m+0); | |||
| } | |||
| for (; n < N; n++) { | |||
| FLOAT result = 0.0; | |||
| for (k = 0; k < K; k++) { | |||
| result += A[m*lda+k] * B[n*ldb+k]; | |||
| } | |||
| result = result * alpha; | |||
| #if !defined(B0) | |||
| C[n*ldc+m] = (C[n*ldc+m] * beta) + result; | |||
| #else | |||
| C[n*ldc+m] = result; | |||
| #endif | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,829 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2021, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #include <altivec.h> | |||
| typedef __vector unsigned char vec_t; | |||
| #if !__has_builtin(__builtin_vsx_assemble_pair) | |||
| #define __builtin_vsx_assemble_pair __builtin_mma_assemble_pair | |||
| #endif | |||
| #if !defined(B0) | |||
| #define SAVE_4x2_ACC(ACC, N, M) \ | |||
| __builtin_mma_disassemble_acc((void *)result, ACC); \ | |||
| rc0 = vec_xl(0, C+(N+0)*ldc+M); \ | |||
| rc0 = vec_mul(rc0, vbeta); \ | |||
| result[0] = vec_madd(result[0], valpha, rc0); \ | |||
| vec_xst(result[0], 0, C+(N+0)*ldc+M); \ | |||
| rc0 = vec_xl(0, C+(N+1)*ldc+M); \ | |||
| rc0 = vec_mul(rc0, vbeta); \ | |||
| result[1] = vec_madd(result[1], valpha, rc0); \ | |||
| vec_xst(result[1], 0, C+(N+1)*ldc+M); \ | |||
| rc0 = vec_xl(0, C+(N+2)*ldc+M); \ | |||
| rc0 = vec_mul(rc0, vbeta); \ | |||
| result[2] = vec_madd(result[2], valpha, rc0); \ | |||
| vec_xst(result[2], 0, C+(N+2)*ldc+M); \ | |||
| rc0 = vec_xl(0, C+(N+3)*ldc+M); \ | |||
| rc0 = vec_mul(rc0, vbeta); \ | |||
| result[3] = vec_madd(result[3], valpha, rc0); \ | |||
| vec_xst(result[3], 0, C+(N+3)*ldc+M); | |||
| #define SAVE_2x2_ACC(ACC, N, M) \ | |||
| __builtin_mma_disassemble_acc((void *)result, ACC); \ | |||
| rc0 = vec_xl(0, C+(N+0)*ldc+M); \ | |||
| rc0 = vec_mul(rc0, vbeta); \ | |||
| result[0] = vec_madd(result[0], valpha, rc0); \ | |||
| vec_xst(result[0], 0, C+(N+0)*ldc+M); \ | |||
| rc0 = vec_xl(0, C+(N+1)*ldc+M); \ | |||
| rc0 = vec_mul(rc0, vbeta); \ | |||
| result[1] = vec_madd(result[1], valpha, rc0); \ | |||
| vec_xst(result[1], 0, C+(N+1)*ldc+M); | |||
| #define SAVE_1x4_VSR(result, N, M) \ | |||
| rc0 = vec_xl(0, C+((N)*ldc)+M); \ | |||
| rc0 = vec_mul(rc0, vbeta); \ | |||
| result = vec_madd(result, valpha, rc0); \ | |||
| vec_xst(result, 0, C+((N)*ldc)+M); | |||
| #define SAVE_4x1_VSR(result, N, M) \ | |||
| result = vec_mul(result, valpha); \ | |||
| C[(N+0)*ldc+M] = (C[(N+0)*ldc+M] * beta) + result[0]; \ | |||
| C[(N+1)*ldc+M] = (C[(N+1)*ldc+M] * beta) + result[1]; | |||
| #else | |||
| #define SAVE_4x2_ACC(ACC, N, M) \ | |||
| __builtin_mma_disassemble_acc((void *)result, ACC); \ | |||
| result[0] = vec_mul(result[0], valpha); \ | |||
| vec_xst(result[0], 0, C+(N+0)*ldc+M); \ | |||
| result[1] = vec_mul(result[1], valpha); \ | |||
| vec_xst(result[1], 0, C+(N+1)*ldc+M); \ | |||
| result[2] = vec_mul(result[2], valpha); \ | |||
| vec_xst(result[2], 0, C+(N+2)*ldc+M); \ | |||
| result[3] = vec_mul(result[3], valpha); \ | |||
| vec_xst(result[3], 0, C+(N+3)*ldc+M); | |||
| #define SAVE_2x2_ACC(ACC, N, M) \ | |||
| __builtin_mma_disassemble_acc((void *)result, ACC); \ | |||
| result[0] = vec_mul(result[0], valpha); \ | |||
| vec_xst(result[0], 0, C+(N+0)*ldc+M); \ | |||
| result[1] = vec_mul(result[1], valpha); \ | |||
| vec_xst(result[1], 0, C+(N+1)*ldc+M); | |||
| #define SAVE_1x4_VSR(result, N, M) \ | |||
| result = vec_mul(result, valpha); \ | |||
| vec_xst(result, 0, C+((N)*ldc)+M); | |||
| #define SAVE_4x1_VSR(result, N, M) \ | |||
| result = vec_mul(result, valpha); \ | |||
| C[(N+0)*ldc+M] = result[0]; \ | |||
| C[(N+1)*ldc+M] = result[1]; | |||
| #endif | |||
| #define INIT_8ACCS() \ | |||
| __builtin_mma_xxsetaccz(&acc0); \ | |||
| __builtin_mma_xxsetaccz(&acc1); \ | |||
| __builtin_mma_xxsetaccz(&acc2); \ | |||
| __builtin_mma_xxsetaccz(&acc3); \ | |||
| __builtin_mma_xxsetaccz(&acc4); \ | |||
| __builtin_mma_xxsetaccz(&acc5); \ | |||
| __builtin_mma_xxsetaccz(&acc6); \ | |||
| __builtin_mma_xxsetaccz(&acc7); | |||
| #define INIT_4ACCS() \ | |||
| __builtin_mma_xxsetaccz(&acc0); \ | |||
| __builtin_mma_xxsetaccz(&acc1); \ | |||
| __builtin_mma_xxsetaccz(&acc2); \ | |||
| __builtin_mma_xxsetaccz(&acc3); | |||
| #define INIT_2ACCS() \ | |||
| __builtin_mma_xxsetaccz(&acc0); \ | |||
| __builtin_mma_xxsetaccz(&acc1); | |||
| #define INIT_1ACC() __builtin_mma_xxsetaccz(&acc0); | |||
| #define LOAD_AT_8x2(M, K) \ | |||
| ra0 = vec_xl(0, A+(M+0)*lda+K+0); \ | |||
| ra1 = vec_xl(0, A+(M+1)*lda+K+0); \ | |||
| ra2 = vec_xl(0, A+(M+2)*lda+K+0); \ | |||
| ra3 = vec_xl(0, A+(M+3)*lda+K+0); \ | |||
| t0 = vec_mergeh(ra0, ra1); \ | |||
| t1 = vec_mergeh(ra2, ra3); \ | |||
| t2 = vec_mergel(ra0, ra1); \ | |||
| t3 = vec_mergel(ra2, ra3); \ | |||
| ra0 = t0; \ | |||
| ra1 = t2; \ | |||
| ra2 = t1; \ | |||
| ra3 = t3; \ | |||
| ra4 = vec_xl(0, A+(M+4)*lda+K+0); \ | |||
| ra5 = vec_xl(0, A+(M+5)*lda+K+0); \ | |||
| ra6 = vec_xl(0, A+(M+6)*lda+K+0); \ | |||
| ra7 = vec_xl(0, A+(M+7)*lda+K+0); \ | |||
| t0 = vec_mergeh(ra4, ra5); \ | |||
| t1 = vec_mergeh(ra6, ra7); \ | |||
| t2 = vec_mergel(ra4, ra5); \ | |||
| t3 = vec_mergel(ra6, ra7); \ | |||
| ra4 = t0; \ | |||
| ra5 = t2; \ | |||
| ra6 = t1; \ | |||
| ra7 = t3; | |||
| #define LOAD_AT_8x1(M, K) \ | |||
| ra0 = vec_xor(ra0, ra0); \ | |||
| ra0 = vec_insert(A[(M+0)*lda+K], ra0, 0); \ | |||
| ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1); \ | |||
| ra1 = vec_xor(ra1, ra1); \ | |||
| ra1 = vec_insert(A[(M+2)*lda+K], ra1, 0); \ | |||
| ra1 = vec_insert(A[(M+3)*lda+K], ra1, 1); \ | |||
| ra2 = vec_xor(ra2, ra2); \ | |||
| ra2 = vec_insert(A[(M+4)*lda+K], ra2, 0); \ | |||
| ra2 = vec_insert(A[(M+5)*lda+K], ra2, 1); \ | |||
| ra3 = vec_xor(ra3, ra3); \ | |||
| ra3 = vec_insert(A[(M+6)*lda+K], ra3, 0); \ | |||
| ra3 = vec_insert(A[(M+7)*lda+K], ra3, 1); \ | |||
| #define LOAD_AT_4x2(M, K) \ | |||
| ra0 = vec_xl(0, A+(M+0)*lda+K+0); \ | |||
| ra1 = vec_xl(0, A+(M+1)*lda+K+0); \ | |||
| ra2 = vec_xl(0, A+(M+2)*lda+K+0); \ | |||
| ra3 = vec_xl(0, A+(M+3)*lda+K+0); \ | |||
| t0 = vec_mergeh(ra0, ra1); \ | |||
| t1 = vec_mergeh(ra2, ra3); \ | |||
| t2 = vec_mergel(ra0, ra1); \ | |||
| t3 = vec_mergel(ra2, ra3); \ | |||
| ra0 = t0; \ | |||
| ra1 = t2; \ | |||
| ra2 = t1; \ | |||
| ra3 = t3; | |||
| #define LOAD_AT_4x1(M, K) \ | |||
| ra0 = vec_xor(ra0, ra0); \ | |||
| ra0 = vec_insert(A[(M+0)*lda+K], ra0, 0); \ | |||
| ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1); \ | |||
| ra1 = vec_xor(ra1, ra1); \ | |||
| ra1 = vec_insert(A[(M+2)*lda+K], ra1, 0); \ | |||
| ra1 = vec_insert(A[(M+3)*lda+K], ra1, 1); \ | |||
| #define LOAD_AT_2x2(M, K) \ | |||
| ra0 = vec_xl(0, A+(M+0)*lda+K+0); \ | |||
| ra1 = vec_xl(0, A+(M+1)*lda+K+0); \ | |||
| t0 = vec_mergeh(ra0, ra1); \ | |||
| t1 = vec_mergel(ra0, ra1); \ | |||
| ra0 = t0; \ | |||
| ra1 = t1; | |||
| #define LOAD_AT_2x1(M, K) \ | |||
| ra0 = vec_xor(ra0, ra0); \ | |||
| ra0 = vec_insert(A[(M+0)*lda+K], ra0, 0); \ | |||
| ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1); | |||
| #define LOAD_A_1x1(M, K) ra0 = vec_splats(A[(M)*lda+K]); | |||
| #define LOAD_BP_1x8(K, N) \ | |||
| pb0 = *((__vector_pair *)((void *)&B[((K)*ldb)+N+0])); \ | |||
| pb1 = *((__vector_pair *)((void *)&B[((K)*ldb)+N+4])); | |||
| #define LOAD_BP_1x4(K, N) \ | |||
| pb0 = *((__vector_pair *)((void *)&B[((K)*ldb)+N+0])); | |||
| #define LOAD_BP_1x2(K, N) \ | |||
| t0 = vec_xl(0, B+((K)*ldb)+N); \ | |||
| __builtin_vsx_assemble_pair(&pb0, (vec_t)t0, (vec_t)t0); | |||
| #define LOAD_B_1x8(K, N) \ | |||
| rb0 = vec_xl(0, B+(K*ldb)+N+0); \ | |||
| rb1 = vec_xl(0, B+(K*ldb)+N+2); \ | |||
| rb2 = vec_xl(0, B+(K*ldb)+N+4); \ | |||
| rb3 = vec_xl(0, B+(K*ldb)+N+6); \ | |||
| #define LOAD_B_1x4(K, N) \ | |||
| rb0 = vec_xl(0, B+(K*ldb)+N+0); \ | |||
| rb1 = vec_xl(0, B+(K*ldb)+N+2); | |||
| #define LOAD_B_1x2(K, N) \ | |||
| rb0 = vec_xl(0, B+(K*ldb)+N+0); | |||
| #define LOAD_B_1x1(K, N) rb0 = vec_splats(B[(K)*ldb+N]); | |||
| #define KERNEL_MMA_8ACC(b0, b1, b2, b3, b4, b5, b6, b7, \ | |||
| a0, a1, a2, a3, a4, a5, a6, a7) \ | |||
| __builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); \ | |||
| __builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1); \ | |||
| __builtin_mma_xvf64gerpp(&acc2, b2, (vec_t)a2); \ | |||
| __builtin_mma_xvf64gerpp(&acc3, b3, (vec_t)a3); \ | |||
| __builtin_mma_xvf64gerpp(&acc4, b4, (vec_t)a4); \ | |||
| __builtin_mma_xvf64gerpp(&acc5, b5, (vec_t)a5); \ | |||
| __builtin_mma_xvf64gerpp(&acc6, b6, (vec_t)a6); \ | |||
| __builtin_mma_xvf64gerpp(&acc7, b7, (vec_t)a7); | |||
| #define KERNEL_MMA_4ACC(b0, b1, b2, b3, a0, a1, a2, a3) \ | |||
| __builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); \ | |||
| __builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1); \ | |||
| __builtin_mma_xvf64gerpp(&acc2, b2, (vec_t)a2); \ | |||
| __builtin_mma_xvf64gerpp(&acc3, b3, (vec_t)a3); | |||
| #define KERNEL_MMA_2ACC(b0, b1, a0, a1) \ | |||
| __builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); \ | |||
| __builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1); | |||
| #define KERNEL_MMA_1ACC(b0, a0) \ | |||
| __builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); | |||
| #define KERNEL_VMADD_4VSR(a0, a1, a2, a3, b0, b1, b2, b3) \ | |||
| result = vec_madd(a0, b0, result); \ | |||
| result1 = vec_madd(a1, b1, result1); \ | |||
| result2 = vec_madd(a2, b2, result2); \ | |||
| result3 = vec_madd(a3, b3, result3); | |||
| #define KERNEL_VMADD_2VSR(a0, a1, b0, b1) \ | |||
| result = vec_madd(a0, b0, result); \ | |||
| result1 = vec_madd(a1, b1, result1); | |||
| #define KERNEL_VMADD_1VSR(a0, b0) \ | |||
| result = vec_madd(a0, b0, result); | |||
| #define PACK_A(ra0, ra1, ra2, ra3, offset) \ | |||
| vec_xst(ra0, 0, packA+(k*8)+0+offset); \ | |||
| vec_xst(ra1, 0, packA+(k*8)+2+offset); \ | |||
| vec_xst(ra2, 0, packA+(k*8)+4+offset); \ | |||
| vec_xst(ra3, 0, packA+(k*8)+6+offset); | |||
| #define LOAD_PACKED_A(ra0, ra1, ra2, ra3, offset) \ | |||
| ra0 = vec_xl(0, packA+(k*8)+0+offset); \ | |||
| ra1 = vec_xl(0, packA+(k*8)+2+offset); \ | |||
| ra2 = vec_xl(0, packA+(k*8)+4+offset); \ | |||
| ra3 = vec_xl(0, packA+(k*8)+6+offset); | |||
| #ifdef B0 | |||
| int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) | |||
| #else | |||
| int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc) | |||
| #endif | |||
| { | |||
| BLASLONG m, n, k; | |||
| BLASLONG m8 = M & ~7; | |||
| BLASLONG m4 = M & ~3; | |||
| BLASLONG m2 = M & ~1; | |||
| BLASLONG n8 = N & ~7; | |||
| BLASLONG n4 = N & ~3; | |||
| BLASLONG n2 = N & ~1; | |||
| BLASLONG k2 = K & ~1; | |||
| #if defined(__GNUC__) && !defined(__clang__) | |||
| int has_packing = (M >= 32 && N >= 32 && K >= 32) ? 1 : 0; | |||
| #else | |||
| int has_packing = 0; | |||
| #endif | |||
| double *packA; | |||
| if (has_packing) packA = (double *)malloc(K*8*sizeof(double)); | |||
| vector double valpha = vec_splats(alpha); | |||
| #if !defined(B0) | |||
| vector double vbeta = vec_splats(beta); | |||
| #endif | |||
| for (m = 0; m < m8; m += 8) { | |||
| for (n = 0; n < n8; n += 8) { | |||
| __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; | |||
| INIT_8ACCS(); | |||
| register vector double ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7; | |||
| register vector double t0, t1, t2, t3; | |||
| __vector_pair pb0, pb1; | |||
| if (has_packing) { | |||
| if (n == 0) { | |||
| for (k = 0; k < k2; k += 2) { | |||
| LOAD_AT_8x2(m, k); | |||
| LOAD_BP_1x8(k, n); | |||
| KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1, | |||
| ra0, ra0, ra2, ra2, ra4, ra4, ra6, ra6); | |||
| PACK_A(ra0, ra2, ra4, ra6, 0); | |||
| LOAD_BP_1x8(k+1, n); | |||
| KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1, | |||
| ra1, ra1, ra3, ra3, ra5, ra5, ra7, ra7); | |||
| PACK_A(ra1, ra3, ra5, ra7, 8); | |||
| } | |||
| for (; k < K; k++) { | |||
| LOAD_AT_8x1(m, k); | |||
| LOAD_BP_1x8(k, n); | |||
| KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1, | |||
| ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3); | |||
| PACK_A(ra0, ra1, ra2, ra3, 0); | |||
| } | |||
| } else { | |||
| for (k = 0; k < k2; k += 2) { | |||
| LOAD_PACKED_A(ra0, ra2, ra4, ra6, 0); | |||
| LOAD_BP_1x8(k, n); | |||
| KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1, | |||
| ra0, ra0, ra2, ra2, ra4, ra4, ra6, ra6); | |||
| LOAD_PACKED_A(ra1, ra3, ra5, ra7, 8); | |||
| LOAD_BP_1x8(k+1, n); | |||
| KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1, | |||
| ra1, ra1, ra3, ra3, ra5, ra5, ra7, ra7); | |||
| } | |||
| for (; k < K; k++) { | |||
| LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0); | |||
| LOAD_BP_1x8(k, n); | |||
| KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1, | |||
| ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3); | |||
| } | |||
| } | |||
| } else { | |||
| for (k = 0; k < k2; k += 2) { | |||
| LOAD_AT_8x2(m, k); | |||
| LOAD_BP_1x8(k, n); | |||
| KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1, | |||
| ra0, ra0, ra2, ra2, ra4, ra4, ra6, ra6); | |||
| LOAD_BP_1x8(k+1, n); | |||
| KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1, | |||
| ra1, ra1, ra3, ra3, ra5, ra5, ra7, ra7); | |||
| } | |||
| for (; k < K; k++) { | |||
| LOAD_AT_8x1(m, k); | |||
| LOAD_BP_1x8(k, n); | |||
| KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1, | |||
| ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3); | |||
| } | |||
| } | |||
| #if !defined(B0) | |||
| register vector double rc0; | |||
| #endif | |||
| vector double result[4]; | |||
| SAVE_4x2_ACC(&acc0, n+0, m+0); | |||
| SAVE_4x2_ACC(&acc2, n+0, m+2); | |||
| SAVE_4x2_ACC(&acc4, n+0, m+4); | |||
| SAVE_4x2_ACC(&acc6, n+0, m+6); | |||
| SAVE_4x2_ACC(&acc1, n+4, m+0); | |||
| SAVE_4x2_ACC(&acc3, n+4, m+2); | |||
| SAVE_4x2_ACC(&acc5, n+4, m+4); | |||
| SAVE_4x2_ACC(&acc7, n+4, m+6); | |||
| } | |||
| for (; n < n4; n += 4) { | |||
| __vector_quad acc0, acc1, acc2, acc3; | |||
| INIT_4ACCS(); | |||
| register vector double ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7; | |||
| register vector double t0, t1, t2, t3; | |||
| __vector_pair pb0; | |||
| if (!has_packing) { | |||
| for (k = 0; k < k2; k += 2) { | |||
| LOAD_AT_8x2(m, k); | |||
| LOAD_BP_1x4(k, n); | |||
| KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra2, ra4, ra6); | |||
| LOAD_BP_1x4(k+1, n); | |||
| KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra1, ra3, ra5, ra7); | |||
| } | |||
| for (; k < K; k++) { | |||
| LOAD_AT_8x1(m, k); | |||
| LOAD_BP_1x4(k, n); | |||
| KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3); | |||
| } | |||
| } else { | |||
| for (k = 0; k < k2; k += 2) { | |||
| LOAD_PACKED_A(ra0, ra2, ra4, ra6, 0); | |||
| LOAD_BP_1x4(k, n); | |||
| KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra2, ra4, ra6); | |||
| LOAD_PACKED_A(ra1, ra3, ra5, ra7, 8); | |||
| LOAD_BP_1x4(k+1, n); | |||
| KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra1, ra3, ra5, ra7); | |||
| } | |||
| for (; k < K; k++) { | |||
| LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0); | |||
| LOAD_BP_1x4(k, n); | |||
| KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3); | |||
| } | |||
| } | |||
| #if !defined(B0) | |||
| register vector double rc0; | |||
| #endif | |||
| vector double result[4]; | |||
| SAVE_4x2_ACC(&acc0, n+0, m+0); | |||
| SAVE_4x2_ACC(&acc1, n+0, m+2); | |||
| SAVE_4x2_ACC(&acc2, n+0, m+4); | |||
| SAVE_4x2_ACC(&acc3, n+0, m+6); | |||
| } | |||
| for (; n < n2; n += 2) { | |||
| __vector_quad acc0, acc1, acc2, acc3; | |||
| INIT_4ACCS(); | |||
| register vector double ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7; | |||
| register vector double t0, t1, t2, t3; | |||
| __vector_pair pb0; | |||
| if (!has_packing) { | |||
| for (k = 0; k < k2; k += 2) { | |||
| LOAD_AT_8x2(m, k); | |||
| LOAD_BP_1x2(k, n); | |||
| KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra2, ra4, ra6); | |||
| LOAD_BP_1x2(k+1, n); | |||
| KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra1, ra3, ra5, ra7); | |||
| } | |||
| for (; k < K; k++) { | |||
| LOAD_AT_8x1(m, k); | |||
| LOAD_BP_1x2(k, n); | |||
| KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3); | |||
| } | |||
| } else { | |||
| for (k = 0; k < k2; k += 2) { | |||
| LOAD_PACKED_A(ra0, ra2, ra4, ra6, 0); | |||
| LOAD_BP_1x2(k, n); | |||
| KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra2, ra4, ra6); | |||
| LOAD_PACKED_A(ra1, ra3, ra5, ra7, 8); | |||
| LOAD_BP_1x2(k+1, n); | |||
| KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra1, ra3, ra5, ra7); | |||
| } | |||
| for (; k < K; k++) { | |||
| LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0); | |||
| LOAD_BP_1x2(k, n); | |||
| KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3); | |||
| } | |||
| } | |||
| #if !defined(B0) | |||
| register vector double rc0; | |||
| #endif | |||
| vector double result[4]; | |||
| SAVE_2x2_ACC(&acc0, n+0, m+0); | |||
| SAVE_2x2_ACC(&acc1, n+0, m+2); | |||
| SAVE_2x2_ACC(&acc2, n+0, m+4); | |||
| SAVE_2x2_ACC(&acc3, n+0, m+6); | |||
| } | |||
| for (; n < N; n++) { | |||
| register vector double result = ((vector double){0.,0.}); | |||
| register vector double result1 = ((vector double){0.,0.}); | |||
| register vector double result2 = ((vector double){0.,0.}); | |||
| register vector double result3 = ((vector double){0.,0.}); | |||
| register vector double ra0, ra1, ra2, ra3; | |||
| register vector double rb0; | |||
| if (!has_packing) { | |||
| for (k = 0; k < K; k++) { | |||
| LOAD_AT_8x1(m, k); | |||
| LOAD_B_1x1(k, n); | |||
| KERNEL_VMADD_4VSR(ra0, ra1, ra2, ra3, rb0, rb0, rb0, rb0); | |||
| } | |||
| } else { | |||
| for (k = 0; k < K; k++) { | |||
| LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0); | |||
| LOAD_B_1x1(k, n); | |||
| KERNEL_VMADD_4VSR(ra0, ra1, ra2, ra3, rb0, rb0, rb0, rb0); | |||
| } | |||
| } | |||
| #if !defined(B0) | |||
| register vector double rc0; | |||
| #endif | |||
| SAVE_1x4_VSR(result, n, m+0); | |||
| SAVE_1x4_VSR(result1, n, m+2); | |||
| SAVE_1x4_VSR(result2, n, m+4); | |||
| SAVE_1x4_VSR(result3, n, m+6); | |||
| } | |||
| } | |||
| for (; m < m4; m += 4) { | |||
| for (n = 0; n < n8; n += 8) { | |||
| __vector_quad acc0, acc1, acc2, acc3; | |||
| INIT_4ACCS(); | |||
| register vector double ra0, ra1, ra2, ra3; | |||
| register vector double t0, t1, t2, t3; | |||
| __vector_pair pb0, pb1; | |||
| for (k = 0; k < k2; k += 2) { | |||
| LOAD_AT_4x2(m, k); | |||
| LOAD_BP_1x8(k, n); | |||
| KERNEL_MMA_4ACC(pb0, pb1, pb0, pb1, ra0, ra0, ra2, ra2); | |||
| LOAD_BP_1x8(k+1, n); | |||
| KERNEL_MMA_4ACC(pb0, pb1, pb0, pb1, ra1, ra1, ra3, ra3); | |||
| } | |||
| for (; k < K; k++) { | |||
| LOAD_AT_4x1(m, k); | |||
| LOAD_BP_1x8(k, n); | |||
| KERNEL_MMA_4ACC(pb0, pb1, pb0, pb1, ra0, ra0, ra1, ra1); | |||
| } | |||
| #if !defined(B0) | |||
| register vector double rc0; | |||
| #endif | |||
| vector double result[4]; | |||
| SAVE_4x2_ACC(&acc0, n+0, m+0); | |||
| SAVE_4x2_ACC(&acc2, n+0, m+2); | |||
| SAVE_4x2_ACC(&acc1, n+4, m+0); | |||
| SAVE_4x2_ACC(&acc3, n+4, m+2); | |||
| } | |||
| for (; n < n4; n += 4) { | |||
| __vector_quad acc0, acc1; | |||
| INIT_2ACCS(); | |||
| register vector double ra0, ra1, ra2, ra3; | |||
| register vector double t0, t1, t2, t3; | |||
| __vector_pair pb0; | |||
| for (k = 0; k < k2; k += 2) { | |||
| LOAD_AT_4x2(m, k); | |||
| LOAD_BP_1x4(k, n); | |||
| KERNEL_MMA_2ACC(pb0, pb0, ra0, ra2); | |||
| LOAD_BP_1x4(k+1, n); | |||
| KERNEL_MMA_2ACC(pb0, pb0, ra1, ra3); | |||
| } | |||
| for (; k < K; k++) { | |||
| LOAD_AT_4x1(m, k); | |||
| LOAD_BP_1x4(k, n); | |||
| KERNEL_MMA_2ACC(pb0, pb0, ra0, ra1); | |||
| } | |||
| #if !defined(B0) | |||
| register vector double rc0; | |||
| #endif | |||
| vector double result[4]; | |||
| SAVE_4x2_ACC(&acc0, n+0, m+0); | |||
| SAVE_4x2_ACC(&acc1, n+0, m+2); | |||
| } | |||
| for (; n < n2; n += 2) { | |||
| __vector_quad acc0, acc1; | |||
| INIT_2ACCS(); | |||
| register vector double ra0, ra1, ra2, ra3; | |||
| register vector double t0, t1, t2, t3; | |||
| __vector_pair pb0; | |||
| for (k = 0; k < k2; k += 2) { | |||
| LOAD_AT_4x2(m, k); | |||
| LOAD_BP_1x2(k, n); | |||
| KERNEL_MMA_2ACC(pb0, pb0, ra0, ra2); | |||
| LOAD_BP_1x2(k+1, n); | |||
| KERNEL_MMA_2ACC(pb0, pb0, ra1, ra3); | |||
| } | |||
| for (; k < K; k++) { | |||
| LOAD_AT_4x1(m, k); | |||
| LOAD_BP_1x2(k, n); | |||
| KERNEL_MMA_2ACC(pb0, pb0, ra0, ra1); | |||
| } | |||
| #if !defined(B0) | |||
| register vector double rc0; | |||
| #endif | |||
| vector double result[4]; | |||
| SAVE_2x2_ACC(&acc0, n+0, m+0); | |||
| SAVE_2x2_ACC(&acc1, n+0, m+2); | |||
| } | |||
| for (; n < N; n++) { | |||
| register vector double result = ((vector double){0.,0.}); | |||
| register vector double result1 = ((vector double){0.,0.}); | |||
| register vector double ra0, ra1; | |||
| register vector double rb0; | |||
| for (k = 0; k < K; k++) { | |||
| LOAD_AT_4x1(m, k); | |||
| LOAD_B_1x1(k, n); | |||
| KERNEL_VMADD_2VSR(ra0, ra1, rb0, rb0); | |||
| } | |||
| #if !defined(B0) | |||
| register vector double rc0; | |||
| #endif | |||
| SAVE_1x4_VSR(result, n, m+0); | |||
| SAVE_1x4_VSR(result1, n, m+2); | |||
| } | |||
| } | |||
| for (; m < m2; m += 2) { | |||
| for (n = 0; n < n8; n += 8) { | |||
| __vector_quad acc0, acc1; | |||
| INIT_2ACCS(); | |||
| register vector double ra0, ra1; | |||
| register vector double t0, t1; | |||
| __vector_pair pb0, pb1; | |||
| for (k = 0; k < k2; k += 2) { | |||
| LOAD_AT_2x2(m, k); | |||
| LOAD_BP_1x8(k, n); | |||
| KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0); | |||
| LOAD_BP_1x8(k+1, n); | |||
| KERNEL_MMA_2ACC(pb0, pb1, ra1, ra1); | |||
| } | |||
| for (; k < K; k++) { | |||
| LOAD_AT_2x1(m, k); | |||
| LOAD_BP_1x8(k, n); | |||
| KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0); | |||
| } | |||
| #if !defined(B0) | |||
| register vector double rc0; | |||
| #endif | |||
| vector double result[4]; | |||
| SAVE_4x2_ACC(&acc0, n+0, m+0); | |||
| SAVE_4x2_ACC(&acc1, n+4, m+0); | |||
| } | |||
| for (; n < n4; n += 4) { | |||
| __vector_quad acc0; | |||
| INIT_1ACC(); | |||
| register vector double ra0, ra1; | |||
| register vector double t0, t1; | |||
| __vector_pair pb0; | |||
| for (k = 0; k < k2; k += 2) { | |||
| LOAD_AT_2x2(m, k); | |||
| LOAD_BP_1x4(k, n); | |||
| KERNEL_MMA_1ACC(pb0, ra0); | |||
| LOAD_BP_1x4(k+1, n); | |||
| KERNEL_MMA_1ACC(pb0, ra1); | |||
| } | |||
| for (; k < K; k++) { | |||
| LOAD_AT_2x1(m, k); | |||
| LOAD_BP_1x4(k, n); | |||
| KERNEL_MMA_1ACC(pb0, ra0); | |||
| } | |||
| #if !defined(B0) | |||
| register vector double rc0; | |||
| #endif | |||
| vector double result[4]; | |||
| SAVE_4x2_ACC(&acc0, n, m); | |||
| } | |||
| for (; n < n2; n += 2) { | |||
| __vector_quad acc0; | |||
| INIT_1ACC(); | |||
| register vector double ra0, ra1; | |||
| register vector double t0, t1; | |||
| __vector_pair pb0; | |||
| for (k = 0; k < k2; k += 2) { | |||
| LOAD_AT_2x2(m, k); | |||
| LOAD_BP_1x2(k, n); | |||
| KERNEL_MMA_1ACC(pb0, ra0); | |||
| LOAD_BP_1x2(k+1, n); | |||
| KERNEL_MMA_1ACC(pb0, ra1); | |||
| } | |||
| for (; k < K; k++) { | |||
| LOAD_AT_2x1(m, k); | |||
| LOAD_BP_1x2(k, n); | |||
| KERNEL_MMA_1ACC(pb0, ra0); | |||
| } | |||
| #if !defined(B0) | |||
| register vector double rc0; | |||
| #endif | |||
| vector double result[4]; | |||
| SAVE_2x2_ACC(&acc0, n, m); | |||
| } | |||
| for (; n < N; n++) { | |||
| register vector double result = ((vector double){0.,0.}); | |||
| register vector double ra0; | |||
| register vector double rb0; | |||
| for (k = 0; k < K; k++) { | |||
| LOAD_AT_2x1(m, k); | |||
| LOAD_B_1x1(k, n); | |||
| KERNEL_VMADD_1VSR(ra0, rb0); | |||
| } | |||
| #if !defined(B0) | |||
| register vector double rc0; | |||
| #endif | |||
| SAVE_1x4_VSR(result, n, m+0); | |||
| } | |||
| } | |||
| for (; m < M; m++) { | |||
| for (n = 0; n < n8; n += 8) { | |||
| register vector double result = ((vector double){0.,0.}); | |||
| register vector double result1 = ((vector double){0.,0.}); | |||
| register vector double result2 = ((vector double){0.,0.}); | |||
| register vector double result3 = ((vector double){0.,0.}); | |||
| register vector double ra0; | |||
| register vector double rb0, rb1, rb2, rb3; | |||
| for (k = 0; k < K; k++) { | |||
| LOAD_A_1x1(m, k); | |||
| LOAD_B_1x8(k, n); | |||
| KERNEL_VMADD_4VSR(ra0, ra0, ra0, ra0, rb0, rb1, rb2, rb3); | |||
| } | |||
| SAVE_4x1_VSR(result, n, m); | |||
| SAVE_4x1_VSR(result1, n+2, m); | |||
| SAVE_4x1_VSR(result2, n+4, m); | |||
| SAVE_4x1_VSR(result3, n+6, m); | |||
| } | |||
| for (; n < n4; n += 4) { | |||
| register vector double result = ((vector double){0.,0.}); | |||
| register vector double result1 = ((vector double){0.,0.}); | |||
| register vector double ra0; | |||
| register vector double rb0, rb1; | |||
| for (k = 0; k < K; k++) { | |||
| LOAD_A_1x1(m, k); | |||
| LOAD_B_1x4(k, n); | |||
| KERNEL_VMADD_2VSR(ra0, ra0, rb0, rb1); | |||
| } | |||
| SAVE_4x1_VSR(result, n, m); | |||
| SAVE_4x1_VSR(result1, n+2, m); | |||
| } | |||
| for (; n < n2; n += 2) { | |||
| register vector double result = ((vector double){0.,0.}); | |||
| register vector double ra0; | |||
| register vector double rb0; | |||
| for (k = 0; k < K; k++) { | |||
| LOAD_A_1x1(m, k); | |||
| LOAD_B_1x2(k, n); | |||
| KERNEL_VMADD_1VSR(ra0, rb0); | |||
| } | |||
| SAVE_4x1_VSR(result, n, m); | |||
| } | |||
| for (; n < N; n++) { | |||
| FLOAT result = 0.0; | |||
| for (k = 0; k < K; k++) { | |||
| result += A[m*lda+k] * B[k*ldb+n]; | |||
| } | |||
| result = result * alpha; | |||
| #if !defined(B0) | |||
| C[n*ldc+m] = (C[n*ldc+m] * beta) + result; | |||
| #else | |||
| C[n*ldc+m] = result; | |||
| #endif | |||
| } | |||
| } | |||
| if(has_packing) free(packA); | |||
| return 0; | |||
| } | |||
| @@ -40,18 +40,27 @@ static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y | |||
| XXSPLTD_S(32,%x9,0) // alpha, alpha | |||
| "sldi %6, %13, 3 \n\t" // lda * sizeof (double) | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| "xvmuldp 34, 40, 32 \n\t" // x0 * alpha, x1 * alpha | |||
| "xvmuldp 35, 41, 32 \n\t" // x2 * alpha, x3 * alpha | |||
| #else | |||
| "xvmuldp 34, 41, 32 \n\t" // x0 * alpha, x1 * alpha | |||
| "xvmuldp 35, 40, 32 \n\t" // x2 * alpha, x3 * alpha | |||
| #endif | |||
| "add %4, %3, %6 \n\t" // a0 = ap, a1 = a0 + lda | |||
| "add %6, %6, %6 \n\t" // 2 * lda | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| XXSPLTD_S(32,34,0) // x0 * alpha, x0 * alpha | |||
| XXSPLTD_S(33,34,1) // x1 * alpha, x1 * alpha | |||
| XXSPLTD_S(34,35,0) // x2 * alpha, x2 * alpha | |||
| XXSPLTD_S(35,35,1) // x3 * alpha, x3 * alpha | |||
| #else | |||
| XXSPLTD_S(32,34,1) // x0 * alpha, x0 * alpha | |||
| XXSPLTD_S(33,34,0) // x1 * alpha, x1 * alpha | |||
| XXSPLTD_S(34,35,1) // x2 * alpha, x2 * alpha | |||
| XXSPLTD_S(35,35,0) // x3 * alpha, x3 * alpha | |||
| #endif | |||
| "add %5, %3, %6 \n\t" // a2 = a0 + 2 * lda | |||
| "add %6, %4, %6 \n\t" // a3 = a1 + 2 * lda | |||
| @@ -286,6 +295,16 @@ static void dgemv_kernel_4x8 (long n, double *ap, long lda, double *x, double *y | |||
| "add %4, %3, %10 \n\t" // a0 = ap, a1 = a0 + lda | |||
| "add %10, %10, %10 \n\t" // 2 * lda | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| XXSPLTD_S(32,34,0) // x0 * alpha, x0 * alpha | |||
| XXSPLTD_S(33,34,1) // x1 * alpha, x1 * alpha | |||
| XXSPLTD_S(34,35,0) // x2 * alpha, x2 * alpha | |||
| XXSPLTD_S(35,35,1) // x3 * alpha, x3 * alpha | |||
| XXSPLTD_S(48,39,0) // x6 * alpha, x6 * alpha | |||
| XXSPLTD_S(49,39,1) // x7 * alpha, x7 * alpha | |||
| XXSPLTD_S(39,38,1) // x5 * alpha, x5 * alpha | |||
| XXSPLTD_S(38,38,0) // x4 * alpha, x4 * alpha | |||
| #else | |||
| XXSPLTD_S(32,34,1) // x0 * alpha, x0 * alpha | |||
| XXSPLTD_S(33,34,0) // x1 * alpha, x1 * alpha | |||
| XXSPLTD_S(34,35,1) // x2 * alpha, x2 * alpha | |||
| @@ -294,6 +313,7 @@ static void dgemv_kernel_4x8 (long n, double *ap, long lda, double *x, double *y | |||
| XXSPLTD_S(49,39,0) // x7 * alpha, x7 * alpha | |||
| XXSPLTD_S(39,38,0) // x5 * alpha, x5 * alpha | |||
| XXSPLTD_S(38,38,1) // x4 * alpha, x4 * alpha | |||
| #endif | |||
| "add %5, %3, %10 \n\t" // a2 = a0 + 2 * lda | |||
| "add %6, %4, %10 \n\t" // a3 = a1 + 2 * lda | |||
| @@ -319,30 +339,69 @@ static void dgemv_kernel_4x8 (long n, double *ap, long lda, double *x, double *y | |||
| "one%=: \n\t" | |||
| "lxvp 36, 0( %2) \n\t" // y0, y1 | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| "xvmaddadp 36, 40, 32 \n\t" | |||
| "xvmaddadp 37, 41, 32 \n\t" | |||
| #else | |||
| "xvmaddadp 36, 40, 34 \n\t" | |||
| "xvmaddadp 37, 41, 34 \n\t" | |||
| #endif | |||
| "lxvpx 40, %3, %11 \n\t" // a0[0], a0[1] | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| "xvmaddadp 36, 42, 33 \n\t" | |||
| "xvmaddadp 37, 43, 33 \n\t" | |||
| #else | |||
| "xvmaddadp 36, 42, 35 \n\t" | |||
| "xvmaddadp 37, 43, 35 \n\t" | |||
| #endif | |||
| "lxvpx 42, %4, %11 \n\t" // a1[0], a1[1] | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| "xvmaddadp 36, 44, 34 \n\t" | |||
| "xvmaddadp 37, 45, 34 \n\t" | |||
| #else | |||
| "xvmaddadp 36, 44, 32 \n\t" | |||
| "xvmaddadp 37, 45, 32 \n\t" | |||
| #endif | |||
| "lxvpx 44, %5, %11 \n\t" // a2[0], a2[1] | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| "xvmaddadp 36, 46, 35 \n\t" | |||
| "xvmaddadp 37, 47, 35 \n\t" | |||
| #else | |||
| "xvmaddadp 36, 46, 33 \n\t" | |||
| "xvmaddadp 37, 47, 33 \n\t" | |||
| #endif | |||
| "lxvpx 46, %6, %11 \n\t" // a3[0], a3[1] | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| "xvmaddadp 36, 50, 38 \n\t" | |||
| "xvmaddadp 37, 51, 38 \n\t" | |||
| #else | |||
| "xvmaddadp 36, 50, 48 \n\t" | |||
| "xvmaddadp 37, 51, 48 \n\t" | |||
| #endif | |||
| "lxvpx 50, %7, %11 \n\t" // a4[0] | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| "xvmaddadp 36, 52, 39 \n\t" | |||
| "xvmaddadp 37, 53, 39 \n\t" | |||
| #else | |||
| "xvmaddadp 36, 52, 49 \n\t" | |||
| "xvmaddadp 37, 53, 49 \n\t" | |||
| #endif | |||
| "lxvpx 52, %8, %11 \n\t" // a5[0] | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| "xvmaddadp 36, 54, 48 \n\t" | |||
| "xvmaddadp 37, 55, 48 \n\t" | |||
| #else | |||
| "xvmaddadp 36, 54, 38 \n\t" | |||
| "xvmaddadp 37, 55, 38 \n\t" | |||
| #endif | |||
| "lxvpx 54, %9, %11 \n\t" // a6[0] | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| "xvmaddadp 36, 56, 49 \n\t" | |||
| "xvmaddadp 37, 57, 49 \n\t" | |||
| #else | |||
| "xvmaddadp 36, 56, 39 \n\t" | |||
| "xvmaddadp 37, 57, 39 \n\t" | |||
| #endif | |||
| "lxvpx 56, %10, %11 \n\t" // a7[0] | |||
| "addi %11, %11, 32 \n\t" | |||
| @@ -355,6 +414,24 @@ static void dgemv_kernel_4x8 (long n, double *ap, long lda, double *x, double *y | |||
| "two%=: \n\t" | |||
| "lxvp 36, 0( %2) \n\t" // y0, y1 | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| "xvmaddadp 36, 40, 32 \n\t" | |||
| "xvmaddadp 37, 41, 32 \n\t" | |||
| "xvmaddadp 36, 42, 33 \n\t" | |||
| "xvmaddadp 37, 43, 33 \n\t" | |||
| "xvmaddadp 36, 44, 34 \n\t" | |||
| "xvmaddadp 37, 45, 34 \n\t" | |||
| "xvmaddadp 36, 46, 35 \n\t" | |||
| "xvmaddadp 37, 47, 35 \n\t" | |||
| "xvmaddadp 36, 50, 38 \n\t" | |||
| "xvmaddadp 37, 51, 38 \n\t" | |||
| "xvmaddadp 36, 52, 39 \n\t" | |||
| "xvmaddadp 37, 53, 39 \n\t" | |||
| "xvmaddadp 36, 54, 48 \n\t" | |||
| "xvmaddadp 37, 55, 48 \n\t" | |||
| "xvmaddadp 36, 56, 49 \n\t" | |||
| "xvmaddadp 37, 57, 49 \n\t" | |||
| #else | |||
| "xvmaddadp 36, 40, 34 \n\t" | |||
| "xvmaddadp 37, 41, 34 \n\t" | |||
| "xvmaddadp 36, 42, 35 \n\t" | |||
| @@ -371,6 +448,7 @@ static void dgemv_kernel_4x8 (long n, double *ap, long lda, double *x, double *y | |||
| "xvmaddadp 37, 55, 38 \n\t" | |||
| "xvmaddadp 36, 56, 39 \n\t" | |||
| "xvmaddadp 37, 57, 39 \n\t" | |||
| #endif | |||
| "stxvp 36, 0( %2) \n\t" // y0, y1 | |||
| : | |||
| @@ -279,34 +279,58 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, do | |||
| "lxvp 40, 32(%[y]) \n\t" | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| XXMRGHD_S(42,34,35) | |||
| XXMRGLD_S(43,34,35) | |||
| XXMRGHD_S(44,4,5) | |||
| XXMRGLD_S(45,4,5) | |||
| #else | |||
| XXMRGLD_S(42,35,34) | |||
| XXMRGHD_S(43,35,34) | |||
| XXMRGLD_S(44,5,4) | |||
| XXMRGHD_S(45,5,4) | |||
| #endif | |||
| "xvadddp 42,42,43 \n\t" | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| XXMRGHD_S(46,6,7) | |||
| XXMRGLD_S(47,6,7) | |||
| #else | |||
| XXMRGLD_S(46,7,6) | |||
| XXMRGHD_S(47,7,6) | |||
| #endif | |||
| "xvadddp 44,44,45 \n\t" | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| XXMRGHD_S(48,8,9) | |||
| XXMRGLD_S(49,8,9) | |||
| #else | |||
| XXMRGLD_S(48,9,8) | |||
| XXMRGHD_S(49,9,8) | |||
| #endif | |||
| "xvadddp 46,46,47 \n\t" | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| "xvmaddadp 38,42,36 \n\t" | |||
| "xvmaddadp 39,44,36 \n\t" | |||
| #else | |||
| "xvmaddadp 39,42,36 \n\t" | |||
| "xvmaddadp 38,44,36 \n\t" | |||
| #endif | |||
| "xvadddp 48,48,49 \n\t" | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| "xvmaddadp 41,48,36 \n\t" | |||
| #else | |||
| "xvmaddadp 41,46,36 \n\t" | |||
| #endif | |||
| "stxvp 38, 0(%[y]) \n\t" | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| "xvmaddadp 40,46,36 \n\t" | |||
| #else | |||
| "xvmaddadp 40,48,36 \n\t" | |||
| #endif | |||
| "stxvp 40, 32(%[y]) \n\t" | |||
| : [memy] "+m" (*(double (*)[8])y), | |||
| @@ -42,10 +42,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||
| #if defined(POWER8) || defined(POWER9) | |||
| #include "drot_microk_power8.c" | |||
| #elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) | |||
| #include "drot_microk_power10.c" | |||
| #elif defined(POWER10) | |||
| #include "drot_microk_power8.c" | |||
| #include "drot_microk_power10.c" | |||
| #endif | |||
| #endif | |||
| @@ -117,7 +115,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT | |||
| if ( (inc_x == 1) && (inc_y == 1) ) | |||
| { | |||
| #if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) | |||
| #if defined(POWER10) | |||
| if ( n >= 16 ) | |||
| { | |||
| BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 3) & 0x3; | |||
| @@ -38,10 +38,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||
| #if defined(POWER8) || defined(POWER9) | |||
| #include "dscal_microk_power8.c" | |||
| #elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) | |||
| #include "dscal_microk_power10.c" | |||
| #elif defined(POWER10) | |||
| #include "dscal_microk_power8.c" | |||
| #include "dscal_microk_power10.c" | |||
| #endif | |||
| #endif | |||
| @@ -104,7 +102,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||
| if ( da == 0.0 ) | |||
| { | |||
| #if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) | |||
| #if defined(POWER10) | |||
| if ( n >= 16 ) | |||
| { | |||
| BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3; | |||
| @@ -138,7 +136,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||
| else | |||
| { | |||
| #if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) | |||
| #if defined(POWER10) | |||
| if ( n >= 16 ) | |||
| { | |||
| BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3; | |||
| @@ -38,10 +38,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||
| #if defined(POWER8) || defined(POWER9) | |||
| #include "dswap_microk_power8.c" | |||
| #elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) | |||
| #include "swap_microk_power10.c" | |||
| #elif defined(POWER10) | |||
| #include "dswap_microk_power8.c" | |||
| #include "swap_microk_power10.c" | |||
| #endif | |||
| #endif | |||
| @@ -119,7 +117,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, | |||
| if ( (inc_x == 1) && (inc_y == 1 )) | |||
| { | |||
| #if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) | |||
| #if defined(POWER10) | |||
| if ( n >= 32 ) | |||
| { | |||
| BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 3) & 0x3; | |||
| @@ -0,0 +1,84 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2021, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| int CNAME(int transa, int transb, BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT beta) | |||
| { | |||
| double MNK = (double) M * (double) N * (double) K; | |||
| #if defined(DOUBLE) // dgemm | |||
| // gcc11 (minor <= 2) has an issue when multiple assemble_pairs are used. This | |||
| // issue affects both dgemm_nn and dgemm_tn. | |||
| #if (defined(__GNUC__) && (__GNUC__ == 11 && __GNUC_MINOR__ <= 2)) | |||
| if (!transb) | |||
| return 0; | |||
| #endif | |||
| if (MNK <= 54.0*54.0*54.0) | |||
| return 1; | |||
| #else // sgemm | |||
| #if defined(__GNUC__) && defined(__clang__) | |||
| // clang generates code with register spilling for the region of code with | |||
| // packing, thus, we had to disable this optimization for clang. Given that | |||
| // the packing on-demand used in this work is one of the reasons that lead the | |||
| // small kernels to outperform the normal flow (when MNK increases), with it | |||
| // disabled we had to reduce the MNK inputs used by the code generated by clang. | |||
| if (MNK > 84.0*84.0*84.0) | |||
| return 0; | |||
| if (transa && !transb) { | |||
| // sgemm_tn works better when packing on-demand is used | |||
| if (MNK <= 64.0*64.0*64.0 && K >= 4) | |||
| return 1; | |||
| else | |||
| return 0; | |||
| } | |||
| #else // gcc | |||
| if (MNK > 100.0*100.0*100.0) | |||
| return 0; | |||
| #endif | |||
| // Multi-threading execution outperforms (or approaches) the execution of the | |||
| // small kernel. | |||
| if (num_cpu_avail(3) > 1) { | |||
| if (MNK <= 64.0*64.0*64.0) | |||
| return 1; | |||
| } else { | |||
| return 1; | |||
| } | |||
| #endif | |||
| return 0; | |||
| } | |||
| @@ -49,10 +49,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||
| #if defined(POWER8) || defined(POWER9) | |||
| #include "sasum_microk_power8.c" | |||
| #elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) | |||
| #include "sasum_microk_power10.c" | |||
| #elif defined(POWER10) | |||
| #include "sasum_microk_power8.c" | |||
| #include "sasum_microk_power10.c" | |||
| #endif | |||
| #endif | |||
| @@ -114,7 +112,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| if ( inc_x == 1 ) | |||
| { | |||
| #if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) | |||
| #if defined(POWER10) | |||
| if ( n >= 32 ) | |||
| { | |||
| BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 2) & 0x7; | |||
| @@ -0,0 +1,887 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2021, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #include <altivec.h> | |||
| typedef __vector unsigned char vec_t; | |||
| #if !defined(B0) | |||
| #define SAVE_4x4_ACC(ACC, N, M) \ | |||
| __builtin_mma_disassemble_acc((void *)result, ACC); \ | |||
| rc0 = vec_xl(0, C+(N+0)*ldc+M); \ | |||
| rc0 = vec_mul(rc0, vbeta); \ | |||
| result[0] = vec_madd(result[0], valpha, rc0); \ | |||
| vec_xst(result[0], 0, C+(N+0)*ldc+M); \ | |||
| rc0 = vec_xl(0, C+(N+1)*ldc+M); \ | |||
| rc0 = vec_mul(rc0, vbeta); \ | |||
| result[1] = vec_madd(result[1], valpha, rc0); \ | |||
| vec_xst(result[1], 0, C+(N+1)*ldc+M); \ | |||
| rc0 = vec_xl(0, C+(N+2)*ldc+M); \ | |||
| rc0 = vec_mul(rc0, vbeta); \ | |||
| result[2] = vec_madd(result[2], valpha, rc0); \ | |||
| vec_xst(result[2], 0, C+(N+2)*ldc+M); \ | |||
| rc0 = vec_xl(0, C+(N+3)*ldc+M); \ | |||
| rc0 = vec_mul(rc0, vbeta); \ | |||
| result[3] = vec_madd(result[3], valpha, rc0); \ | |||
| vec_xst(result[3], 0, C+(N+3)*ldc+M); | |||
| #define SAVE_4x2_ACC(ACC, N, M) \ | |||
| __builtin_mma_disassemble_acc((void *)result, ACC); \ | |||
| rc0 = vec_xl_len(C+(N+0)*ldc+M, 8); \ | |||
| rc0 = vec_mul(rc0, vbeta); \ | |||
| result[0] = vec_madd(result[0], valpha, rc0); \ | |||
| vec_xst_len(result[0], C+(N+0)*ldc+M, 8); \ | |||
| rc0 = vec_xl_len(C+(N+1)*ldc+M, 8); \ | |||
| rc0 = vec_mul(rc0, vbeta); \ | |||
| result[1] = vec_madd(result[1], valpha, rc0); \ | |||
| vec_xst_len(result[1], C+(N+1)*ldc+M, 8); \ | |||
| rc0 = vec_xl_len(C+(N+2)*ldc+M, 8); \ | |||
| rc0 = vec_mul(rc0, vbeta); \ | |||
| result[2] = vec_madd(result[2], valpha, rc0); \ | |||
| vec_xst_len(result[2], C+(N+2)*ldc+M, 8); \ | |||
| rc0 = vec_xl_len(C+(N+3)*ldc+M, 8); \ | |||
| rc0 = vec_mul(rc0, vbeta); \ | |||
| result[3] = vec_madd(result[3], valpha, rc0); \ | |||
| vec_xst_len(result[3], C+(N+3)*ldc+M, 8); | |||
| #define SAVE_2x4_ACC(ACC, N, M) \ | |||
| __builtin_mma_disassemble_acc((void *)result, ACC); \ | |||
| rc0 = vec_xl(0, C+(N+0)*ldc+M); \ | |||
| rc0 = vec_mul(rc0, vbeta); \ | |||
| result[0] = vec_madd(result[0], valpha, rc0); \ | |||
| vec_xst(result[0], 0, C+(N+0)*ldc+M); \ | |||
| rc0 = vec_xl(0, C+(N+1)*ldc+M); \ | |||
| rc0 = vec_mul(rc0, vbeta); \ | |||
| result[1] = vec_madd(result[1], valpha, rc0); \ | |||
| vec_xst(result[1], 0, C+(N+1)*ldc+M); | |||
| #define SAVE_1x4_VSR(result, N, M) \ | |||
| rc0 = vec_xl(0, C+((N)*ldc)+M); \ | |||
| rc0 = vec_mul(rc0, vbeta); \ | |||
| result = vec_madd(result, valpha, rc0); \ | |||
| vec_xst(result, 0, C+((N)*ldc)+M); | |||
| #define SAVE_2x2_VSR(result, N, M) \ | |||
| rc0 = vec_xl_len(C+(N*ldc)+M, 8); \ | |||
| rc0 = vec_insert(C[(N+1)*ldc+M+0], rc0, 2); \ | |||
| rc0 = vec_insert(C[(N+1)*ldc+M+1], rc0, 3); \ | |||
| rc0 = vec_mul(rc0, vbeta); \ | |||
| result = vec_madd(result, valpha, rc0); \ | |||
| vec_xst_len(result, C+(N*ldc)+M, 8); \ | |||
| C[(N+1)*ldc+M+0] = result[2]; \ | |||
| C[(N+1)*ldc+M+1] = result[3]; | |||
| #define SAVE_1x2_VSR(result, N, M) \ | |||
| rc0 = vec_xl_len(C+(N*ldc)+M, 8); \ | |||
| rc0 = vec_mul(rc0, vbeta); \ | |||
| result = vec_madd(result, valpha, rc0); \ | |||
| vec_xst_len(result, C+(N*ldc)+M, 8); | |||
| #define SAVE_4x1_VSR(result, N, M) \ | |||
| result = vec_mul(result, valpha); \ | |||
| C[(N+0)*ldc+M] = (C[(N+0)*ldc+M] * beta) + result[0]; \ | |||
| C[(N+1)*ldc+M] = (C[(N+1)*ldc+M] * beta) + result[1]; \ | |||
| C[(N+2)*ldc+M] = (C[(N+2)*ldc+M] * beta) + result[2]; \ | |||
| C[(N+3)*ldc+M] = (C[(N+3)*ldc+M] * beta) + result[3]; | |||
| #define SAVE_2x1_VSR(result, N, M) \ | |||
| result = vec_mul(result, valpha); \ | |||
| C[(N+0)*ldc+M] = (C[(N+0)*ldc+M] * beta) + result[0]; \ | |||
| C[(N+1)*ldc+M] = (C[(N+1)*ldc+M] * beta) + result[1]; | |||
| #else | |||
| #define SAVE_4x4_ACC(ACC, N, M) \ | |||
| __builtin_mma_disassemble_acc((void *)result, ACC); \ | |||
| result[0] = vec_mul(result[0], valpha); \ | |||
| vec_xst(result[0], 0, C+(N+0)*ldc+M); \ | |||
| result[1] = vec_mul(result[1], valpha); \ | |||
| vec_xst(result[1], 0, C+(N+1)*ldc+M); \ | |||
| result[2] = vec_mul(result[2], valpha); \ | |||
| vec_xst(result[2], 0, C+(N+2)*ldc+M); \ | |||
| result[3] = vec_mul(result[3], valpha); \ | |||
| vec_xst(result[3], 0, C+(N+3)*ldc+M); | |||
| #define SAVE_4x2_ACC(ACC, N, M) \ | |||
| __builtin_mma_disassemble_acc((void *)result, ACC); \ | |||
| result[0] = vec_mul(result[0], valpha); \ | |||
| vec_xst_len(result[0], C+(N+0)*ldc+M, 8); \ | |||
| result[1] = vec_mul(result[1], valpha); \ | |||
| vec_xst_len(result[1], C+(N+1)*ldc+M, 8); \ | |||
| result[2] = vec_mul(result[2], valpha); \ | |||
| vec_xst_len(result[2], C+(N+2)*ldc+M, 8); \ | |||
| result[3] = vec_mul(result[3], valpha); \ | |||
| vec_xst_len(result[3], C+(N+3)*ldc+M, 8); | |||
| #define SAVE_2x4_ACC(ACC, N, M) \ | |||
| __builtin_mma_disassemble_acc((void *)result, ACC); \ | |||
| result[0] = vec_mul(result[0], valpha); \ | |||
| vec_xst(result[0], 0, C+(N+0)*ldc+M); \ | |||
| result[1] = vec_mul(result[1], valpha); \ | |||
| vec_xst(result[1], 0, C+(N+1)*ldc+M); | |||
| #define SAVE_1x4_VSR(result, N, M) \ | |||
| result = vec_mul(result, valpha); \ | |||
| vec_xst(result, 0, C+((N)*ldc)+M); | |||
| #define SAVE_2x2_VSR(result, N, M) \ | |||
| result = vec_mul(result, valpha); \ | |||
| vec_xst_len(result, C+(N*ldc)+M, 8); \ | |||
| C[(N+1)*ldc+M+0] = result[2]; \ | |||
| C[(N+1)*ldc+M+1] = result[3]; | |||
| #define SAVE_1x2_VSR(result, N, M) \ | |||
| result = vec_mul(result, valpha); \ | |||
| vec_xst_len(result, C+(N*ldc)+M, 8); | |||
| #define SAVE_4x1_VSR(result, N, M) \ | |||
| result = vec_mul(result, valpha); \ | |||
| C[(N+0)*ldc+M] = result[0]; \ | |||
| C[(N+1)*ldc+M] = result[1]; \ | |||
| C[(N+2)*ldc+M] = result[2]; \ | |||
| C[(N+3)*ldc+M] = result[3]; | |||
| #define SAVE_2x1_VSR(result, N, M) \ | |||
| result = vec_mul(result, valpha); \ | |||
| C[(N+0)*ldc+M] = result[0]; \ | |||
| C[(N+1)*ldc+M] = result[1]; | |||
| #endif | |||
| #define INIT_8ACCS() \ | |||
| __builtin_mma_xxsetaccz(&acc0); \ | |||
| __builtin_mma_xxsetaccz(&acc1); \ | |||
| __builtin_mma_xxsetaccz(&acc2); \ | |||
| __builtin_mma_xxsetaccz(&acc3); \ | |||
| __builtin_mma_xxsetaccz(&acc4); \ | |||
| __builtin_mma_xxsetaccz(&acc5); \ | |||
| __builtin_mma_xxsetaccz(&acc6); \ | |||
| __builtin_mma_xxsetaccz(&acc7); | |||
| #define INIT_4ACCS() \ | |||
| __builtin_mma_xxsetaccz(&acc0); \ | |||
| __builtin_mma_xxsetaccz(&acc1); \ | |||
| __builtin_mma_xxsetaccz(&acc2); \ | |||
| __builtin_mma_xxsetaccz(&acc3); | |||
| #define INIT_2ACCS() \ | |||
| __builtin_mma_xxsetaccz(&acc0); \ | |||
| __builtin_mma_xxsetaccz(&acc1); | |||
| #define INIT_1ACC() __builtin_mma_xxsetaccz(&acc0); | |||
| #define LOAD_A_1x16(K, M) \ | |||
| ra0 = vec_xl(0, A+(K*lda)+M+0); \ | |||
| ra1 = vec_xl(0, A+(K*lda)+M+4); \ | |||
| ra2 = vec_xl(0, A+(K*lda)+M+8); \ | |||
| ra3 = vec_xl(0, A+(K*lda)+M+12); | |||
| #define LOAD_A_1x8(K, M) \ | |||
| ra0 = vec_xl(0, A+(K*lda)+M+0); \ | |||
| ra1 = vec_xl(0, A+(K*lda)+M+4); | |||
| #define LOAD_A_1x4(K, M) ra0 = vec_xl(0, A+(K*lda)+M); | |||
| #define LOAD_A_2x2(K, M) \ | |||
| ra0 = vec_splats(A[K*lda+M+0]); \ | |||
| ra0 = vec_insert(A[K*lda+M+1], ra0, 1); \ | |||
| ra0 = vec_insert(A[K*lda+M+1], ra0, 3); | |||
| #define LOAD_A_1x2(K, M) ra0 = vec_xl_len(A+(K*lda)+M, 8); | |||
| #define LOAD_A_1x1(K, M) ra0 = vec_splats(A[K*lda+M+0]); | |||
| #define LOAD_B_1x16(K, N) \ | |||
| rb0 = vec_xl(0, B+(K*ldb)+N+0); \ | |||
| rb1 = vec_xl(0, B+(K*ldb)+N+4); \ | |||
| rb2 = vec_xl(0, B+(K*ldb)+N+8); \ | |||
| rb3 = vec_xl(0, B+(K*ldb)+N+12); | |||
| #define LOAD_B_1x8(K, N) \ | |||
| rb0 = vec_xl(0, B+(K*ldb)+N+0); \ | |||
| rb1 = vec_xl(0, B+(K*ldb)+N+4); | |||
| #define LOAD_B_1x4(K, N) rb0 = vec_xl(0, B+(K*ldb)+N); | |||
| #define LOAD_B_2x2(K, N) \ | |||
| rb0 = vec_splats(B[K*ldb+N]); \ | |||
| rb0 = vec_insert(B[K*ldb+N+1], rb0, 2); \ | |||
| rb0 = vec_insert(B[K*ldb+N+1], rb0, 3); | |||
| #define LOAD_B_1x2(K, N) rb0 = vec_xl_len(B+(K*ldb)+N, 8); | |||
| #define LOAD_B_1x1(K, N) rb0 = vec_splats(B[K*ldb+N]); | |||
| #define KERNEL_MMA_8ACC(b0, b1, b2, b3, b4, b5, b6, b7, \ | |||
| a0, a1, a2, a3, a4, a5, a6, a7) \ | |||
| __builtin_mma_xvf32gerpp(&acc0, (vec_t)b0, (vec_t)a0); \ | |||
| __builtin_mma_xvf32gerpp(&acc1, (vec_t)b1, (vec_t)a1); \ | |||
| __builtin_mma_xvf32gerpp(&acc2, (vec_t)b2, (vec_t)a2); \ | |||
| __builtin_mma_xvf32gerpp(&acc3, (vec_t)b3, (vec_t)a3); \ | |||
| __builtin_mma_xvf32gerpp(&acc4, (vec_t)b4, (vec_t)a4); \ | |||
| __builtin_mma_xvf32gerpp(&acc5, (vec_t)b5, (vec_t)a5); \ | |||
| __builtin_mma_xvf32gerpp(&acc6, (vec_t)b6, (vec_t)a6); \ | |||
| __builtin_mma_xvf32gerpp(&acc7, (vec_t)b7, (vec_t)a7); | |||
| #define KERNEL_MMA_4ACC(b0, b1, b2, b3, a0, a1, a2, a3) \ | |||
| __builtin_mma_xvf32gerpp(&acc0, (vec_t)b0, (vec_t)a0); \ | |||
| __builtin_mma_xvf32gerpp(&acc1, (vec_t)b1, (vec_t)a1); \ | |||
| __builtin_mma_xvf32gerpp(&acc2, (vec_t)b2, (vec_t)a2); \ | |||
| __builtin_mma_xvf32gerpp(&acc3, (vec_t)b3, (vec_t)a3); | |||
| #define KERNEL_MMA_2ACC(b0, b1, a0, a1) \ | |||
| __builtin_mma_xvf32gerpp(&acc0, (vec_t)b0, (vec_t)a0); \ | |||
| __builtin_mma_xvf32gerpp(&acc1, (vec_t)b1, (vec_t)a1); | |||
| #define KERNEL_MMA_1ACC(b0, a0) \ | |||
| __builtin_mma_xvf32gerpp(&acc0, (vec_t)b0, (vec_t)a0); | |||
| #define KERNEL_VMADD_4VSR(a0, a1, a2, a3, b0, b1, b2, b3) \ | |||
| result = vec_madd(a0, b0, result); \ | |||
| result1 = vec_madd(a1, b1, result1); \ | |||
| result2 = vec_madd(a2, b2, result2); \ | |||
| result3 = vec_madd(a3, b3, result3); | |||
| #define KERNEL_VMADD_2VSR(a0, a1, b0, b1) \ | |||
| result = vec_madd(a0, b0, result); \ | |||
| result1 = vec_madd(a1, b1, result1); | |||
| #define KERNEL_VMADD_1VSR(a0, b0) \ | |||
| result = vec_madd(a0, b0, result); | |||
| #define PACK_A(ra0, ra1, ra2, ra3, offset) \ | |||
| vec_xst(ra0, 0, packA+(k*16)+0+offset); \ | |||
| vec_xst(ra1, 0, packA+(k*16)+4+offset); \ | |||
| vec_xst(ra2, 0, packA+(k*16)+8+offset); \ | |||
| vec_xst(ra3, 0, packA+(k*16)+12+offset); | |||
| #define LOAD_PACKED_A(ra0, ra1, ra2, ra3, offset) \ | |||
| ra0 = vec_xl(0, packA+(k*16)+0+offset); \ | |||
| ra1 = vec_xl(0, packA+(k*16)+4+offset); \ | |||
| ra2 = vec_xl(0, packA+(k*16)+8+offset); \ | |||
| ra3 = vec_xl(0, packA+(k*16)+12+offset); | |||
| #ifdef B0 | |||
| int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) | |||
| #else | |||
| int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc) | |||
| #endif | |||
| { | |||
| BLASLONG m, n, k; | |||
| BLASLONG m16 = M & ~15; | |||
| BLASLONG m8 = M & ~7; | |||
| BLASLONG m4 = M & ~3; | |||
| BLASLONG m2 = M & ~1; | |||
| BLASLONG n16 = N & ~15; | |||
| BLASLONG n8 = N & ~7; | |||
| BLASLONG n4 = N & ~3; | |||
| BLASLONG n2 = N & ~1; | |||
| vector float valpha = vec_splats(alpha); | |||
| #if !defined(B0) | |||
| vector float vbeta = vec_splats(beta); | |||
| #endif | |||
| #if defined(__GNUC__) && !defined(__clang__) | |||
| int has_packing = (M >= 40 && N >= 40 && K >= 40) ? 1 : 0; | |||
| #else | |||
| int has_packing = 0; | |||
| #endif | |||
| float *packA; | |||
| if (has_packing) packA = (float *)malloc(K*16*sizeof(float)); | |||
| for (m = 0; m < m16; m += 16) { | |||
| for (n = 0; n < n8; n += 8) { | |||
| __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; | |||
| INIT_8ACCS(); | |||
| register vector float ra0, ra1, ra2, ra3; | |||
| register vector float rb0, rb1; | |||
| if (has_packing) { | |||
| if (n == 0) { | |||
| for (k = 0; k < K; k++) { | |||
| LOAD_A_1x16(k, m); | |||
| LOAD_B_1x8(k, n); | |||
| KERNEL_MMA_8ACC(rb0, rb1, rb0, rb1, rb0, rb1, rb0, rb1, | |||
| ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3); | |||
| PACK_A(ra0, ra1, ra2, ra3, 0); | |||
| } | |||
| } else { | |||
| for (k = 0; k < K; k++) { | |||
| LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0); | |||
| LOAD_B_1x8(k, n); | |||
| KERNEL_MMA_8ACC(rb0, rb1, rb0, rb1, rb0, rb1, rb0, rb1, | |||
| ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3); | |||
| } | |||
| } | |||
| } else { | |||
| for (k = 0; k < K; k++) { | |||
| LOAD_A_1x16(k, m); | |||
| LOAD_B_1x8(k, n); | |||
| KERNEL_MMA_8ACC(rb0, rb1, rb0, rb1, rb0, rb1, rb0, rb1, | |||
| ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3); | |||
| } | |||
| } | |||
| #if !defined(B0) | |||
| register vector float rc0; | |||
| #endif | |||
| vector float result[4]; | |||
| SAVE_4x4_ACC(&acc0, n+0, m+0); | |||
| SAVE_4x4_ACC(&acc2, n+0, m+4); | |||
| SAVE_4x4_ACC(&acc4, n+0, m+8); | |||
| SAVE_4x4_ACC(&acc6, n+0, m+12); | |||
| SAVE_4x4_ACC(&acc1, n+4, m+0); | |||
| SAVE_4x4_ACC(&acc3, n+4, m+4); | |||
| SAVE_4x4_ACC(&acc5, n+4, m+8); | |||
| SAVE_4x4_ACC(&acc7, n+4, m+12); | |||
| } | |||
| for (; n < n4; n += 4) { | |||
| __vector_quad acc0, acc1, acc2, acc3; | |||
| INIT_4ACCS(); | |||
| register vector float ra0, ra1, ra2, ra3; | |||
| register vector float rb0; | |||
| if (!has_packing) { | |||
| for (k = 0; k < K; k++) { | |||
| LOAD_A_1x16(k, m); | |||
| LOAD_B_1x4(k, n); | |||
| KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra1, ra2, ra3); | |||
| } | |||
| } else { | |||
| for (k = 0; k < K; k++) { | |||
| LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0); | |||
| LOAD_B_1x4(k, n); | |||
| KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra1, ra2, ra3); | |||
| } | |||
| } | |||
| #if !defined(B0) | |||
| register vector float rc0; | |||
| #endif | |||
| vector float result[4]; | |||
| SAVE_4x4_ACC(&acc0, n+0, m+0); | |||
| SAVE_4x4_ACC(&acc1, n+0, m+4); | |||
| SAVE_4x4_ACC(&acc2, n+0, m+8); | |||
| SAVE_4x4_ACC(&acc3, n+0, m+12); | |||
| } | |||
| for (; n < n2; n += 2) { | |||
| __vector_quad acc0, acc1, acc2, acc3; | |||
| INIT_4ACCS(); | |||
| register vector float ra0, ra1, ra2, ra3; | |||
| register vector float rb0; | |||
| if (!has_packing) { | |||
| for (k = 0; k < K; k++) { | |||
| LOAD_A_1x16(k, m); | |||
| LOAD_B_1x2(k, n); | |||
| KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra1, ra2, ra3); | |||
| } | |||
| } else { | |||
| for (k = 0; k < K; k++) { | |||
| LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0); | |||
| LOAD_B_1x2(k, n); | |||
| KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra1, ra2, ra3); | |||
| } | |||
| } | |||
| #if !defined(B0) | |||
| register vector float rc0; | |||
| #endif | |||
| vector float result[4]; | |||
| SAVE_2x4_ACC(&acc0, n, m+0); | |||
| SAVE_2x4_ACC(&acc1, n, m+4); | |||
| SAVE_2x4_ACC(&acc2, n, m+8); | |||
| SAVE_2x4_ACC(&acc3, n, m+12); | |||
| } | |||
| for (; n < N; n++) { | |||
| vector float result = ((vector float){0., 0., 0., 0.}); | |||
| vector float result1 = ((vector float){0., 0., 0., 0.}); | |||
| vector float result2 = ((vector float){0., 0., 0., 0.}); | |||
| vector float result3 = ((vector float){0., 0., 0., 0.}); | |||
| register vector float ra0, ra1, ra2, ra3; | |||
| register vector float rb0; | |||
| if (!has_packing) { | |||
| for (k = 0; k < K; k++) { | |||
| LOAD_A_1x16(k, m); | |||
| LOAD_B_1x1(k, n); | |||
| KERNEL_VMADD_4VSR(ra0, ra1, ra2, ra3, rb0, rb0, rb0, rb0); | |||
| } | |||
| } else { | |||
| for (k = 0; k < K; k++) { | |||
| LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0); | |||
| LOAD_B_1x1(k, n); | |||
| KERNEL_VMADD_4VSR(ra0, ra1, ra2, ra3, rb0, rb0, rb0, rb0); | |||
| } | |||
| } | |||
| #if !defined(B0) | |||
| register vector float rc0; | |||
| #endif | |||
| SAVE_1x4_VSR(result, n, m); | |||
| SAVE_1x4_VSR(result1, n, m+4); | |||
| SAVE_1x4_VSR(result2, n, m+8); | |||
| SAVE_1x4_VSR(result3, n, m+12); | |||
| } | |||
| } | |||
| for (; m < m8; m += 8) { | |||
| for (n = 0; n < n16; n += 16) { | |||
| __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; | |||
| INIT_8ACCS(); | |||
| register vector float ra0, ra1; | |||
| register vector float rb0, rb1, rb2, rb3; | |||
| for (k = 0; k < K; k++) { | |||
| LOAD_A_1x8(k, m); | |||
| LOAD_B_1x16(k, n); | |||
| KERNEL_MMA_8ACC(rb0, rb1, rb2, rb3, rb0, rb1, rb2, rb3, | |||
| ra0, ra0, ra0, ra0, ra1, ra1, ra1, ra1); | |||
| } | |||
| #if !defined(B0) | |||
| register vector float rc0; | |||
| #endif | |||
| vector float result[4]; | |||
| SAVE_4x4_ACC(&acc0, n+0, m+0); | |||
| SAVE_4x4_ACC(&acc4, n+0, m+4); | |||
| SAVE_4x4_ACC(&acc1, n+4, m+0); | |||
| SAVE_4x4_ACC(&acc5, n+4, m+4); | |||
| SAVE_4x4_ACC(&acc2, n+8, m+0); | |||
| SAVE_4x4_ACC(&acc6, n+8, m+4); | |||
| SAVE_4x4_ACC(&acc3, n+12, m+0); | |||
| SAVE_4x4_ACC(&acc7, n+12, m+4); | |||
| } | |||
| for (; n < n8; n += 8) { | |||
| __vector_quad acc0, acc1, acc2, acc3; | |||
| INIT_4ACCS(); | |||
| register vector float ra0, ra1; | |||
| register vector float rb0, rb1; | |||
| for (k = 0; k < K; k++) { | |||
| LOAD_A_1x8(k, m); | |||
| LOAD_B_1x8(k, n); | |||
| KERNEL_MMA_4ACC(rb0, rb1, rb0, rb1, ra0, ra0, ra1, ra1); | |||
| } | |||
| #if !defined(B0) | |||
| register vector float rc0; | |||
| #endif | |||
| vector float result[4]; | |||
| SAVE_4x4_ACC(&acc0, n+0, m+0); | |||
| SAVE_4x4_ACC(&acc2, n+0, m+4); | |||
| SAVE_4x4_ACC(&acc1, n+4, m+0); | |||
| SAVE_4x4_ACC(&acc3, n+4, m+4); | |||
| } | |||
| for (; n < n4; n += 4) { | |||
| __vector_quad acc0, acc1; | |||
| INIT_2ACCS(); | |||
| register vector float ra0, ra1; | |||
| register vector float rb0; | |||
| for (k = 0; k < K; k++) { | |||
| LOAD_A_1x8(k, m); | |||
| LOAD_B_1x4(k, n); | |||
| KERNEL_MMA_2ACC(rb0, rb0, ra0, ra1); | |||
| } | |||
| #if !defined(B0) | |||
| register vector float rc0; | |||
| #endif | |||
| vector float result[4]; | |||
| SAVE_4x4_ACC(&acc0, n+0, m+0); | |||
| SAVE_4x4_ACC(&acc1, n+0, m+4); | |||
| } | |||
| for (; n < n2; n += 2) { | |||
| __vector_quad acc0, acc1; | |||
| INIT_2ACCS(); | |||
| register vector float ra0, ra1; | |||
| register vector float rb0; | |||
| for (k = 0; k < K; k++) { | |||
| LOAD_A_1x8(k, m); | |||
| LOAD_B_1x2(k, n); | |||
| KERNEL_MMA_2ACC(rb0, rb0, ra0, ra1); | |||
| } | |||
| #if !defined(B0) | |||
| register vector float rc0; | |||
| #endif | |||
| vector float result[4]; | |||
| SAVE_2x4_ACC(&acc0, n, m+0); | |||
| SAVE_2x4_ACC(&acc1, n, m+4); | |||
| } | |||
| for (; n < N; n++) { | |||
| vector float result = ((vector float){0.,0.,0.,0.}); | |||
| vector float result1 = ((vector float){0.,0.,0.,0.}); | |||
| register vector float ra0, ra1; | |||
| register vector float rb0; | |||
| for (k = 0; k < K; k++) { | |||
| LOAD_A_1x8(k, m); | |||
| LOAD_B_1x1(k, n); | |||
| KERNEL_VMADD_2VSR(ra0, ra1, rb0, rb0); | |||
| } | |||
| #if !defined(B0) | |||
| register vector float rc0; | |||
| #endif | |||
| SAVE_1x4_VSR(result, n, m); | |||
| SAVE_1x4_VSR(result1, n, m+4); | |||
| } | |||
| } | |||
| for (; m < m4; m += 4) { | |||
| for (n = 0; n < n16; n += 16) { | |||
| __vector_quad acc0, acc1, acc2, acc3; | |||
| INIT_4ACCS(); | |||
| register vector float ra0; | |||
| register vector float rb0, rb1, rb2, rb3; | |||
| for (k = 0; k < K; k++) { | |||
| LOAD_A_1x4(k, m); | |||
| LOAD_B_1x16(k, n); | |||
| KERNEL_MMA_4ACC(rb0, rb1, rb2, rb3, ra0, ra0, ra0, ra0); | |||
| } | |||
| #if !defined(B0) | |||
| register vector float rc0; | |||
| #endif | |||
| vector float result[4]; | |||
| SAVE_4x4_ACC(&acc0, n+0, m+0); | |||
| SAVE_4x4_ACC(&acc1, n+4, m+0); | |||
| SAVE_4x4_ACC(&acc2, n+8, m+0); | |||
| SAVE_4x4_ACC(&acc3, n+12, m+0); | |||
| } | |||
| for (; n < n8; n += 8) { | |||
| __vector_quad acc0, acc1; | |||
| INIT_2ACCS(); | |||
| register vector float ra0; | |||
| register vector float rb0, rb1; | |||
| for (k = 0; k < K; k++) { | |||
| LOAD_A_1x4(k, m); | |||
| LOAD_B_1x8(k, n); | |||
| KERNEL_MMA_2ACC(rb0, rb1, ra0, ra0); | |||
| } | |||
| #if !defined(B0) | |||
| register vector float rc0; | |||
| #endif | |||
| vector float result[4]; | |||
| SAVE_4x4_ACC(&acc0, n+0, m+0); | |||
| SAVE_4x4_ACC(&acc1, n+4, m+0); | |||
| } | |||
| for (; n < n4; n += 4) { | |||
| __vector_quad acc0; | |||
| INIT_1ACC(); | |||
| register vector float ra0; | |||
| register vector float rb0; | |||
| for (k = 0; k < K; k++) { | |||
| LOAD_A_1x4(k, m); | |||
| LOAD_B_1x4(k, n); | |||
| KERNEL_MMA_1ACC(rb0, ra0); | |||
| } | |||
| #if !defined(B0) | |||
| register vector float rc0; | |||
| #endif | |||
| vector float result[4]; | |||
| SAVE_4x4_ACC(&acc0, n+0, m+0); | |||
| } | |||
| for (; n < n2; n += 2) { | |||
| __vector_quad acc0; | |||
| INIT_1ACC(); | |||
| register vector float ra0; | |||
| register vector float rb0; | |||
| for (k = 0; k < K; k++) { | |||
| LOAD_A_1x4(k, m); | |||
| LOAD_B_1x2(k, n); | |||
| KERNEL_MMA_1ACC(rb0, ra0); | |||
| } | |||
| #if !defined(B0) | |||
| register vector float rc0; | |||
| #endif | |||
| vector float result[4]; | |||
| SAVE_2x4_ACC(&acc0, n, m); | |||
| } | |||
| for (; n < N; n++) { | |||
| vector float result = ((vector float){0.,0.,0.,0.}); | |||
| register vector float ra0; | |||
| register vector float rb0; | |||
| for (k = 0; k < K; k++) { | |||
| LOAD_A_1x4(k, m); | |||
| LOAD_B_1x1(k, n); | |||
| KERNEL_VMADD_1VSR(ra0, rb0); | |||
| } | |||
| #if !defined(B0) | |||
| register vector float rc0; | |||
| #endif | |||
| SAVE_1x4_VSR(result, n, m); | |||
| } | |||
| } | |||
| for (; m < m2; m += 2) { | |||
| for (n = 0; n < n16; n += 16) { | |||
| __vector_quad acc0, acc1, acc2, acc3; | |||
| INIT_4ACCS(); | |||
| register vector float ra0; | |||
| register vector float rb0, rb1, rb2, rb3; | |||
| for (k = 0; k < K; k++) { | |||
| LOAD_A_1x2(k, m); | |||
| LOAD_B_1x16(k, n); | |||
| KERNEL_MMA_4ACC(rb0, rb1, rb2, rb3, ra0, ra0, ra0, ra0); | |||
| } | |||
| #if !defined(B0) | |||
| register vector float rc0; | |||
| #endif | |||
| vector float result[4]; | |||
| SAVE_4x2_ACC(&acc0, n+0, m+0); | |||
| SAVE_4x2_ACC(&acc1, n+4, m+0); | |||
| SAVE_4x2_ACC(&acc2, n+8, m+0); | |||
| SAVE_4x2_ACC(&acc3, n+12, m+0); | |||
| } | |||
| for (; n < n8; n += 8) { | |||
| __vector_quad acc0, acc1; | |||
| INIT_2ACCS(); | |||
| register vector float ra0; | |||
| register vector float rb0, rb1; | |||
| for (k = 0; k < K; k++) { | |||
| LOAD_A_1x2(k, m); | |||
| LOAD_B_1x8(k, n); | |||
| KERNEL_MMA_2ACC(rb0, rb1, ra0, ra0); | |||
| } | |||
| #if !defined(B0) | |||
| register vector float rc0; | |||
| #endif | |||
| vector float result[4]; | |||
| SAVE_4x2_ACC(&acc0, n+0, m+0); | |||
| SAVE_4x2_ACC(&acc1, n+4, m+0); | |||
| } | |||
| for (; n < n4; n += 4) { | |||
| __vector_quad acc0; | |||
| INIT_1ACC(); | |||
| register vector float ra0; | |||
| register vector float rb0; | |||
| for (k = 0; k < K; k++) { | |||
| LOAD_A_1x2(k, m); | |||
| LOAD_B_1x4(k, n); | |||
| KERNEL_MMA_1ACC(rb0, ra0); | |||
| } | |||
| #if !defined(B0) | |||
| register vector float rc0; | |||
| #endif | |||
| vector float result[4]; | |||
| SAVE_4x2_ACC(&acc0, n+0, m+0); | |||
| } | |||
| for (; n < n2; n += 2) { | |||
| vector float result = ((vector float){0.,0.,0.,0.}); | |||
| register vector float ra0; | |||
| register vector float rb0; | |||
| for (k = 0; k < K; k++) { | |||
| LOAD_A_2x2(k, m); | |||
| LOAD_B_2x2(k, n); | |||
| KERNEL_VMADD_1VSR(ra0, rb0); | |||
| } | |||
| #if !defined(B0) | |||
| register vector float rc0; | |||
| #endif | |||
| SAVE_2x2_VSR(result, n, m); | |||
| } | |||
| for (; n < N; n++) { | |||
| vector float result = ((vector float){0.,0.,0.,0.}); | |||
| register vector float ra0; | |||
| register vector float rb0; | |||
| for (k = 0; k < K; k++) { | |||
| LOAD_A_1x2(k, m); | |||
| LOAD_B_1x1(k, n); | |||
| KERNEL_VMADD_1VSR(ra0, rb0); | |||
| } | |||
| #if !defined(B0) | |||
| register vector float rc0; | |||
| #endif | |||
| SAVE_1x2_VSR(result, n, m); | |||
| } | |||
| } | |||
| for (; m < M; m++) { | |||
| for (n = 0; n < n16; n += 16) { | |||
| vector float result = ((vector float){0.,0.,0.,0.}); | |||
| vector float result1 = ((vector float){0.,0.,0.,0.}); | |||
| vector float result2 = ((vector float){0.,0.,0.,0.}); | |||
| vector float result3 = ((vector float){0.,0.,0.,0.}); | |||
| register vector float ra0; | |||
| register vector float rb0, rb1, rb2, rb3; | |||
| for (k = 0; k < K; k++) { | |||
| LOAD_A_1x1(k, m); | |||
| LOAD_B_1x16(k, n); | |||
| KERNEL_VMADD_4VSR(ra0, ra0, ra0, ra0, rb0, rb1, rb2, rb3); | |||
| } | |||
| SAVE_4x1_VSR(result, n+0, m); | |||
| SAVE_4x1_VSR(result1, n+4, m); | |||
| SAVE_4x1_VSR(result2, n+8, m); | |||
| SAVE_4x1_VSR(result3, n+12, m); | |||
| } | |||
| for (; n < n8; n += 8) { | |||
| vector float result = ((vector float){0.,0.,0.,0.}); | |||
| vector float result1 = ((vector float){0.,0.,0.,0.}); | |||
| register vector float ra0; | |||
| register vector float rb0, rb1; | |||
| for (k = 0; k < K; k++) { | |||
| LOAD_A_1x1(k, m); | |||
| LOAD_B_1x8(k, n); | |||
| KERNEL_VMADD_2VSR(ra0, ra0, rb0, rb1); | |||
| } | |||
| SAVE_4x1_VSR(result, n+0, m); | |||
| SAVE_4x1_VSR(result1, n+4, m); | |||
| } | |||
| for (; n < n4; n += 4) { | |||
| vector float result = ((vector float){0.,0.,0.,0.}); | |||
| register vector float ra0; | |||
| register vector float rb0; | |||
| for (k = 0; k < K; k++) { | |||
| LOAD_A_1x1(k, m); | |||
| LOAD_B_1x4(k, n); | |||
| KERNEL_VMADD_1VSR(ra0, rb0); | |||
| } | |||
| SAVE_4x1_VSR(result, n+0, m); | |||
| } | |||
| for (; n < n2; n += 2) { | |||
| vector float result = ((vector float){0.,0.,0.,0.}); | |||
| register vector float ra0; | |||
| register vector float rb0; | |||
| for (k = 0; k < K; k++) { | |||
| LOAD_A_1x1(k, m); | |||
| LOAD_B_1x2(k, n); | |||
| KERNEL_VMADD_1VSR(ra0, rb0); | |||
| } | |||
| SAVE_2x1_VSR(result, n+0, m); | |||
| } | |||
| for (; n < N; n++) { | |||
| FLOAT result = 0.0f; | |||
| for (k = 0; k < K; k++) { | |||
| result += A[k*lda+m] * B[k*ldb+n]; | |||
| } | |||
| result = result * alpha; | |||
| #if !defined(B0) | |||
| C[n*ldc+m] = (C[n*ldc+m] * beta) + result; | |||
| #else | |||
| C[n*ldc+m] = result; | |||
| #endif | |||
| } | |||
| } | |||
| if (has_packing) free (packA); | |||
| return 0; | |||
| } | |||
| @@ -42,10 +42,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||
| #if defined(POWER8) || defined(POWER9) | |||
| #include "srot_microk_power8.c" | |||
| #elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) | |||
| #include "srot_microk_power10.c" | |||
| #elif defined(POWER10) | |||
| #include "srot_microk_power8.c" | |||
| #include "srot_microk_power10.c" | |||
| #endif | |||
| #endif | |||
| @@ -119,7 +117,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT | |||
| if ( (inc_x == 1) && (inc_y == 1) ) | |||
| { | |||
| #if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) | |||
| #if defined(POWER10) | |||
| if ( n >= 16 ) | |||
| { | |||
| BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 2) & 0x7; | |||
| @@ -38,10 +38,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||
| #if defined(POWER8) || defined(POWER9) | |||
| #include "sscal_microk_power8.c" | |||
| #elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) | |||
| #include "sscal_microk_power10.c" | |||
| #elif defined(POWER10) | |||
| #include "sscal_microk_power8.c" | |||
| #include "sscal_microk_power10.c" | |||
| #endif | |||
| #endif | |||
| @@ -106,7 +104,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||
| if ( da == 0.0 ) | |||
| { | |||
| #if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) | |||
| #if defined(POWER10) | |||
| if ( n >= 32 ) | |||
| { | |||
| BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 2) & 0x7; | |||
| @@ -140,7 +138,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||
| else | |||
| { | |||
| #if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) | |||
| #if defined(POWER10) | |||
| if ( n >= 32 ) | |||
| { | |||
| BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 2) & 0x7; | |||
| @@ -38,10 +38,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||
| #if defined(POWER8) || defined(POWER9) | |||
| #include "sswap_microk_power8.c" | |||
| #elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) | |||
| #include "swap_microk_power10.c" | |||
| #elif defined(POWER10) | |||
| #include "sswap_microk_power8.c" | |||
| #include "swap_microk_power10.c" | |||
| #endif | |||
| #endif | |||
| @@ -119,7 +117,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, | |||
| if ( (inc_x == 1) && (inc_y == 1 )) | |||
| { | |||
| #if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) | |||
| #if defined(POWER10) | |||
| if ( n >= 64 ) | |||
| { | |||
| BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 2) & 0x7; | |||
| @@ -30,9 +30,17 @@ static void zaxpy_kernel_4 (long n, double *x, double *y, | |||
| double alpha_r, double alpha_i) | |||
| { | |||
| #if !defined(CONJ) | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| static const double mvec[2] = { -1.0, 1.0 }; | |||
| #else | |||
| static const double mvec[2] = { 1.0, -1.0 }; | |||
| #endif | |||
| #else | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| static const double mvec[2] = { 1.0, -1.0 }; | |||
| #else | |||
| static const double mvec[2] = { -1.0, 1.0 }; | |||
| #endif | |||
| #endif | |||
| const double *mvecp = mvec; | |||