| @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.4) | |||||
| project(OpenBLAS) | project(OpenBLAS) | ||||
| set(OpenBLAS_MAJOR_VERSION 0) | set(OpenBLAS_MAJOR_VERSION 0) | ||||
| set(OpenBLAS_MINOR_VERSION 2) | set(OpenBLAS_MINOR_VERSION 2) | ||||
| set(OpenBLAS_PATCH_VERSION 18) | |||||
| set(OpenBLAS_PATCH_VERSION 19) | |||||
| set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") | set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") | ||||
| enable_language(ASM) | enable_language(ASM) | ||||
| @@ -45,8 +45,8 @@ endif() | |||||
| message(WARNING "CMake support is experimental. This will not produce the same Makefiles that OpenBLAS ships with. Only x86 support is currently available.") | message(WARNING "CMake support is experimental. This will not produce the same Makefiles that OpenBLAS ships with. Only x86 support is currently available.") | ||||
| include("${CMAKE_SOURCE_DIR}/cmake/utils.cmake") | |||||
| include("${CMAKE_SOURCE_DIR}/cmake/system.cmake") | |||||
| include("${PROJECT_SOURCE_DIR}/cmake/utils.cmake") | |||||
| include("${PROJECT_SOURCE_DIR}/cmake/system.cmake") | |||||
| set(BLASDIRS interface driver/level2 driver/level3 driver/others) | set(BLASDIRS interface driver/level2 driver/level3 driver/others) | ||||
| @@ -123,9 +123,9 @@ endforeach () | |||||
| # Can't just use lapack-netlib's CMake files, since they are set up to search for BLAS, build and install a binary. We just want to build a couple of lib files out of lapack and lapacke. | # Can't just use lapack-netlib's CMake files, since they are set up to search for BLAS, build and install a binary. We just want to build a couple of lib files out of lapack and lapacke. | ||||
| # Not using add_subdirectory here because lapack-netlib already has its own CMakeLists.txt. Instead include a cmake script with the sources we want. | # Not using add_subdirectory here because lapack-netlib already has its own CMakeLists.txt. Instead include a cmake script with the sources we want. | ||||
| if (NOT NOFORTRAN AND NOT NO_LAPACK) | if (NOT NOFORTRAN AND NOT NO_LAPACK) | ||||
| include("${CMAKE_SOURCE_DIR}/cmake/lapack.cmake") | |||||
| include("${PROJECT_SOURCE_DIR}/cmake/lapack.cmake") | |||||
| if (NOT NO_LAPACKE) | if (NOT NO_LAPACKE) | ||||
| include("${CMAKE_SOURCE_DIR}/cmake/lapacke.cmake") | |||||
| include("${PROJECT_SOURCE_DIR}/cmake/lapacke.cmake") | |||||
| endif () | endif () | ||||
| endif () | endif () | ||||
| @@ -137,7 +137,7 @@ endif() | |||||
| # add objects to the openblas lib | # add objects to the openblas lib | ||||
| add_library(${OpenBLAS_LIBNAME} SHARED ${LA_SOURCES} ${LAPACKE_SOURCES} ${TARGET_OBJS} ${OpenBLAS_DEF_FILE}) | add_library(${OpenBLAS_LIBNAME} SHARED ${LA_SOURCES} ${LAPACKE_SOURCES} ${TARGET_OBJS} ${OpenBLAS_DEF_FILE}) | ||||
| include("${CMAKE_SOURCE_DIR}/cmake/export.cmake") | |||||
| include("${PROJECT_SOURCE_DIR}/cmake/export.cmake") | |||||
| # Set output for libopenblas | # Set output for libopenblas | ||||
| set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib) | set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib) | ||||
| @@ -150,3 +150,14 @@ In chronological order: | |||||
| * theoractice <https://github.com/theoractice/> | * theoractice <https://github.com/theoractice/> | ||||
| * [2016-03-20] Fix compiler error in VisualStudio with CMake | * [2016-03-20] Fix compiler error in VisualStudio with CMake | ||||
| * [2016-03-22] Fix access violation on Windows while static linking | * [2016-03-22] Fix access violation on Windows while static linking | ||||
| * Paul Mustière <https://github.com/buffer51/> | |||||
| * [2016-02-04] Fix Android build on ARMV7 | |||||
| * [2016-04-26] Android build with LAPACK for ARMV7 & ARMV8 | |||||
| * Shivraj Patil <https://github.com/sva-img/> | |||||
| * [2016-05-03] DGEMM optimization for MIPS P5600 and I6400 using MSA | |||||
| * Kaustubh Raste <https://github.com/ksraste/> | |||||
| * [2016-05-09] DTRSM optimization for MIPS P5600 and I6400 using MSA | |||||
| * [2016-05-20] STRSM optimization for MIPS P5600 and I6400 using MSA | |||||
| @@ -1,4 +1,22 @@ | |||||
| OpenBLAS ChangeLog | OpenBLAS ChangeLog | ||||
| ==================================================================== | |||||
| Version 0.2.19 | |||||
| 1-Sep-2016 | |||||
| common: | |||||
| * Improved cross compiling. | |||||
| * Fix the bug on musl libc. | |||||
| POWER: | |||||
| * Optimize BLAS on Power8 | |||||
| * Fixed Julia+OpenBLAS bugs on Power8 | |||||
| MIPS: | |||||
| * Optimize BLAS on MIPS P5600 and I6400 (Thanks, Shivraj Patil, Kaustubh Raste) | |||||
| ARM: | |||||
| * Improved on ARM Cortex-A57. (Thanks, Ashwin Sekhar T K) | |||||
| ==================================================================== | ==================================================================== | ||||
| Version 0.2.18 | Version 0.2.18 | ||||
| 12-Apr-2016 | 12-Apr-2016 | ||||
| @@ -108,8 +108,6 @@ endif | |||||
| tests : | tests : | ||||
| ifndef NOFORTRAN | ifndef NOFORTRAN | ||||
| ifndef TARGET | |||||
| ifndef CROSS | |||||
| touch $(LIBNAME) | touch $(LIBNAME) | ||||
| ifndef NO_FBLAS | ifndef NO_FBLAS | ||||
| $(MAKE) -C test all | $(MAKE) -C test all | ||||
| @@ -119,8 +117,6 @@ ifndef NO_CBLAS | |||||
| $(MAKE) -C ctest all | $(MAKE) -C ctest all | ||||
| endif | endif | ||||
| endif | endif | ||||
| endif | |||||
| endif | |||||
| libs : | libs : | ||||
| ifeq ($(CORE), UNKOWN) | ifeq ($(CORE), UNKOWN) | ||||
| @@ -20,75 +20,75 @@ lib.grd : | |||||
| $(error OpenBLAS: Please run "make" firstly) | $(error OpenBLAS: Please run "make" firstly) | ||||
| install : lib.grd | install : lib.grd | ||||
| @-mkdir -p $(DESTDIR)$(PREFIX) | |||||
| @-mkdir -p $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) | |||||
| @-mkdir -p $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) | |||||
| @-mkdir -p $(DESTDIR)$(OPENBLAS_BINARY_DIR) | |||||
| @-mkdir -p $(DESTDIR)$(OPENBLAS_CMAKE_DIR) | |||||
| @-mkdir -p "$(DESTDIR)$(PREFIX)" | |||||
| @-mkdir -p "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)" | |||||
| @-mkdir -p "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" | |||||
| @-mkdir -p "$(DESTDIR)$(OPENBLAS_BINARY_DIR)" | |||||
| @-mkdir -p "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)" | |||||
| @echo Generating openblas_config.h in $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) | @echo Generating openblas_config.h in $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) | ||||
| #for inc | #for inc | ||||
| @echo \#ifndef OPENBLAS_CONFIG_H > $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h | |||||
| @echo \#define OPENBLAS_CONFIG_H >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h | |||||
| @$(AWK) 'NF {print $$1, "OPENBLAS_"$$2, $$3}' config_last.h >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h | |||||
| @echo \#define OPENBLAS_VERSION \" OpenBLAS $(VERSION) \" >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h | |||||
| @cat openblas_config_template.h >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h | |||||
| @echo \#endif \/\* OPENBLAS_CONFIG_H \*\/ >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h | |||||
| @echo \#ifndef OPENBLAS_CONFIG_H > "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h" | |||||
| @echo \#define OPENBLAS_CONFIG_H >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h" | |||||
| @$(AWK) 'NF {print $$1, "OPENBLAS_"$$2, $$3}' config_last.h >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h" | |||||
| @echo \#define OPENBLAS_VERSION \" OpenBLAS $(VERSION) \" >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h" | |||||
| @cat openblas_config_template.h >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h" | |||||
| @echo \#endif \/\* OPENBLAS_CONFIG_H \*\/ >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h" | |||||
| @echo Generating f77blas.h in $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) | @echo Generating f77blas.h in $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) | ||||
| @echo \#ifndef OPENBLAS_F77BLAS_H > $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h | |||||
| @echo \#define OPENBLAS_F77BLAS_H >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h | |||||
| @echo \#include \"openblas_config.h\" >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h | |||||
| @cat common_interface.h >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h | |||||
| @echo \#endif >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h | |||||
| @echo \#ifndef OPENBLAS_F77BLAS_H > "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h" | |||||
| @echo \#define OPENBLAS_F77BLAS_H >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h" | |||||
| @echo \#include \"openblas_config.h\" >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h" | |||||
| @cat common_interface.h >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h" | |||||
| @echo \#endif >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h" | |||||
| ifndef NO_CBLAS | ifndef NO_CBLAS | ||||
| @echo Generating cblas.h in $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) | @echo Generating cblas.h in $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) | ||||
| @sed 's/common/openblas_config/g' cblas.h > $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/cblas.h | |||||
| @sed 's/common/openblas_config/g' cblas.h > "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/cblas.h" | |||||
| endif | endif | ||||
| ifndef NO_LAPACKE | ifndef NO_LAPACKE | ||||
| @echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) | @echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) | ||||
| @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h | |||||
| @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h | |||||
| @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h | |||||
| @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_utils.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h | |||||
| @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h" | |||||
| @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h" | |||||
| @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h" | |||||
| @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_utils.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h" | |||||
| endif | endif | ||||
| #for install static library | #for install static library | ||||
| ifndef NO_STATIC | ifndef NO_STATIC | ||||
| @echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) | @echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) | ||||
| @install -pm644 $(LIBNAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) | |||||
| @cd $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) ; \ | |||||
| @install -pm644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" | |||||
| @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ | |||||
| ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) | ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) | ||||
| endif | endif | ||||
| #for install shared library | #for install shared library | ||||
| ifndef NO_SHARED | ifndef NO_SHARED | ||||
| @echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) | @echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) | ||||
| ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS)) | ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS)) | ||||
| @install -pm755 $(LIBSONAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) | |||||
| @cd $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) ; \ | |||||
| @install -pm755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" | |||||
| @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ | |||||
| ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \ | ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \ | ||||
| ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION) | ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION) | ||||
| endif | endif | ||||
| ifeq ($(OSNAME), FreeBSD) | ifeq ($(OSNAME), FreeBSD) | ||||
| @cp $(LIBSONAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) | |||||
| @cd $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) ; \ | |||||
| @cp $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" | |||||
| @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ | |||||
| ln -fs $(LIBSONAME) $(LIBPREFIX).so | ln -fs $(LIBSONAME) $(LIBPREFIX).so | ||||
| endif | endif | ||||
| ifeq ($(OSNAME), NetBSD) | ifeq ($(OSNAME), NetBSD) | ||||
| @cp $(LIBSONAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) | |||||
| @cd $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) ; \ | |||||
| @cp $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" | |||||
| @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ | |||||
| ln -fs $(LIBSONAME) $(LIBPREFIX).so | ln -fs $(LIBSONAME) $(LIBPREFIX).so | ||||
| endif | endif | ||||
| ifeq ($(OSNAME), Darwin) | ifeq ($(OSNAME), Darwin) | ||||
| @-cp $(LIBDYNNAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) | |||||
| @-install_name_tool -id $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME) | |||||
| @cd $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) ; \ | |||||
| @-cp $(LIBDYNNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" | |||||
| @-install_name_tool -id "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)" "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)" | |||||
| @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ | |||||
| ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib | ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib | ||||
| endif | endif | ||||
| ifeq ($(OSNAME), WINNT) | ifeq ($(OSNAME), WINNT) | ||||
| @-cp $(LIBDLLNAME) $(DESTDIR)$(OPENBLAS_BINARY_DIR) | |||||
| @-cp $(LIBDLLNAME).a $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) | |||||
| @-cp $(LIBDLLNAME) "$(DESTDIR)$(OPENBLAS_BINARY_DIR)" | |||||
| @-cp $(LIBDLLNAME).a "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" | |||||
| endif | endif | ||||
| ifeq ($(OSNAME), CYGWIN_NT) | ifeq ($(OSNAME), CYGWIN_NT) | ||||
| @-cp $(LIBDLLNAME) $(OPENBLAS_BINARY_DIR) | @-cp $(LIBDLLNAME) $(OPENBLAS_BINARY_DIR) | ||||
| @@ -96,34 +96,34 @@ endif | |||||
| endif | endif | ||||
| #Generating OpenBLASConfig.cmake | #Generating OpenBLASConfig.cmake | ||||
| @echo Generating $(OPENBLAS_CMAKE_CONFIG) in $(DESTDIR)$(OPENBLAS_CMAKE_DIR) | @echo Generating $(OPENBLAS_CMAKE_CONFIG) in $(DESTDIR)$(OPENBLAS_CMAKE_DIR) | ||||
| @echo "SET(OpenBLAS_VERSION \"${VERSION}\")" > $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG) | |||||
| @echo "SET(OpenBLAS_INCLUDE_DIRS ${OPENBLAS_INCLUDE_DIR})" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG) | |||||
| @echo "SET(OpenBLAS_VERSION \"${VERSION}\")" > "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" | |||||
| @echo "SET(OpenBLAS_INCLUDE_DIRS ${OPENBLAS_INCLUDE_DIR})" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" | |||||
| ifndef NO_SHARED | ifndef NO_SHARED | ||||
| #ifeq logical or | #ifeq logical or | ||||
| ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD)) | ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD)) | ||||
| @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).so)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG) | |||||
| @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).so)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" | |||||
| endif | endif | ||||
| ifeq ($(OSNAME), $(filter $(OSNAME),WINNT CYGWIN_NT)) | ifeq ($(OSNAME), $(filter $(OSNAME),WINNT CYGWIN_NT)) | ||||
| @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_BINARY_DIR}/$(LIBDLLNAME))" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG) | |||||
| @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_BINARY_DIR}/$(LIBDLLNAME))" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" | |||||
| endif | endif | ||||
| ifeq ($(OSNAME), Darwin) | ifeq ($(OSNAME), Darwin) | ||||
| @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).dylib)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG) | |||||
| @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).dylib)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" | |||||
| endif | endif | ||||
| else | else | ||||
| #only static | #only static | ||||
| @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).$(LIBSUFFIX))" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG) | |||||
| @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).$(LIBSUFFIX))" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" | |||||
| endif | endif | ||||
| #Generating OpenBLASConfigVersion.cmake | #Generating OpenBLASConfigVersion.cmake | ||||
| @echo Generating $(OPENBLAS_CMAKE_CONFIG_VERSION) in $(DESTDIR)$(OPENBLAS_CMAKE_DIR) | @echo Generating $(OPENBLAS_CMAKE_CONFIG_VERSION) in $(DESTDIR)$(OPENBLAS_CMAKE_DIR) | ||||
| @echo "set (PACKAGE_VERSION \"${VERSION}\")" > $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION) | |||||
| @echo "if (PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION) | |||||
| @echo " set (PACKAGE_VERSION_COMPATIBLE FALSE)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION) | |||||
| @echo "else ()" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION) | |||||
| @echo " set (PACKAGE_VERSION_COMPATIBLE TRUE)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION) | |||||
| @echo " if (PACKAGE_FIND_VERSION STREQUAL PACKAGE_VERSION)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION) | |||||
| @echo " set (PACKAGE_VERSION_EXACT TRUE)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION) | |||||
| @echo " endif ()" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION) | |||||
| @echo "endif ()" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION) | |||||
| @echo "set (PACKAGE_VERSION \"${VERSION}\")" > "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" | |||||
| @echo "if (PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" | |||||
| @echo " set (PACKAGE_VERSION_COMPATIBLE FALSE)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" | |||||
| @echo "else ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" | |||||
| @echo " set (PACKAGE_VERSION_COMPATIBLE TRUE)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" | |||||
| @echo " if (PACKAGE_FIND_VERSION STREQUAL PACKAGE_VERSION)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" | |||||
| @echo " set (PACKAGE_VERSION_EXACT TRUE)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" | |||||
| @echo " endif ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" | |||||
| @echo "endif ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" | |||||
| @echo Install OK! | @echo Install OK! | ||||
| @@ -0,0 +1,3 @@ | |||||
| ifdef BINARY64 | |||||
| else | |||||
| endif | |||||
| @@ -1,4 +1,26 @@ | |||||
| # CCOMMON_OPT += -DALLOC_SHM | |||||
| ifdef USE_THREAD | |||||
| ifeq ($(USE_THREAD), 0) | |||||
| USE_OPENMP = 0 | |||||
| else | |||||
| USE_OPENMP = 1 | |||||
| endif | |||||
| else | |||||
| USE_OPENMP = 1 | |||||
| endif | |||||
| ifeq ($(CORE), POWER8) | |||||
| ifeq ($(USE_OPENMP), 1) | |||||
| COMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp | |||||
| FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp | |||||
| else | |||||
| COMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -malign-power -fno-fast-math | |||||
| FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -fno-fast-math | |||||
| endif | |||||
| endif | |||||
| FLAMEPATH = $(HOME)/flame/lib | FLAMEPATH = $(HOME)/flame/lib | ||||
| @@ -16,6 +38,16 @@ else | |||||
| endif | endif | ||||
| endif | endif | ||||
| #Either uncomment below line or run make with `USE_MASS=1` to enable support of MASS library | |||||
| #USE_MASS = 1 | |||||
| ifeq ($(USE_MASS), 1) | |||||
| # Path to MASS libs, change it if the libs are installed at any other location | |||||
| MASSPATH = /opt/ibm/xlmass/8.1.3/lib | |||||
| COMMON_OPT += -mveclibabi=mass -ftree-vectorize -funsafe-math-optimizations -DUSE_MASS | |||||
| EXTRALIB += -L$(MASSPATH) -lmass -lmassvp8 -lmass_simdp8 | |||||
| endif | |||||
| ifdef BINARY64 | ifdef BINARY64 | ||||
| @@ -17,14 +17,26 @@ ifdef CPUIDEMU | |||||
| EXFLAGS = -DCPUIDEMU -DVENDOR=99 | EXFLAGS = -DCPUIDEMU -DVENDOR=99 | ||||
| endif | endif | ||||
| ifeq ($(TARGET), P5600) | |||||
| TARGET_FLAGS = -mips32r5 | |||||
| endif | |||||
| ifeq ($(TARGET), I6400) | |||||
| TARGET_FLAGS = -mips64r6 | |||||
| endif | |||||
| ifeq ($(TARGET), P6600) | |||||
| TARGET_FLAGS = -mips64r6 | |||||
| endif | |||||
| all: getarch_2nd | all: getarch_2nd | ||||
| ./getarch_2nd 0 >> $(TARGET_MAKE) | ./getarch_2nd 0 >> $(TARGET_MAKE) | ||||
| ./getarch_2nd 1 >> $(TARGET_CONF) | ./getarch_2nd 1 >> $(TARGET_CONF) | ||||
| config.h : c_check f_check getarch | config.h : c_check f_check getarch | ||||
| perl ./c_check $(TARGET_MAKE) $(TARGET_CONF) $(CC) | |||||
| perl ./c_check $(TARGET_MAKE) $(TARGET_CONF) $(CC) $(TARGET_FLAGS) | |||||
| ifneq ($(ONLY_CBLAS), 1) | ifneq ($(ONLY_CBLAS), 1) | ||||
| perl ./f_check $(TARGET_MAKE) $(TARGET_CONF) $(FC) | |||||
| perl ./f_check $(TARGET_MAKE) $(TARGET_CONF) $(FC) $(TARGET_FLAGS) | |||||
| else | else | ||||
| #When we only build CBLAS, we set NOFORTRAN=2 | #When we only build CBLAS, we set NOFORTRAN=2 | ||||
| echo "NOFORTRAN=2" >> $(TARGET_MAKE) | echo "NOFORTRAN=2" >> $(TARGET_MAKE) | ||||
| @@ -3,7 +3,7 @@ | |||||
| # | # | ||||
| # This library's version | # This library's version | ||||
| VERSION = 0.2.18 | |||||
| VERSION = 0.2.19 | |||||
| # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a | # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a | ||||
| # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library | # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library | ||||
| @@ -52,6 +52,7 @@ VERSION = 0.2.18 | |||||
| # USE_THREAD = 0 | # USE_THREAD = 0 | ||||
| # If you're going to use this library with OpenMP, please comment it in. | # If you're going to use this library with OpenMP, please comment it in. | ||||
| # This flag is always set for POWER8. Don't modify the flag | |||||
| # USE_OPENMP = 1 | # USE_OPENMP = 1 | ||||
| # You can define maximum number of threads. Basically it should be | # You can define maximum number of threads. Basically it should be | ||||
| @@ -153,10 +154,12 @@ NO_AFFINITY = 1 | |||||
| # Common Optimization Flag; | # Common Optimization Flag; | ||||
| # The default -O2 is enough. | # The default -O2 is enough. | ||||
| # Flags for POWER8 are defined in Makefile.power. Don't modify COMMON_OPT | |||||
| # COMMON_OPT = -O2 | # COMMON_OPT = -O2 | ||||
| # gfortran option for LAPACK | # gfortran option for LAPACK | ||||
| # enable this flag only on 64bit Linux and if you need a thread safe lapack library | # enable this flag only on 64bit Linux and if you need a thread safe lapack library | ||||
| # Flags for POWER8 are defined in Makefile.power. Don't modify FCOMMON_OPT | |||||
| # FCOMMON_OPT = -frecursive | # FCOMMON_OPT = -frecursive | ||||
| # Profiling flags | # Profiling flags | ||||
| @@ -159,7 +159,7 @@ ifndef GOTOBLAS_MAKEFILE | |||||
| export GOTOBLAS_MAKEFILE = 1 | export GOTOBLAS_MAKEFILE = 1 | ||||
| # Generating Makefile.conf and config.h | # Generating Makefile.conf and config.h | ||||
| DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" CFLAGS="$(GETARCH_FLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) TARGET_CORE=$(TARGET_CORE) ONLY_CBLAS=$(ONLY_CBLAS) all) | |||||
| DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" CFLAGS="$(GETARCH_FLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) TARGET_CORE=$(TARGET_CORE) ONLY_CBLAS=$(ONLY_CBLAS) TARGET=$(TARGET) all) | |||||
| ifndef TARGET_CORE | ifndef TARGET_CORE | ||||
| include $(TOPDIR)/Makefile.conf | include $(TOPDIR)/Makefile.conf | ||||
| @@ -462,7 +462,7 @@ endif | |||||
| endif | endif | ||||
| endif | endif | ||||
| ifeq ($(ARCH), mips64) | |||||
| ifeq ($(ARCH), $(filter $(ARCH),mips64 mips)) | |||||
| NO_BINARY_MODE = 1 | NO_BINARY_MODE = 1 | ||||
| endif | endif | ||||
| @@ -502,13 +502,16 @@ endif | |||||
| ifdef NO_BINARY_MODE | ifdef NO_BINARY_MODE | ||||
| ifeq ($(ARCH), mips64) | |||||
| ifeq ($(ARCH), $(filter $(ARCH),mips64)) | |||||
| ifdef BINARY64 | ifdef BINARY64 | ||||
| CCOMMON_OPT += -mabi=64 | CCOMMON_OPT += -mabi=64 | ||||
| else | else | ||||
| CCOMMON_OPT += -mabi=n32 | CCOMMON_OPT += -mabi=n32 | ||||
| endif | endif | ||||
| BINARY_DEFINED = 1 | BINARY_DEFINED = 1 | ||||
| else ifeq ($(ARCH), $(filter $(ARCH),mips)) | |||||
| CCOMMON_OPT += -mabi=32 | |||||
| BINARY_DEFINED = 1 | |||||
| endif | endif | ||||
| ifeq ($(CORE), LOONGSON3A) | ifeq ($(CORE), LOONGSON3A) | ||||
| @@ -521,6 +524,21 @@ CCOMMON_OPT += -march=mips64 | |||||
| FCOMMON_OPT += -march=mips64 | FCOMMON_OPT += -march=mips64 | ||||
| endif | endif | ||||
| ifeq ($(CORE), P5600) | |||||
| CCOMMON_OPT += -mips32r5 -mnan=2008 -mtune=p5600 $(MSA_FLAGS) | |||||
| FCOMMON_OPT += -mips32r5 -mnan=2008 -mtune=p5600 $(MSA_FLAGS) | |||||
| endif | |||||
| ifeq ($(CORE), I6400) | |||||
| CCOMMON_OPT += -mips64r6 -mnan=2008 -mtune=i6400 $(MSA_FLAGS) | |||||
| FCOMMON_OPT += -mips64r6 -mnan=2008 -mtune=i6400 $(MSA_FLAGS) | |||||
| endif | |||||
| ifeq ($(CORE), P6600) | |||||
| CCOMMON_OPT += -mips64r6 -mnan=2008 -mtune=p6600 $(MSA_FLAGS) | |||||
| FCOMMON_OPT += -mips64r6 -mnan=2008 -mtune=p6600 $(MSA_FLAGS) | |||||
| endif | |||||
| ifeq ($(OSNAME), AIX) | ifeq ($(OSNAME), AIX) | ||||
| BINARY_DEFINED = 1 | BINARY_DEFINED = 1 | ||||
| endif | endif | ||||
| @@ -589,12 +607,14 @@ ifneq ($(NO_LAPACK), 1) | |||||
| EXTRALIB += -lgfortran | EXTRALIB += -lgfortran | ||||
| endif | endif | ||||
| ifdef NO_BINARY_MODE | ifdef NO_BINARY_MODE | ||||
| ifeq ($(ARCH), mips64) | |||||
| ifeq ($(ARCH), $(filter $(ARCH),mips64)) | |||||
| ifdef BINARY64 | ifdef BINARY64 | ||||
| FCOMMON_OPT += -mabi=64 | FCOMMON_OPT += -mabi=64 | ||||
| else | else | ||||
| FCOMMON_OPT += -mabi=n32 | FCOMMON_OPT += -mabi=n32 | ||||
| endif | endif | ||||
| else ifeq ($(ARCH), $(filter $(ARCH),mips)) | |||||
| FCOMMON_OPT += -mabi=32 | |||||
| endif | endif | ||||
| else | else | ||||
| ifdef BINARY64 | ifdef BINARY64 | ||||
| @@ -677,21 +697,7 @@ FCOMMON_OPT += -i8 | |||||
| endif | endif | ||||
| endif | endif | ||||
| endif | endif | ||||
| ifneq ($(ARCH), mips64) | |||||
| ifndef BINARY64 | |||||
| FCOMMON_OPT += -m32 | |||||
| else | |||||
| FCOMMON_OPT += -m64 | |||||
| endif | |||||
| else | |||||
| ifdef BINARY64 | |||||
| FCOMMON_OPT += -mabi=64 | |||||
| else | |||||
| FCOMMON_OPT += -mabi=n32 | |||||
| endif | |||||
| endif | |||||
| ifeq ($(USE_OPENMP), 1) | ifeq ($(USE_OPENMP), 1) | ||||
| FCOMMON_OPT += -mp | FCOMMON_OPT += -mp | ||||
| endif | endif | ||||
| @@ -707,7 +713,7 @@ endif | |||||
| endif | endif | ||||
| endif | endif | ||||
| ifeq ($(ARCH), mips64) | |||||
| ifeq ($(ARCH), $(filter $(ARCH),mips64 mips)) | |||||
| ifndef BINARY64 | ifndef BINARY64 | ||||
| FCOMMON_OPT += -n32 | FCOMMON_OPT += -n32 | ||||
| else | else | ||||
| @@ -737,7 +743,7 @@ endif | |||||
| ifeq ($(C_COMPILER), OPEN64) | ifeq ($(C_COMPILER), OPEN64) | ||||
| ifeq ($(ARCH), mips64) | |||||
| ifeq ($(ARCH), $(filter $(ARCH),mips64 mips)) | |||||
| ifndef BINARY64 | ifndef BINARY64 | ||||
| CCOMMON_OPT += -n32 | CCOMMON_OPT += -n32 | ||||
| else | else | ||||
| @@ -1126,6 +1132,8 @@ export HAVE_VFP | |||||
| export HAVE_VFPV3 | export HAVE_VFPV3 | ||||
| export HAVE_VFPV4 | export HAVE_VFPV4 | ||||
| export HAVE_NEON | export HAVE_NEON | ||||
| export HAVE_MSA | |||||
| export MSA_FLAGS | |||||
| export KERNELDIR | export KERNELDIR | ||||
| export FUNCTION_PROFILE | export FUNCTION_PROFILE | ||||
| export TARGET_CORE | export TARGET_CORE | ||||
| @@ -43,6 +43,35 @@ On X86 box, compile this library for loongson3a CPU with loongcc (based on Open6 | |||||
| make DEBUG=1 | make DEBUG=1 | ||||
| ### Compile with MASS Support on Power CPU (Optional dependency) | |||||
| [IBM MASS](http://www-01.ibm.com/software/awdtools/mass/linux/mass-linux.html) library consists of a set of mathematical functions for C, C++, and | |||||
| Fortran-language applications that are tuned for optimum performance on POWER architectures. OpenBLAS with MASS requires 64-bit, little-endian OS on POWER. | |||||
| The library can be installed as below - | |||||
| * On Ubuntu: | |||||
| wget -q http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/ubuntu/public.gpg -O- | sudo apt-key add - | |||||
| echo "deb http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/ubuntu/ trusty main" | sudo tee /etc/apt/sources.list.d/ibm-xl-compiler-eval.list | |||||
| sudo apt-get update | |||||
| sudo apt-get install libxlmass-devel.8.1.3 | |||||
| * On RHEL/CentOS: | |||||
| wget http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/rhel7/repodata/repomd.xml.key | |||||
| sudo rpm --import repomd.xml.key | |||||
| wget http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/rhel7/ibm-xl-compiler-eval.repo | |||||
| sudo cp ibm-xl-compiler-eval.repo /etc/yum.repos.d/ | |||||
| sudo yum install libxlmass-devel.8.1.3 | |||||
| After installing MASS library, compile openblas with USE_MASS=1. | |||||
| Example: | |||||
| Compiling on Power8 with MASS support - | |||||
| make USE_MASS=1 TARGET=POWER8 | |||||
| ### Install to the directory (optional) | ### Install to the directory (optional) | ||||
| Example: | Example: | ||||
| @@ -82,6 +111,7 @@ Please read GotoBLAS_01Readme.txt | |||||
| - **MingWin or Visual Studio(CMake)/Windows**: Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-in-Microsoft-Visual-Studio>. | - **MingWin or Visual Studio(CMake)/Windows**: Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-in-Microsoft-Visual-Studio>. | ||||
| - **Darwin/Mac OS X**: Experimental. Although GotoBLAS2 supports Darwin, we are the beginner on Mac OS X. | - **Darwin/Mac OS X**: Experimental. Although GotoBLAS2 supports Darwin, we are the beginner on Mac OS X. | ||||
| - **FreeBSD**: Supported by community. We didn't test the library on this OS. | - **FreeBSD**: Supported by community. We didn't test the library on this OS. | ||||
| - **Android**: Supported by community. Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-build-OpenBLAS-for-Android>. | |||||
| ## Usages | ## Usages | ||||
| Link with libopenblas.a or -lopenblas for shared library. | Link with libopenblas.a or -lopenblas for shared library. | ||||
| @@ -53,26 +53,31 @@ PPC440 | |||||
| PPC440FP2 | PPC440FP2 | ||||
| CELL | CELL | ||||
| 3.MIPS64 CPU: | |||||
| 3.MIPS CPU: | |||||
| P5600 | |||||
| 4.MIPS64 CPU: | |||||
| SICORTEX | SICORTEX | ||||
| LOONGSON3A | LOONGSON3A | ||||
| LOONGSON3B | LOONGSON3B | ||||
| I6400 | |||||
| P6600 | |||||
| 4.IA64 CPU: | |||||
| 5.IA64 CPU: | |||||
| ITANIUM2 | ITANIUM2 | ||||
| 5.SPARC CPU: | |||||
| 6.SPARC CPU: | |||||
| SPARC | SPARC | ||||
| SPARCV7 | SPARCV7 | ||||
| 6.ARM CPU: | |||||
| 7.ARM CPU: | |||||
| CORTEXA15 | CORTEXA15 | ||||
| CORTEXA9 | CORTEXA9 | ||||
| ARMV7 | ARMV7 | ||||
| ARMV6 | ARMV6 | ||||
| ARMV5 | ARMV5 | ||||
| 7.ARM 64-bit CPU: | |||||
| 8.ARM 64-bit CPU: | |||||
| ARMV8 | ARMV8 | ||||
| CORTEXA57 | CORTEXA57 | ||||
| @@ -1,4 +1,4 @@ | |||||
| version: 0.2.18.{build} | |||||
| version: 0.2.19.{build} | |||||
| #environment: | #environment: | ||||
| @@ -173,7 +173,9 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ | |||||
| sgetri.goto dgetri.goto cgetri.goto zgetri.goto \ | sgetri.goto dgetri.goto cgetri.goto zgetri.goto \ | ||||
| spotrf.goto dpotrf.goto cpotrf.goto zpotrf.goto \ | spotrf.goto dpotrf.goto cpotrf.goto zpotrf.goto \ | ||||
| ssymm.goto dsymm.goto csymm.goto zsymm.goto \ | ssymm.goto dsymm.goto csymm.goto zsymm.goto \ | ||||
| smallscaling | |||||
| smallscaling \ | |||||
| isamax.goto idamax.goto icamax.goto izamax.goto \ | |||||
| snrm2.goto dnrm2.goto scnrm2.goto dznrm2.goto | |||||
| acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ | acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ | ||||
| scholesky.acml dcholesky.acml ccholesky.acml zcholesky.acml \ | scholesky.acml dcholesky.acml ccholesky.acml zcholesky.acml \ | ||||
| @@ -226,7 +228,9 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ | |||||
| sgesv.atlas dgesv.atlas cgesv.atlas zgesv.atlas \ | sgesv.atlas dgesv.atlas cgesv.atlas zgesv.atlas \ | ||||
| sgetri.atlas dgetri.atlas cgetri.atlas zgetri.atlas \ | sgetri.atlas dgetri.atlas cgetri.atlas zgetri.atlas \ | ||||
| spotrf.atlas dpotrf.atlas cpotrf.atlas zpotrf.atlas \ | spotrf.atlas dpotrf.atlas cpotrf.atlas zpotrf.atlas \ | ||||
| ssymm.atlas dsymm.atlas csymm.atlas zsymm.atlas | |||||
| ssymm.atlas dsymm.atlas csymm.atlas zsymm.atlas \ | |||||
| isamax.atlas idamax.atlas icamax.atlas izamax.atlas \ | |||||
| snrm2.goto dnrm2.goto scnrm2.goto dznrm2.goto | |||||
| mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ | mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ | ||||
| scholesky.mkl dcholesky.mkl ccholesky.mkl zcholesky.mkl \ | scholesky.mkl dcholesky.mkl ccholesky.mkl zcholesky.mkl \ | ||||
| @@ -261,7 +265,9 @@ endif | |||||
| essl :: sgemm.essl strmm.essl dgemm.essl dtrmm.essl \ | essl :: sgemm.essl strmm.essl dgemm.essl dtrmm.essl \ | ||||
| cgemm.essl ctrmm.essl zgemm.essl ztrmm.essl \ | cgemm.essl ctrmm.essl zgemm.essl ztrmm.essl \ | ||||
| slinpack.essl clinpack.essl dlinpack.essl zlinpack.essl | |||||
| slinpack.essl clinpack.essl dlinpack.essl zlinpack.essl \ | |||||
| scholesky.essl ccholesky.essl dcholesky.essl zcholesky.essl \ | |||||
| strsm.essl dtrsm.essl ctrsm.essl ztrsm.essl | |||||
| veclib :: slinpack.veclib dlinpack.veclib clinpack.veclib zlinpack.veclib \ | veclib :: slinpack.veclib dlinpack.veclib clinpack.veclib zlinpack.veclib \ | ||||
| scholesky.veclib dcholesky.veclib ccholesky.veclib zcholesky.veclib \ | scholesky.veclib dcholesky.veclib ccholesky.veclib zcholesky.veclib \ | ||||
| @@ -393,6 +399,9 @@ scholesky.mkl : scholesky.$(SUFFIX) | |||||
| scholesky.veclib : scholesky.$(SUFFIX) | scholesky.veclib : scholesky.$(SUFFIX) | ||||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | ||||
| scholesky.essl : scholesky.$(SUFFIX) | |||||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||||
| ##################################### Dcholesky ################################################### | ##################################### Dcholesky ################################################### | ||||
| dcholesky.goto : dcholesky.$(SUFFIX) ../$(LIBNAME) | dcholesky.goto : dcholesky.$(SUFFIX) ../$(LIBNAME) | ||||
| @@ -410,6 +419,9 @@ dcholesky.mkl : dcholesky.$(SUFFIX) | |||||
| dcholesky.veclib : dcholesky.$(SUFFIX) | dcholesky.veclib : dcholesky.$(SUFFIX) | ||||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | ||||
| dcholesky.essl : dcholesky.$(SUFFIX) | |||||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||||
| ##################################### Ccholesky ################################################### | ##################################### Ccholesky ################################################### | ||||
| ccholesky.goto : ccholesky.$(SUFFIX) ../$(LIBNAME) | ccholesky.goto : ccholesky.$(SUFFIX) ../$(LIBNAME) | ||||
| @@ -427,6 +439,9 @@ ccholesky.mkl : ccholesky.$(SUFFIX) | |||||
| ccholesky.veclib : ccholesky.$(SUFFIX) | ccholesky.veclib : ccholesky.$(SUFFIX) | ||||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | ||||
| ccholesky.essl : ccholesky.$(SUFFIX) | |||||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||||
| ##################################### Zcholesky ################################################### | ##################################### Zcholesky ################################################### | ||||
| @@ -445,6 +460,9 @@ zcholesky.mkl : zcholesky.$(SUFFIX) | |||||
| zcholesky.veclib : zcholesky.$(SUFFIX) | zcholesky.veclib : zcholesky.$(SUFFIX) | ||||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | ||||
| zcholesky.essl : zcholesky.$(SUFFIX) | |||||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||||
| ##################################### Sgemm #################################################### | ##################################### Sgemm #################################################### | ||||
| sgemm.goto : sgemm.$(SUFFIX) ../$(LIBNAME) | sgemm.goto : sgemm.$(SUFFIX) ../$(LIBNAME) | ||||
| $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | ||||
| @@ -683,6 +701,9 @@ strsm.mkl : strsm.$(SUFFIX) | |||||
| strsm.veclib : strsm.$(SUFFIX) | strsm.veclib : strsm.$(SUFFIX) | ||||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | ||||
| strsm.essl : strsm.$(SUFFIX) | |||||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||||
| ##################################### Dtrsm #################################################### | ##################################### Dtrsm #################################################### | ||||
| dtrsm.goto : dtrsm.$(SUFFIX) ../$(LIBNAME) | dtrsm.goto : dtrsm.$(SUFFIX) ../$(LIBNAME) | ||||
| $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | ||||
| @@ -699,6 +720,9 @@ dtrsm.mkl : dtrsm.$(SUFFIX) | |||||
| dtrsm.veclib : dtrsm.$(SUFFIX) | dtrsm.veclib : dtrsm.$(SUFFIX) | ||||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | ||||
| dtrsm.essl : dtrsm.$(SUFFIX) | |||||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||||
| ##################################### Ctrsm #################################################### | ##################################### Ctrsm #################################################### | ||||
| ctrsm.goto : ctrsm.$(SUFFIX) ../$(LIBNAME) | ctrsm.goto : ctrsm.$(SUFFIX) ../$(LIBNAME) | ||||
| @@ -716,6 +740,9 @@ ctrsm.mkl : ctrsm.$(SUFFIX) | |||||
| ctrsm.veclib : ctrsm.$(SUFFIX) | ctrsm.veclib : ctrsm.$(SUFFIX) | ||||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | ||||
| ctrsm.essl : ctrsm.$(SUFFIX) | |||||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||||
| ##################################### Ztrsm #################################################### | ##################################### Ztrsm #################################################### | ||||
| ztrsm.goto : ztrsm.$(SUFFIX) ../$(LIBNAME) | ztrsm.goto : ztrsm.$(SUFFIX) ../$(LIBNAME) | ||||
| @@ -733,6 +760,9 @@ ztrsm.mkl : ztrsm.$(SUFFIX) | |||||
| ztrsm.veclib : ztrsm.$(SUFFIX) | ztrsm.veclib : ztrsm.$(SUFFIX) | ||||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | ||||
| ztrsm.essl : ztrsm.$(SUFFIX) | |||||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||||
| ##################################### Ssyrk #################################################### | ##################################### Ssyrk #################################################### | ||||
| ssyrk.goto : ssyrk.$(SUFFIX) ../$(LIBNAME) | ssyrk.goto : ssyrk.$(SUFFIX) ../$(LIBNAME) | ||||
| $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | ||||
| @@ -1911,6 +1941,63 @@ zgemm3m.mkl : zgemm3m.$(SUFFIX) | |||||
| zgemm3m.veclib : zgemm3m.$(SUFFIX) | zgemm3m.veclib : zgemm3m.$(SUFFIX) | ||||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | ||||
| ############################################## ISAMAX ############################################## | |||||
| isamax.goto : isamax.$(SUFFIX) ../$(LIBNAME) | |||||
| $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | |||||
| isamax.atlas : isamax.$(SUFFIX) | |||||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||||
| ############################################## IDAMAX ############################################## | |||||
| idamax.goto : idamax.$(SUFFIX) ../$(LIBNAME) | |||||
| $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | |||||
| idamax.atlas : idamax.$(SUFFIX) | |||||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||||
| ############################################## ICAMAX ############################################## | |||||
| icamax.goto : icamax.$(SUFFIX) ../$(LIBNAME) | |||||
| $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | |||||
| icamax.atlas : icamax.$(SUFFIX) | |||||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||||
| ############################################## IZAMAX ############################################## | |||||
| izamax.goto : izamax.$(SUFFIX) ../$(LIBNAME) | |||||
| $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | |||||
| izamax.atlas : izamax.$(SUFFIX) | |||||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||||
| ############################################## SNRM2 ############################################## | |||||
| snrm2.goto : snrm2.$(SUFFIX) ../$(LIBNAME) | |||||
| $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | |||||
| snrm2.atlas : snrm2.$(SUFFIX) | |||||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||||
| ############################################## DNRM2 ############################################## | |||||
| dnrm2.goto : dnrm2.$(SUFFIX) ../$(LIBNAME) | |||||
| $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | |||||
| dnrm2.atlas : dnrm2.$(SUFFIX) | |||||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||||
| ############################################## Sscnrm2 ############################################## | |||||
| scnrm2.goto : scnrm2.$(SUFFIX) ../$(LIBNAME) | |||||
| $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | |||||
| scnrm2.atlas : scnrm2.$(SUFFIX) | |||||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||||
| ############################################## Ddznrm2 ############################################## | |||||
| dznrm2.goto : dznrm2.$(SUFFIX) ../$(LIBNAME) | |||||
| $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | |||||
| dznrm2.atlas : dznrm2.$(SUFFIX) | |||||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||||
| ################################################################################################### | ################################################################################################### | ||||
| slinpack.$(SUFFIX) : linpack.c | slinpack.$(SUFFIX) : linpack.c | ||||
| @@ -2217,11 +2304,38 @@ cgemm3m.$(SUFFIX) : gemm3m.c | |||||
| zgemm3m.$(SUFFIX) : gemm3m.c | zgemm3m.$(SUFFIX) : gemm3m.c | ||||
| $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ | $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ | ||||
| isamax.$(SUFFIX) : iamax.c | |||||
| $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ | |||||
| idamax.$(SUFFIX) : iamax.c | |||||
| $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ | |||||
| icamax.$(SUFFIX) : iamax.c | |||||
| $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ | |||||
| izamax.$(SUFFIX) : iamax.c | |||||
| $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ | |||||
| snrm2.$(SUFFIX) : nrm2.c | |||||
| $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ | |||||
| dnrm2.$(SUFFIX) : nrm2.c | |||||
| $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ | |||||
| scnrm2.$(SUFFIX) : nrm2.c | |||||
| $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ | |||||
| dznrm2.$(SUFFIX) : nrm2.c | |||||
| $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ | |||||
| smallscaling: smallscaling.c ../$(LIBNAME) | smallscaling: smallscaling.c ../$(LIBNAME) | ||||
| $(CC) $(CFLAGS) -o $(@F) $^ $(EXTRALIB) -fopenmp -lm | |||||
| $(CC) $(CFLAGS) -o $(@F) $^ $(EXTRALIB) -fopenmp -lm -lpthread | |||||
| clean :: | clean :: | ||||
| @rm -f *.goto *.mkl *.acml *.atlas *.veclib *.essl | |||||
| @rm -f *.goto *.mkl *.acml *.atlas *.veclib *.essl smallscaling | |||||
| include $(TOPDIR)/Makefile.tail | include $(TOPDIR)/Makefile.tail | ||||
| @@ -183,9 +183,9 @@ int main(int argc, char *argv[]){ | |||||
| timeg /= loops; | timeg /= loops; | ||||
| #ifdef COMPLEX | #ifdef COMPLEX | ||||
| fprintf(stderr, " %10.2f MFlops\n", 4. * (double)m / timeg * 1.e-6); | |||||
| fprintf(stderr, " %10.2f MFlops %10.6f sec\n", 4. * (double)m / timeg * 1.e-6, timeg); | |||||
| #else | #else | ||||
| fprintf(stderr, " %10.2f MFlops\n", 2. * (double)m / timeg * 1.e-6); | |||||
| fprintf(stderr, " %10.2f MFlops %10.6f sec\n", 2. * (double)m / timeg * 1.e-6, timeg); | |||||
| #endif | #endif | ||||
| } | } | ||||
| @@ -190,8 +190,8 @@ int main(int argc, char *argv[]){ | |||||
| timeg /= loops; | timeg /= loops; | ||||
| fprintf(stderr, | fprintf(stderr, | ||||
| " %10.2f MFlops\n", | |||||
| COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6); | |||||
| " %10.2f MFlops %10.6f sec\n", | |||||
| COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6, timeg); | |||||
| } | } | ||||
| @@ -190,8 +190,8 @@ int main(int argc, char *argv[]){ | |||||
| timeg /= loops; | timeg /= loops; | ||||
| fprintf(stderr, | fprintf(stderr, | ||||
| " %10.2f MBytes\n", | |||||
| COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6); | |||||
| " %10.2f MBytes %10.6f sec\n", | |||||
| COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg); | |||||
| } | } | ||||
| @@ -184,8 +184,8 @@ int main(int argc, char *argv[]){ | |||||
| timeg /= loops; | timeg /= loops; | ||||
| fprintf(stderr, | fprintf(stderr, | ||||
| " %10.2f MFlops\n", | |||||
| COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6); | |||||
| " %10.2f MFlops %10.6f sec\n", | |||||
| COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6, timeg); | |||||
| } | } | ||||
| @@ -221,7 +221,7 @@ int main(int argc, char *argv[]){ | |||||
| timeg /= loops; | timeg /= loops; | ||||
| fprintf(stderr, " %10.2f MFlops\n", COMPSIZE * COMPSIZE * 2. * (double)m * (double)n / timeg * 1.e-6); | |||||
| fprintf(stderr, " %10.2f MFlops %10.6f sec\n", COMPSIZE * COMPSIZE * 2. * (double)m * (double)n / timeg * 1.e-6, timeg); | |||||
| } | } | ||||
| } | } | ||||
| @@ -258,7 +258,7 @@ int main(int argc, char *argv[]){ | |||||
| timeg /= loops; | timeg /= loops; | ||||
| fprintf(stderr, " %10.2f MFlops\n", COMPSIZE * COMPSIZE * 2. * (double)m * (double)n / timeg * 1.e-6); | |||||
| fprintf(stderr, " %10.2f MFlops %10.6f sec\n", COMPSIZE * COMPSIZE * 2. * (double)m * (double)n / timeg * 1.e-6, timeg); | |||||
| } | } | ||||
| } | } | ||||
| @@ -0,0 +1,190 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2016, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #include <stdio.h> | |||||
| #include <stdlib.h> | |||||
| #ifdef __CYGWIN32__ | |||||
| #include <sys/time.h> | |||||
| #endif | |||||
| #include "common.h" | |||||
| #undef IAMAX | |||||
| #ifdef COMPLEX | |||||
| #ifdef DOUBLE | |||||
| #define IAMAX BLASFUNC(izamax) | |||||
| #else | |||||
| #define IAMAX BLASFUNC(icamax) | |||||
| #endif | |||||
| #else | |||||
| #ifdef DOUBLE | |||||
| #define IAMAX BLASFUNC(idamax) | |||||
| #else | |||||
| #define IAMAX BLASFUNC(isamax) | |||||
| #endif | |||||
| #endif | |||||
| #if defined(__WIN32__) || defined(__WIN64__) | |||||
| #ifndef DELTA_EPOCH_IN_MICROSECS | |||||
| #define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL | |||||
| #endif | |||||
| int gettimeofday(struct timeval *tv, void *tz){ | |||||
| FILETIME ft; | |||||
| unsigned __int64 tmpres = 0; | |||||
| static int tzflag; | |||||
| if (NULL != tv) | |||||
| { | |||||
| GetSystemTimeAsFileTime(&ft); | |||||
| tmpres |= ft.dwHighDateTime; | |||||
| tmpres <<= 32; | |||||
| tmpres |= ft.dwLowDateTime; | |||||
| /*converting file time to unix epoch*/ | |||||
| tmpres /= 10; /*convert into microseconds*/ | |||||
| tmpres -= DELTA_EPOCH_IN_MICROSECS; | |||||
| tv->tv_sec = (long)(tmpres / 1000000UL); | |||||
| tv->tv_usec = (long)(tmpres % 1000000UL); | |||||
| } | |||||
| return 0; | |||||
| } | |||||
| #endif | |||||
| #if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 | |||||
| static void *huge_malloc(BLASLONG size){ | |||||
| int shmid; | |||||
| void *address; | |||||
| #ifndef SHM_HUGETLB | |||||
| #define SHM_HUGETLB 04000 | |||||
| #endif | |||||
| if ((shmid =shmget(IPC_PRIVATE, | |||||
| (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), | |||||
| SHM_HUGETLB | IPC_CREAT |0600)) < 0) { | |||||
| printf( "Memory allocation failed(shmget).\n"); | |||||
| exit(1); | |||||
| } | |||||
| address = shmat(shmid, NULL, SHM_RND); | |||||
| if ((BLASLONG)address == -1){ | |||||
| printf( "Memory allocation failed(shmat).\n"); | |||||
| exit(1); | |||||
| } | |||||
| shmctl(shmid, IPC_RMID, 0); | |||||
| return address; | |||||
| } | |||||
| #define malloc huge_malloc | |||||
| #endif | |||||
| int main(int argc, char *argv[]){ | |||||
| FLOAT *x; | |||||
| blasint m, i; | |||||
| blasint inc_x=1; | |||||
| int loops = 1; | |||||
| int l; | |||||
| char *p; | |||||
| int from = 1; | |||||
| int to = 200; | |||||
| int step = 1; | |||||
| struct timeval start, stop; | |||||
| double time1,timeg; | |||||
| argc--;argv++; | |||||
| if (argc > 0) { from = atol(*argv); argc--; argv++;} | |||||
| if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} | |||||
| if (argc > 0) { step = atol(*argv); argc--; argv++;} | |||||
| if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); | |||||
| if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); | |||||
| fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); | |||||
| if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ | |||||
| fprintf(stderr,"Out of Memory!!\n");exit(1); | |||||
| } | |||||
| #ifdef linux | |||||
| srandom(getpid()); | |||||
| #endif | |||||
| fprintf(stderr, " SIZE Time\n"); | |||||
| for(m = from; m <= to; m += step) | |||||
| { | |||||
| timeg=0; | |||||
| fprintf(stderr, " %6d : ", (int)m); | |||||
| for (l=0; l<loops; l++) | |||||
| { | |||||
| for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){ | |||||
| x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||||
| } | |||||
| gettimeofday( &start, (struct timezone *)0); | |||||
| IAMAX (&m, x, &inc_x); | |||||
| gettimeofday( &stop, (struct timezone *)0); | |||||
| time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; | |||||
| timeg += time1; | |||||
| } | |||||
| timeg /= loops; | |||||
| fprintf(stderr, " %10.6f secs\n", timeg); | |||||
| } | |||||
| return 0; | |||||
| } | |||||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||||
| @@ -0,0 +1,190 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2016, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #include <stdio.h> | |||||
| #include <stdlib.h> | |||||
| #ifdef __CYGWIN32__ | |||||
| #include <sys/time.h> | |||||
| #endif | |||||
| #include "common.h" | |||||
| #undef NRM2 | |||||
| #ifdef COMPLEX | |||||
| #ifdef DOUBLE | |||||
| #define NRM2 BLASFUNC(dznrm2) | |||||
| #else | |||||
| #define NRM2 BLASFUNC(scnrm2) | |||||
| #endif | |||||
| #else | |||||
| #ifdef DOUBLE | |||||
| #define NRM2 BLASFUNC(dnrm2) | |||||
| #else | |||||
| #define NRM2 BLASFUNC(snrm2) | |||||
| #endif | |||||
| #endif | |||||
| #if defined(__WIN32__) || defined(__WIN64__) | |||||
| #ifndef DELTA_EPOCH_IN_MICROSECS | |||||
| #define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL | |||||
| #endif | |||||
| int gettimeofday(struct timeval *tv, void *tz){ | |||||
| FILETIME ft; | |||||
| unsigned __int64 tmpres = 0; | |||||
| static int tzflag; | |||||
| if (NULL != tv) | |||||
| { | |||||
| GetSystemTimeAsFileTime(&ft); | |||||
| tmpres |= ft.dwHighDateTime; | |||||
| tmpres <<= 32; | |||||
| tmpres |= ft.dwLowDateTime; | |||||
| /*converting file time to unix epoch*/ | |||||
| tmpres /= 10; /*convert into microseconds*/ | |||||
| tmpres -= DELTA_EPOCH_IN_MICROSECS; | |||||
| tv->tv_sec = (long)(tmpres / 1000000UL); | |||||
| tv->tv_usec = (long)(tmpres % 1000000UL); | |||||
| } | |||||
| return 0; | |||||
| } | |||||
| #endif | |||||
| #if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 | |||||
| static void *huge_malloc(BLASLONG size){ | |||||
| int shmid; | |||||
| void *address; | |||||
| #ifndef SHM_HUGETLB | |||||
| #define SHM_HUGETLB 04000 | |||||
| #endif | |||||
| if ((shmid =shmget(IPC_PRIVATE, | |||||
| (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), | |||||
| SHM_HUGETLB | IPC_CREAT |0600)) < 0) { | |||||
| printf( "Memory allocation failed(shmget).\n"); | |||||
| exit(1); | |||||
| } | |||||
| address = shmat(shmid, NULL, SHM_RND); | |||||
| if ((BLASLONG)address == -1){ | |||||
| printf( "Memory allocation failed(shmat).\n"); | |||||
| exit(1); | |||||
| } | |||||
| shmctl(shmid, IPC_RMID, 0); | |||||
| return address; | |||||
| } | |||||
| #define malloc huge_malloc | |||||
| #endif | |||||
| int main(int argc, char *argv[]){ | |||||
| FLOAT *x; | |||||
| blasint m, i; | |||||
| blasint inc_x=1; | |||||
| int loops = 1; | |||||
| int l; | |||||
| char *p; | |||||
| int from = 1; | |||||
| int to = 200; | |||||
| int step = 1; | |||||
| struct timeval start, stop; | |||||
| double time1,timeg; | |||||
| argc--;argv++; | |||||
| if (argc > 0) { from = atol(*argv); argc--; argv++;} | |||||
| if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} | |||||
| if (argc > 0) { step = atol(*argv); argc--; argv++;} | |||||
| if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); | |||||
| if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); | |||||
| fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); | |||||
| if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ | |||||
| fprintf(stderr,"Out of Memory!!\n");exit(1); | |||||
| } | |||||
| #ifdef linux | |||||
| srandom(getpid()); | |||||
| #endif | |||||
| fprintf(stderr, " SIZE Time\n"); | |||||
| for(m = from; m <= to; m += step) | |||||
| { | |||||
| timeg=0; | |||||
| fprintf(stderr, " %6d : ", (int)m); | |||||
| for (l=0; l<loops; l++) | |||||
| { | |||||
| for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){ | |||||
| x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||||
| } | |||||
| gettimeofday( &start, (struct timezone *)0); | |||||
| NRM2 (&m, x, &inc_x); | |||||
| gettimeofday( &stop, (struct timezone *)0); | |||||
| time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; | |||||
| timeg += time1; | |||||
| } | |||||
| timeg /= loops; | |||||
| fprintf(stderr, " %10.6f secs\n", timeg); | |||||
| } | |||||
| return 0; | |||||
| } | |||||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||||
| @@ -186,8 +186,8 @@ int main(int argc, char *argv[]){ | |||||
| timeg /= loops; | timeg /= loops; | ||||
| fprintf(stderr, | fprintf(stderr, | ||||
| " %10.2f MFlops\n", | |||||
| COMPSIZE * COMPSIZE * 6. * (double)m / timeg * 1.e-6); | |||||
| " %10.2f MFlops %10.6f sec\n", | |||||
| COMPSIZE * COMPSIZE * 6. * (double)m / timeg * 1.e-6, timeg); | |||||
| } | } | ||||
| @@ -189,9 +189,9 @@ int main(int argc, char *argv[]){ | |||||
| timeg /= loops; | timeg /= loops; | ||||
| #ifdef COMPLEX | #ifdef COMPLEX | ||||
| fprintf(stderr, " %10.2f MFlops\n", 6. * (double)m / timeg * 1.e-6); | |||||
| fprintf(stderr, " %10.2f MFlops %10.6f sec\n", 6. * (double)m / timeg * 1.e-6, timeg); | |||||
| #else | #else | ||||
| fprintf(stderr, " %10.2f MFlops\n", 1. * (double)m / timeg * 1.e-6); | |||||
| fprintf(stderr, " %10.2f MFlops %10.6f sec\n", 1. * (double)m / timeg * 1.e-6, timeg); | |||||
| #endif | #endif | ||||
| } | } | ||||
| @@ -5,6 +5,7 @@ | |||||
| #include <time.h> | #include <time.h> | ||||
| #include <cblas.h> | #include <cblas.h> | ||||
| #include <omp.h> | #include <omp.h> | ||||
| #include <pthread.h> | |||||
| #define MIN_SIZE 5 | #define MIN_SIZE 5 | ||||
| #define MAX_SIZE 60 | #define MAX_SIZE 60 | ||||
| #define NB_SIZE 10 | #define NB_SIZE 10 | ||||
| @@ -190,8 +190,8 @@ int main(int argc, char *argv[]){ | |||||
| timeg /= loops; | timeg /= loops; | ||||
| fprintf(stderr, | fprintf(stderr, | ||||
| " %10.2f MBytes\n", | |||||
| COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6); | |||||
| " %10.2f MBytes %10.6f sec\n", | |||||
| COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg); | |||||
| } | } | ||||
| @@ -191,8 +191,8 @@ int main(int argc, char *argv[]){ | |||||
| gettimeofday( &start, (struct timezone *)0); | gettimeofday( &start, (struct timezone *)0); | ||||
| fprintf(stderr, | fprintf(stderr, | ||||
| " %10.2f MFlops\n", | |||||
| COMPSIZE * COMPSIZE * 1. * (double)m * (double)m * (double)m / time1 * 1.e-6); | |||||
| " %10.2f MFlops %10.6f sec\n", | |||||
| COMPSIZE * COMPSIZE * 1. * (double)m * (double)m * (double)m / time1 * 1.e-6, time1); | |||||
| } | } | ||||
| @@ -184,8 +184,8 @@ int main(int argc, char *argv[]){ | |||||
| timeg /= loops; | timeg /= loops; | ||||
| fprintf(stderr, | fprintf(stderr, | ||||
| " %10.2f MFlops\n", | |||||
| COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6); | |||||
| " %10.2f MFlops %10.6f sec\n", | |||||
| COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6, timeg); | |||||
| } | } | ||||
| @@ -1,5 +1,8 @@ | |||||
| #!/usr/bin/perl | #!/usr/bin/perl | ||||
| use File::Basename; | |||||
| use File::Temp qw(tempfile); | |||||
| # Checking cross compile | # Checking cross compile | ||||
| $hostos = `uname -s | sed -e s/\-.*//`; chop($hostos); | $hostos = `uname -s | sed -e s/\-.*//`; chop($hostos); | ||||
| $hostarch = `uname -m | sed -e s/i.86/x86/`;chop($hostarch); | $hostarch = `uname -m | sed -e s/i.86/x86/`;chop($hostarch); | ||||
| @@ -8,6 +11,7 @@ $hostarch = "arm" if ($hostarch =~ /^arm.*/); | |||||
| $hostarch = "arm64" if ($hostarch eq "aarch64"); | $hostarch = "arm64" if ($hostarch eq "aarch64"); | ||||
| $hostarch = "power" if ($hostarch =~ /^(powerpc|ppc).*/); | $hostarch = "power" if ($hostarch =~ /^(powerpc|ppc).*/); | ||||
| $tmpf = new File::Temp( UNLINK => 1 ); | |||||
| $binary = $ENV{"BINARY"}; | $binary = $ENV{"BINARY"}; | ||||
| $makefile = shift(@ARGV); | $makefile = shift(@ARGV); | ||||
| @@ -26,14 +30,12 @@ if ($?) { | |||||
| $cross_suffix = ""; | $cross_suffix = ""; | ||||
| if ($ARGV[0] =~ /(.*)(-[.\d]+)/) { | |||||
| if ($1 =~ /(.*-)(.*)/) { | |||||
| $cross_suffix = $1; | |||||
| } | |||||
| } else { | |||||
| if ($ARGV[0] =~ /([^\/]*-)([^\/]*$)/) { | |||||
| $cross_suffix = $1; | |||||
| } | |||||
| if (dirname($compiler_name) ne ".") { | |||||
| $cross_suffix .= dirname($compiler_name) . "/"; | |||||
| } | |||||
| if (basename($compiler_name) =~ /(.*-)(.*)/) { | |||||
| $cross_suffix .= $1; | |||||
| } | } | ||||
| $compiler = ""; | $compiler = ""; | ||||
| @@ -63,7 +65,7 @@ $os = Android if ($data =~ /OS_ANDROID/); | |||||
| $architecture = x86 if ($data =~ /ARCH_X86/); | $architecture = x86 if ($data =~ /ARCH_X86/); | ||||
| $architecture = x86_64 if ($data =~ /ARCH_X86_64/); | $architecture = x86_64 if ($data =~ /ARCH_X86_64/); | ||||
| $architecture = power if ($data =~ /ARCH_POWER/); | $architecture = power if ($data =~ /ARCH_POWER/); | ||||
| $architecture = mips32 if ($data =~ /ARCH_MIPS32/); | |||||
| $architecture = mips if ($data =~ /ARCH_MIPS/); | |||||
| $architecture = mips64 if ($data =~ /ARCH_MIPS64/); | $architecture = mips64 if ($data =~ /ARCH_MIPS64/); | ||||
| $architecture = alpha if ($data =~ /ARCH_ALPHA/); | $architecture = alpha if ($data =~ /ARCH_ALPHA/); | ||||
| $architecture = sparc if ($data =~ /ARCH_SPARC/); | $architecture = sparc if ($data =~ /ARCH_SPARC/); | ||||
| @@ -79,7 +81,12 @@ if ($os eq "AIX") { | |||||
| $defined = 1; | $defined = 1; | ||||
| } | } | ||||
| if (($architecture eq "mips32") || ($architecture eq "mips64")) { | |||||
| if ($architecture eq "mips") { | |||||
| $compiler_name .= " -mabi=32"; | |||||
| $defined = 1; | |||||
| } | |||||
| if ($architecture eq "mips64") { | |||||
| $compiler_name .= " -mabi=n32" if ($binary eq "32"); | $compiler_name .= " -mabi=n32" if ($binary eq "32"); | ||||
| $compiler_name .= " -mabi=64" if ($binary eq "64"); | $compiler_name .= " -mabi=64" if ($binary eq "64"); | ||||
| $defined = 1; | $defined = 1; | ||||
| @@ -152,10 +159,28 @@ if ($?) { | |||||
| die 1; | die 1; | ||||
| } | } | ||||
| $have_msa = 0; | |||||
| if (($architecture eq "mips") || ($architecture eq "mips64")) { | |||||
| $code = '"addvi.b $w0, $w1, 1"'; | |||||
| $msa_flags = "-mmsa -mfp64 -msched-weight -mload-store-pairs"; | |||||
| print $tmpf "#include <msa.h>\n\n"; | |||||
| print $tmpf "void main(void){ __asm__ volatile($code); }\n"; | |||||
| $args = "$msa_flags -o $tmpf.o -x c $tmpf"; | |||||
| my @cmd = ("$compiler_name $args"); | |||||
| system(@cmd) == 0; | |||||
| if ($? != 0) { | |||||
| $have_msa = 0; | |||||
| } else { | |||||
| $have_msa = 1; | |||||
| } | |||||
| unlink("$tmpf.o"); | |||||
| } | |||||
| $architecture = x86 if ($data =~ /ARCH_X86/); | $architecture = x86 if ($data =~ /ARCH_X86/); | ||||
| $architecture = x86_64 if ($data =~ /ARCH_X86_64/); | $architecture = x86_64 if ($data =~ /ARCH_X86_64/); | ||||
| $architecture = power if ($data =~ /ARCH_POWER/); | $architecture = power if ($data =~ /ARCH_POWER/); | ||||
| $architecture = mips32 if ($data =~ /ARCH_MIPS32/); | |||||
| $architecture = mips if ($data =~ /ARCH_MIPS/); | |||||
| $architecture = mips64 if ($data =~ /ARCH_MIPS64/); | $architecture = mips64 if ($data =~ /ARCH_MIPS64/); | ||||
| $architecture = alpha if ($data =~ /ARCH_ALPHA/); | $architecture = alpha if ($data =~ /ARCH_ALPHA/); | ||||
| $architecture = sparc if ($data =~ /ARCH_SPARC/); | $architecture = sparc if ($data =~ /ARCH_SPARC/); | ||||
| @@ -243,9 +268,11 @@ print MAKEFILE "BINARY64=\n" if $binformat ne bin64; | |||||
| print MAKEFILE "BINARY32=1\n" if $binformat eq bin32; | print MAKEFILE "BINARY32=1\n" if $binformat eq bin32; | ||||
| print MAKEFILE "BINARY64=1\n" if $binformat eq bin64; | print MAKEFILE "BINARY64=1\n" if $binformat eq bin64; | ||||
| print MAKEFILE "FU=$need_fu\n" if $need_fu ne ""; | print MAKEFILE "FU=$need_fu\n" if $need_fu ne ""; | ||||
| print MAKEFILE "CROSS_SUFFIX=$cross_suffix\n" if $cross_suffix ne ""; | |||||
| print MAKEFILE "CROSS_SUFFIX=$cross_suffix\n" if $cross != 0 && $cross_suffix ne ""; | |||||
| print MAKEFILE "CROSS=1\n" if $cross != 0; | print MAKEFILE "CROSS=1\n" if $cross != 0; | ||||
| print MAKEFILE "CEXTRALIB=$linker_L $linker_l $linker_a\n"; | print MAKEFILE "CEXTRALIB=$linker_L $linker_l $linker_a\n"; | ||||
| print MAKEFILE "HAVE_MSA=1\n" if $have_msa eq 1; | |||||
| print MAKEFILE "MSA_FLAGS=$msa_flags\n" if $have_msa eq 1; | |||||
| $os =~ tr/[a-z]/[A-Z]/; | $os =~ tr/[a-z]/[A-Z]/; | ||||
| $architecture =~ tr/[a-z]/[A-Z]/; | $architecture =~ tr/[a-z]/[A-Z]/; | ||||
| @@ -257,6 +284,7 @@ print CONFFILE "#define C_$compiler\t1\n"; | |||||
| print CONFFILE "#define __32BIT__\t1\n" if $binformat eq bin32; | print CONFFILE "#define __32BIT__\t1\n" if $binformat eq bin32; | ||||
| print CONFFILE "#define __64BIT__\t1\n" if $binformat eq bin64; | print CONFFILE "#define __64BIT__\t1\n" if $binformat eq bin64; | ||||
| print CONFFILE "#define FUNDERSCORE\t$need_fu\n" if $need_fu ne ""; | print CONFFILE "#define FUNDERSCORE\t$need_fu\n" if $need_fu ne ""; | ||||
| print CONFFILE "#define HAVE_MSA\t1\n" if $have_msa eq 1; | |||||
| if ($os eq "LINUX") { | if ($os eq "LINUX") { | ||||
| @@ -53,7 +53,7 @@ endif() | |||||
| add_custom_command( | add_custom_command( | ||||
| TARGET ${OpenBLAS_LIBNAME} PRE_LINK | TARGET ${OpenBLAS_LIBNAME} PRE_LINK | ||||
| COMMAND perl | COMMAND perl | ||||
| ARGS "${CMAKE_SOURCE_DIR}/exports/gensymbol" "win2k" "${ARCH_IN}" "dummy" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" "${SYMBOLPREFIX}" "${SYMBOLSUFFIX}" > "${PROJECT_BINARY_DIR}/openblas.def" | |||||
| ARGS "${PROJECT_SOURCE_DIR}/exports/gensymbol" "win2k" "${ARCH_IN}" "dummy" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" "${SYMBOLPREFIX}" "${SYMBOLSUFFIX}" > "${PROJECT_BINARY_DIR}/openblas.def" | |||||
| COMMENT "Create openblas.def file" | COMMENT "Create openblas.def file" | ||||
| VERBATIM) | VERBATIM) | ||||
| @@ -50,20 +50,20 @@ else() | |||||
| set(TARGET_CONF "config.h") | set(TARGET_CONF "config.h") | ||||
| endif () | endif () | ||||
| include("${CMAKE_SOURCE_DIR}/cmake/c_check.cmake") | |||||
| include("${PROJECT_SOURCE_DIR}/cmake/c_check.cmake") | |||||
| if (NOT NOFORTRAN) | if (NOT NOFORTRAN) | ||||
| include("${CMAKE_SOURCE_DIR}/cmake/f_check.cmake") | |||||
| include("${PROJECT_SOURCE_DIR}/cmake/f_check.cmake") | |||||
| endif () | endif () | ||||
| # compile getarch | # compile getarch | ||||
| set(GETARCH_SRC | set(GETARCH_SRC | ||||
| ${CMAKE_SOURCE_DIR}/getarch.c | |||||
| ${PROJECT_SOURCE_DIR}/getarch.c | |||||
| ${CPUIDEMO} | ${CPUIDEMO} | ||||
| ) | ) | ||||
| if (NOT MSVC) | if (NOT MSVC) | ||||
| list(APPEND GETARCH_SRC ${CMAKE_SOURCE_DIR}/cpuid.S) | |||||
| list(APPEND GETARCH_SRC ${PROJECT_SOURCE_DIR}/cpuid.S) | |||||
| endif () | endif () | ||||
| if (MSVC) | if (MSVC) | ||||
| @@ -76,7 +76,7 @@ set(GETARCH_BIN "getarch${CMAKE_EXECUTABLE_SUFFIX}") | |||||
| file(MAKE_DIRECTORY ${GETARCH_DIR}) | file(MAKE_DIRECTORY ${GETARCH_DIR}) | ||||
| try_compile(GETARCH_RESULT ${GETARCH_DIR} | try_compile(GETARCH_RESULT ${GETARCH_DIR} | ||||
| SOURCES ${GETARCH_SRC} | SOURCES ${GETARCH_SRC} | ||||
| COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} -I${CMAKE_SOURCE_DIR} | |||||
| COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} -I${PROJECT_SOURCE_DIR} | |||||
| OUTPUT_VARIABLE GETARCH_LOG | OUTPUT_VARIABLE GETARCH_LOG | ||||
| COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH_BIN} | COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH_BIN} | ||||
| ) | ) | ||||
| @@ -97,8 +97,8 @@ set(GETARCH2_DIR "${PROJECT_BINARY_DIR}/getarch2_build") | |||||
| set(GETARCH2_BIN "getarch_2nd${CMAKE_EXECUTABLE_SUFFIX}") | set(GETARCH2_BIN "getarch_2nd${CMAKE_EXECUTABLE_SUFFIX}") | ||||
| file(MAKE_DIRECTORY ${GETARCH2_DIR}) | file(MAKE_DIRECTORY ${GETARCH2_DIR}) | ||||
| try_compile(GETARCH2_RESULT ${GETARCH2_DIR} | try_compile(GETARCH2_RESULT ${GETARCH2_DIR} | ||||
| SOURCES ${CMAKE_SOURCE_DIR}/getarch_2nd.c | |||||
| COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GETARCH2_FLAGS} -I${CMAKE_SOURCE_DIR} | |||||
| SOURCES ${PROJECT_SOURCE_DIR}/getarch_2nd.c | |||||
| COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GETARCH2_FLAGS} -I${PROJECT_SOURCE_DIR} | |||||
| OUTPUT_VARIABLE GETARCH2_LOG | OUTPUT_VARIABLE GETARCH2_LOG | ||||
| COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH2_BIN} | COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH2_BIN} | ||||
| ) | ) | ||||
| @@ -3,7 +3,7 @@ | |||||
| ## Description: Ported from OpenBLAS/Makefile.system | ## Description: Ported from OpenBLAS/Makefile.system | ||||
| ## | ## | ||||
| set(NETLIB_LAPACK_DIR "${CMAKE_SOURCE_DIR}/lapack-netlib") | |||||
| set(NETLIB_LAPACK_DIR "${PROJECT_SOURCE_DIR}/lapack-netlib") | |||||
| # TODO: Makefile.system detects Darwin (mac) and switches to clang here -hpa | # TODO: Makefile.system detects Darwin (mac) and switches to clang here -hpa | ||||
| # http://stackoverflow.com/questions/714100/os-detecting-makefile | # http://stackoverflow.com/questions/714100/os-detecting-makefile | ||||
| @@ -78,7 +78,7 @@ else () | |||||
| set(ONLY_CBLAS 0) | set(ONLY_CBLAS 0) | ||||
| endif () | endif () | ||||
| include("${CMAKE_SOURCE_DIR}/cmake/prebuild.cmake") | |||||
| include("${PROJECT_SOURCE_DIR}/cmake/prebuild.cmake") | |||||
| if (NOT DEFINED NUM_THREADS) | if (NOT DEFINED NUM_THREADS) | ||||
| set(NUM_THREADS ${NUM_CORES}) | set(NUM_THREADS ${NUM_CORES}) | ||||
| @@ -124,17 +124,17 @@ set(OBJCOPY "${CROSS_SUFFIX}objcopy") | |||||
| set(OBJCONV "${CROSS_SUFFIX}objconv") | set(OBJCONV "${CROSS_SUFFIX}objconv") | ||||
| # OS dependent settings | # OS dependent settings | ||||
| include("${CMAKE_SOURCE_DIR}/cmake/os.cmake") | |||||
| include("${PROJECT_SOURCE_DIR}/cmake/os.cmake") | |||||
| # Architecture dependent settings | # Architecture dependent settings | ||||
| include("${CMAKE_SOURCE_DIR}/cmake/arch.cmake") | |||||
| include("${PROJECT_SOURCE_DIR}/cmake/arch.cmake") | |||||
| # C Compiler dependent settings | # C Compiler dependent settings | ||||
| include("${CMAKE_SOURCE_DIR}/cmake/cc.cmake") | |||||
| include("${PROJECT_SOURCE_DIR}/cmake/cc.cmake") | |||||
| if (NOT NOFORTRAN) | if (NOT NOFORTRAN) | ||||
| # Fortran Compiler dependent settings | # Fortran Compiler dependent settings | ||||
| include("${CMAKE_SOURCE_DIR}/cmake/fc.cmake") | |||||
| include("${PROJECT_SOURCE_DIR}/cmake/fc.cmake") | |||||
| endif () | endif () | ||||
| if (BINARY64) | if (BINARY64) | ||||
| @@ -247,10 +247,10 @@ if (NOT DEFINED SYMBOLSUFFIX) | |||||
| set(SYMBOLSUFFIX "") | set(SYMBOLSUFFIX "") | ||||
| endif () | endif () | ||||
| set(KERNELDIR "${CMAKE_SOURCE_DIR}/kernel/${ARCH}") | |||||
| set(KERNELDIR "${PROJECT_SOURCE_DIR}/kernel/${ARCH}") | |||||
| # TODO: nead to convert these Makefiles | # TODO: nead to convert these Makefiles | ||||
| # include ${CMAKE_SOURCE_DIR}/cmake/${ARCH}.cmake | |||||
| # include ${PROJECT_SOURCE_DIR}/cmake/${ARCH}.cmake | |||||
| if (${CORE} STREQUAL "PPC440") | if (${CORE} STREQUAL "PPC440") | ||||
| set(CCOMMON_OPT "${CCOMMON_OPT} -DALLOC_QALLOC") | set(CCOMMON_OPT "${CCOMMON_OPT} -DALLOC_QALLOC") | ||||
| @@ -410,8 +410,8 @@ set(LIBDEFNAME "${LIBNAME}.${LIBSUFFIX}.def") | |||||
| set(LIBEXPNAME "${LIBNAME}.${LIBSUFFIX}.exp") | set(LIBEXPNAME "${LIBNAME}.${LIBSUFFIX}.exp") | ||||
| set(LIBZIPNAME "${LIBNAME}.${LIBSUFFIX}.zip") | set(LIBZIPNAME "${LIBNAME}.${LIBSUFFIX}.zip") | ||||
| set(LIBS "${CMAKE_SOURCE_DIR}/${LIBNAME}") | |||||
| set(LIBS_P "${CMAKE_SOURCE_DIR}/${LIBNAME_P}") | |||||
| set(LIBS "${PROJECT_SOURCE_DIR}/${LIBNAME}") | |||||
| set(LIBS_P "${PROJECT_SOURCE_DIR}/${LIBNAME_P}") | |||||
| set(LIB_COMPONENTS BLAS) | set(LIB_COMPONENTS BLAS) | ||||
| @@ -332,6 +332,13 @@ typedef int blasint; | |||||
| #endif | #endif | ||||
| #endif | #endif | ||||
| #ifdef POWER8 | |||||
| #ifndef YIELDING | |||||
| #define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n"); | |||||
| #endif | |||||
| #endif | |||||
| /* | /* | ||||
| #ifdef PILEDRIVER | #ifdef PILEDRIVER | ||||
| #ifndef YIELDING | #ifndef YIELDING | ||||
| @@ -397,6 +404,10 @@ please https://github.com/xianyi/OpenBLAS/issues/246 | |||||
| #include "common_sparc.h" | #include "common_sparc.h" | ||||
| #endif | #endif | ||||
| #ifdef ARCH_MIPS | |||||
| #include "common_mips.h" | |||||
| #endif | |||||
| #ifdef ARCH_MIPS64 | #ifdef ARCH_MIPS64 | ||||
| #include "common_mips64.h" | #include "common_mips64.h" | ||||
| #endif | #endif | ||||
| @@ -615,9 +626,14 @@ void gotoblas_profile_init(void); | |||||
| void gotoblas_profile_quit(void); | void gotoblas_profile_quit(void); | ||||
| #ifdef USE_OPENMP | #ifdef USE_OPENMP | ||||
| #ifndef C_MSVC | |||||
| int omp_in_parallel(void); | int omp_in_parallel(void); | ||||
| int omp_get_num_procs(void); | int omp_get_num_procs(void); | ||||
| #else | #else | ||||
| __declspec(dllimport) int __cdecl omp_in_parallel(void); | |||||
| __declspec(dllimport) int __cdecl omp_get_num_procs(void); | |||||
| #endif | |||||
| #else | |||||
| #ifdef __ELF__ | #ifdef __ELF__ | ||||
| int omp_in_parallel (void) __attribute__ ((weak)); | int omp_in_parallel (void) __attribute__ ((weak)); | ||||
| int omp_get_num_procs(void) __attribute__ ((weak)); | int omp_get_num_procs(void) __attribute__ ((weak)); | ||||
| @@ -0,0 +1,109 @@ | |||||
| /***************************************************************************** | |||||
| Copyright (c) 2016, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written | |||||
| permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| **********************************************************************************/ | |||||
| #ifndef COMMON_MIPS | |||||
| #define COMMON_MIPS | |||||
| #define MB | |||||
| #define WMB | |||||
| #define INLINE inline | |||||
| #define RETURN_BY_COMPLEX | |||||
| #ifndef ASSEMBLER | |||||
| static void INLINE blas_lock(volatile unsigned long *address){ | |||||
| } | |||||
| #define BLAS_LOCK_DEFINED | |||||
| static inline unsigned int rpcc(void){ | |||||
| unsigned long ret; | |||||
| __asm__ __volatile__(".set push \n" | |||||
| "rdhwr %0, $30 \n" | |||||
| ".set pop" : "=r"(ret) : : "memory"); | |||||
| return ret; | |||||
| } | |||||
| #define RPCC_DEFINED | |||||
| static inline int blas_quickdivide(blasint x, blasint y){ | |||||
| return x / y; | |||||
| } | |||||
| #define GET_IMAGE(res) | |||||
| #define GET_IMAGE_CANCEL | |||||
| #endif | |||||
| #ifndef F_INTERFACE | |||||
| #define REALNAME ASMNAME | |||||
| #else | |||||
| #define REALNAME ASMFNAME | |||||
| #endif | |||||
| #if defined(ASSEMBLER) && !defined(NEEDPARAM) | |||||
| #define PROLOGUE \ | |||||
| .arm ;\ | |||||
| .global REALNAME ;\ | |||||
| .func REALNAME ;\ | |||||
| REALNAME: | |||||
| #define EPILOGUE | |||||
| #define PROFCODE | |||||
| #endif | |||||
| #define SEEK_ADDRESS | |||||
| #ifndef PAGESIZE | |||||
| #define PAGESIZE ( 4 << 10) | |||||
| #endif | |||||
| #define HUGE_PAGESIZE ( 4 << 20) | |||||
| #define BUFFER_SIZE (16 << 20) | |||||
| #define BASE_ADDRESS (START_ADDRESS - BUFFER_SIZE * MAX_CPU_NUMBER) | |||||
| #ifndef MAP_ANONYMOUS | |||||
| #define MAP_ANONYMOUS MAP_ANON | |||||
| #endif | |||||
| #endif | |||||
| @@ -102,7 +102,7 @@ static void INLINE blas_lock(volatile unsigned long *address){ | |||||
| static inline unsigned int rpcc(void){ | static inline unsigned int rpcc(void){ | ||||
| unsigned long ret; | unsigned long ret; | ||||
| #if defined(LOONGSON3A) || defined(LOONGSON3B) | |||||
| // unsigned long long tmp; | // unsigned long long tmp; | ||||
| //__asm__ __volatile__("dmfc0 %0, $25, 1": "=r"(tmp):: "memory"); | //__asm__ __volatile__("dmfc0 %0, $25, 1": "=r"(tmp):: "memory"); | ||||
| //ret=tmp; | //ret=tmp; | ||||
| @@ -111,17 +111,10 @@ static inline unsigned int rpcc(void){ | |||||
| "rdhwr %0, $2\n" | "rdhwr %0, $2\n" | ||||
| ".set pop": "=r"(ret):: "memory"); | ".set pop": "=r"(ret):: "memory"); | ||||
| #else | |||||
| __asm__ __volatile__(".set push \n" | |||||
| ".set mips32r2\n" | |||||
| "rdhwr %0, $30 \n" | |||||
| ".set pop" : "=r"(ret) : : "memory"); | |||||
| #endif | |||||
| return ret; | return ret; | ||||
| } | } | ||||
| #define RPCC_DEFINED | #define RPCC_DEFINED | ||||
| #if defined(LOONGSON3A) || defined(LOONGSON3B) | |||||
| #ifndef NO_AFFINITY | #ifndef NO_AFFINITY | ||||
| #define WHEREAMI | #define WHEREAMI | ||||
| static inline int WhereAmI(void){ | static inline int WhereAmI(void){ | ||||
| @@ -134,7 +127,6 @@ static inline int WhereAmI(void){ | |||||
| } | } | ||||
| #endif | #endif | ||||
| #endif | |||||
| static inline int blas_quickdivide(blasint x, blasint y){ | static inline int blas_quickdivide(blasint x, blasint y){ | ||||
| return x / y; | return x / y; | ||||
| @@ -39,8 +39,13 @@ | |||||
| #ifndef COMMON_POWER | #ifndef COMMON_POWER | ||||
| #define COMMON_POWER | #define COMMON_POWER | ||||
| #if defined(POWER8) | |||||
| #define MB __asm__ __volatile__ ("eieio":::"memory") | |||||
| #define WMB __asm__ __volatile__ ("eieio":::"memory") | |||||
| #else | |||||
| #define MB __asm__ __volatile__ ("sync") | #define MB __asm__ __volatile__ ("sync") | ||||
| #define WMB __asm__ __volatile__ ("sync") | #define WMB __asm__ __volatile__ ("sync") | ||||
| #endif | |||||
| #define INLINE inline | #define INLINE inline | ||||
| @@ -798,7 +803,7 @@ Lmcount$lazy_ptr: | |||||
| #elif defined(PPC440FP2) | #elif defined(PPC440FP2) | ||||
| #define BUFFER_SIZE ( 16 << 20) | #define BUFFER_SIZE ( 16 << 20) | ||||
| #elif defined(POWER8) | #elif defined(POWER8) | ||||
| #define BUFFER_SIZE ( 32 << 20) | |||||
| #define BUFFER_SIZE ( 64 << 20) | |||||
| #else | #else | ||||
| #define BUFFER_SIZE ( 16 << 20) | #define BUFFER_SIZE ( 16 << 20) | ||||
| #endif | #endif | ||||
| @@ -71,15 +71,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| /*********************************************************************/ | /*********************************************************************/ | ||||
| #define CPU_UNKNOWN 0 | #define CPU_UNKNOWN 0 | ||||
| #define CPU_SICORTEX 1 | |||||
| #define CPU_LOONGSON3A 2 | |||||
| #define CPU_LOONGSON3B 3 | |||||
| #define CPU_P5600 1 | |||||
| static char *cpuname[] = { | static char *cpuname[] = { | ||||
| "UNKOWN", | "UNKOWN", | ||||
| "SICORTEX", | |||||
| "LOONGSON3A", | |||||
| "LOONGSON3B" | |||||
| "P5600" | |||||
| }; | }; | ||||
| int detect(void){ | int detect(void){ | ||||
| @@ -120,7 +116,7 @@ int detect(void){ | |||||
| if (strstr(p, "loongson3a")) | if (strstr(p, "loongson3a")) | ||||
| return CPU_LOONGSON3A; | return CPU_LOONGSON3A; | ||||
| }else{ | }else{ | ||||
| return CPU_SICORTEX; | |||||
| return CPU_UNKNOWN; | |||||
| } | } | ||||
| } | } | ||||
| //Check model name for Loongson3 | //Check model name for Loongson3 | ||||
| @@ -149,64 +145,40 @@ char *get_corename(void){ | |||||
| } | } | ||||
| void get_architecture(void){ | void get_architecture(void){ | ||||
| printf("MIPS64"); | |||||
| printf("MIPS"); | |||||
| } | } | ||||
| void get_subarchitecture(void){ | void get_subarchitecture(void){ | ||||
| if(detect()==CPU_LOONGSON3A) { | |||||
| printf("LOONGSON3A"); | |||||
| }else if(detect()==CPU_LOONGSON3B){ | |||||
| printf("LOONGSON3B"); | |||||
| if(detect()==CPU_P5600){ | |||||
| printf("P5600"); | |||||
| }else{ | }else{ | ||||
| printf("SICORTEX"); | |||||
| printf("UNKNOWN"); | |||||
| } | } | ||||
| } | } | ||||
| void get_subdirname(void){ | void get_subdirname(void){ | ||||
| printf("mips64"); | |||||
| printf("mips"); | |||||
| } | } | ||||
| void get_cpuconfig(void){ | void get_cpuconfig(void){ | ||||
| if(detect()==CPU_LOONGSON3A) { | |||||
| printf("#define LOONGSON3A\n"); | |||||
| printf("#define L1_DATA_SIZE 65536\n"); | |||||
| printf("#define L1_DATA_LINESIZE 32\n"); | |||||
| printf("#define L2_SIZE 512488\n"); | |||||
| printf("#define L2_LINESIZE 32\n"); | |||||
| printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||||
| printf("#define DTB_SIZE 4096\n"); | |||||
| printf("#define L2_ASSOCIATIVE 4\n"); | |||||
| }else if(detect()==CPU_LOONGSON3B){ | |||||
| printf("#define LOONGSON3B\n"); | |||||
| if(detect()==CPU_P5600){ | |||||
| printf("#define P5600\n"); | |||||
| printf("#define L1_DATA_SIZE 65536\n"); | printf("#define L1_DATA_SIZE 65536\n"); | ||||
| printf("#define L1_DATA_LINESIZE 32\n"); | printf("#define L1_DATA_LINESIZE 32\n"); | ||||
| printf("#define L2_SIZE 512488\n"); | |||||
| printf("#define L2_SIZE 1048576\n"); | |||||
| printf("#define L2_LINESIZE 32\n"); | printf("#define L2_LINESIZE 32\n"); | ||||
| printf("#define DTB_DEFAULT_ENTRIES 64\n"); | printf("#define DTB_DEFAULT_ENTRIES 64\n"); | ||||
| printf("#define DTB_SIZE 4096\n"); | printf("#define DTB_SIZE 4096\n"); | ||||
| printf("#define L2_ASSOCIATIVE 4\n"); | |||||
| }else{ | |||||
| printf("#define SICORTEX\n"); | |||||
| printf("#define L1_DATA_SIZE 32768\n"); | |||||
| printf("#define L1_DATA_LINESIZE 32\n"); | |||||
| printf("#define L2_SIZE 512488\n"); | |||||
| printf("#define L2_LINESIZE 32\n"); | |||||
| printf("#define DTB_DEFAULT_ENTRIES 32\n"); | |||||
| printf("#define DTB_SIZE 4096\n"); | |||||
| printf("#define L2_ASSOCIATIVE 8\n"); | printf("#define L2_ASSOCIATIVE 8\n"); | ||||
| }else{ | |||||
| printf("#define UNKNOWN\n"); | |||||
| } | } | ||||
| } | } | ||||
| void get_libname(void){ | void get_libname(void){ | ||||
| if(detect()==CPU_LOONGSON3A) { | |||||
| printf("loongson3a\n"); | |||||
| }else if(detect()==CPU_LOONGSON3B) { | |||||
| printf("loongson3b\n"); | |||||
| if(detect()==CPU_P5600) { | |||||
| printf("p5600\n"); | |||||
| }else{ | }else{ | ||||
| #ifdef __mips64 | |||||
| printf("mips64\n"); | |||||
| #else | |||||
| printf("mips32\n"); | |||||
| #endif | |||||
| printf("mips\n"); | |||||
| } | } | ||||
| } | } | ||||
| @@ -0,0 +1,238 @@ | |||||
| /***************************************************************************** | |||||
| Copyright (c) 2011-2014, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written | |||||
| permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| **********************************************************************************/ | |||||
| /*********************************************************************/ | |||||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||||
| /* All rights reserved. */ | |||||
| /* */ | |||||
| /* Redistribution and use in source and binary forms, with or */ | |||||
| /* without modification, are permitted provided that the following */ | |||||
| /* conditions are met: */ | |||||
| /* */ | |||||
| /* 1. Redistributions of source code must retain the above */ | |||||
| /* copyright notice, this list of conditions and the following */ | |||||
| /* disclaimer. */ | |||||
| /* */ | |||||
| /* 2. Redistributions in binary form must reproduce the above */ | |||||
| /* copyright notice, this list of conditions and the following */ | |||||
| /* disclaimer in the documentation and/or other materials */ | |||||
| /* provided with the distribution. */ | |||||
| /* */ | |||||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||||
| /* */ | |||||
| /* The views and conclusions contained in the software and */ | |||||
| /* documentation are those of the authors and should not be */ | |||||
| /* interpreted as representing official policies, either expressed */ | |||||
| /* or implied, of The University of Texas at Austin. */ | |||||
| /*********************************************************************/ | |||||
| #define CPU_UNKNOWN 0 | |||||
| #define CPU_SICORTEX 1 | |||||
| #define CPU_LOONGSON3A 2 | |||||
| #define CPU_LOONGSON3B 3 | |||||
| #define CPU_I6400 4 | |||||
| #define CPU_P6600 5 | |||||
| static char *cpuname[] = { | |||||
| "UNKOWN", | |||||
| "SICORTEX", | |||||
| "LOONGSON3A", | |||||
| "LOONGSON3B", | |||||
| "I6400", | |||||
| "P6600" | |||||
| }; | |||||
| int detect(void){ | |||||
| #ifdef linux | |||||
| FILE *infile; | |||||
| char buffer[512], *p; | |||||
| p = (char *)NULL; | |||||
| infile = fopen("/proc/cpuinfo", "r"); | |||||
| while (fgets(buffer, sizeof(buffer), infile)){ | |||||
| if (!strncmp("cpu", buffer, 3)){ | |||||
| p = strchr(buffer, ':') + 2; | |||||
| #if 0 | |||||
| fprintf(stderr, "%s\n", p); | |||||
| #endif | |||||
| break; | |||||
| } | |||||
| } | |||||
| fclose(infile); | |||||
| if(p != NULL){ | |||||
| if (strstr(p, "Loongson-3A")){ | |||||
| return CPU_LOONGSON3A; | |||||
| }else if(strstr(p, "Loongson-3B")){ | |||||
| return CPU_LOONGSON3B; | |||||
| }else if (strstr(p, "Loongson-3")){ | |||||
| infile = fopen("/proc/cpuinfo", "r"); | |||||
| p = (char *)NULL; | |||||
| while (fgets(buffer, sizeof(buffer), infile)){ | |||||
| if (!strncmp("system type", buffer, 11)){ | |||||
| p = strchr(buffer, ':') + 2; | |||||
| break; | |||||
| } | |||||
| } | |||||
| fclose(infile); | |||||
| if (strstr(p, "loongson3a")) | |||||
| return CPU_LOONGSON3A; | |||||
| }else{ | |||||
| return CPU_SICORTEX; | |||||
| } | |||||
| } | |||||
| //Check model name for Loongson3 | |||||
| infile = fopen("/proc/cpuinfo", "r"); | |||||
| p = (char *)NULL; | |||||
| while (fgets(buffer, sizeof(buffer), infile)){ | |||||
| if (!strncmp("model name", buffer, 10)){ | |||||
| p = strchr(buffer, ':') + 2; | |||||
| break; | |||||
| } | |||||
| } | |||||
| fclose(infile); | |||||
| if(p != NULL){ | |||||
| if (strstr(p, "Loongson-3A")){ | |||||
| return CPU_LOONGSON3A; | |||||
| }else if(strstr(p, "Loongson-3B")){ | |||||
| return CPU_LOONGSON3B; | |||||
| } | |||||
| } | |||||
| #endif | |||||
| return CPU_UNKNOWN; | |||||
| } | |||||
| char *get_corename(void){ | |||||
| return cpuname[detect()]; | |||||
| } | |||||
| void get_architecture(void){ | |||||
| printf("MIPS64"); | |||||
| } | |||||
| void get_subarchitecture(void){ | |||||
| if(detect()==CPU_LOONGSON3A) { | |||||
| printf("LOONGSON3A"); | |||||
| }else if(detect()==CPU_LOONGSON3B){ | |||||
| printf("LOONGSON3B"); | |||||
| }else if(detect()==CPU_I6400){ | |||||
| printf("I6400"); | |||||
| }else if(detect()==CPU_P6600){ | |||||
| printf("P6600"); | |||||
| }else{ | |||||
| printf("SICORTEX"); | |||||
| } | |||||
| } | |||||
| void get_subdirname(void){ | |||||
| printf("mips64"); | |||||
| } | |||||
| void get_cpuconfig(void){ | |||||
| if(detect()==CPU_LOONGSON3A) { | |||||
| printf("#define LOONGSON3A\n"); | |||||
| printf("#define L1_DATA_SIZE 65536\n"); | |||||
| printf("#define L1_DATA_LINESIZE 32\n"); | |||||
| printf("#define L2_SIZE 512488\n"); | |||||
| printf("#define L2_LINESIZE 32\n"); | |||||
| printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||||
| printf("#define DTB_SIZE 4096\n"); | |||||
| printf("#define L2_ASSOCIATIVE 4\n"); | |||||
| }else if(detect()==CPU_LOONGSON3B){ | |||||
| printf("#define LOONGSON3B\n"); | |||||
| printf("#define L1_DATA_SIZE 65536\n"); | |||||
| printf("#define L1_DATA_LINESIZE 32\n"); | |||||
| printf("#define L2_SIZE 512488\n"); | |||||
| printf("#define L2_LINESIZE 32\n"); | |||||
| printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||||
| printf("#define DTB_SIZE 4096\n"); | |||||
| printf("#define L2_ASSOCIATIVE 4\n"); | |||||
| }else if(detect()==CPU_I6400){ | |||||
| printf("#define I6400\n"); | |||||
| printf("#define L1_DATA_SIZE 65536\n"); | |||||
| printf("#define L1_DATA_LINESIZE 32\n"); | |||||
| printf("#define L2_SIZE 1048576\n"); | |||||
| printf("#define L2_LINESIZE 32\n"); | |||||
| printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||||
| printf("#define DTB_SIZE 4096\n"); | |||||
| printf("#define L2_ASSOCIATIVE 8\n"); | |||||
| }else if(detect()==CPU_P6600){ | |||||
| printf("#define P6600\n"); | |||||
| printf("#define L1_DATA_SIZE 65536\n"); | |||||
| printf("#define L1_DATA_LINESIZE 32\n"); | |||||
| printf("#define L2_SIZE 1048576\n"); | |||||
| printf("#define L2_LINESIZE 32\n"); | |||||
| printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||||
| printf("#define DTB_SIZE 4096\n"); | |||||
| printf("#define L2_ASSOCIATIVE 8\n"); | |||||
| }else{ | |||||
| printf("#define SICORTEX\n"); | |||||
| printf("#define L1_DATA_SIZE 32768\n"); | |||||
| printf("#define L1_DATA_LINESIZE 32\n"); | |||||
| printf("#define L2_SIZE 512488\n"); | |||||
| printf("#define L2_LINESIZE 32\n"); | |||||
| printf("#define DTB_DEFAULT_ENTRIES 32\n"); | |||||
| printf("#define DTB_SIZE 4096\n"); | |||||
| printf("#define L2_ASSOCIATIVE 8\n"); | |||||
| } | |||||
| } | |||||
| void get_libname(void){ | |||||
| if(detect()==CPU_LOONGSON3A) { | |||||
| printf("loongson3a\n"); | |||||
| }else if(detect()==CPU_LOONGSON3B) { | |||||
| printf("loongson3b\n"); | |||||
| }else if(detect()==CPU_I6400) { | |||||
| printf("i6400\n"); | |||||
| }else if(detect()==CPU_P6600) { | |||||
| printf("p6600\n"); | |||||
| }else{ | |||||
| printf("mips64\n"); | |||||
| } | |||||
| } | |||||
| @@ -1172,6 +1172,8 @@ int get_cpuname(void){ | |||||
| #endif | #endif | ||||
| else | else | ||||
| return CPUTYPE_NEHALEM; | return CPUTYPE_NEHALEM; | ||||
| case 12: | |||||
| // Braswell | |||||
| case 13: | case 13: | ||||
| // Avoton | // Avoton | ||||
| return CPUTYPE_NEHALEM; | return CPUTYPE_NEHALEM; | ||||
| @@ -1678,6 +1680,8 @@ int get_coretype(void){ | |||||
| #endif | #endif | ||||
| else | else | ||||
| return CORE_NEHALEM; | return CORE_NEHALEM; | ||||
| case 12: | |||||
| // Braswell | |||||
| case 13: | case 13: | ||||
| // Avoton | // Avoton | ||||
| return CORE_NEHALEM; | return CORE_NEHALEM; | ||||
| @@ -110,7 +110,7 @@ ARCH_MIPS64 | |||||
| #endif | #endif | ||||
| #if defined(__mips32) || defined(__mips) | #if defined(__mips32) || defined(__mips) | ||||
| ARCH_MIPS32 | |||||
| ARCH_MIPS | |||||
| #endif | #endif | ||||
| #ifdef __alpha | #ifdef __alpha | ||||
| @@ -1,4 +1,4 @@ | |||||
| include_directories(${CMAKE_SOURCE_DIR}) | |||||
| include_directories(${PROJECT_SOURCE_DIR}) | |||||
| enable_language(Fortran) | enable_language(Fortran) | ||||
| @@ -42,6 +42,7 @@ ztestl3o_3m = c_zblas3_3m.o c_z3chke_3m.o auxiliary.o c_xerbla.o constant.o | |||||
| all :: all1 all2 all3 | all :: all1 all2 all3 | ||||
| all1: xscblat1 xdcblat1 xccblat1 xzcblat1 | all1: xscblat1 xdcblat1 xccblat1 xzcblat1 | ||||
| ifndef CROSS | |||||
| ifeq ($(USE_OPENMP), 1) | ifeq ($(USE_OPENMP), 1) | ||||
| OMP_NUM_THREADS=2 ./xscblat1 | OMP_NUM_THREADS=2 ./xscblat1 | ||||
| OMP_NUM_THREADS=2 ./xdcblat1 | OMP_NUM_THREADS=2 ./xdcblat1 | ||||
| @@ -53,8 +54,10 @@ else | |||||
| OPENBLAS_NUM_THREADS=2 ./xccblat1 | OPENBLAS_NUM_THREADS=2 ./xccblat1 | ||||
| OPENBLAS_NUM_THREADS=2 ./xzcblat1 | OPENBLAS_NUM_THREADS=2 ./xzcblat1 | ||||
| endif | endif | ||||
| endif | |||||
| all2: xscblat2 xdcblat2 xccblat2 xzcblat2 | all2: xscblat2 xdcblat2 xccblat2 xzcblat2 | ||||
| ifndef CROSS | |||||
| ifeq ($(USE_OPENMP), 1) | ifeq ($(USE_OPENMP), 1) | ||||
| OMP_NUM_THREADS=2 ./xscblat2 < sin2 | OMP_NUM_THREADS=2 ./xscblat2 < sin2 | ||||
| OMP_NUM_THREADS=2 ./xdcblat2 < din2 | OMP_NUM_THREADS=2 ./xdcblat2 < din2 | ||||
| @@ -66,8 +69,10 @@ else | |||||
| OPENBLAS_NUM_THREADS=2 ./xccblat2 < cin2 | OPENBLAS_NUM_THREADS=2 ./xccblat2 < cin2 | ||||
| OPENBLAS_NUM_THREADS=2 ./xzcblat2 < zin2 | OPENBLAS_NUM_THREADS=2 ./xzcblat2 < zin2 | ||||
| endif | endif | ||||
| endif | |||||
| all3: xscblat3 xdcblat3 xccblat3 xzcblat3 | all3: xscblat3 xdcblat3 xccblat3 xzcblat3 | ||||
| ifndef CROSS | |||||
| ifeq ($(USE_OPENMP), 1) | ifeq ($(USE_OPENMP), 1) | ||||
| OMP_NUM_THREADS=2 ./xscblat3 < sin3 | OMP_NUM_THREADS=2 ./xscblat3 < sin3 | ||||
| OMP_NUM_THREADS=2 ./xdcblat3 < din3 | OMP_NUM_THREADS=2 ./xdcblat3 < din3 | ||||
| @@ -88,6 +93,7 @@ else | |||||
| OPENBLAS_NUM_THREADS=2 ./xccblat3_3m < cin3_3m | OPENBLAS_NUM_THREADS=2 ./xccblat3_3m < cin3_3m | ||||
| OPENBLAS_NUM_THREADS=2 ./xzcblat3_3m < zin3_3m | OPENBLAS_NUM_THREADS=2 ./xzcblat3_3m < zin3_3m | ||||
| endif | endif | ||||
| endif | |||||
| @@ -1,5 +1,5 @@ | |||||
| include_directories(${CMAKE_SOURCE_DIR}) | |||||
| include_directories(${PROJECT_SOURCE_DIR}) | |||||
| # sources that need to be compiled twice, once with no flags and once with LOWER | # sources that need to be compiled twice, once with no flags and once with LOWER | ||||
| set(UL_SOURCES | set(UL_SOURCES | ||||
| @@ -1,4 +1,4 @@ | |||||
| include_directories(${CMAKE_SOURCE_DIR}) | |||||
| include_directories(${PROJECT_SOURCE_DIR}) | |||||
| # N.B. In the original makefile there was a BLOCKS define used in the compilation of these files but I don't see any evidence of it being set anywhere. -hpa | # N.B. In the original makefile there was a BLOCKS define used in the compilation of these files but I don't see any evidence of it being set anywhere. -hpa | ||||
| @@ -1,4 +1,4 @@ | |||||
| include_directories(${CMAKE_SOURCE_DIR}) | |||||
| include_directories(${PROJECT_SOURCE_DIR}) | |||||
| if (${CORE} STREQUAL "PPC440") | if (${CORE} STREQUAL "PPC440") | ||||
| set(MEMORY memory_qalloc.c) | set(MEMORY memory_qalloc.c) | ||||
| @@ -261,8 +261,8 @@ static gotoblas_t *get_coretype(void){ | |||||
| return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. | return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. | ||||
| } | } | ||||
| } | } | ||||
| //Intel Avoton | |||||
| if (model == 13) { | |||||
| //Intel Braswell / Avoton | |||||
| if (model == 12 || model == 13) { | |||||
| openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); | openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); | ||||
| return &gotoblas_NEHALEM; | return &gotoblas_NEHALEM; | ||||
| } | } | ||||
| @@ -439,7 +439,7 @@ static gotoblas_t *force_coretype(char *coretype){ | |||||
| char message[128]; | char message[128]; | ||||
| //char mname[20]; | //char mname[20]; | ||||
| for ( i=1 ; i <= 21; i++) | |||||
| for ( i=1 ; i <= 22; i++) | |||||
| { | { | ||||
| if (!strncasecmp(coretype,corename[i],20)) | if (!strncasecmp(coretype,corename[i],20)) | ||||
| { | { | ||||
| @@ -361,6 +361,9 @@ static void numa_mapping(void) { | |||||
| unsigned long work, bit; | unsigned long work, bit; | ||||
| int count = 0; | int count = 0; | ||||
| int bitmask_idx = 0; | int bitmask_idx = 0; | ||||
| int current_cpu; | |||||
| int current_node = 0; | |||||
| int cpu_count = 0; | |||||
| for (node = 0; node < common -> num_nodes; node ++) { | for (node = 0; node < common -> num_nodes; node ++) { | ||||
| core = 0; | core = 0; | ||||
| @@ -382,33 +385,84 @@ static void numa_mapping(void) { | |||||
| fprintf(stderr, "CPU (%2d) : %08lx\n", cpu, common -> cpu_info[cpu]); | fprintf(stderr, "CPU (%2d) : %08lx\n", cpu, common -> cpu_info[cpu]); | ||||
| #endif | #endif | ||||
| h = 1; | |||||
| while (h < count) h = 2 * h + 1; | |||||
| while (h > 1) { | |||||
| h /= 2; | |||||
| for (i = h; i < count; i++) { | |||||
| work = common -> cpu_info[i]; | |||||
| bit = CPU_ISSET(i, &cpu_orig_mask[0]); | |||||
| j = i - h; | |||||
| while (work < common -> cpu_info[j]) { | |||||
| common -> cpu_info[j + h] = common -> cpu_info[j]; | |||||
| if (CPU_ISSET(j, &cpu_orig_mask[0])) { | |||||
| CPU_SET(j + h, &cpu_orig_mask[0]); | |||||
| } else { | |||||
| CPU_CLR(j + h, &cpu_orig_mask[0]); | |||||
| } | |||||
| j -= h; | |||||
| if (j < 0) break; | |||||
| } | |||||
| common -> cpu_info[j + h] = work; | |||||
| if (bit) { | |||||
| CPU_SET(j + h, &cpu_orig_mask[0]); | |||||
| } else { | |||||
| CPU_CLR(j + h, &cpu_orig_mask[0]); | |||||
| current_cpu = sched_getcpu(); | |||||
| for (cpu = 0; cpu < count; cpu++) { | |||||
| if (READ_CPU(common -> cpu_info[cpu]) == current_cpu) { | |||||
| current_node = READ_NODE(common -> cpu_info[cpu]); | |||||
| break; | |||||
| } | |||||
| } | |||||
| for (i = 0; i < MAX_BITMASK_LEN; i++) | |||||
| cpu_count += popcount(common -> node_info[current_node][i] & common -> avail[i]); | |||||
| /* | |||||
| * If all the processes can be accommodated in the | |||||
| * in the current node itself, then bind to cores | |||||
| * from the current node only | |||||
| */ | |||||
| if (numprocs <= cpu_count) { | |||||
| /* | |||||
| * First sort all the cores in order from the current node. | |||||
| * Then take remaining nodes one by one in order, | |||||
| * and sort their cores in order. | |||||
| */ | |||||
| for (i = 0; i < count; i++) { | |||||
| for (j = 0; j < count - 1; j++) { | |||||
| int node_1, node_2; | |||||
| int core_1, core_2; | |||||
| int swap = 0; | |||||
| node_1 = READ_NODE(common -> cpu_info[j]); | |||||
| node_2 = READ_NODE(common -> cpu_info[j + 1]); | |||||
| core_1 = READ_CORE(common -> cpu_info[j]); | |||||
| core_2 = READ_CORE(common -> cpu_info[j + 1]); | |||||
| if (node_1 == node_2) { | |||||
| if (core_1 > core_2) | |||||
| swap = 1; | |||||
| } else { | |||||
| if ((node_2 == current_node) || | |||||
| ((node_1 != current_node) && (node_1 > node_2))) | |||||
| swap = 1; | |||||
| } | |||||
| if (swap) { | |||||
| unsigned long temp; | |||||
| temp = common->cpu_info[j]; | |||||
| common->cpu_info[j] = common->cpu_info[j + 1]; | |||||
| common->cpu_info[j + 1] = temp; | |||||
| } | |||||
| } | } | ||||
| } | |||||
| } else { | |||||
| h = 1; | |||||
| while (h < count) h = 2 * h + 1; | |||||
| while (h > 1) { | |||||
| h /= 2; | |||||
| for (i = h; i < count; i++) { | |||||
| work = common -> cpu_info[i]; | |||||
| bit = CPU_ISSET(i, &cpu_orig_mask[0]); | |||||
| j = i - h; | |||||
| while (work < common -> cpu_info[j]) { | |||||
| common -> cpu_info[j + h] = common -> cpu_info[j]; | |||||
| if (CPU_ISSET(j, &cpu_orig_mask[0])) { | |||||
| CPU_SET(j + h, &cpu_orig_mask[0]); | |||||
| } else { | |||||
| CPU_CLR(j + h, &cpu_orig_mask[0]); | |||||
| } | |||||
| j -= h; | |||||
| if (j < 0) break; | |||||
| } | |||||
| common -> cpu_info[j + h] = work; | |||||
| if (bit) { | |||||
| CPU_SET(j + h, &cpu_orig_mask[0]); | |||||
| } else { | |||||
| CPU_CLR(j + h, &cpu_orig_mask[0]); | |||||
| } | |||||
| } | |||||
| } | } | ||||
| } | } | ||||
| @@ -416,7 +470,10 @@ static void numa_mapping(void) { | |||||
| fprintf(stderr, "\nSorting ...\n\n"); | fprintf(stderr, "\nSorting ...\n\n"); | ||||
| for (cpu = 0; cpu < count; cpu++) | for (cpu = 0; cpu < count; cpu++) | ||||
| fprintf(stderr, "CPU (%2d) : %08lx\n", cpu, common -> cpu_info[cpu]); | |||||
| fprintf(stderr, "CPUINFO (%2d) : %08lx (CPU=%3lu CORE=%3lu NODE=%3lu)\n", cpu, common -> cpu_info[cpu], | |||||
| READ_CPU(common -> cpu_info[cpu]), | |||||
| READ_CORE(common -> cpu_info[cpu]), | |||||
| READ_NODE(common -> cpu_info[cpu])); | |||||
| #endif | #endif | ||||
| } | } | ||||
| @@ -167,7 +167,7 @@ int get_L2_size(void){ | |||||
| #if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) || \ | #if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) || \ | ||||
| defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \ | defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \ | ||||
| defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) || \ | defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) || \ | ||||
| defined(PILEDRIVER) || defined(HASWELL) || defined(STEAMROLLER) | |||||
| defined(PILEDRIVER) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) | |||||
| cpuid(0x80000006, &eax, &ebx, &ecx, &edx); | cpuid(0x80000006, &eax, &ebx, &ecx, &edx); | ||||
| @@ -251,7 +251,7 @@ int get_L2_size(void){ | |||||
| void blas_set_parameter(void){ | void blas_set_parameter(void){ | ||||
| int factor; | int factor; | ||||
| #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL) || defined(STEAMROLLER) | |||||
| #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) | |||||
| int size = 16; | int size = 16; | ||||
| #else | #else | ||||
| int size = get_L2_size(); | int size = get_L2_size(); | ||||
| @@ -110,9 +110,9 @@ $(LIBDYNNAME) : ../$(LIBNAME).osx.renamed osx.def | |||||
| endif | endif | ||||
| ifeq ($(NOFORTRAN), $(filter $(NOFORTRAN),1 2)) | ifeq ($(NOFORTRAN), $(filter $(NOFORTRAN),1 2)) | ||||
| #only build without Fortran | #only build without Fortran | ||||
| $(CC) $(CFLAGS) -all_load -headerpad_max_install_names -install_name $(CURDIR)/../$(LIBDYNNAME) -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) | |||||
| $(CC) $(CFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(LIBDYNNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) | |||||
| else | else | ||||
| $(FC) $(FFLAGS) -all_load -headerpad_max_install_names -install_name $(CURDIR)/../$(LIBDYNNAME) -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) | |||||
| $(FC) $(FFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(LIBDYNNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) | |||||
| endif | endif | ||||
| dllinit.$(SUFFIX) : dllinit.c | dllinit.$(SUFFIX) : dllinit.c | ||||
| @@ -114,7 +114,7 @@ if ($compiler eq "") { | |||||
| $openmp = "-mp"; | $openmp = "-mp"; | ||||
| } | } | ||||
| if ($data =~ /IBM/) { | |||||
| if ($data =~ /IBM XL/) { | |||||
| $vendor = IBM; | $vendor = IBM; | ||||
| $openmp = "-openmp"; | $openmp = "-openmp"; | ||||
| } | } | ||||
| @@ -223,7 +223,12 @@ if (!$?) { | |||||
| } | } | ||||
| #For gfortran MIPS | #For gfortran MIPS | ||||
| if ($?) { | if ($?) { | ||||
| $link = `$compiler $openmp -mabi=n32 -v ftest2.f 2>&1 && rm -f a.out a.exe`; | |||||
| $mips_data = `$compiler_bin -E -dM - < /dev/null`; | |||||
| if ($mips_data =~ /_MIPS_ISA_MIPS64/) { | |||||
| $link = `$compiler $openmp -mabi=n32 -v ftest2.f 2>&1 && rm -f a.out a.exe`; | |||||
| } else { | |||||
| $link = `$compiler $openmp -mabi=32 -v ftest2.f 2>&1 && rm -f a.out a.exe`; | |||||
| } | |||||
| } | } | ||||
| $binary = "" if ($?); | $binary = "" if ($?); | ||||
| } | } | ||||
| @@ -131,6 +131,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| /* #define FORCE_SICORTEX */ | /* #define FORCE_SICORTEX */ | ||||
| /* #define FORCE_LOONGSON3A */ | /* #define FORCE_LOONGSON3A */ | ||||
| /* #define FORCE_LOONGSON3B */ | /* #define FORCE_LOONGSON3B */ | ||||
| /* #define FORCE_I6400 */ | |||||
| /* #define FORCE_P6600 */ | |||||
| /* #define FORCE_P5600 */ | |||||
| /* #define FORCE_ITANIUM2 */ | /* #define FORCE_ITANIUM2 */ | ||||
| /* #define FORCE_SPARC */ | /* #define FORCE_SPARC */ | ||||
| /* #define FORCE_SPARCV7 */ | /* #define FORCE_SPARCV7 */ | ||||
| @@ -699,6 +702,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #else | #else | ||||
| #endif | #endif | ||||
| #ifdef FORCE_I6400 | |||||
| #define FORCE | |||||
| #define ARCHITECTURE "MIPS" | |||||
| #define SUBARCHITECTURE "I6400" | |||||
| #define SUBDIRNAME "mips64" | |||||
| #define ARCHCONFIG "-DI6400 " \ | |||||
| "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \ | |||||
| "-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \ | |||||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " | |||||
| #define LIBNAME "i6400" | |||||
| #define CORENAME "I6400" | |||||
| #else | |||||
| #endif | |||||
| #ifdef FORCE_P6600 | |||||
| #define FORCE | |||||
| #define ARCHITECTURE "MIPS" | |||||
| #define SUBARCHITECTURE "P6600" | |||||
| #define SUBDIRNAME "mips64" | |||||
| #define ARCHCONFIG "-DP6600 " \ | |||||
| "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \ | |||||
| "-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \ | |||||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " | |||||
| #define LIBNAME "p6600" | |||||
| #define CORENAME "P6600" | |||||
| #else | |||||
| #endif | |||||
| #ifdef FORCE_P5600 | |||||
| #define FORCE | |||||
| #define ARCHITECTURE "MIPS" | |||||
| #define SUBARCHITECTURE "P5600" | |||||
| #define SUBDIRNAME "mips" | |||||
| #define ARCHCONFIG "-DP5600 " \ | |||||
| "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \ | |||||
| "-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \ | |||||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " | |||||
| #define LIBNAME "p5600" | |||||
| #define CORENAME "P5600" | |||||
| #else | |||||
| #endif | |||||
| #ifdef FORCE_ITANIUM2 | #ifdef FORCE_ITANIUM2 | ||||
| #define FORCE | #define FORCE | ||||
| #define ARCHITECTURE "IA64" | #define ARCHITECTURE "IA64" | ||||
| @@ -888,7 +933,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #endif | #endif | ||||
| #ifdef __mips__ | #ifdef __mips__ | ||||
| #ifdef __mips64 | |||||
| #include "cpuid_mips64.c" | |||||
| #else | |||||
| #include "cpuid_mips.c" | #include "cpuid_mips.c" | ||||
| #endif | |||||
| #define OPENBLAS_SUPPORTED | #define OPENBLAS_SUPPORTED | ||||
| #endif | #endif | ||||
| @@ -1,5 +1,5 @@ | |||||
| include_directories(${CMAKE_SOURCE_DIR}) | |||||
| include_directories(${PROJECT_SOURCE_DIR}) | |||||
| set(BLAS1_SOURCES | set(BLAS1_SOURCES | ||||
| @@ -42,6 +42,10 @@ | |||||
| #include "functable.h" | #include "functable.h" | ||||
| #endif | #endif | ||||
| // Disable multi-threading as it does not show any performance | |||||
| // benefits. Keep the multi-threading code for the record. | |||||
| #undef SMP | |||||
| #ifndef CBLAS | #ifndef CBLAS | ||||
| void NAME(blasint *N, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY){ | void NAME(blasint *N, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY){ | ||||
| @@ -243,6 +243,8 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, | |||||
| #endif | #endif | ||||
| { | { | ||||
| buffer_size = ((n - 1) / DTB_ENTRIES) * 2 * DTB_ENTRIES + 32 / sizeof(FLOAT); | buffer_size = ((n - 1) / DTB_ENTRIES) * 2 * DTB_ENTRIES + 32 / sizeof(FLOAT); | ||||
| // It seems to be required for some K8 or Barcelona CPU | |||||
| buffer_size += 8; | |||||
| if(incx != 1) | if(incx != 1) | ||||
| buffer_size += n * 2; | buffer_size += n * 2; | ||||
| } | } | ||||
| @@ -1,6 +1,6 @@ | |||||
| include_directories(${CMAKE_SOURCE_DIR}) | |||||
| include("${CMAKE_SOURCE_DIR}/cmake/kernel.cmake") | |||||
| include_directories(${PROJECT_SOURCE_DIR}) | |||||
| include("${PROJECT_SOURCE_DIR}/cmake/kernel.cmake") | |||||
| # Makefile | # Makefile | ||||
| @@ -12,10 +12,6 @@ ifeq ($(ARCH), ia64) | |||||
| USE_GEMM3M = 1 | USE_GEMM3M = 1 | ||||
| endif | endif | ||||
| ifeq ($(ARCH), MIPS) | |||||
| USE_GEMM3M = 1 | |||||
| endif | |||||
| ifeq ($(ARCH), arm) | ifeq ($(ARCH), arm) | ||||
| USE_TRMM = 1 | USE_TRMM = 1 | ||||
| endif | endif | ||||
| @@ -40,6 +40,10 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||||
| { | { | ||||
| BLASLONG i=0,j=0; | BLASLONG i=0,j=0; | ||||
| if ( (n <= 0) || (inc_x <= 0)) | |||||
| return(0); | |||||
| while(j < n) | while(j < n) | ||||
| { | { | ||||
| @@ -43,6 +43,10 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F | |||||
| BLASLONG ip = 0; | BLASLONG ip = 0; | ||||
| FLOAT temp; | FLOAT temp; | ||||
| if ( (n <= 0) || (inc_x <= 0)) | |||||
| return(0); | |||||
| inc_x2 = 2 * inc_x; | inc_x2 = 2 * inc_x; | ||||
| for ( i=0; i<n; i++ ) | for ( i=0; i<n; i++ ) | ||||
| { | { | ||||
| @@ -58,43 +58,43 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| str TMPF, [Y], #SZ | str TMPF, [Y], #SZ | ||||
| #else | #else | ||||
| #if !defined(DOUBLE) | #if !defined(DOUBLE) | ||||
| ld1 {v0.2s}, [X], #8 | |||||
| st1 {v0.2s}, [Y], #8 | |||||
| ldr d0, [X], #8 | |||||
| str d0, [Y], #8 | |||||
| #else | #else | ||||
| ld1 {v0.2d}, [X], #16 | |||||
| st1 {v0.2d}, [Y], #16 | |||||
| ldr q0, [X], #16 | |||||
| str q0, [Y], #16 | |||||
| #endif | #endif | ||||
| #endif | #endif | ||||
| .endm | .endm | ||||
| .macro KERNEL_F4 | .macro KERNEL_F4 | ||||
| #if !defined(COMPLEX) | #if !defined(COMPLEX) | ||||
| #if !defined(DOUBLE) | #if !defined(DOUBLE) | ||||
| ld1 {v0.4s}, [X], #16 | |||||
| st1 {v0.4s}, [Y], #16 | |||||
| ldr q0, [X], #16 | |||||
| str q0, [Y], #16 | |||||
| #else // DOUBLE | #else // DOUBLE | ||||
| ld1 {v0.4s}, [X], #16 | |||||
| ld1 {v1.4s}, [X], #16 | |||||
| st1 {v0.4s}, [Y], #16 | |||||
| st1 {v1.4s}, [Y], #16 | |||||
| ldr q0, [X], #16 | |||||
| str q0, [Y], #16 | |||||
| ldr q1, [X], #16 | |||||
| str q1, [Y], #16 | |||||
| #endif | #endif | ||||
| #else // COMPLEX | #else // COMPLEX | ||||
| #if !defined(DOUBLE) | #if !defined(DOUBLE) | ||||
| ld1 {v0.4s}, [X], #16 | |||||
| ld1 {v1.4s}, [X], #16 | |||||
| st1 {v0.4s}, [Y], #16 | |||||
| st1 {v1.4s}, [Y], #16 | |||||
| ldr q0, [X], #16 | |||||
| str q0, [Y], #16 | |||||
| ldr q1, [X], #16 | |||||
| str q1, [Y], #16 | |||||
| #else // DOUBLE | #else // DOUBLE | ||||
| ld1 {v0.4s}, [X], #16 | |||||
| ld1 {v1.4s}, [X], #16 | |||||
| ld1 {v2.4s}, [X], #16 | |||||
| ld1 {v3.4s}, [X], #16 | |||||
| st1 {v0.4s}, [Y], #16 | |||||
| st1 {v1.4s}, [Y], #16 | |||||
| st1 {v2.4s}, [Y], #16 | |||||
| st1 {v3.4s}, [Y], #16 | |||||
| ldr q0, [X], #16 | |||||
| str q0, [Y], #16 | |||||
| ldr q1, [X], #16 | |||||
| str q1, [Y], #16 | |||||
| ldr q2, [X], #16 | |||||
| str q2, [Y], #16 | |||||
| ldr q3, [X], #16 | |||||
| str q3, [Y], #16 | |||||
| #endif | #endif | ||||
| #endif | #endif | ||||
| @@ -339,7 +339,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| stp q0, q1, [pCRow0] | stp q0, q1, [pCRow0] | ||||
| add pCRow0, pCRow0, #32 | add pCRow0, pCRow0, #32 | ||||
| prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||||
| ldp q2, q3, [pCRow0] | ldp q2, q3, [pCRow0] | ||||
| fmla v2.2d, v18.2d, alphaV0 | fmla v2.2d, v18.2d, alphaV0 | ||||
| @@ -356,7 +355,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| stp q4, q5, [pCRow1] | stp q4, q5, [pCRow1] | ||||
| add pCRow1, pCRow1, #32 | add pCRow1, pCRow1, #32 | ||||
| prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||||
| ldp q6, q7, [pCRow1] | ldp q6, q7, [pCRow1] | ||||
| fmla v6.2d, v22.2d, alphaV0 | fmla v6.2d, v22.2d, alphaV0 | ||||
| @@ -373,7 +371,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| stp q0, q1, [pCRow2] | stp q0, q1, [pCRow2] | ||||
| add pCRow2, pCRow2, #32 | add pCRow2, pCRow2, #32 | ||||
| prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] | |||||
| ldp q2, q3, [pCRow2] | ldp q2, q3, [pCRow2] | ||||
| fmla v2.2d, v26.2d, alphaV0 | fmla v2.2d, v26.2d, alphaV0 | ||||
| @@ -390,7 +387,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| stp q4, q5, [pCRow3] | stp q4, q5, [pCRow3] | ||||
| add pCRow3, pCRow3, #32 | add pCRow3, pCRow3, #32 | ||||
| prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] | |||||
| ldp q6, q7, [pCRow3] | ldp q6, q7, [pCRow3] | ||||
| fmla v6.2d, v30.2d, alphaV0 | fmla v6.2d, v30.2d, alphaV0 | ||||
| @@ -434,33 +430,38 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .macro SAVE4x4 | .macro SAVE4x4 | ||||
| fmov alpha0, alpha | fmov alpha0, alpha | ||||
| ld1 {v8.2d, v9.2d}, [pCRow0] | ld1 {v8.2d, v9.2d}, [pCRow0] | ||||
| fmla v8.2d, v16.2d, alphaV0 | fmla v8.2d, v16.2d, alphaV0 | ||||
| fmla v9.2d, v17.2d, alphaV0 | fmla v9.2d, v17.2d, alphaV0 | ||||
| st1 {v8.2d, v9.2d}, [pCRow0] | st1 {v8.2d, v9.2d}, [pCRow0] | ||||
| add pCRow1, pCRow0, LDC | |||||
| prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||||
| add pCRow0, pCRow0, #32 | |||||
| ld1 {v12.2d, v13.2d}, [pCRow1] | ld1 {v12.2d, v13.2d}, [pCRow1] | ||||
| fmla v12.2d, v20.2d, alphaV0 | fmla v12.2d, v20.2d, alphaV0 | ||||
| fmla v13.2d, v21.2d, alphaV0 | fmla v13.2d, v21.2d, alphaV0 | ||||
| st1 {v12.2d, v13.2d}, [pCRow1] | st1 {v12.2d, v13.2d}, [pCRow1] | ||||
| add pCRow2, pCRow1, LDC | |||||
| prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||||
| add pCRow1, pCRow1, #32 | |||||
| ld1 {v8.2d, v9.2d}, [pCRow2] | ld1 {v8.2d, v9.2d}, [pCRow2] | ||||
| fmla v8.2d, v24.2d, alphaV0 | fmla v8.2d, v24.2d, alphaV0 | ||||
| fmla v9.2d, v25.2d, alphaV0 | fmla v9.2d, v25.2d, alphaV0 | ||||
| st1 {v8.2d, v9.2d}, [pCRow2] | st1 {v8.2d, v9.2d}, [pCRow2] | ||||
| add pCRow1, pCRow2, LDC | |||||
| prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] | |||||
| add pCRow2, pCRow2, #32 | |||||
| ld1 {v12.2d, v13.2d}, [pCRow1] | |||||
| ld1 {v12.2d, v13.2d}, [pCRow3] | |||||
| fmla v12.2d, v28.2d, alphaV0 | fmla v12.2d, v28.2d, alphaV0 | ||||
| fmla v13.2d, v29.2d, alphaV0 | fmla v13.2d, v29.2d, alphaV0 | ||||
| st1 {v12.2d, v13.2d}, [pCRow1] | |||||
| st1 {v12.2d, v13.2d}, [pCRow3] | |||||
| add pCRow0, pCRow0, #32 | |||||
| prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] | |||||
| add pCRow3, pCRow3, #32 | |||||
| .endm | .endm | ||||
| /******************************************************************************/ | /******************************************************************************/ | ||||
| @@ -487,29 +488,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .macro SAVE2x4 | .macro SAVE2x4 | ||||
| fmov alpha0, alpha | fmov alpha0, alpha | ||||
| ld1 {v8.2d}, [pCRow0] | ld1 {v8.2d}, [pCRow0] | ||||
| fmla v8.2d, v16.2d, alphaV0 | fmla v8.2d, v16.2d, alphaV0 | ||||
| st1 {v8.2d}, [pCRow0] | st1 {v8.2d}, [pCRow0] | ||||
| add pCRow1, pCRow0, LDC | |||||
| prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||||
| add pCRow0, pCRow0, #16 | |||||
| ld1 {v12.2d}, [pCRow1] | ld1 {v12.2d}, [pCRow1] | ||||
| fmla v12.2d, v20.2d, alphaV0 | fmla v12.2d, v20.2d, alphaV0 | ||||
| st1 {v12.2d}, [pCRow1] | st1 {v12.2d}, [pCRow1] | ||||
| add pCRow2, pCRow1, LDC | |||||
| prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||||
| add pCRow1, pCRow1, #16 | |||||
| ld1 {v8.2d}, [pCRow2] | ld1 {v8.2d}, [pCRow2] | ||||
| fmla v8.2d, v24.2d, alphaV0 | fmla v8.2d, v24.2d, alphaV0 | ||||
| st1 {v8.2d}, [pCRow2] | st1 {v8.2d}, [pCRow2] | ||||
| add pCRow1, pCRow2, LDC | |||||
| prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] | |||||
| add pCRow2, pCRow2, #16 | |||||
| ld1 {v12.2d}, [pCRow1] | |||||
| ld1 {v12.2d}, [pCRow3] | |||||
| fmla v12.2d, v28.2d, alphaV0 | fmla v12.2d, v28.2d, alphaV0 | ||||
| st1 {v12.2d}, [pCRow1] | |||||
| st1 {v12.2d}, [pCRow3] | |||||
| add pCRow0, pCRow0, #16 | |||||
| prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] | |||||
| add pCRow3, pCRow3, #16 | |||||
| .endm | .endm | ||||
| /******************************************************************************/ | /******************************************************************************/ | ||||
| @@ -532,7 +538,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .macro SAVE1x4 | .macro SAVE1x4 | ||||
| fmov alpha0, alpha | fmov alpha0, alpha | ||||
| add pCRow1, pCRow0, LDC | |||||
| ld1 {v8.d}[0], [pCRow0] | ld1 {v8.d}[0], [pCRow0] | ||||
| ld1 {v8.d}[1], [pCRow1] | ld1 {v8.d}[1], [pCRow1] | ||||
| @@ -540,16 +545,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| st1 {v8.d}[0], [pCRow0] | st1 {v8.d}[0], [pCRow0] | ||||
| st1 {v8.d}[1], [pCRow1] | st1 {v8.d}[1], [pCRow1] | ||||
| add pCRow2, pCRow1, LDC | |||||
| add pCRow1, pCRow2, LDC | |||||
| prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||||
| add pCRow0, pCRow0, #8 | |||||
| prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||||
| add pCRow1, pCRow1, #8 | |||||
| ld1 {v12.d}[0], [pCRow2] | ld1 {v12.d}[0], [pCRow2] | ||||
| ld1 {v12.d}[1], [pCRow1] | |||||
| ld1 {v12.d}[1], [pCRow3] | |||||
| fmla v12.2d, v20.2d, alphaV0 | fmla v12.2d, v20.2d, alphaV0 | ||||
| st1 {v12.d}[0], [pCRow2] | st1 {v12.d}[0], [pCRow2] | ||||
| st1 {v12.d}[1], [pCRow1] | |||||
| st1 {v12.d}[1], [pCRow3] | |||||
| add pCRow0, pCRow0, #8 | |||||
| prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] | |||||
| add pCRow2, pCRow2, #8 | |||||
| prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] | |||||
| add pCRow3, pCRow3, #8 | |||||
| .endm | .endm | ||||
| /******************************************************************************/ | /******************************************************************************/ | ||||
| @@ -578,6 +588,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| fmla v18.2d, v2.2d, v8.d[0] | fmla v18.2d, v2.2d, v8.d[0] | ||||
| fmla v19.2d, v3.2d, v8.d[0] | fmla v19.2d, v3.2d, v8.d[0] | ||||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||||
| fmla v20.2d, v0.2d, v8.d[1] | fmla v20.2d, v0.2d, v8.d[1] | ||||
| fmla v21.2d, v1.2d, v8.d[1] | fmla v21.2d, v1.2d, v8.d[1] | ||||
| fmla v22.2d, v2.2d, v8.d[1] | fmla v22.2d, v2.2d, v8.d[1] | ||||
| @@ -586,7 +598,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .macro SAVE8x2 | .macro SAVE8x2 | ||||
| fmov alpha0, alpha | fmov alpha0, alpha | ||||
| add pCRow1, pCRow0, LDC | |||||
| ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] | ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] | ||||
| fmla v0.2d, v16.2d, alphaV0 | fmla v0.2d, v16.2d, alphaV0 | ||||
| @@ -595,6 +606,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| fmla v3.2d, v19.2d, alphaV0 | fmla v3.2d, v19.2d, alphaV0 | ||||
| st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] | st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] | ||||
| prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||||
| add pCRow0, pCRow0, #64 | |||||
| ld1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1] | ld1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1] | ||||
| fmla v4.2d, v20.2d, alphaV0 | fmla v4.2d, v20.2d, alphaV0 | ||||
| fmla v5.2d, v21.2d, alphaV0 | fmla v5.2d, v21.2d, alphaV0 | ||||
| @@ -602,7 +616,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| fmla v7.2d, v23.2d, alphaV0 | fmla v7.2d, v23.2d, alphaV0 | ||||
| st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1] | st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1] | ||||
| add pCRow0, pCRow0, #64 | |||||
| prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||||
| add pCRow1, pCRow1, #64 | |||||
| .endm | .endm | ||||
| /******************************************************************************/ | /******************************************************************************/ | ||||
| @@ -628,19 +643,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .macro SAVE4x2 | .macro SAVE4x2 | ||||
| fmov alpha0, alpha | fmov alpha0, alpha | ||||
| ld1 {v8.2d, v9.2d}, [pCRow0] | ld1 {v8.2d, v9.2d}, [pCRow0] | ||||
| fmla v8.2d, v16.2d, alphaV0 | fmla v8.2d, v16.2d, alphaV0 | ||||
| fmla v9.2d, v17.2d, alphaV0 | fmla v9.2d, v17.2d, alphaV0 | ||||
| st1 {v8.2d, v9.2d}, [pCRow0] | st1 {v8.2d, v9.2d}, [pCRow0] | ||||
| add pCRow1, pCRow0, LDC | |||||
| prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||||
| add pCRow0, pCRow0, #32 | |||||
| ld1 {v12.2d, v13.2d}, [pCRow1] | ld1 {v12.2d, v13.2d}, [pCRow1] | ||||
| fmla v12.2d, v20.2d, alphaV0 | fmla v12.2d, v20.2d, alphaV0 | ||||
| fmla v13.2d, v21.2d, alphaV0 | fmla v13.2d, v21.2d, alphaV0 | ||||
| st1 {v12.2d, v13.2d}, [pCRow1] | st1 {v12.2d, v13.2d}, [pCRow1] | ||||
| add pCRow0, pCRow0, #32 | |||||
| prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||||
| add pCRow1, pCRow1, #32 | |||||
| .endm | .endm | ||||
| /******************************************************************************/ | /******************************************************************************/ | ||||
| @@ -663,17 +681,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .macro SAVE2x2 | .macro SAVE2x2 | ||||
| fmov alpha0, alpha | fmov alpha0, alpha | ||||
| ld1 {v8.2d}, [pCRow0] | ld1 {v8.2d}, [pCRow0] | ||||
| fmla v8.2d, v16.2d, alphaV0 | fmla v8.2d, v16.2d, alphaV0 | ||||
| st1 {v8.2d}, [pCRow0] | st1 {v8.2d}, [pCRow0] | ||||
| add pCRow1 , pCRow0, LDC | |||||
| prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||||
| add pCRow0, pCRow0, #16 | |||||
| ld1 {v12.2d}, [pCRow1] | ld1 {v12.2d}, [pCRow1] | ||||
| fmla v12.2d, v20.2d, alphaV0 | fmla v12.2d, v20.2d, alphaV0 | ||||
| st1 {v12.2d}, [pCRow1] | st1 {v12.2d}, [pCRow1] | ||||
| add pCRow0, pCRow0, #16 | |||||
| prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||||
| add pCRow1, pCRow1, #16 | |||||
| .endm | .endm | ||||
| /******************************************************************************/ | /******************************************************************************/ | ||||
| @@ -694,7 +715,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .macro SAVE1x2 | .macro SAVE1x2 | ||||
| fmov alpha0, alpha | fmov alpha0, alpha | ||||
| add pCRow1 , pCRow0, LDC | |||||
| ld1 {v8.d}[0], [pCRow0] | ld1 {v8.d}[0], [pCRow0] | ||||
| ld1 {v8.d}[1], [pCRow1] | ld1 {v8.d}[1], [pCRow1] | ||||
| @@ -702,7 +722,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| st1 {v8.d}[0], [pCRow0] | st1 {v8.d}[0], [pCRow0] | ||||
| st1 {v8.d}[1], [pCRow1] | st1 {v8.d}[1], [pCRow1] | ||||
| prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||||
| add pCRow0, pCRow0, #8 | add pCRow0, pCRow0, #8 | ||||
| prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||||
| add pCRow1, pCRow1, #8 | |||||
| .endm | .endm | ||||
| /******************************************************************************/ | /******************************************************************************/ | ||||
| @@ -726,12 +749,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| fmla v16.2d, v0.2d, v8.d[0] | fmla v16.2d, v0.2d, v8.d[0] | ||||
| fmla v17.2d, v1.2d, v8.d[0] | fmla v17.2d, v1.2d, v8.d[0] | ||||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||||
| fmla v18.2d, v2.2d, v8.d[0] | fmla v18.2d, v2.2d, v8.d[0] | ||||
| fmla v19.2d, v3.2d, v8.d[0] | fmla v19.2d, v3.2d, v8.d[0] | ||||
| .endm | .endm | ||||
| .macro SAVE8x1 | .macro SAVE8x1 | ||||
| fmov alpha0, alpha | fmov alpha0, alpha | ||||
| ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] | ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] | ||||
| fmla v0.2d, v16.2d, alphaV0 | fmla v0.2d, v16.2d, alphaV0 | ||||
| fmla v1.2d, v17.2d, alphaV0 | fmla v1.2d, v17.2d, alphaV0 | ||||
| @@ -739,6 +764,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| fmla v3.2d, v19.2d, alphaV0 | fmla v3.2d, v19.2d, alphaV0 | ||||
| st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] | st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] | ||||
| prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||||
| add pCRow0, pCRow0, #64 | add pCRow0, pCRow0, #64 | ||||
| .endm | .endm | ||||
| @@ -763,11 +789,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .macro SAVE4x1 | .macro SAVE4x1 | ||||
| fmov alpha0, alpha | fmov alpha0, alpha | ||||
| ld1 {v8.2d, v9.2d}, [pCRow0] | ld1 {v8.2d, v9.2d}, [pCRow0] | ||||
| fmla v8.2d, v16.2d, alphaV0 | fmla v8.2d, v16.2d, alphaV0 | ||||
| fmla v9.2d, v17.2d, alphaV0 | fmla v9.2d, v17.2d, alphaV0 | ||||
| st1 {v8.2d, v9.2d}, [pCRow0] | st1 {v8.2d, v9.2d}, [pCRow0] | ||||
| prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||||
| add pCRow0, pCRow0, #32 | add pCRow0, pCRow0, #32 | ||||
| .endm | .endm | ||||
| @@ -790,10 +818,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .macro SAVE2x1 | .macro SAVE2x1 | ||||
| fmov alpha0, alpha | fmov alpha0, alpha | ||||
| ld1 {v8.2d}, [pCRow0] | ld1 {v8.2d}, [pCRow0] | ||||
| fmla v8.2d, v16.2d, alphaV0 | fmla v8.2d, v16.2d, alphaV0 | ||||
| st1 {v8.2d}, [pCRow0] | st1 {v8.2d}, [pCRow0] | ||||
| prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||||
| add pCRow0, pCRow0, #16 | add pCRow0, pCRow0, #16 | ||||
| .endm | .endm | ||||
| @@ -819,6 +849,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| fmadd d8, d16, alpha0, d8 | fmadd d8, d16, alpha0, d8 | ||||
| str d8, [pCRow0] | str d8, [pCRow0] | ||||
| prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||||
| add pCRow0, pCRow0, #8 | add pCRow0, pCRow0, #8 | ||||
| .endm | .endm | ||||
| @@ -858,6 +889,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| /******************************************************************************/ | /******************************************************************************/ | ||||
| .align 5 | |||||
| dgemm_kernel_L4_BEGIN: | dgemm_kernel_L4_BEGIN: | ||||
| mov pCRow0, pC | mov pCRow0, pC | ||||
| add pCRow1, pCRow0, LDC | add pCRow1, pCRow0, LDC | ||||
| @@ -989,17 +1021,26 @@ dgemm_kernel_L4_M4_20: | |||||
| cmp counterL , #0 | cmp counterL , #0 | ||||
| ble dgemm_kernel_L4_M4_40 | ble dgemm_kernel_L4_M4_40 | ||||
| .align 5 | |||||
| dgemm_kernel_L4_M4_22: | dgemm_kernel_L4_M4_22: | ||||
| KERNEL4x4_SUB | KERNEL4x4_SUB | ||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||||
| KERNEL4x4_SUB | KERNEL4x4_SUB | ||||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||||
| KERNEL4x4_SUB | KERNEL4x4_SUB | ||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||||
| KERNEL4x4_SUB | KERNEL4x4_SUB | ||||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||||
| KERNEL4x4_SUB | KERNEL4x4_SUB | ||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||||
| KERNEL4x4_SUB | KERNEL4x4_SUB | ||||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||||
| KERNEL4x4_SUB | KERNEL4x4_SUB | ||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||||
| KERNEL4x4_SUB | KERNEL4x4_SUB | ||||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||||
| subs counterL, counterL, #1 | subs counterL, counterL, #1 | ||||
| bgt dgemm_kernel_L4_M4_22 | bgt dgemm_kernel_L4_M4_22 | ||||
| @@ -1012,6 +1053,8 @@ dgemm_kernel_L4_M4_40: | |||||
| dgemm_kernel_L4_M4_42: | dgemm_kernel_L4_M4_42: | ||||
| KERNEL4x4_SUB | KERNEL4x4_SUB | ||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||||
| subs counterL, counterL, #1 | subs counterL, counterL, #1 | ||||
| bgt dgemm_kernel_L4_M4_42 | bgt dgemm_kernel_L4_M4_42 | ||||
| @@ -1022,7 +1065,6 @@ dgemm_kernel_L4_M4_100: | |||||
| dgemm_kernel_L4_M4_END: | dgemm_kernel_L4_M4_END: | ||||
| dgemm_kernel_L4_M2_BEGIN: | dgemm_kernel_L4_M2_BEGIN: | ||||
| mov counterI, origM | mov counterI, origM | ||||
| @@ -1042,16 +1084,23 @@ dgemm_kernel_L4_M2_20: | |||||
| cmp counterL , #0 | cmp counterL , #0 | ||||
| ble dgemm_kernel_L4_M2_40 | ble dgemm_kernel_L4_M2_40 | ||||
| .align 5 | |||||
| dgemm_kernel_L4_M2_22: | dgemm_kernel_L4_M2_22: | ||||
| KERNEL2x4_SUB | KERNEL2x4_SUB | ||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||||
| KERNEL2x4_SUB | KERNEL2x4_SUB | ||||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||||
| KERNEL2x4_SUB | KERNEL2x4_SUB | ||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||||
| KERNEL2x4_SUB | KERNEL2x4_SUB | ||||
| KERNEL2x4_SUB | KERNEL2x4_SUB | ||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||||
| KERNEL2x4_SUB | KERNEL2x4_SUB | ||||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||||
| KERNEL2x4_SUB | KERNEL2x4_SUB | ||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||||
| KERNEL2x4_SUB | KERNEL2x4_SUB | ||||
| subs counterL, counterL, #1 | subs counterL, counterL, #1 | ||||
| @@ -1063,9 +1112,12 @@ dgemm_kernel_L4_M2_40: | |||||
| ands counterL , origK, #7 // counterL = counterL % 8 | ands counterL , origK, #7 // counterL = counterL % 8 | ||||
| ble dgemm_kernel_L4_M2_100 | ble dgemm_kernel_L4_M2_100 | ||||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] | |||||
| dgemm_kernel_L4_M2_42: | dgemm_kernel_L4_M2_42: | ||||
| KERNEL2x4_SUB | KERNEL2x4_SUB | ||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||||
| subs counterL, counterL, #1 | subs counterL, counterL, #1 | ||||
| bgt dgemm_kernel_L4_M2_42 | bgt dgemm_kernel_L4_M2_42 | ||||
| @@ -1092,15 +1144,22 @@ dgemm_kernel_L4_M1_20: | |||||
| cmp counterL , #0 | cmp counterL , #0 | ||||
| ble dgemm_kernel_L4_M1_40 | ble dgemm_kernel_L4_M1_40 | ||||
| .align 5 | |||||
| dgemm_kernel_L4_M1_22: | dgemm_kernel_L4_M1_22: | ||||
| KERNEL1x4_SUB | KERNEL1x4_SUB | ||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||||
| KERNEL1x4_SUB | KERNEL1x4_SUB | ||||
| KERNEL1x4_SUB | KERNEL1x4_SUB | ||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||||
| KERNEL1x4_SUB | KERNEL1x4_SUB | ||||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||||
| KERNEL1x4_SUB | KERNEL1x4_SUB | ||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||||
| KERNEL1x4_SUB | KERNEL1x4_SUB | ||||
| KERNEL1x4_SUB | KERNEL1x4_SUB | ||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||||
| KERNEL1x4_SUB | KERNEL1x4_SUB | ||||
| subs counterL, counterL, #1 | subs counterL, counterL, #1 | ||||
| @@ -1112,9 +1171,11 @@ dgemm_kernel_L4_M1_40: | |||||
| ands counterL , origK, #7 // counterL = counterL % 8 | ands counterL , origK, #7 // counterL = counterL % 8 | ||||
| ble dgemm_kernel_L4_M1_100 | ble dgemm_kernel_L4_M1_100 | ||||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||||
| dgemm_kernel_L4_M1_42: | dgemm_kernel_L4_M1_42: | ||||
| KERNEL1x4_SUB | KERNEL1x4_SUB | ||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||||
| subs counterL, counterL, #1 | subs counterL, counterL, #1 | ||||
| bgt dgemm_kernel_L4_M1_42 | bgt dgemm_kernel_L4_M1_42 | ||||
| @@ -1143,9 +1204,10 @@ dgemm_kernel_L2_BEGIN: // less than 2 left in N direction | |||||
| tst counterJ , #2 | tst counterJ , #2 | ||||
| ble dgemm_kernel_L1_BEGIN | ble dgemm_kernel_L1_BEGIN | ||||
| mov pCRow0, pC // pCRow0 = pC | |||||
| mov pCRow0, pC | |||||
| add pCRow1, pCRow0, LDC | |||||
| add pC,pC,LDC, lsl #1 | |||||
| add pC, pCRow1, LDC | |||||
| mov pA, origPA // pA = A | mov pA, origPA // pA = A | ||||
| @@ -1156,6 +1218,7 @@ dgemm_kernel_L2_M8_BEGIN: | |||||
| cmp counterI, #0 | cmp counterI, #0 | ||||
| ble dgemm_kernel_L2_M4_BEGIN | ble dgemm_kernel_L2_M4_BEGIN | ||||
| .align 5 | |||||
| dgemm_kernel_L2_M8_20: | dgemm_kernel_L2_M8_20: | ||||
| INIT8x2 | INIT8x2 | ||||
| @@ -1165,28 +1228,31 @@ dgemm_kernel_L2_M8_20: | |||||
| asr counterL , origK, #3 // counterL = counterL / 8 | asr counterL , origK, #3 // counterL = counterL / 8 | ||||
| cmp counterL,#0 | cmp counterL,#0 | ||||
| ble dgemm_kernel_L2_M8_40 | ble dgemm_kernel_L2_M8_40 | ||||
| .align 5 | |||||
| .align 5 | |||||
| dgemm_kernel_L2_M8_22: | dgemm_kernel_L2_M8_22: | ||||
| KERNEL8x2_SUB | KERNEL8x2_SUB | ||||
| KERNEL8x2_SUB | KERNEL8x2_SUB | ||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||||
| KERNEL8x2_SUB | KERNEL8x2_SUB | ||||
| KERNEL8x2_SUB | KERNEL8x2_SUB | ||||
| KERNEL8x2_SUB | KERNEL8x2_SUB | ||||
| KERNEL8x2_SUB | KERNEL8x2_SUB | ||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||||
| KERNEL8x2_SUB | KERNEL8x2_SUB | ||||
| KERNEL8x2_SUB | KERNEL8x2_SUB | ||||
| subs counterL, counterL, #1 | subs counterL, counterL, #1 | ||||
| bgt dgemm_kernel_L2_M8_22 | bgt dgemm_kernel_L2_M8_22 | ||||
| dgemm_kernel_L2_M8_40: | dgemm_kernel_L2_M8_40: | ||||
| ands counterL , origK, #7 // counterL = counterL % 8 | ands counterL , origK, #7 // counterL = counterL % 8 | ||||
| ble dgemm_kernel_L2_M8_100 | ble dgemm_kernel_L2_M8_100 | ||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] | |||||
| dgemm_kernel_L2_M8_42: | dgemm_kernel_L2_M8_42: | ||||
| KERNEL8x2_SUB | KERNEL8x2_SUB | ||||
| @@ -1221,17 +1287,23 @@ dgemm_kernel_L2_M4_20: | |||||
| asr counterL , origK, #3 // counterL = counterL / 8 | asr counterL , origK, #3 // counterL = counterL / 8 | ||||
| cmp counterL,#0 | cmp counterL,#0 | ||||
| ble dgemm_kernel_L2_M4_40 | ble dgemm_kernel_L2_M4_40 | ||||
| .align 5 | |||||
| .align 5 | |||||
| dgemm_kernel_L2_M4_22: | dgemm_kernel_L2_M4_22: | ||||
| KERNEL4x2_SUB | KERNEL4x2_SUB | ||||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||||
| KERNEL4x2_SUB | KERNEL4x2_SUB | ||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||||
| KERNEL4x2_SUB | KERNEL4x2_SUB | ||||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||||
| KERNEL4x2_SUB | KERNEL4x2_SUB | ||||
| KERNEL4x2_SUB | KERNEL4x2_SUB | ||||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||||
| KERNEL4x2_SUB | KERNEL4x2_SUB | ||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||||
| KERNEL4x2_SUB | KERNEL4x2_SUB | ||||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||||
| KERNEL4x2_SUB | KERNEL4x2_SUB | ||||
| subs counterL, counterL, #1 | subs counterL, counterL, #1 | ||||
| @@ -1243,9 +1315,12 @@ dgemm_kernel_L2_M4_40: | |||||
| ands counterL , origK, #7 // counterL = counterL % 8 | ands counterL , origK, #7 // counterL = counterL % 8 | ||||
| ble dgemm_kernel_L2_M4_100 | ble dgemm_kernel_L2_M4_100 | ||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] | |||||
| dgemm_kernel_L2_M4_42: | dgemm_kernel_L2_M4_42: | ||||
| KERNEL4x2_SUB | KERNEL4x2_SUB | ||||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||||
| subs counterL, counterL, #1 | subs counterL, counterL, #1 | ||||
| bgt dgemm_kernel_L2_M4_42 | bgt dgemm_kernel_L2_M4_42 | ||||
| @@ -1279,19 +1354,26 @@ dgemm_kernel_L2_M2_20: | |||||
| dgemm_kernel_L2_M2_22: | dgemm_kernel_L2_M2_22: | ||||
| KERNEL2x2_SUB | KERNEL2x2_SUB | ||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||||
| KERNEL2x2_SUB | KERNEL2x2_SUB | ||||
| KERNEL2x2_SUB | KERNEL2x2_SUB | ||||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||||
| KERNEL2x2_SUB | KERNEL2x2_SUB | ||||
| KERNEL2x2_SUB | KERNEL2x2_SUB | ||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||||
| KERNEL2x2_SUB | KERNEL2x2_SUB | ||||
| KERNEL2x2_SUB | KERNEL2x2_SUB | ||||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||||
| KERNEL2x2_SUB | KERNEL2x2_SUB | ||||
| subs counterL, counterL, #1 | subs counterL, counterL, #1 | ||||
| bgt dgemm_kernel_L2_M2_22 | bgt dgemm_kernel_L2_M2_22 | ||||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] | |||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] | |||||
| dgemm_kernel_L2_M2_40: | dgemm_kernel_L2_M2_40: | ||||
| ands counterL , origK, #7 // counterL = counterL % 8 | ands counterL , origK, #7 // counterL = counterL % 8 | ||||
| @@ -1329,18 +1411,24 @@ dgemm_kernel_L2_M1_20: | |||||
| dgemm_kernel_L2_M1_22: | dgemm_kernel_L2_M1_22: | ||||
| KERNEL1x2_SUB | KERNEL1x2_SUB | ||||
| KERNEL1x2_SUB | KERNEL1x2_SUB | ||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||||
| KERNEL1x2_SUB | KERNEL1x2_SUB | ||||
| KERNEL1x2_SUB | KERNEL1x2_SUB | ||||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||||
| KERNEL1x2_SUB | KERNEL1x2_SUB | ||||
| KERNEL1x2_SUB | KERNEL1x2_SUB | ||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||||
| KERNEL1x2_SUB | KERNEL1x2_SUB | ||||
| KERNEL1x2_SUB | KERNEL1x2_SUB | ||||
| subs counterL, counterL, #1 | subs counterL, counterL, #1 | ||||
| bgt dgemm_kernel_L2_M1_22 | bgt dgemm_kernel_L2_M1_22 | ||||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] | |||||
| dgemm_kernel_L2_M1_40: | dgemm_kernel_L2_M1_40: | ||||
| ands counterL , origK, #7 // counterL = counterL % 8 | ands counterL , origK, #7 // counterL = counterL % 8 | ||||
| @@ -1380,6 +1468,7 @@ dgemm_kernel_L1_M8_BEGIN: | |||||
| cmp counterI, #0 | cmp counterI, #0 | ||||
| ble dgemm_kernel_L1_M4_BEGIN | ble dgemm_kernel_L1_M4_BEGIN | ||||
| .align 5 | |||||
| dgemm_kernel_L1_M8_20: | dgemm_kernel_L1_M8_20: | ||||
| INIT8x1 | INIT8x1 | ||||
| @@ -1388,14 +1477,16 @@ dgemm_kernel_L1_M8_20: | |||||
| asr counterL , origK, #3 // counterL = counterL / 8 | asr counterL , origK, #3 // counterL = counterL / 8 | ||||
| cmp counterL , #0 | cmp counterL , #0 | ||||
| ble dgemm_kernel_L1_M8_40 | ble dgemm_kernel_L1_M8_40 | ||||
| .align 5 | |||||
| .align 5 | |||||
| dgemm_kernel_L1_M8_22: | dgemm_kernel_L1_M8_22: | ||||
| KERNEL8x1_SUB | KERNEL8x1_SUB | ||||
| KERNEL8x1_SUB | KERNEL8x1_SUB | ||||
| KERNEL8x1_SUB | KERNEL8x1_SUB | ||||
| KERNEL8x1_SUB | KERNEL8x1_SUB | ||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||||
| KERNEL8x1_SUB | KERNEL8x1_SUB | ||||
| KERNEL8x1_SUB | KERNEL8x1_SUB | ||||
| KERNEL8x1_SUB | KERNEL8x1_SUB | ||||
| @@ -1410,6 +1501,7 @@ dgemm_kernel_L1_M8_40: | |||||
| ands counterL , origK, #7 // counterL = counterL % 8 | ands counterL , origK, #7 // counterL = counterL % 8 | ||||
| ble dgemm_kernel_L1_M8_100 | ble dgemm_kernel_L1_M8_100 | ||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||||
| dgemm_kernel_L1_M8_42: | dgemm_kernel_L1_M8_42: | ||||
| KERNEL8x1_SUB | KERNEL8x1_SUB | ||||
| @@ -1443,17 +1535,23 @@ dgemm_kernel_L1_M4_20: | |||||
| asr counterL , origK, #3 // counterL = counterL / 8 | asr counterL , origK, #3 // counterL = counterL / 8 | ||||
| cmp counterL , #0 | cmp counterL , #0 | ||||
| ble dgemm_kernel_L1_M4_40 | ble dgemm_kernel_L1_M4_40 | ||||
| .align 5 | |||||
| .align 5 | |||||
| dgemm_kernel_L1_M4_22: | dgemm_kernel_L1_M4_22: | ||||
| KERNEL4x1_SUB | KERNEL4x1_SUB | ||||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||||
| KERNEL4x1_SUB | KERNEL4x1_SUB | ||||
| KERNEL4x1_SUB | KERNEL4x1_SUB | ||||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||||
| KERNEL4x1_SUB | KERNEL4x1_SUB | ||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||||
| KERNEL4x1_SUB | KERNEL4x1_SUB | ||||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||||
| KERNEL4x1_SUB | KERNEL4x1_SUB | ||||
| KERNEL4x1_SUB | KERNEL4x1_SUB | ||||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||||
| KERNEL4x1_SUB | KERNEL4x1_SUB | ||||
| subs counterL, counterL, #1 | subs counterL, counterL, #1 | ||||
| @@ -1465,9 +1563,11 @@ dgemm_kernel_L1_M4_40: | |||||
| ands counterL , origK, #7 // counterL = counterL % 8 | ands counterL , origK, #7 // counterL = counterL % 8 | ||||
| ble dgemm_kernel_L1_M4_100 | ble dgemm_kernel_L1_M4_100 | ||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||||
| dgemm_kernel_L1_M4_42: | dgemm_kernel_L1_M4_42: | ||||
| KERNEL4x1_SUB | KERNEL4x1_SUB | ||||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||||
| subs counterL, counterL, #1 | subs counterL, counterL, #1 | ||||
| bgt dgemm_kernel_L1_M4_42 | bgt dgemm_kernel_L1_M4_42 | ||||
| @@ -1501,18 +1601,24 @@ dgemm_kernel_L1_M2_22: | |||||
| KERNEL2x1_SUB | KERNEL2x1_SUB | ||||
| KERNEL2x1_SUB | KERNEL2x1_SUB | ||||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||||
| KERNEL2x1_SUB | KERNEL2x1_SUB | ||||
| KERNEL2x1_SUB | KERNEL2x1_SUB | ||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||||
| KERNEL2x1_SUB | KERNEL2x1_SUB | ||||
| KERNEL2x1_SUB | KERNEL2x1_SUB | ||||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||||
| KERNEL2x1_SUB | KERNEL2x1_SUB | ||||
| KERNEL2x1_SUB | KERNEL2x1_SUB | ||||
| subs counterL, counterL, #1 | subs counterL, counterL, #1 | ||||
| bgt dgemm_kernel_L1_M2_22 | bgt dgemm_kernel_L1_M2_22 | ||||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] | |||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||||
| dgemm_kernel_L1_M2_40: | dgemm_kernel_L1_M2_40: | ||||
| ands counterL , origK, #7 // counterL = counterL % 8 | ands counterL , origK, #7 // counterL = counterL % 8 | ||||
| @@ -1547,14 +1653,17 @@ dgemm_kernel_L1_M1_20: | |||||
| cmp counterL , #0 | cmp counterL , #0 | ||||
| ble dgemm_kernel_L1_M1_40 | ble dgemm_kernel_L1_M1_40 | ||||
| dgemm_kernel_L1_M1_22: | dgemm_kernel_L1_M1_22: | ||||
| KERNEL1x1_SUB | KERNEL1x1_SUB | ||||
| KERNEL1x1_SUB | KERNEL1x1_SUB | ||||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||||
| KERNEL1x1_SUB | KERNEL1x1_SUB | ||||
| KERNEL1x1_SUB | KERNEL1x1_SUB | ||||
| KERNEL1x1_SUB | KERNEL1x1_SUB | ||||
| KERNEL1x1_SUB | KERNEL1x1_SUB | ||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||||
| KERNEL1x1_SUB | KERNEL1x1_SUB | ||||
| KERNEL1x1_SUB | KERNEL1x1_SUB | ||||
| @@ -1567,6 +1676,8 @@ dgemm_kernel_L1_M1_40: | |||||
| ands counterL , origK, #7 // counterL = counterL % 8 | ands counterL , origK, #7 // counterL = counterL % 8 | ||||
| ble dgemm_kernel_L1_M1_100 | ble dgemm_kernel_L1_M1_100 | ||||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||||
| dgemm_kernel_L1_M1_42: | dgemm_kernel_L1_M1_42: | ||||
| KERNEL1x1_SUB | KERNEL1x1_SUB | ||||
| @@ -46,19 +46,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define pCRow0 x12 | #define pCRow0 x12 | ||||
| #define pCRow1 x13 | #define pCRow1 x13 | ||||
| #define pCRow2 x14 | #define pCRow2 x14 | ||||
| #define pA x15 | |||||
| #define temp x16 | |||||
| #define tempOffset x17 | |||||
| #define tempK x18 | |||||
| #define pCRow3 x15 | |||||
| #define pA x16 | |||||
| #define alpha x17 | |||||
| #define temp x18 | |||||
| #define tempOffset x19 | |||||
| #define tempK x20 | |||||
| #define alpha0 d10 | #define alpha0 d10 | ||||
| #define alphaV0 v10.d[0] | #define alphaV0 v10.d[0] | ||||
| #define alpha1 d11 | |||||
| #define alphaV1 v11.d[0] | |||||
| #define alpha2 d14 | |||||
| #define alphaV2 v14.d[0] | |||||
| #define alpha3 d15 | |||||
| #define alphaV3 v15.d[0] | |||||
| #define A_PRE_SIZE 2560 | |||||
| #define B_PRE_SIZE 448 | |||||
| #define C_PRE_SIZE 128 | |||||
| // 00 origM | // 00 origM | ||||
| // 01 origN | // 01 origN | ||||
| @@ -101,14 +101,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| //v05 pA1_2, pA1_3 | //v05 pA1_2, pA1_3 | ||||
| //v06 pA1_4, pA1_5 | //v06 pA1_4, pA1_5 | ||||
| //v07 pA1_6, pA1_7 | //v07 pA1_6, pA1_7 | ||||
| //v08 must save pB0_0, pB0_1 | |||||
| //v09 must save pB0_2, pB0_3 | |||||
| //v10 must save ALPHA0 | |||||
| //v11 must save ALPHA1 | |||||
| //v12 must save pB1_0, pB1_1 | |||||
| //v13 must save pB1_2, pB1_3 | |||||
| //v14 must save ALPHA2 | |||||
| //v15 must save ALPHA3 | |||||
| //v08 must save pB0_0 | |||||
| //v09 must save pB0_1 | |||||
| //v10 must save pB0_2 --> ALPHA0 | |||||
| //v11 must save pB0_3 | |||||
| //v12 must save pB1_0 | |||||
| //v13 must save pB1_1 | |||||
| //v14 must save pB1_2 | |||||
| //v15 must save pB1_3 | |||||
| //v16 must save C00, C01 | //v16 must save C00, C01 | ||||
| //v17 must save C02, C03 | //v17 must save C02, C03 | ||||
| //v18 C04, C05 | //v18 C04, C05 | ||||
| @@ -150,186 +150,249 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .endm | .endm | ||||
| .macro KERNEL8x4_I | .macro KERNEL8x4_I | ||||
| ld1 {v0.2d, v1.2d}, [pA] | |||||
| add pA, pA, #32 | |||||
| ld1 {v8.2d, v9.2d}, [pB] | |||||
| add pB, pB, #32 | |||||
| ld1 {v2.2d, v3.2d}, [pA] | |||||
| add pA, pA, #32 | |||||
| ldp q0, q1, [pA], #32 | |||||
| ldp d8, d9, [pB], #16 | |||||
| fmul v16.2d, v0.2d, v8.d[0] | fmul v16.2d, v0.2d, v8.d[0] | ||||
| fmul v20.2d, v0.2d, v9.d[0] | |||||
| ldp d10, d11, [pB], #16 | |||||
| fmul v17.2d, v1.2d, v8.d[0] | fmul v17.2d, v1.2d, v8.d[0] | ||||
| fmul v21.2d, v1.2d, v9.d[0] | |||||
| ldp q2, q3, [pA], #32 | |||||
| fmul v24.2d, v0.2d, v10.d[0] | |||||
| fmul v28.2d, v0.2d, v11.d[0] | |||||
| ldp q4, q5, [pA], #32 | |||||
| fmul v25.2d, v1.2d, v10.d[0] | |||||
| fmul v29.2d, v1.2d, v11.d[0] | |||||
| ldp d12, d13, [pB], #16 | |||||
| fmul v18.2d, v2.2d, v8.d[0] | fmul v18.2d, v2.2d, v8.d[0] | ||||
| fmul v19.2d, v3.2d, v8.d[0] | |||||
| fmul v22.2d, v2.2d, v9.d[0] | |||||
| fmul v20.2d, v0.2d, v8.d[1] | |||||
| fmul v21.2d, v1.2d, v8.d[1] | |||||
| fmul v22.2d, v2.2d, v8.d[1] | |||||
| fmul v23.2d, v3.2d, v8.d[1] | |||||
| ldp d14, d15, [pB], #16 | |||||
| fmul v24.2d, v0.2d, v9.d[0] | |||||
| fmul v25.2d, v1.2d, v9.d[0] | |||||
| fmul v26.2d, v2.2d, v9.d[0] | |||||
| fmul v27.2d, v3.2d, v9.d[0] | |||||
| fmul v26.2d, v2.2d, v10.d[0] | |||||
| fmul v30.2d, v2.2d, v11.d[0] | |||||
| fmul v28.2d, v0.2d, v9.d[1] | |||||
| fmul v29.2d, v1.2d, v9.d[1] | |||||
| fmul v30.2d, v2.2d, v9.d[1] | |||||
| fmul v31.2d, v3.2d, v9.d[1] | |||||
| ldp q6, q7, [pA], #32 | |||||
| ld1 {v4.2d, v5.2d}, [pA] | |||||
| add pA, pA, #32 | |||||
| ld1 {v12.2d, v13.2d}, [pB] | |||||
| add pB, pB, #32 | |||||
| ld1 {v6.2d, v7.2d}, [pA] | |||||
| add pA, pA, #32 | |||||
| fmul v19.2d, v3.2d, v8.d[0] | |||||
| fmul v27.2d, v3.2d, v10.d[0] | |||||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||||
| fmul v31.2d, v3.2d, v11.d[0] | |||||
| fmul v23.2d, v3.2d, v9.d[0] | |||||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] | |||||
| .endm | .endm | ||||
| .macro KERNEL8x4_M1 | .macro KERNEL8x4_M1 | ||||
| fmla v16.2d, v0.2d, v8.d[0] | fmla v16.2d, v0.2d, v8.d[0] | ||||
| fmla v20.2d, v0.2d, v9.d[0] | |||||
| ldp q4, q5, [pA], #32 | |||||
| fmla v24.2d, v0.2d, v10.d[0] | |||||
| fmla v28.2d, v0.2d, v11.d[0] | |||||
| ldp d12, d13, [pB], #16 | |||||
| fmla v17.2d, v1.2d, v8.d[0] | fmla v17.2d, v1.2d, v8.d[0] | ||||
| fmla v18.2d, v2.2d, v8.d[0] | |||||
| fmla v19.2d, v3.2d, v8.d[0] | |||||
| fmla v25.2d, v1.2d, v10.d[0] | |||||
| fmla v20.2d, v0.2d, v8.d[1] | |||||
| fmla v21.2d, v1.2d, v8.d[1] | |||||
| fmla v22.2d, v2.2d, v8.d[1] | |||||
| fmla v23.2d, v3.2d, v8.d[1] | |||||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] | |||||
| fmla v24.2d, v0.2d, v9.d[0] | |||||
| fmla v25.2d, v1.2d, v9.d[0] | |||||
| fmla v26.2d, v2.2d, v9.d[0] | |||||
| fmla v27.2d, v3.2d, v9.d[0] | |||||
| fmla v21.2d, v1.2d, v9.d[0] | |||||
| fmla v29.2d, v1.2d, v11.d[0] | |||||
| fmla v28.2d, v0.2d, v9.d[1] | |||||
| fmla v29.2d, v1.2d, v9.d[1] | |||||
| fmla v30.2d, v2.2d, v9.d[1] | |||||
| fmla v31.2d, v3.2d, v9.d[1] | |||||
| ldp d14, d15, [pB], #16 | |||||
| ld1 {v4.2d, v5.2d}, [pA] | |||||
| add pA, pA, #32 | |||||
| ld1 {v12.2d, v13.2d}, [pB] | |||||
| add pB, pB, #32 | |||||
| ld1 {v6.2d, v7.2d}, [pA] | |||||
| add pA, pA, #32 | |||||
| fmla v18.2d, v2.2d, v8.d[0] | |||||
| fmla v22.2d, v2.2d, v9.d[0] | |||||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||||
| fmla v26.2d, v2.2d, v10.d[0] | |||||
| fmla v30.2d, v2.2d, v11.d[0] | |||||
| fmla v19.2d, v3.2d, v8.d[0] | |||||
| fmla v23.2d, v3.2d, v9.d[0] | |||||
| ldp q6, q7, [pA], #32 | |||||
| prfm PLDL1KEEP, [pA, #512] | |||||
| fmla v27.2d, v3.2d, v10.d[0] | |||||
| fmla v31.2d, v3.2d, v11.d[0] | |||||
| .endm | .endm | ||||
| .macro KERNEL8x4_M2 | .macro KERNEL8x4_M2 | ||||
| fmla v16.2d, v4.2d, v12.d[0] | fmla v16.2d, v4.2d, v12.d[0] | ||||
| fmla v20.2d, v4.2d, v13.d[0] | |||||
| fmla v24.2d, v4.2d, v14.d[0] | |||||
| fmla v28.2d, v4.2d, v15.d[0] | |||||
| ldp q0, q1, [pA], #32 | |||||
| fmla v17.2d, v5.2d, v12.d[0] | fmla v17.2d, v5.2d, v12.d[0] | ||||
| fmla v25.2d, v5.2d, v14.d[0] | |||||
| ldp d8, d9, [pB], #16 | |||||
| fmla v21.2d, v5.2d, v13.d[0] | |||||
| fmla v29.2d, v5.2d, v15.d[0] | |||||
| ldp d10, d11, [pB], #16 | |||||
| fmla v18.2d, v6.2d, v12.d[0] | fmla v18.2d, v6.2d, v12.d[0] | ||||
| fmla v19.2d, v7.2d, v12.d[0] | |||||
| fmla v22.2d, v6.2d, v13.d[0] | |||||
| fmla v20.2d, v4.2d, v12.d[1] | |||||
| fmla v21.2d, v5.2d, v12.d[1] | |||||
| fmla v22.2d, v6.2d, v12.d[1] | |||||
| fmla v23.2d, v7.2d, v12.d[1] | |||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||||
| fmla v24.2d, v4.2d, v13.d[0] | |||||
| fmla v25.2d, v5.2d, v13.d[0] | |||||
| fmla v26.2d, v6.2d, v13.d[0] | |||||
| fmla v27.2d, v7.2d, v13.d[0] | |||||
| fmla v26.2d, v6.2d, v14.d[0] | |||||
| fmla v30.2d, v6.2d, v15.d[0] | |||||
| fmla v28.2d, v4.2d, v13.d[1] | |||||
| fmla v29.2d, v5.2d, v13.d[1] | |||||
| fmla v30.2d, v6.2d, v13.d[1] | |||||
| fmla v31.2d, v7.2d, v13.d[1] | |||||
| fmla v19.2d, v7.2d, v12.d[0] | |||||
| fmla v23.2d, v7.2d, v13.d[0] | |||||
| ld1 {v0.2d, v1.2d}, [pA] | |||||
| add pA, pA, #32 | |||||
| ld1 {v8.2d, v9.2d}, [pB] | |||||
| add pB, pB, #32 | |||||
| ld1 {v2.2d, v3.2d}, [pA] | |||||
| add pA, pA, #32 | |||||
| ldp q2, q3, [pA], #32 | |||||
| prfm PLDL1KEEP, [pB, #512] | |||||
| fmla v27.2d, v7.2d, v14.d[0] | |||||
| fmla v31.2d, v7.2d, v15.d[0] | |||||
| .endm | .endm | ||||
| .macro KERNEL8x4_E | .macro KERNEL8x4_E | ||||
| fmla v16.2d, v4.2d, v12.d[0] | fmla v16.2d, v4.2d, v12.d[0] | ||||
| fmla v20.2d, v4.2d, v13.d[0] | |||||
| fmla v24.2d, v4.2d, v14.d[0] | |||||
| fmla v28.2d, v4.2d, v15.d[0] | |||||
| fmla v17.2d, v5.2d, v12.d[0] | fmla v17.2d, v5.2d, v12.d[0] | ||||
| fmla v18.2d, v6.2d, v12.d[0] | |||||
| fmla v19.2d, v7.2d, v12.d[0] | |||||
| fmla v25.2d, v5.2d, v14.d[0] | |||||
| fmla v21.2d, v5.2d, v13.d[0] | |||||
| fmla v29.2d, v5.2d, v15.d[0] | |||||
| fmla v20.2d, v4.2d, v12.d[1] | |||||
| fmla v21.2d, v5.2d, v12.d[1] | |||||
| fmla v22.2d, v6.2d, v12.d[1] | |||||
| fmla v23.2d, v7.2d, v12.d[1] | |||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||||
| fmla v24.2d, v4.2d, v13.d[0] | |||||
| fmla v25.2d, v5.2d, v13.d[0] | |||||
| fmla v26.2d, v6.2d, v13.d[0] | |||||
| fmla v27.2d, v7.2d, v13.d[0] | |||||
| fmla v18.2d, v6.2d, v12.d[0] | |||||
| fmla v22.2d, v6.2d, v13.d[0] | |||||
| fmla v26.2d, v6.2d, v14.d[0] | |||||
| fmla v30.2d, v6.2d, v15.d[0] | |||||
| fmla v28.2d, v4.2d, v13.d[1] | |||||
| fmla v29.2d, v5.2d, v13.d[1] | |||||
| fmla v30.2d, v6.2d, v13.d[1] | |||||
| fmla v31.2d, v7.2d, v13.d[1] | |||||
| fmla v19.2d, v7.2d, v12.d[0] | |||||
| fmla v23.2d, v7.2d, v13.d[0] | |||||
| fmla v27.2d, v7.2d, v14.d[0] | |||||
| fmla v31.2d, v7.2d, v15.d[0] | |||||
| .endm | .endm | ||||
| .macro KERNEL8x4_SUB | .macro KERNEL8x4_SUB | ||||
| ld1 {v0.2d, v1.2d}, [pA] | |||||
| add pA, pA, #32 | |||||
| ld1 {v8.2d, v9.2d}, [pB] | |||||
| add pB, pB, #32 | |||||
| ld1 {v2.2d, v3.2d}, [pA] | |||||
| add pA, pA, #32 | |||||
| ldp q0, q1, [pA], #32 | |||||
| ldp d8, d9, [pB], #16 | |||||
| fmla v16.2d, v0.2d, v8.d[0] | fmla v16.2d, v0.2d, v8.d[0] | ||||
| fmla v20.2d, v0.2d, v9.d[0] | |||||
| ldp d10, d11, [pB], #16 | |||||
| fmla v17.2d, v1.2d, v8.d[0] | fmla v17.2d, v1.2d, v8.d[0] | ||||
| fmla v21.2d, v1.2d, v9.d[0] | |||||
| ldp q2, q3, [pA], #32 | |||||
| fmla v24.2d, v0.2d, v10.d[0] | |||||
| fmla v28.2d, v0.2d, v11.d[0] | |||||
| fmla v25.2d, v1.2d, v10.d[0] | |||||
| fmla v29.2d, v1.2d, v11.d[0] | |||||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||||
| fmla v18.2d, v2.2d, v8.d[0] | fmla v18.2d, v2.2d, v8.d[0] | ||||
| fmla v19.2d, v3.2d, v8.d[0] | |||||
| fmla v22.2d, v2.2d, v9.d[0] | |||||
| fmla v20.2d, v0.2d, v8.d[1] | |||||
| fmla v21.2d, v1.2d, v8.d[1] | |||||
| fmla v22.2d, v2.2d, v8.d[1] | |||||
| fmla v23.2d, v3.2d, v8.d[1] | |||||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] | |||||
| fmla v24.2d, v0.2d, v9.d[0] | |||||
| fmla v25.2d, v1.2d, v9.d[0] | |||||
| fmla v26.2d, v2.2d, v9.d[0] | |||||
| fmla v27.2d, v3.2d, v9.d[0] | |||||
| fmla v26.2d, v2.2d, v10.d[0] | |||||
| fmla v30.2d, v2.2d, v11.d[0] | |||||
| fmla v28.2d, v0.2d, v9.d[1] | |||||
| fmla v29.2d, v1.2d, v9.d[1] | |||||
| fmla v30.2d, v2.2d, v9.d[1] | |||||
| fmla v31.2d, v3.2d, v9.d[1] | |||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||||
| fmla v19.2d, v3.2d, v8.d[0] | |||||
| fmla v27.2d, v3.2d, v10.d[0] | |||||
| fmla v31.2d, v3.2d, v11.d[0] | |||||
| fmla v23.2d, v3.2d, v9.d[0] | |||||
| .endm | .endm | ||||
| .macro SAVE8x4 | .macro SAVE8x4 | ||||
| add pCRow1, pCRow0, LDC | |||||
| fmov alpha0, alpha | |||||
| prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||||
| fmul v0.2d, v16.2d, alphaV0 | fmul v0.2d, v16.2d, alphaV0 | ||||
| fmul v1.2d, v17.2d, alphaV1 | |||||
| fmul v2.2d, v18.2d, alphaV2 | |||||
| fmul v3.2d, v19.2d, alphaV3 | |||||
| st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] | |||||
| fmul v1.2d, v17.2d, alphaV0 | |||||
| stp q0, q1, [pCRow0] | |||||
| add pCRow2, pCRow1, LDC | |||||
| add pCRow0, pCRow0, #32 | |||||
| prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||||
| fmul v2.2d, v18.2d, alphaV0 | |||||
| fmul v3.2d, v19.2d, alphaV0 | |||||
| stp q2, q3, [pCRow0] | |||||
| add pCRow0, pCRow0, #32 | |||||
| prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||||
| fmul v4.2d, v20.2d, alphaV0 | fmul v4.2d, v20.2d, alphaV0 | ||||
| fmul v5.2d, v21.2d, alphaV1 | |||||
| fmul v6.2d, v22.2d, alphaV2 | |||||
| fmul v7.2d, v23.2d, alphaV3 | |||||
| st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1] | |||||
| fmul v5.2d, v21.2d, alphaV0 | |||||
| stp q4, q5, [pCRow1] | |||||
| add pCRow1, pCRow2, LDC | |||||
| add pCRow1, pCRow1, #32 | |||||
| prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||||
| fmul v6.2d, v22.2d, alphaV0 | |||||
| fmul v7.2d, v23.2d, alphaV0 | |||||
| stp q6, q7, [pCRow1] | |||||
| add pCRow1, pCRow1, #32 | |||||
| prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] | |||||
| fmul v0.2d, v24.2d, alphaV0 | fmul v0.2d, v24.2d, alphaV0 | ||||
| fmul v1.2d, v25.2d, alphaV1 | |||||
| fmul v2.2d, v26.2d, alphaV2 | |||||
| fmul v3.2d, v27.2d, alphaV3 | |||||
| st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow2] | |||||
| fmul v1.2d, v25.2d, alphaV0 | |||||
| stp q0, q1, [pCRow2] | |||||
| add pCRow2, pCRow2, #32 | |||||
| prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] | |||||
| fmul v2.2d, v26.2d, alphaV0 | |||||
| fmul v3.2d, v27.2d, alphaV0 | |||||
| stp q2, q3, [pCRow2] | |||||
| add pCRow2, pCRow2, #32 | |||||
| prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] | |||||
| fmul v4.2d, v28.2d, alphaV0 | fmul v4.2d, v28.2d, alphaV0 | ||||
| fmul v5.2d, v29.2d, alphaV1 | |||||
| fmul v6.2d, v30.2d, alphaV2 | |||||
| fmul v7.2d, v31.2d, alphaV3 | |||||
| st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1] | |||||
| fmul v5.2d, v29.2d, alphaV0 | |||||
| stp q4, q5, [pCRow3] | |||||
| add pCRow0, pCRow0, #64 | |||||
| add pCRow3, pCRow3, #32 | |||||
| prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] | |||||
| fmul v6.2d, v30.2d, alphaV0 | |||||
| fmul v7.2d, v31.2d, alphaV0 | |||||
| stp q6, q7, [pCRow3] | |||||
| add pCRow3, pCRow3, #32 | |||||
| .endm | .endm | ||||
| /******************************************************************************/ | /******************************************************************************/ | ||||
| @@ -365,26 +428,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .endm | .endm | ||||
| .macro SAVE4x4 | .macro SAVE4x4 | ||||
| fmov alpha0, alpha | |||||
| fmul v8.2d, v16.2d, alphaV0 | fmul v8.2d, v16.2d, alphaV0 | ||||
| fmul v9.2d, v17.2d, alphaV1 | |||||
| fmul v9.2d, v17.2d, alphaV0 | |||||
| st1 {v8.2d, v9.2d}, [pCRow0] | st1 {v8.2d, v9.2d}, [pCRow0] | ||||
| add pCRow1, pCRow0, LDC | add pCRow1, pCRow0, LDC | ||||
| fmul v12.2d, v20.2d, alphaV2 | |||||
| fmul v13.2d, v21.2d, alphaV3 | |||||
| fmul v12.2d, v20.2d, alphaV0 | |||||
| fmul v13.2d, v21.2d, alphaV0 | |||||
| st1 {v12.2d, v13.2d}, [pCRow1] | st1 {v12.2d, v13.2d}, [pCRow1] | ||||
| add pCRow2, pCRow1, LDC | add pCRow2, pCRow1, LDC | ||||
| fmul v8.2d, v24.2d, alphaV0 | fmul v8.2d, v24.2d, alphaV0 | ||||
| fmul v9.2d, v25.2d, alphaV1 | |||||
| fmul v9.2d, v25.2d, alphaV0 | |||||
| st1 {v8.2d, v9.2d}, [pCRow2] | st1 {v8.2d, v9.2d}, [pCRow2] | ||||
| add pCRow1, pCRow2, LDC | add pCRow1, pCRow2, LDC | ||||
| fmul v12.2d, v28.2d, alphaV2 | |||||
| fmul v13.2d, v29.2d, alphaV3 | |||||
| fmul v12.2d, v28.2d, alphaV0 | |||||
| fmul v13.2d, v29.2d, alphaV0 | |||||
| st1 {v12.2d, v13.2d}, [pCRow1] | st1 {v12.2d, v13.2d}, [pCRow1] | ||||
| add pCRow0, pCRow0, #32 | add pCRow0, pCRow0, #32 | ||||
| @@ -413,22 +477,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .endm | .endm | ||||
| .macro SAVE2x4 | .macro SAVE2x4 | ||||
| fmov alpha0, alpha | |||||
| fmul v8.2d, v16.2d, alphaV0 | fmul v8.2d, v16.2d, alphaV0 | ||||
| st1 {v8.2d}, [pCRow0] | st1 {v8.2d}, [pCRow0] | ||||
| add pCRow1, pCRow0, LDC | add pCRow1, pCRow0, LDC | ||||
| fmul v12.2d, v20.2d, alphaV1 | |||||
| fmul v12.2d, v20.2d, alphaV0 | |||||
| st1 {v12.2d}, [pCRow1] | st1 {v12.2d}, [pCRow1] | ||||
| add pCRow2, pCRow1, LDC | add pCRow2, pCRow1, LDC | ||||
| fmul v8.2d, v24.2d, alphaV2 | |||||
| fmul v8.2d, v24.2d, alphaV0 | |||||
| st1 {v8.2d}, [pCRow2] | st1 {v8.2d}, [pCRow2] | ||||
| add pCRow1, pCRow2, LDC | add pCRow1, pCRow2, LDC | ||||
| fmul v12.2d, v28.2d, alphaV3 | |||||
| fmul v12.2d, v28.2d, alphaV0 | |||||
| st1 {v12.2d}, [pCRow1] | st1 {v12.2d}, [pCRow1] | ||||
| add pCRow0, pCRow0, #16 | add pCRow0, pCRow0, #16 | ||||
| @@ -453,6 +518,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .endm | .endm | ||||
| .macro SAVE1x4 | .macro SAVE1x4 | ||||
| fmov alpha0, alpha | |||||
| add pCRow1, pCRow0, LDC | add pCRow1, pCRow0, LDC | ||||
| fmul v8.2d, v16.2d, alphaV0 | fmul v8.2d, v16.2d, alphaV0 | ||||
| @@ -462,7 +529,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| add pCRow2, pCRow1, LDC | add pCRow2, pCRow1, LDC | ||||
| add pCRow1, pCRow2, LDC | add pCRow1, pCRow2, LDC | ||||
| fmul v12.2d, v20.2d, alphaV1 | |||||
| fmul v12.2d, v20.2d, alphaV0 | |||||
| st1 {v12.d}[0], [pCRow2] | st1 {v12.d}[0], [pCRow2] | ||||
| st1 {v12.d}[1], [pCRow1] | st1 {v12.d}[1], [pCRow1] | ||||
| @@ -502,18 +569,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .endm | .endm | ||||
| .macro SAVE8x2 | .macro SAVE8x2 | ||||
| fmov alpha0, alpha | |||||
| add pCRow1, pCRow0, LDC | add pCRow1, pCRow0, LDC | ||||
| fmul v0.2d, v16.2d, alphaV0 | fmul v0.2d, v16.2d, alphaV0 | ||||
| fmul v1.2d, v17.2d, alphaV1 | |||||
| fmul v2.2d, v18.2d, alphaV2 | |||||
| fmul v3.2d, v19.2d, alphaV3 | |||||
| fmul v1.2d, v17.2d, alphaV0 | |||||
| fmul v2.2d, v18.2d, alphaV0 | |||||
| fmul v3.2d, v19.2d, alphaV0 | |||||
| st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] | st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] | ||||
| fmul v4.2d, v20.2d, alphaV0 | fmul v4.2d, v20.2d, alphaV0 | ||||
| fmul v5.2d, v21.2d, alphaV1 | |||||
| fmul v6.2d, v22.2d, alphaV2 | |||||
| fmul v7.2d, v23.2d, alphaV3 | |||||
| fmul v5.2d, v21.2d, alphaV0 | |||||
| fmul v6.2d, v22.2d, alphaV0 | |||||
| fmul v7.2d, v23.2d, alphaV0 | |||||
| st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1] | st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1] | ||||
| add pCRow0, pCRow0, #64 | add pCRow0, pCRow0, #64 | ||||
| @@ -541,14 +609,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .endm | .endm | ||||
| .macro SAVE4x2 | .macro SAVE4x2 | ||||
| fmov alpha0, alpha | |||||
| fmul v8.2d, v16.2d, alphaV0 | fmul v8.2d, v16.2d, alphaV0 | ||||
| fmul v9.2d, v17.2d, alphaV1 | |||||
| fmul v9.2d, v17.2d, alphaV0 | |||||
| st1 {v8.2d, v9.2d}, [pCRow0] | st1 {v8.2d, v9.2d}, [pCRow0] | ||||
| add pCRow1, pCRow0, LDC | add pCRow1, pCRow0, LDC | ||||
| fmul v12.2d, v20.2d, alphaV2 | |||||
| fmul v13.2d, v21.2d, alphaV3 | |||||
| fmul v12.2d, v20.2d, alphaV0 | |||||
| fmul v13.2d, v21.2d, alphaV0 | |||||
| st1 {v12.2d, v13.2d}, [pCRow1] | st1 {v12.2d, v13.2d}, [pCRow1] | ||||
| add pCRow0, pCRow0, #32 | add pCRow0, pCRow0, #32 | ||||
| @@ -573,12 +642,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .endm | .endm | ||||
| .macro SAVE2x2 | .macro SAVE2x2 | ||||
| fmov alpha0, alpha | |||||
| fmul v8.2d, v16.2d, alphaV0 | fmul v8.2d, v16.2d, alphaV0 | ||||
| st1 {v8.2d}, [pCRow0] | st1 {v8.2d}, [pCRow0] | ||||
| add pCRow1 , pCRow0, LDC | add pCRow1 , pCRow0, LDC | ||||
| fmul v12.2d, v20.2d, alphaV1 | |||||
| fmul v12.2d, v20.2d, alphaV0 | |||||
| st1 {v12.2d}, [pCRow1] | st1 {v12.2d}, [pCRow1] | ||||
| add pCRow0, pCRow0, #16 | add pCRow0, pCRow0, #16 | ||||
| @@ -601,6 +671,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .endm | .endm | ||||
| .macro SAVE1x2 | .macro SAVE1x2 | ||||
| fmov alpha0, alpha | |||||
| add pCRow1 , pCRow0, LDC | add pCRow1 , pCRow0, LDC | ||||
| fmul v8.2d, v16.2d, alphaV0 | fmul v8.2d, v16.2d, alphaV0 | ||||
| @@ -636,10 +707,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .endm | .endm | ||||
| .macro SAVE8x1 | .macro SAVE8x1 | ||||
| fmov alpha0, alpha | |||||
| fmul v0.2d, v16.2d, alphaV0 | fmul v0.2d, v16.2d, alphaV0 | ||||
| fmul v1.2d, v17.2d, alphaV1 | |||||
| fmul v2.2d, v18.2d, alphaV2 | |||||
| fmul v3.2d, v19.2d, alphaV3 | |||||
| fmul v1.2d, v17.2d, alphaV0 | |||||
| fmul v2.2d, v18.2d, alphaV0 | |||||
| fmul v3.2d, v19.2d, alphaV0 | |||||
| st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] | st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] | ||||
| add pCRow0, pCRow0, #64 | add pCRow0, pCRow0, #64 | ||||
| @@ -665,8 +737,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .endm | .endm | ||||
| .macro SAVE4x1 | .macro SAVE4x1 | ||||
| fmov alpha0, alpha | |||||
| fmul v8.2d, v16.2d, alphaV0 | fmul v8.2d, v16.2d, alphaV0 | ||||
| fmul v9.2d, v17.2d, alphaV1 | |||||
| fmul v9.2d, v17.2d, alphaV0 | |||||
| st1 {v8.2d, v9.2d}, [pCRow0] | st1 {v8.2d, v9.2d}, [pCRow0] | ||||
| add pCRow0, pCRow0, #32 | add pCRow0, pCRow0, #32 | ||||
| @@ -690,6 +763,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .endm | .endm | ||||
| .macro SAVE2x1 | .macro SAVE2x1 | ||||
| fmov alpha0, alpha | |||||
| fmul v8.2d, v16.2d, alphaV0 | fmul v8.2d, v16.2d, alphaV0 | ||||
| st1 {v8.2d}, [pCRow0] | st1 {v8.2d}, [pCRow0] | ||||
| @@ -713,6 +787,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .endm | .endm | ||||
| .macro SAVE1x1 | .macro SAVE1x1 | ||||
| fmov alpha0, alpha | |||||
| fmul d8, d16, alpha0 | fmul d8, d16, alpha0 | ||||
| str d8, [pCRow0] | str d8, [pCRow0] | ||||
| @@ -739,10 +814,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| stp x26, x27, [sp, #(9 * 16)] | stp x26, x27, [sp, #(9 * 16)] | ||||
| str x28, [sp, #(10 * 16)] | str x28, [sp, #(10 * 16)] | ||||
| fmov alpha0, d0 | |||||
| fmov alpha1, d0 | |||||
| fmov alpha2, d0 | |||||
| fmov alpha3, d0 | |||||
| prfm PLDL1KEEP, [origPB] | |||||
| prfm PLDL1KEEP, [origPA] | |||||
| fmov alpha, d0 | |||||
| lsl LDC, LDC, #3 // ldc = ldc * 8 | lsl LDC, LDC, #3 // ldc = ldc * 8 | ||||
| @@ -759,8 +834,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| /******************************************************************************/ | /******************************************************************************/ | ||||
| dtrmm_kernel_L4_BEGIN: | dtrmm_kernel_L4_BEGIN: | ||||
| mov pCRow0, pC // pCRow0 = C | |||||
| add pC, pC, LDC, lsl #2 | |||||
| mov pCRow0, pC | |||||
| add pCRow1, pCRow0, LDC | |||||
| add pCRow2, pCRow1, LDC | |||||
| add pCRow3, pCRow2, LDC | |||||
| add pC, pCRow3, LDC | |||||
| #if defined(LEFT) | #if defined(LEFT) | ||||
| mov tempOffset, offset | mov tempOffset, offset | ||||
| @@ -774,6 +854,7 @@ dtrmm_kernel_L4_M8_BEGIN: | |||||
| cmp counterI, #0 | cmp counterI, #0 | ||||
| ble dtrmm_kernel_L4_M4_BEGIN | ble dtrmm_kernel_L4_M4_BEGIN | ||||
| .align 5 | |||||
| dtrmm_kernel_L4_M8_20: | dtrmm_kernel_L4_M8_20: | ||||
| #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | ||||
| @@ -794,40 +875,64 @@ dtrmm_kernel_L4_M8_20: | |||||
| add tempK, tempOffset, #4 | add tempK, tempOffset, #4 | ||||
| #endif | #endif | ||||
| asr counterL , tempK, #1 // L = K / 2 | |||||
| asr counterL , tempK, #3 // L = K / 8 | |||||
| cmp counterL , #2 // is there at least 4 to do? | cmp counterL , #2 // is there at least 4 to do? | ||||
| blt dtrmm_kernel_L4_M8_32 | blt dtrmm_kernel_L4_M8_32 | ||||
| KERNEL8x4_I // do one in the K | KERNEL8x4_I // do one in the K | ||||
| KERNEL8x4_M2 // do another in the K | KERNEL8x4_M2 // do another in the K | ||||
| KERNEL8x4_M1 | |||||
| KERNEL8x4_M2 | |||||
| KERNEL8x4_M1 | |||||
| KERNEL8x4_M2 | |||||
| KERNEL8x4_M1 | |||||
| KERNEL8x4_M2 | |||||
| subs counterL, counterL, #2 // subtract 2 | subs counterL, counterL, #2 // subtract 2 | ||||
| ble dtrmm_kernel_L4_M8_22a | ble dtrmm_kernel_L4_M8_22a | ||||
| .align 5 | |||||
| .align 5 | |||||
| dtrmm_kernel_L4_M8_22: | dtrmm_kernel_L4_M8_22: | ||||
| KERNEL8x4_M1 | KERNEL8x4_M1 | ||||
| KERNEL8x4_M2 | KERNEL8x4_M2 | ||||
| KERNEL8x4_M1 | |||||
| KERNEL8x4_M2 | |||||
| KERNEL8x4_M1 | |||||
| KERNEL8x4_M2 | |||||
| KERNEL8x4_M1 | |||||
| KERNEL8x4_M2 | |||||
| subs counterL, counterL, #1 | subs counterL, counterL, #1 | ||||
| bgt dtrmm_kernel_L4_M8_22 | bgt dtrmm_kernel_L4_M8_22 | ||||
| .align 5 | |||||
| dtrmm_kernel_L4_M8_22a: | dtrmm_kernel_L4_M8_22a: | ||||
| KERNEL8x4_M1 | |||||
| KERNEL8x4_M2 | |||||
| KERNEL8x4_M1 | |||||
| KERNEL8x4_M2 | |||||
| KERNEL8x4_M1 | |||||
| KERNEL8x4_M2 | |||||
| KERNEL8x4_M1 | KERNEL8x4_M1 | ||||
| KERNEL8x4_E | KERNEL8x4_E | ||||
| b dtrmm_kernel_L4_M8_44 | b dtrmm_kernel_L4_M8_44 | ||||
| .align 5 | |||||
| dtrmm_kernel_L4_M8_32: | dtrmm_kernel_L4_M8_32: | ||||
| tst counterL, #1 | tst counterL, #1 | ||||
| ble dtrmm_kernel_L4_M8_40 | ble dtrmm_kernel_L4_M8_40 | ||||
| KERNEL8x4_I | KERNEL8x4_I | ||||
| KERNEL8x4_M2 | |||||
| KERNEL8x4_M1 | |||||
| KERNEL8x4_M2 | |||||
| KERNEL8x4_M1 | |||||
| KERNEL8x4_M2 | |||||
| KERNEL8x4_M1 | |||||
| KERNEL8x4_E | KERNEL8x4_E | ||||
| b dtrmm_kernel_L4_M8_44 | b dtrmm_kernel_L4_M8_44 | ||||
| @@ -838,13 +943,17 @@ dtrmm_kernel_L4_M8_40: | |||||
| dtrmm_kernel_L4_M8_44: | dtrmm_kernel_L4_M8_44: | ||||
| ands counterL , tempK, #1 | |||||
| ands counterL , tempK, #7 | |||||
| ble dtrmm_kernel_L4_M8_100 | ble dtrmm_kernel_L4_M8_100 | ||||
| .align 5 | |||||
| dtrmm_kernel_L4_M8_46: | dtrmm_kernel_L4_M8_46: | ||||
| KERNEL8x4_SUB | KERNEL8x4_SUB | ||||
| subs counterL, counterL, #1 | |||||
| bne dtrmm_kernel_L4_M8_46 | |||||
| dtrmm_kernel_L4_M8_100: | dtrmm_kernel_L4_M8_100: | ||||
| SAVE8x4 | SAVE8x4 | ||||
| @@ -864,6 +973,9 @@ dtrmm_kernel_L4_M8_100: | |||||
| #if defined(LEFT) | #if defined(LEFT) | ||||
| add tempOffset, tempOffset, #8 | add tempOffset, tempOffset, #8 | ||||
| #endif | #endif | ||||
| prfm PLDL1KEEP, [pA] | |||||
| prfm PLDL1KEEP, [pA, #64] | |||||
| prfm PLDL1KEEP, [origPB] | |||||
| dtrmm_kernel_L4_M8_END: | dtrmm_kernel_L4_M8_END: | ||||
| subs counterI, counterI, #1 | subs counterI, counterI, #1 | ||||
| @@ -68,6 +68,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define SHZ 3 | #define SHZ 3 | ||||
| #endif | #endif | ||||
| #define A_PRE_SIZE 768 | |||||
| #define Y_PRE_SIZE 768 | |||||
| /******************************************************************************/ | /******************************************************************************/ | ||||
| .macro SAVE_REGS | .macro SAVE_REGS | ||||
| @@ -105,36 +108,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| ld1 {v2.4s, v3.4s}, [A_PTR], #32 | ld1 {v2.4s, v3.4s}, [A_PTR], #32 | ||||
| ld1 {v4.4s, v5.4s}, [Y_IPTR], #32 | ld1 {v4.4s, v5.4s}, [Y_IPTR], #32 | ||||
| fmla v4.4s, v1.4s, v2.4s | fmla v4.4s, v1.4s, v2.4s | ||||
| prfm PLDL1KEEP, [A_PTR, #A_PRE_SIZE] | |||||
| fmla v5.4s, v1.4s, v3.4s | fmla v5.4s, v1.4s, v3.4s | ||||
| st1 {v4.4s, v5.4s}, [Y_OPTR], #32 | st1 {v4.4s, v5.4s}, [Y_OPTR], #32 | ||||
| ld1 {v6.4s, v7.4s}, [A_PTR], #32 | ld1 {v6.4s, v7.4s}, [A_PTR], #32 | ||||
| ld1 {v8.4s, v9.4s}, [Y_IPTR], #32 | ld1 {v8.4s, v9.4s}, [Y_IPTR], #32 | ||||
| fmla v8.4s, v1.4s, v6.4s | fmla v8.4s, v1.4s, v6.4s | ||||
| prfm PLDL1KEEP, [Y_IPTR, #Y_PRE_SIZE] | |||||
| fmla v9.4s, v1.4s, v7.4s | fmla v9.4s, v1.4s, v7.4s | ||||
| st1 {v8.4s, v9.4s}, [Y_OPTR], #32 | st1 {v8.4s, v9.4s}, [Y_OPTR], #32 | ||||
| #else //DOUBLE | #else //DOUBLE | ||||
| ld1 {v2.2d, v3.2d}, [A_PTR], #32 | ld1 {v2.2d, v3.2d}, [A_PTR], #32 | ||||
| ld1 {v4.2d, v5.2d}, [Y_IPTR], #32 | ld1 {v4.2d, v5.2d}, [Y_IPTR], #32 | ||||
| fmla v4.2d, v1.2d, v2.2d | fmla v4.2d, v1.2d, v2.2d | ||||
| prfm PLDL1KEEP, [A_PTR, #A_PRE_SIZE] | |||||
| fmla v5.2d, v1.2d, v3.2d | fmla v5.2d, v1.2d, v3.2d | ||||
| st1 {v4.2d, v5.2d}, [Y_OPTR], #32 | st1 {v4.2d, v5.2d}, [Y_OPTR], #32 | ||||
| ld1 {v6.2d, v7.2d}, [A_PTR], #32 | ld1 {v6.2d, v7.2d}, [A_PTR], #32 | ||||
| ld1 {v8.2d, v9.2d}, [Y_IPTR], #32 | ld1 {v8.2d, v9.2d}, [Y_IPTR], #32 | ||||
| fmla v8.2d, v1.2d, v6.2d | fmla v8.2d, v1.2d, v6.2d | ||||
| prfm PLDL1KEEP, [Y_IPTR, #Y_PRE_SIZE] | |||||
| fmla v9.2d, v1.2d, v7.2d | fmla v9.2d, v1.2d, v7.2d | ||||
| st1 {v8.2d, v9.2d}, [Y_OPTR], #32 | st1 {v8.2d, v9.2d}, [Y_OPTR], #32 | ||||
| ld1 {v10.2d, v11.2d}, [A_PTR], #32 | ld1 {v10.2d, v11.2d}, [A_PTR], #32 | ||||
| ld1 {v12.2d, v13.2d}, [Y_IPTR], #32 | ld1 {v12.2d, v13.2d}, [Y_IPTR], #32 | ||||
| fmla v12.2d, v1.2d, v10.2d | fmla v12.2d, v1.2d, v10.2d | ||||
| prfm PLDL1KEEP, [A_PTR, #A_PRE_SIZE] | |||||
| fmla v13.2d, v1.2d, v11.2d | fmla v13.2d, v1.2d, v11.2d | ||||
| st1 {v12.2d, v13.2d}, [Y_OPTR], #32 | st1 {v12.2d, v13.2d}, [Y_OPTR], #32 | ||||
| ld1 {v14.2d, v15.2d}, [A_PTR], #32 | ld1 {v14.2d, v15.2d}, [A_PTR], #32 | ||||
| ld1 {v16.2d, v17.2d}, [Y_IPTR], #32 | ld1 {v16.2d, v17.2d}, [Y_IPTR], #32 | ||||
| fmla v16.2d, v1.2d, v14.2d | fmla v16.2d, v1.2d, v14.2d | ||||
| prfm PLDL1KEEP, [Y_IPTR, #Y_PRE_SIZE] | |||||
| fmla v17.2d, v1.2d, v15.2d | fmla v17.2d, v1.2d, v15.2d | ||||
| st1 {v16.2d, v17.2d}, [Y_OPTR], #32 | st1 {v16.2d, v17.2d}, [Y_OPTR], #32 | ||||
| #endif | #endif | ||||
| @@ -41,6 +41,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define J x11 /* loop variable */ | #define J x11 /* loop variable */ | ||||
| #define I x12 /* loop variable */ | #define I x12 /* loop variable */ | ||||
| #define X_PREFETCH_SIZE 768 | |||||
| #define A_PREFETCH_SIZE 768 | |||||
| /******************************************************************************* | /******************************************************************************* | ||||
| * Macro definitions | * Macro definitions | ||||
| *******************************************************************************/ | *******************************************************************************/ | ||||
| @@ -112,42 +115,54 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| ld1 {v5.4s, v6.4s, v7.4s, v8.4s}, [A_PTR], #64 | ld1 {v5.4s, v6.4s, v7.4s, v8.4s}, [A_PTR], #64 | ||||
| ld1 {v9.4s, v10.4s, v11.4s, v12.4s}, [X_PTR], #64 | ld1 {v9.4s, v10.4s, v11.4s, v12.4s}, [X_PTR], #64 | ||||
| fmla v1.4s, v5.4s, v9.4s | fmla v1.4s, v5.4s, v9.4s | ||||
| prfm PLDL1KEEP, [A_PTR, #A_PREFETCH_SIZE] | |||||
| fmla v2.4s, v6.4s, v10.4s | fmla v2.4s, v6.4s, v10.4s | ||||
| prfm PLDL1KEEP, [X_PTR, #X_PREFETCH_SIZE] | |||||
| fmla v3.4s, v7.4s, v11.4s | fmla v3.4s, v7.4s, v11.4s | ||||
| ld1 {v13.4s, v14.4s, v15.4s, v16.4s}, [A_PTR], #64 | |||||
| fmla v4.4s, v8.4s, v12.4s | fmla v4.4s, v8.4s, v12.4s | ||||
| ld1 {v13.4s, v14.4s, v15.4s, v16.4s}, [A_PTR], #64 | |||||
| ld1 {v17.4s, v18.4s, v19.4s, v20.4s}, [X_PTR], #64 | ld1 {v17.4s, v18.4s, v19.4s, v20.4s}, [X_PTR], #64 | ||||
| fmla v1.4s, v13.4s, v17.4s | fmla v1.4s, v13.4s, v17.4s | ||||
| prfm PLDL1KEEP, [A_PTR, #A_PREFETCH_SIZE] | |||||
| fmla v2.4s, v14.4s, v18.4s | fmla v2.4s, v14.4s, v18.4s | ||||
| prfm PLDL1KEEP, [X_PTR, #X_PREFETCH_SIZE] | |||||
| fmla v3.4s, v15.4s, v19.4s | fmla v3.4s, v15.4s, v19.4s | ||||
| fmla v4.4s, v16.4s, v20.4s | fmla v4.4s, v16.4s, v20.4s | ||||
| #else | #else | ||||
| ld1 {v5.2d, v6.2d, v7.2d, v8.2d}, [A_PTR], #64 | ld1 {v5.2d, v6.2d, v7.2d, v8.2d}, [A_PTR], #64 | ||||
| ld1 {v9.2d, v10.2d, v11.2d, v12.2d}, [X_PTR], #64 | ld1 {v9.2d, v10.2d, v11.2d, v12.2d}, [X_PTR], #64 | ||||
| fmla v1.2d, v5.2d, v9.2d | fmla v1.2d, v5.2d, v9.2d | ||||
| prfm PLDL1KEEP, [A_PTR, #A_PREFETCH_SIZE] | |||||
| fmla v2.2d, v6.2d, v10.2d | fmla v2.2d, v6.2d, v10.2d | ||||
| prfm PLDL1KEEP, [X_PTR, #X_PREFETCH_SIZE] | |||||
| fmla v3.2d, v7.2d, v11.2d | fmla v3.2d, v7.2d, v11.2d | ||||
| fmla v4.2d, v8.2d, v12.2d | fmla v4.2d, v8.2d, v12.2d | ||||
| ld1 {v13.2d, v14.2d, v15.2d, v16.2d}, [A_PTR], #64 | ld1 {v13.2d, v14.2d, v15.2d, v16.2d}, [A_PTR], #64 | ||||
| ld1 {v17.2d, v18.2d, v19.2d, v20.2d}, [X_PTR], #64 | ld1 {v17.2d, v18.2d, v19.2d, v20.2d}, [X_PTR], #64 | ||||
| fmla v1.2d, v13.2d, v17.2d | fmla v1.2d, v13.2d, v17.2d | ||||
| prfm PLDL1KEEP, [A_PTR, #A_PREFETCH_SIZE] | |||||
| fmla v2.2d, v14.2d, v18.2d | fmla v2.2d, v14.2d, v18.2d | ||||
| prfm PLDL1KEEP, [X_PTR, #X_PREFETCH_SIZE] | |||||
| fmla v3.2d, v15.2d, v19.2d | fmla v3.2d, v15.2d, v19.2d | ||||
| fmla v4.2d, v16.2d, v20.2d | fmla v4.2d, v16.2d, v20.2d | ||||
| ld1 {v5.2d, v6.2d, v7.2d, v8.2d}, [A_PTR], #64 | ld1 {v5.2d, v6.2d, v7.2d, v8.2d}, [A_PTR], #64 | ||||
| ld1 {v9.2d, v10.2d, v11.2d, v12.2d}, [X_PTR], #64 | ld1 {v9.2d, v10.2d, v11.2d, v12.2d}, [X_PTR], #64 | ||||
| fmla v1.2d, v5.2d, v9.2d | fmla v1.2d, v5.2d, v9.2d | ||||
| prfm PLDL1KEEP, [A_PTR, #A_PREFETCH_SIZE] | |||||
| fmla v2.2d, v6.2d, v10.2d | fmla v2.2d, v6.2d, v10.2d | ||||
| prfm PLDL1KEEP, [X_PTR, #X_PREFETCH_SIZE] | |||||
| fmla v3.2d, v7.2d, v11.2d | fmla v3.2d, v7.2d, v11.2d | ||||
| fmla v4.2d, v8.2d, v12.2d | fmla v4.2d, v8.2d, v12.2d | ||||
| ld1 {v13.2d, v14.2d, v15.2d, v16.2d}, [A_PTR], #64 | ld1 {v13.2d, v14.2d, v15.2d, v16.2d}, [A_PTR], #64 | ||||
| ld1 {v17.2d, v18.2d, v19.2d, v20.2d}, [X_PTR], #64 | ld1 {v17.2d, v18.2d, v19.2d, v20.2d}, [X_PTR], #64 | ||||
| fmla v1.2d, v13.2d, v17.2d | fmla v1.2d, v13.2d, v17.2d | ||||
| prfm PLDL1KEEP, [A_PTR, #A_PREFETCH_SIZE] | |||||
| fmla v2.2d, v14.2d, v18.2d | fmla v2.2d, v14.2d, v18.2d | ||||
| prfm PLDL1KEEP, [X_PTR, #X_PREFETCH_SIZE] | |||||
| fmla v3.2d, v15.2d, v19.2d | fmla v3.2d, v15.2d, v19.2d | ||||
| fmla v4.2d, v16.2d, v20.2d | fmla v4.2d, v16.2d, v20.2d | ||||
| #endif | #endif | ||||
| @@ -72,6 +72,148 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| fabs MAXF, MAXF | fabs MAXF, MAXF | ||||
| .endm | .endm | ||||
| .macro KERNEL_F8 | |||||
| #if !defined(DOUBLE) | |||||
| ldp q2, q3, [X], #32 | |||||
| fabs v2.4s, v2.4s | |||||
| fabs v3.4s, v3.4s | |||||
| fmax v2.4s, v2.4s, v3.4s | |||||
| fmaxv TMPF, v2.4s | |||||
| fcmp MAXF, TMPF | |||||
| fcsel MAXF, MAXF, TMPF, COND | |||||
| csel INDEX, INDEX, Z, COND | |||||
| add Z, Z, #8 | |||||
| #else | |||||
| ldp q2, q3, [X], #32 | |||||
| ldp q4, q5, [X], #32 | |||||
| fabs v2.2d, v2.2d | |||||
| fabs v3.2d, v3.2d | |||||
| fabs v4.2d, v4.2d | |||||
| fabs v5.2d, v5.2d | |||||
| fmax v2.2d, v2.2d, v3.2d | |||||
| fmax v4.2d, v4.2d, v5.2d | |||||
| fmax v2.2d, v2.2d, v4.2d | |||||
| fmaxp TMPF, v2.2d | |||||
| fcmp MAXF, TMPF | |||||
| fcsel MAXF, MAXF, TMPF, COND | |||||
| csel INDEX, INDEX, Z, COND | |||||
| add Z, Z, #8 | |||||
| #endif | |||||
| PRFM PLDL1KEEP, [X, #1024] | |||||
| .endm | |||||
| .macro KERNEL_F8_FINALIZE | |||||
| sub x6, INDEX, #1 | |||||
| #if !defined(DOUBLE) | |||||
| lsl x6, x6, #2 | |||||
| add x7, x7, x6 | |||||
| ldp q2, q3, [x7] | |||||
| fabs v2.4s, v2.4s | |||||
| fabs v3.4s, v3.4s | |||||
| ins v4.s[0], v3.s[0] | |||||
| ins v5.s[0], v3.s[1] | |||||
| ins v6.s[0], v3.s[2] | |||||
| ins v7.s[0], v3.s[3] | |||||
| add x6, INDEX, #7 | |||||
| fcmp MAXF, s7 | |||||
| csel INDEX, x6, INDEX, eq | |||||
| sub x6, x6, #1 | |||||
| fcmp MAXF, s6 | |||||
| csel INDEX, x6, INDEX, eq | |||||
| sub x6, x6, #1 | |||||
| fcmp MAXF, s5 | |||||
| csel INDEX, x6, INDEX, eq | |||||
| sub x6, x6, #1 | |||||
| fcmp MAXF, s4 | |||||
| csel INDEX, x6, INDEX, eq | |||||
| ins v4.s[0], v2.s[0] | |||||
| ins v5.s[0], v2.s[1] | |||||
| ins v6.s[0], v2.s[2] | |||||
| ins v7.s[0], v2.s[3] | |||||
| sub x6, x6, #1 | |||||
| fcmp MAXF, s7 | |||||
| csel INDEX, x6, INDEX, eq | |||||
| sub x6, x6, #1 | |||||
| fcmp MAXF, s6 | |||||
| csel INDEX, x6, INDEX, eq | |||||
| sub x6, x6, #1 | |||||
| fcmp MAXF, s5 | |||||
| csel INDEX, x6, INDEX, eq | |||||
| sub x6, x6, #1 | |||||
| fcmp MAXF, s4 | |||||
| csel INDEX, x6, INDEX, eq | |||||
| #else | |||||
| add x6, x6, #4 | |||||
| lsl x6, x6, #3 | |||||
| add x7, x7, x6 | |||||
| ldp q2, q3, [x7] | |||||
| fabs v2.2d, v2.2d | |||||
| fabs v3.2d, v3.2d | |||||
| ins v4.d[0], v2.d[0] | |||||
| ins v5.d[0], v2.d[1] | |||||
| ins v6.d[0], v3.d[0] | |||||
| ins v7.d[0], v3.d[1] | |||||
| add x6, INDEX, #7 | |||||
| fcmp MAXF, d7 | |||||
| csel INDEX, x6, INDEX, eq | |||||
| sub x6, x6, #1 | |||||
| fcmp MAXF, d6 | |||||
| csel INDEX, x6, INDEX, eq | |||||
| sub x6, x6, #1 | |||||
| fcmp MAXF, d5 | |||||
| csel INDEX, x6, INDEX, eq | |||||
| sub x6, x6, #1 | |||||
| fcmp MAXF, d4 | |||||
| csel INDEX, x6, INDEX, eq | |||||
| sub x7, x7, #32 | |||||
| ldp q2, q3, [x7] | |||||
| fabs v2.2d, v2.2d | |||||
| fabs v3.2d, v3.2d | |||||
| ins v4.d[0], v2.d[0] | |||||
| ins v5.d[0], v2.d[1] | |||||
| ins v6.d[0], v3.d[0] | |||||
| ins v7.d[0], v3.d[1] | |||||
| sub x6, x6, #1 | |||||
| fcmp MAXF, d7 | |||||
| csel INDEX, x6, INDEX, eq | |||||
| sub x6, x6, #1 | |||||
| fcmp MAXF, d6 | |||||
| csel INDEX, x6, INDEX, eq | |||||
| sub x6, x6, #1 | |||||
| fcmp MAXF, d5 | |||||
| csel INDEX, x6, INDEX, eq | |||||
| sub x6, x6, #1 | |||||
| fcmp MAXF, d4 | |||||
| csel INDEX, x6, INDEX, eq | |||||
| #endif | |||||
| .endm | |||||
| .macro KERNEL_S1 | .macro KERNEL_S1 | ||||
| ld1 TMPVF, [X], INC_X | ld1 TMPVF, [X], INC_X | ||||
| add Z, Z, #1 | add Z, Z, #1 | ||||
| @@ -92,6 +234,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| cmp INC_X, xzr | cmp INC_X, xzr | ||||
| ble iamax_kernel_zero | ble iamax_kernel_zero | ||||
| cmp INC_X, #1 | |||||
| bne iamax_kernel_S_BEGIN | |||||
| mov x7, X | |||||
| iamax_kernel_F_BEGIN: | |||||
| INIT_S | |||||
| subs N, N, #1 | |||||
| ble iamax_kernel_L999 | |||||
| asr I, N, #3 | |||||
| cmp I, xzr | |||||
| beq iamax_kernel_F1 | |||||
| add Z, Z, #1 | |||||
| iamax_kernel_F8: | |||||
| KERNEL_F8 | |||||
| subs I, I, #1 | |||||
| bne iamax_kernel_F8 | |||||
| KERNEL_F8_FINALIZE | |||||
| sub Z, Z, #1 | |||||
| iamax_kernel_F1: | |||||
| ands I, N, #7 | |||||
| ble iamax_kernel_L999 | |||||
| iamax_kernel_F10: | |||||
| KERNEL_S1 | |||||
| subs I, I, #1 | |||||
| bne iamax_kernel_F10 | |||||
| b iamax_kernel_L999 | |||||
| iamax_kernel_S_BEGIN: | |||||
| INIT_S | INIT_S | ||||
| subs N, N, #1 | subs N, N, #1 | ||||
| @@ -78,6 +78,179 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #endif | #endif | ||||
| .endm | .endm | ||||
| .macro KERNEL_F8 | |||||
| #if !defined(DOUBLE) | |||||
| ldp q2, q3, [X], #32 | |||||
| ldp q4, q5, [X], #32 | |||||
| fabs v2.4s, v2.4s | |||||
| fabs v3.4s, v3.4s | |||||
| fabs v4.4s, v4.4s | |||||
| fabs v5.4s, v5.4s | |||||
| faddp v2.4s, v2.4s, v3.4s | |||||
| faddp v3.4s, v4.4s, v5.4s | |||||
| fmax v2.4s, v2.4s, v3.4s | |||||
| fmaxv TMPF, v2.4s | |||||
| fcmp MAXF, TMPF | |||||
| fcsel MAXF, MAXF, TMPF, COND | |||||
| csel INDEX, INDEX, Z, COND | |||||
| add Z, Z, #8 | |||||
| #else | |||||
| ldp q2, q3, [X], #32 | |||||
| ldp q4, q5, [X], #32 | |||||
| ldp q16, q17, [X], #32 | |||||
| ldp q18, q19, [X], #32 | |||||
| fabs v2.2d, v2.2d | |||||
| fabs v3.2d, v3.2d | |||||
| fabs v4.2d, v4.2d | |||||
| fabs v5.2d, v5.2d | |||||
| fabs v16.2d, v16.2d | |||||
| fabs v17.2d, v17.2d | |||||
| fabs v18.2d, v18.2d | |||||
| fabs v19.2d, v19.2d | |||||
| faddp v2.2d, v2.2d, v3.2d | |||||
| faddp v3.2d, v4.2d, v5.2d | |||||
| faddp v4.2d, v16.2d, v17.2d | |||||
| faddp v5.2d, v18.2d, v19.2d | |||||
| fmax v2.2d, v2.2d, v3.2d | |||||
| fmax v4.2d, v4.2d, v5.2d | |||||
| fmax v2.2d, v2.2d, v4.2d | |||||
| fmaxp TMPF, v2.2d | |||||
| fcmp MAXF, TMPF | |||||
| fcsel MAXF, MAXF, TMPF, COND | |||||
| csel INDEX, INDEX, Z, COND | |||||
| add Z, Z, #8 | |||||
| #endif | |||||
| PRFM PLDL1KEEP, [X, #1024] | |||||
| .endm | |||||
| .macro KERNEL_F8_FINALIZE | |||||
| sub x6, INDEX, #1 | |||||
| #if !defined(DOUBLE) | |||||
| lsl x6, x6, #3 | |||||
| add x7, x7, x6 | |||||
| ldp q2, q3, [x7] | |||||
| ldp q4, q5, [x7, #32] | |||||
| fabs v2.4s, v2.4s | |||||
| fabs v3.4s, v3.4s | |||||
| fabs v4.4s, v4.4s | |||||
| fabs v5.4s, v5.4s | |||||
| faddp v2.4s, v2.4s, v3.4s | |||||
| faddp v3.4s, v4.4s, v5.4s | |||||
| ins v4.s[0], v3.s[3] | |||||
| add x6, INDEX, #7 | |||||
| fcmp MAXF, s4 | |||||
| csel INDEX, x6, INDEX, eq | |||||
| ins v4.s[0], v3.s[2] | |||||
| sub x6, x6, #1 | |||||
| fcmp MAXF, s4 | |||||
| csel INDEX, x6, INDEX, eq | |||||
| ins v4.s[0], v3.s[1] | |||||
| sub x6, x6, #1 | |||||
| fcmp MAXF, s4 | |||||
| csel INDEX, x6, INDEX, eq | |||||
| ins v4.s[0], v3.s[0] | |||||
| sub x6, x6, #1 | |||||
| fcmp MAXF, s4 | |||||
| csel INDEX, x6, INDEX, eq | |||||
| ins v4.s[0], v2.s[3] | |||||
| sub x6, x6, #1 | |||||
| fcmp MAXF, s4 | |||||
| csel INDEX, x6, INDEX, eq | |||||
| ins v4.s[0], v2.s[2] | |||||
| sub x6, x6, #1 | |||||
| fcmp MAXF, s4 | |||||
| csel INDEX, x6, INDEX, eq | |||||
| ins v4.s[0], v2.s[1] | |||||
| sub x6, x6, #1 | |||||
| fcmp MAXF, s4 | |||||
| csel INDEX, x6, INDEX, eq | |||||
| ins v4.s[0], v2.s[0] | |||||
| sub x6, x6, #1 | |||||
| fcmp MAXF, s4 | |||||
| csel INDEX, x6, INDEX, eq | |||||
| #else | |||||
| lsl x6, x6, #4 | |||||
| add x7, x7, x6 | |||||
| ldp q2, q3, [x7] | |||||
| ldp q4, q5, [x7, #32] | |||||
| ldp q16, q17, [x7, #64] | |||||
| ldp q18, q19, [x7, #96] | |||||
| fabs v2.2d, v2.2d | |||||
| fabs v3.2d, v3.2d | |||||
| fabs v4.2d, v4.2d | |||||
| fabs v5.2d, v5.2d | |||||
| fabs v16.2d, v16.2d | |||||
| fabs v17.2d, v17.2d | |||||
| fabs v18.2d, v18.2d | |||||
| fabs v19.2d, v19.2d | |||||
| faddp v2.2d, v2.2d, v3.2d | |||||
| faddp v3.2d, v4.2d, v5.2d | |||||
| faddp v4.2d, v16.2d, v17.2d | |||||
| faddp v5.2d, v18.2d, v19.2d | |||||
| ins v7.d[0], v5.d[1] | |||||
| add x6, INDEX, #7 | |||||
| fcmp MAXF, d7 | |||||
| csel INDEX, x6, INDEX, eq | |||||
| ins v7.d[0], v5.d[0] | |||||
| sub x6, x6, #1 | |||||
| fcmp MAXF, d7 | |||||
| csel INDEX, x6, INDEX, eq | |||||
| ins v7.d[0], v4.d[1] | |||||
| sub x6, x6, #1 | |||||
| fcmp MAXF, d7 | |||||
| csel INDEX, x6, INDEX, eq | |||||
| ins v7.d[0], v4.d[0] | |||||
| sub x6, x6, #1 | |||||
| fcmp MAXF, d7 | |||||
| csel INDEX, x6, INDEX, eq | |||||
| ins v7.d[0], v3.d[1] | |||||
| sub x6, x6, #1 | |||||
| fcmp MAXF, d7 | |||||
| csel INDEX, x6, INDEX, eq | |||||
| ins v7.d[0], v3.d[0] | |||||
| sub x6, x6, #1 | |||||
| fcmp MAXF, d7 | |||||
| csel INDEX, x6, INDEX, eq | |||||
| ins v7.d[0], v2.d[1] | |||||
| sub x6, x6, #1 | |||||
| fcmp MAXF, d7 | |||||
| csel INDEX, x6, INDEX, eq | |||||
| ins v7.d[0], v2.d[0] | |||||
| sub x6, x6, #1 | |||||
| fcmp MAXF, d7 | |||||
| csel INDEX, x6, INDEX, eq | |||||
| #endif | |||||
| .endm | |||||
| .macro KERNEL_S1 | .macro KERNEL_S1 | ||||
| #if !defined(DOUBLE) | #if !defined(DOUBLE) | ||||
| ld1 {v1.2s}, [X], INC_X | ld1 {v1.2s}, [X], INC_X | ||||
| @@ -107,6 +280,50 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| cmp INC_X, xzr | cmp INC_X, xzr | ||||
| ble iamax_kernel_zero | ble iamax_kernel_zero | ||||
| cmp INC_X, #1 | |||||
| bne iamax_kernel_S_BEGIN | |||||
| mov x7, X | |||||
| iamax_kernel_F_BEGIN: | |||||
| INIT_S | |||||
| subs N, N, #1 | |||||
| ble iamax_kernel_L999 | |||||
| asr I, N, #3 | |||||
| cmp I, xzr | |||||
| ble iamax_kernel_F1 | |||||
| add Z, Z, #1 | |||||
| iamax_kernel_F8: | |||||
| KERNEL_F8 | |||||
| subs I, I, #1 | |||||
| bne iamax_kernel_F8 | |||||
| KERNEL_F8_FINALIZE | |||||
| sub Z, Z, #1 | |||||
| iamax_kernel_F1: | |||||
| ands I, N, #7 | |||||
| ble iamax_kernel_L999 | |||||
| iamax_kernel_F10: | |||||
| KERNEL_S1 | |||||
| subs I, I, #1 | |||||
| bne iamax_kernel_F10 | |||||
| b iamax_kernel_L999 | |||||
| iamax_kernel_S_BEGIN: | |||||
| INIT_S | INIT_S | ||||
| subs N, N, #1 | subs N, N, #1 | ||||
| @@ -46,20 +46,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define pCRow0 x12 | #define pCRow0 x12 | ||||
| #define pCRow1 x13 | #define pCRow1 x13 | ||||
| #define pCRow2 x14 | #define pCRow2 x14 | ||||
| #define pA x15 | |||||
| #define alpha_save_R x16 | |||||
| #define alpha_save_I x17 | |||||
| #define pCRow3 x15 | |||||
| #define pA x16 | |||||
| #define alphaR x17 | |||||
| #define alphaI x18 | |||||
| #define alpha0_R d10 | #define alpha0_R d10 | ||||
| #define alphaV0_R v10.d[0] | #define alphaV0_R v10.d[0] | ||||
| #define alpha0_I d11 | #define alpha0_I d11 | ||||
| #define alphaV0_I v11.d[0] | #define alphaV0_I v11.d[0] | ||||
| #define alpha1_R d14 | |||||
| #define alphaV1_R v14.d[0] | |||||
| #define alpha1_I d15 | |||||
| #define alphaV1_I v15.d[0] | |||||
| #define A_PRE_SIZE 2560 | |||||
| #define B_PRE_SIZE 448 | |||||
| #define C_PRE_SIZE 128 | |||||
| #if defined(NN) || defined(NT) || defined(TN) || defined(TT) | #if defined(NN) || defined(NT) || defined(TN) || defined(TT) | ||||
| #define OP_rr fmla | #define OP_rr fmla | ||||
| @@ -98,10 +97,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| // 12 pCRow0 | // 12 pCRow0 | ||||
| // 13 pCRow1 | // 13 pCRow1 | ||||
| // 14 pCRow2 | // 14 pCRow2 | ||||
| // 15 pA | |||||
| // 16 alpha_save_R | |||||
| // 17 alpha_save_I | |||||
| // 18 must save | |||||
| // 15 pCRow3 | |||||
| // 16 pA | |||||
| // 17 alpha_save_R | |||||
| // 18 must save alpha_save_I | |||||
| // 19 must save | // 19 must save | ||||
| // 20 must save | // 20 must save | ||||
| // 21 must save | // 21 must save | ||||
| @@ -175,12 +174,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .macro KERNEL4x4_I | .macro KERNEL4x4_I | ||||
| ld2 {v8.2d, v9.2d}, [pB] | ld2 {v8.2d, v9.2d}, [pB] | ||||
| add pB, pB, #32 | add pB, pB, #32 | ||||
| ld2 {v10.2d, v11.2d}, [pB] | |||||
| add pB, pB, #32 | |||||
| ld2 {v0.2d, v1.2d}, [pA] | ld2 {v0.2d, v1.2d}, [pA] | ||||
| add pA, pA, #32 | add pA, pA, #32 | ||||
| ld2 {v2.2d, v3.2d}, [pA] | |||||
| add pA, pA, #32 | |||||
| fmul v16.2d, v0.2d, v8.d[0] | fmul v16.2d, v0.2d, v8.d[0] | ||||
| OP_ii v16.2d, v1.2d, v9.d[0] | OP_ii v16.2d, v1.2d, v9.d[0] | ||||
| @@ -193,16 +188,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #endif | #endif | ||||
| OP_ir v17.2d, v1.2d, v8.d[0] | OP_ir v17.2d, v1.2d, v8.d[0] | ||||
| fmul v18.2d, v2.2d, v8.d[0] | |||||
| OP_ii v18.2d, v3.2d, v9.d[0] | |||||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||||
| eor v19.16b, v19.16b, v19.16b | |||||
| fmls v19.2d, v2.2d, v9.d[0] | |||||
| #else | |||||
| fmul v19.2d, v2.2d, v9.d[0] | |||||
| #endif | |||||
| OP_ir v19.2d, v3.2d, v8.d[0] | |||||
| ld2 {v2.2d, v3.2d}, [pA] | |||||
| add pA, pA, #32 | |||||
| fmul v20.2d, v0.2d, v8.d[1] | fmul v20.2d, v0.2d, v8.d[1] | ||||
| OP_ii v20.2d, v1.2d, v9.d[1] | OP_ii v20.2d, v1.2d, v9.d[1] | ||||
| @@ -215,6 +202,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #endif | #endif | ||||
| OP_ir v21.2d, v1.2d, v8.d[1] | OP_ir v21.2d, v1.2d, v8.d[1] | ||||
| ld2 {v10.2d, v11.2d}, [pB] | |||||
| add pB, pB, #32 | |||||
| fmul v22.2d, v2.2d, v8.d[1] | fmul v22.2d, v2.2d, v8.d[1] | ||||
| OP_ii v22.2d, v3.2d, v9.d[1] | OP_ii v22.2d, v3.2d, v9.d[1] | ||||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | ||||
| @@ -226,6 +216,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #endif | #endif | ||||
| OP_ir v23.2d, v3.2d, v8.d[1] | OP_ir v23.2d, v3.2d, v8.d[1] | ||||
| ld2 {v12.2d, v13.2d}, [pB] | |||||
| add pB, pB, #32 | |||||
| fmul v18.2d, v2.2d, v8.d[0] | |||||
| OP_ii v18.2d, v3.2d, v9.d[0] | |||||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||||
| eor v19.16b, v19.16b, v19.16b | |||||
| fmls v19.2d, v2.2d, v9.d[0] | |||||
| #else | |||||
| fmul v19.2d, v2.2d, v9.d[0] | |||||
| #endif | |||||
| OP_ir v19.2d, v3.2d, v8.d[0] | |||||
| ld2 {v4.2d, v5.2d} , [pA] | |||||
| add pA, pA, #32 | |||||
| fmul v24.2d, v0.2d, v10.d[0] | fmul v24.2d, v0.2d, v10.d[0] | ||||
| OP_ii v24.2d, v1.2d, v11.d[0] | OP_ii v24.2d, v1.2d, v11.d[0] | ||||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | ||||
| @@ -237,6 +244,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #endif | #endif | ||||
| OP_ir v25.2d, v1.2d, v10.d[0] | OP_ir v25.2d, v1.2d, v10.d[0] | ||||
| ld2 {v6.2d, v7.2d} , [pA] | |||||
| add pA, pA, #32 | |||||
| fmul v26.2d, v2.2d, v10.d[0] | fmul v26.2d, v2.2d, v10.d[0] | ||||
| OP_ii v26.2d, v3.2d, v11.d[0] | OP_ii v26.2d, v3.2d, v11.d[0] | ||||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | ||||
| @@ -248,6 +258,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #endif | #endif | ||||
| OP_ir v27.2d, v3.2d, v10.d[0] | OP_ir v27.2d, v3.2d, v10.d[0] | ||||
| ld2 {v14.2d, v15.2d}, [pB] | |||||
| add pB, pB, #32 | |||||
| fmul v28.2d, v0.2d, v10.d[1] | fmul v28.2d, v0.2d, v10.d[1] | ||||
| OP_ii v28.2d, v1.2d, v11.d[1] | OP_ii v28.2d, v1.2d, v11.d[1] | ||||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | ||||
| @@ -259,6 +272,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #endif | #endif | ||||
| OP_ir v29.2d, v1.2d, v10.d[1] | OP_ir v29.2d, v1.2d, v10.d[1] | ||||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||||
| fmul v30.2d, v2.2d, v10.d[1] | fmul v30.2d, v2.2d, v10.d[1] | ||||
| OP_ii v30.2d, v3.2d, v11.d[1] | OP_ii v30.2d, v3.2d, v11.d[1] | ||||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | ||||
| @@ -270,14 +285,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #endif | #endif | ||||
| OP_ir v31.2d, v3.2d, v10.d[1] | OP_ir v31.2d, v3.2d, v10.d[1] | ||||
| ld2 {v12.2d, v13.2d}, [pB] | |||||
| add pB, pB, #32 | |||||
| ld2 {v14.2d, v15.2d}, [pB] | |||||
| add pB, pB, #32 | |||||
| ld2 {v4.2d, v5.2d} , [pA] | |||||
| add pA, pA, #32 | |||||
| ld2 {v6.2d, v7.2d} , [pA] | |||||
| add pA, pA, #32 | |||||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] | |||||
| .endm | .endm | ||||
| .macro KERNEL4x4_M1 | .macro KERNEL4x4_M1 | ||||
| @@ -286,7 +294,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| OP_ri v17.2d, v0.2d, v9.d[0] | OP_ri v17.2d, v0.2d, v9.d[0] | ||||
| OP_ir v17.2d, v1.2d, v8.d[0] | OP_ir v17.2d, v1.2d, v8.d[0] | ||||
| ld2 {v12.2d, v13.2d}, [pB] // For next round | |||||
| ld2 {v12.2d, v13.2d}, [pB] | |||||
| add pB, pB, #32 | add pB, pB, #32 | ||||
| OP_rr v18.2d, v2.2d, v8.d[0] | OP_rr v18.2d, v2.2d, v8.d[0] | ||||
| @@ -294,15 +302,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| OP_ri v19.2d, v2.2d, v9.d[0] | OP_ri v19.2d, v2.2d, v9.d[0] | ||||
| OP_ir v19.2d, v3.2d, v8.d[0] | OP_ir v19.2d, v3.2d, v8.d[0] | ||||
| ld2 {v14.2d, v15.2d}, [pB] // For next round | |||||
| add pB, pB, #32 | |||||
| ld2 {v4.2d, v5.2d} , [pA] | |||||
| add pA, pA, #32 | |||||
| OP_rr v20.2d, v0.2d, v8.d[1] | OP_rr v20.2d, v0.2d, v8.d[1] | ||||
| OP_ii v20.2d, v1.2d, v9.d[1] | OP_ii v20.2d, v1.2d, v9.d[1] | ||||
| OP_ri v21.2d, v0.2d, v9.d[1] | OP_ri v21.2d, v0.2d, v9.d[1] | ||||
| OP_ir v21.2d, v1.2d, v8.d[1] | OP_ir v21.2d, v1.2d, v8.d[1] | ||||
| ld2 {v4.2d, v5.2d} , [pA] // For next round | |||||
| ld2 {v6.2d, v7.2d} , [pA] | |||||
| add pA, pA, #32 | add pA, pA, #32 | ||||
| OP_rr v22.2d, v2.2d, v8.d[1] | OP_rr v22.2d, v2.2d, v8.d[1] | ||||
| @@ -310,22 +318,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| OP_ri v23.2d, v2.2d, v9.d[1] | OP_ri v23.2d, v2.2d, v9.d[1] | ||||
| OP_ir v23.2d, v3.2d, v8.d[1] | OP_ir v23.2d, v3.2d, v8.d[1] | ||||
| ld2 {v6.2d, v7.2d} , [pA] // For next round | |||||
| add pA, pA, #32 | |||||
| ld2 {v14.2d, v15.2d}, [pB] | |||||
| add pB, pB, #32 | |||||
| OP_rr v24.2d, v0.2d, v10.d[0] | OP_rr v24.2d, v0.2d, v10.d[0] | ||||
| OP_ii v24.2d, v1.2d, v11.d[0] | OP_ii v24.2d, v1.2d, v11.d[0] | ||||
| OP_ri v25.2d, v0.2d, v11.d[0] | OP_ri v25.2d, v0.2d, v11.d[0] | ||||
| OP_ir v25.2d, v1.2d, v10.d[0] | OP_ir v25.2d, v1.2d, v10.d[0] | ||||
| prfm PLDL1KEEP, [pA, #512] | |||||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||||
| OP_rr v26.2d, v2.2d, v10.d[0] | OP_rr v26.2d, v2.2d, v10.d[0] | ||||
| OP_ii v26.2d, v3.2d, v11.d[0] | OP_ii v26.2d, v3.2d, v11.d[0] | ||||
| OP_ri v27.2d, v2.2d, v11.d[0] | OP_ri v27.2d, v2.2d, v11.d[0] | ||||
| OP_ir v27.2d, v3.2d, v10.d[0] | OP_ir v27.2d, v3.2d, v10.d[0] | ||||
| prfm PLDL1KEEP, [pB, #512] | |||||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] | |||||
| OP_rr v28.2d, v0.2d, v10.d[1] | OP_rr v28.2d, v0.2d, v10.d[1] | ||||
| OP_ii v28.2d, v1.2d, v11.d[1] | OP_ii v28.2d, v1.2d, v11.d[1] | ||||
| @@ -344,7 +352,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| OP_ri v17.2d, v4.2d, v13.d[0] | OP_ri v17.2d, v4.2d, v13.d[0] | ||||
| OP_ir v17.2d, v5.2d, v12.d[0] | OP_ir v17.2d, v5.2d, v12.d[0] | ||||
| ld2 {v8.2d, v9.2d}, [pB] // For next round | |||||
| ld2 {v8.2d, v9.2d}, [pB] | |||||
| add pB, pB, #32 | add pB, pB, #32 | ||||
| OP_rr v18.2d, v6.2d, v12.d[0] | OP_rr v18.2d, v6.2d, v12.d[0] | ||||
| @@ -352,15 +360,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| OP_ri v19.2d, v6.2d, v13.d[0] | OP_ri v19.2d, v6.2d, v13.d[0] | ||||
| OP_ir v19.2d, v7.2d, v12.d[0] | OP_ir v19.2d, v7.2d, v12.d[0] | ||||
| ld2 {v10.2d, v11.2d}, [pB] // For next round | |||||
| add pB, pB, #32 | |||||
| ld2 {v0.2d, v1.2d}, [pA] | |||||
| add pA, pA, #32 | |||||
| OP_rr v20.2d, v4.2d, v12.d[1] | OP_rr v20.2d, v4.2d, v12.d[1] | ||||
| OP_ii v20.2d, v5.2d, v13.d[1] | OP_ii v20.2d, v5.2d, v13.d[1] | ||||
| OP_ri v21.2d, v4.2d, v13.d[1] | OP_ri v21.2d, v4.2d, v13.d[1] | ||||
| OP_ir v21.2d, v5.2d, v12.d[1] | OP_ir v21.2d, v5.2d, v12.d[1] | ||||
| ld2 {v0.2d, v1.2d}, [pA] // For next round | |||||
| ld2 {v2.2d, v3.2d}, [pA] | |||||
| add pA, pA, #32 | add pA, pA, #32 | ||||
| OP_rr v22.2d, v6.2d, v12.d[1] | OP_rr v22.2d, v6.2d, v12.d[1] | ||||
| @@ -368,22 +376,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| OP_ri v23.2d, v6.2d, v13.d[1] | OP_ri v23.2d, v6.2d, v13.d[1] | ||||
| OP_ir v23.2d, v7.2d, v12.d[1] | OP_ir v23.2d, v7.2d, v12.d[1] | ||||
| ld2 {v2.2d, v3.2d}, [pA] // For next round | |||||
| add pA, pA, #32 | |||||
| ld2 {v10.2d, v11.2d}, [pB] | |||||
| add pB, pB, #32 | |||||
| OP_rr v24.2d, v4.2d, v14.d[0] | OP_rr v24.2d, v4.2d, v14.d[0] | ||||
| OP_ii v24.2d, v5.2d, v15.d[0] | OP_ii v24.2d, v5.2d, v15.d[0] | ||||
| OP_ri v25.2d, v4.2d, v15.d[0] | OP_ri v25.2d, v4.2d, v15.d[0] | ||||
| OP_ir v25.2d, v5.2d, v14.d[0] | OP_ir v25.2d, v5.2d, v14.d[0] | ||||
| prfm PLDL1KEEP, [pA, #512] | |||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||||
| OP_rr v26.2d, v6.2d, v14.d[0] | OP_rr v26.2d, v6.2d, v14.d[0] | ||||
| OP_ii v26.2d, v7.2d, v15.d[0] | OP_ii v26.2d, v7.2d, v15.d[0] | ||||
| OP_ri v27.2d, v6.2d, v15.d[0] | OP_ri v27.2d, v6.2d, v15.d[0] | ||||
| OP_ir v27.2d, v7.2d, v14.d[0] | OP_ir v27.2d, v7.2d, v14.d[0] | ||||
| prfm PLDL1KEEP, [pB, #512] | |||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] | |||||
| OP_rr v28.2d, v4.2d, v14.d[1] | OP_rr v28.2d, v4.2d, v14.d[1] | ||||
| OP_ii v28.2d, v5.2d, v15.d[1] | OP_ii v28.2d, v5.2d, v15.d[1] | ||||
| @@ -412,6 +420,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| OP_ri v21.2d, v4.2d, v13.d[1] | OP_ri v21.2d, v4.2d, v13.d[1] | ||||
| OP_ir v21.2d, v5.2d, v12.d[1] | OP_ir v21.2d, v5.2d, v12.d[1] | ||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||||
| OP_rr v22.2d, v6.2d, v12.d[1] | OP_rr v22.2d, v6.2d, v12.d[1] | ||||
| OP_ii v22.2d, v7.2d, v13.d[1] | OP_ii v22.2d, v7.2d, v13.d[1] | ||||
| OP_ri v23.2d, v6.2d, v13.d[1] | OP_ri v23.2d, v6.2d, v13.d[1] | ||||
| @@ -422,6 +432,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| OP_ri v25.2d, v4.2d, v15.d[0] | OP_ri v25.2d, v4.2d, v15.d[0] | ||||
| OP_ir v25.2d, v5.2d, v14.d[0] | OP_ir v25.2d, v5.2d, v14.d[0] | ||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] | |||||
| OP_rr v26.2d, v6.2d, v14.d[0] | OP_rr v26.2d, v6.2d, v14.d[0] | ||||
| OP_ii v26.2d, v7.2d, v15.d[0] | OP_ii v26.2d, v7.2d, v15.d[0] | ||||
| OP_ri v27.2d, v6.2d, v15.d[0] | OP_ri v27.2d, v6.2d, v15.d[0] | ||||
| @@ -441,33 +453,40 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .macro KERNEL4x4_SUB | .macro KERNEL4x4_SUB | ||||
| ld2 {v8.2d, v9.2d}, [pB] | ld2 {v8.2d, v9.2d}, [pB] | ||||
| add pB, pB, #32 | add pB, pB, #32 | ||||
| ld2 {v10.2d, v11.2d}, [pB] | |||||
| add pB, pB, #32 | |||||
| ld2 {v0.2d, v1.2d}, [pA] | ld2 {v0.2d, v1.2d}, [pA] | ||||
| add pA, pA, #32 | add pA, pA, #32 | ||||
| ld2 {v2.2d, v3.2d}, [pA] | |||||
| add pA, pA, #32 | |||||
| OP_rr v16.2d, v0.2d, v8.d[0] | OP_rr v16.2d, v0.2d, v8.d[0] | ||||
| OP_ii v16.2d, v1.2d, v9.d[0] | OP_ii v16.2d, v1.2d, v9.d[0] | ||||
| OP_ri v17.2d, v0.2d, v9.d[0] | OP_ri v17.2d, v0.2d, v9.d[0] | ||||
| OP_ir v17.2d, v1.2d, v8.d[0] | OP_ir v17.2d, v1.2d, v8.d[0] | ||||
| OP_rr v18.2d, v2.2d, v8.d[0] | |||||
| OP_ii v18.2d, v3.2d, v9.d[0] | |||||
| OP_ri v19.2d, v2.2d, v9.d[0] | |||||
| OP_ir v19.2d, v3.2d, v8.d[0] | |||||
| ld2 {v2.2d, v3.2d}, [pA] | |||||
| add pA, pA, #32 | |||||
| OP_rr v20.2d, v0.2d, v8.d[1] | OP_rr v20.2d, v0.2d, v8.d[1] | ||||
| OP_ii v20.2d, v1.2d, v9.d[1] | OP_ii v20.2d, v1.2d, v9.d[1] | ||||
| OP_ri v21.2d, v0.2d, v9.d[1] | OP_ri v21.2d, v0.2d, v9.d[1] | ||||
| OP_ir v21.2d, v1.2d, v8.d[1] | OP_ir v21.2d, v1.2d, v8.d[1] | ||||
| ld2 {v10.2d, v11.2d}, [pB] | |||||
| add pB, pB, #32 | |||||
| OP_rr v18.2d, v2.2d, v8.d[0] | |||||
| OP_ii v18.2d, v3.2d, v9.d[0] | |||||
| OP_ri v19.2d, v2.2d, v9.d[0] | |||||
| OP_ir v19.2d, v3.2d, v8.d[0] | |||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||||
| OP_rr v22.2d, v2.2d, v8.d[1] | OP_rr v22.2d, v2.2d, v8.d[1] | ||||
| OP_ii v22.2d, v3.2d, v9.d[1] | OP_ii v22.2d, v3.2d, v9.d[1] | ||||
| OP_ri v23.2d, v2.2d, v9.d[1] | OP_ri v23.2d, v2.2d, v9.d[1] | ||||
| OP_ir v23.2d, v3.2d, v8.d[1] | OP_ir v23.2d, v3.2d, v8.d[1] | ||||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||||
| OP_rr v24.2d, v0.2d, v10.d[0] | OP_rr v24.2d, v0.2d, v10.d[0] | ||||
| OP_ii v24.2d, v1.2d, v11.d[0] | OP_ii v24.2d, v1.2d, v11.d[0] | ||||
| OP_ri v25.2d, v0.2d, v11.d[0] | OP_ri v25.2d, v0.2d, v11.d[0] | ||||
| @@ -490,74 +509,85 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .endm | .endm | ||||
| .macro SAVE4x4 | .macro SAVE4x4 | ||||
| fmov alpha0_R, alpha_save_R | |||||
| fmov alpha0_I, alpha_save_I | |||||
| fmov alpha1_R, alpha0_R | |||||
| fmov alpha1_I, alpha0_I | |||||
| fmov alpha0_R, alphaR | |||||
| fmov alpha0_I, alphaI | |||||
| mov pCRow1, pCRow0 | |||||
| prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||||
| ld2 {v0.2d, v1.2d}, [pCRow1] | |||||
| ld2 {v0.2d, v1.2d}, [pCRow0] | |||||
| fmla v0.2d, v16.2d, alphaV0_R | fmla v0.2d, v16.2d, alphaV0_R | ||||
| fmls v0.2d, v17.2d, alphaV0_I | fmls v0.2d, v17.2d, alphaV0_I | ||||
| fmla v1.2d, v16.2d, alphaV1_I | |||||
| fmla v1.2d, v17.2d, alphaV1_R | |||||
| st2 {v0.2d, v1.2d}, [pCRow1] | |||||
| add pCRow2, pCRow1, #32 | |||||
| ld2 {v2.2d, v3.2d}, [pCRow2] | |||||
| fmla v1.2d, v16.2d, alphaV0_I | |||||
| fmla v1.2d, v17.2d, alphaV0_R | |||||
| st2 {v0.2d, v1.2d}, [pCRow0] | |||||
| add pCRow0, pCRow0, #32 | |||||
| ld2 {v2.2d, v3.2d}, [pCRow0] | |||||
| fmla v2.2d, v18.2d, alphaV0_R | fmla v2.2d, v18.2d, alphaV0_R | ||||
| fmls v2.2d, v19.2d, alphaV0_I | fmls v2.2d, v19.2d, alphaV0_I | ||||
| fmla v3.2d, v18.2d, alphaV1_I | |||||
| fmla v3.2d, v19.2d, alphaV1_R | |||||
| st2 {v2.2d, v3.2d}, [pCRow2] | |||||
| fmla v3.2d, v18.2d, alphaV0_I | |||||
| fmla v3.2d, v19.2d, alphaV0_R | |||||
| st2 {v2.2d, v3.2d}, [pCRow0] | |||||
| add pCRow0, pCRow0, #32 | |||||
| prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||||
| add pCRow1, pCRow1, LDC | |||||
| ld2 {v4.2d, v5.2d}, [pCRow1] | ld2 {v4.2d, v5.2d}, [pCRow1] | ||||
| fmla v4.2d, v20.2d, alphaV0_R | fmla v4.2d, v20.2d, alphaV0_R | ||||
| fmls v4.2d, v21.2d, alphaV0_I | fmls v4.2d, v21.2d, alphaV0_I | ||||
| fmla v5.2d, v20.2d, alphaV1_I | |||||
| fmla v5.2d, v21.2d, alphaV1_R | |||||
| fmla v5.2d, v20.2d, alphaV0_I | |||||
| fmla v5.2d, v21.2d, alphaV0_R | |||||
| st2 {v4.2d, v5.2d}, [pCRow1] | st2 {v4.2d, v5.2d}, [pCRow1] | ||||
| add pCRow2, pCRow1, #32 | |||||
| ld2 {v6.2d, v7.2d}, [pCRow2] | |||||
| add pCRow1, pCRow1, #32 | |||||
| ld2 {v6.2d, v7.2d}, [pCRow1] | |||||
| fmla v6.2d, v22.2d, alphaV0_R | fmla v6.2d, v22.2d, alphaV0_R | ||||
| fmls v6.2d, v23.2d, alphaV0_I | fmls v6.2d, v23.2d, alphaV0_I | ||||
| fmla v7.2d, v22.2d, alphaV1_I | |||||
| fmla v7.2d, v23.2d, alphaV1_R | |||||
| st2 {v6.2d, v7.2d}, [pCRow2] | |||||
| fmla v7.2d, v22.2d, alphaV0_I | |||||
| fmla v7.2d, v23.2d, alphaV0_R | |||||
| st2 {v6.2d, v7.2d}, [pCRow1] | |||||
| add pCRow1, pCRow1, LDC | |||||
| ld2 {v0.2d, v1.2d}, [pCRow1] | |||||
| add pCRow1, pCRow1, #32 | |||||
| prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] | |||||
| ld2 {v0.2d, v1.2d}, [pCRow2] | |||||
| fmla v0.2d, v24.2d, alphaV0_R | fmla v0.2d, v24.2d, alphaV0_R | ||||
| fmls v0.2d, v25.2d, alphaV0_I | fmls v0.2d, v25.2d, alphaV0_I | ||||
| fmla v1.2d, v24.2d, alphaV1_I | |||||
| fmla v1.2d, v25.2d, alphaV1_R | |||||
| st2 {v0.2d, v1.2d}, [pCRow1] | |||||
| add pCRow2, pCRow1, #32 | |||||
| fmla v1.2d, v24.2d, alphaV0_I | |||||
| fmla v1.2d, v25.2d, alphaV0_R | |||||
| st2 {v0.2d, v1.2d}, [pCRow2] | |||||
| add pCRow2, pCRow2, #32 | |||||
| ld2 {v2.2d, v3.2d}, [pCRow2] | ld2 {v2.2d, v3.2d}, [pCRow2] | ||||
| fmla v2.2d, v26.2d, alphaV0_R | fmla v2.2d, v26.2d, alphaV0_R | ||||
| fmls v2.2d, v27.2d, alphaV0_I | fmls v2.2d, v27.2d, alphaV0_I | ||||
| fmla v3.2d, v26.2d, alphaV1_I | |||||
| fmla v3.2d, v27.2d, alphaV1_R | |||||
| fmla v3.2d, v26.2d, alphaV0_I | |||||
| fmla v3.2d, v27.2d, alphaV0_R | |||||
| st2 {v2.2d, v3.2d}, [pCRow2] | st2 {v2.2d, v3.2d}, [pCRow2] | ||||
| add pCRow1, pCRow1, LDC | |||||
| add pCRow2, pCRow2, #32 | |||||
| prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] | |||||
| ld2 {v4.2d, v5.2d}, [pCRow1] | |||||
| ld2 {v4.2d, v5.2d}, [pCRow3] | |||||
| fmla v4.2d, v28.2d, alphaV0_R | fmla v4.2d, v28.2d, alphaV0_R | ||||
| fmls v4.2d, v29.2d, alphaV0_I | fmls v4.2d, v29.2d, alphaV0_I | ||||
| fmla v5.2d, v28.2d, alphaV1_I | |||||
| fmla v5.2d, v29.2d, alphaV1_R | |||||
| st2 {v4.2d, v5.2d}, [pCRow1] | |||||
| add pCRow2, pCRow1, #32 | |||||
| ld2 {v6.2d, v7.2d}, [pCRow2] | |||||
| fmla v5.2d, v28.2d, alphaV0_I | |||||
| fmla v5.2d, v29.2d, alphaV0_R | |||||
| st2 {v4.2d, v5.2d}, [pCRow3] | |||||
| add pCRow3, pCRow3, #32 | |||||
| ld2 {v6.2d, v7.2d}, [pCRow3] | |||||
| fmla v6.2d, v30.2d, alphaV0_R | fmla v6.2d, v30.2d, alphaV0_R | ||||
| fmls v6.2d, v31.2d, alphaV0_I | fmls v6.2d, v31.2d, alphaV0_I | ||||
| fmla v7.2d, v30.2d, alphaV1_I | |||||
| fmla v7.2d, v31.2d, alphaV1_R | |||||
| st2 {v6.2d, v7.2d}, [pCRow2] | |||||
| fmla v7.2d, v30.2d, alphaV0_I | |||||
| fmla v7.2d, v31.2d, alphaV0_R | |||||
| st2 {v6.2d, v7.2d}, [pCRow3] | |||||
| add pCRow0, pCRow0, #64 | |||||
| add pCRow3, pCRow3, #32 | |||||
| .endm | .endm | ||||
| /******************************************************************************/ | /******************************************************************************/ | ||||
| @@ -604,18 +634,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .endm | .endm | ||||
| .macro SAVE2x4 | .macro SAVE2x4 | ||||
| fmov alpha0_R, alpha_save_R | |||||
| fmov alpha0_I, alpha_save_I | |||||
| fmov alpha1_R, alpha0_R | |||||
| fmov alpha1_I, alpha0_I | |||||
| fmov alpha0_R, alphaR | |||||
| fmov alpha0_I, alphaI | |||||
| mov pCRow1, pCRow0 | mov pCRow1, pCRow0 | ||||
| ld2 {v0.2d, v1.2d}, [pCRow1] | ld2 {v0.2d, v1.2d}, [pCRow1] | ||||
| fmla v0.2d, v16.2d, alphaV0_R | fmla v0.2d, v16.2d, alphaV0_R | ||||
| fmls v0.2d, v17.2d, alphaV0_I | fmls v0.2d, v17.2d, alphaV0_I | ||||
| fmla v1.2d, v16.2d, alphaV1_I | |||||
| fmla v1.2d, v17.2d, alphaV1_R | |||||
| fmla v1.2d, v16.2d, alphaV0_I | |||||
| fmla v1.2d, v17.2d, alphaV0_R | |||||
| st2 {v0.2d, v1.2d}, [pCRow1] | st2 {v0.2d, v1.2d}, [pCRow1] | ||||
| add pCRow1, pCRow1, LDC | add pCRow1, pCRow1, LDC | ||||
| @@ -623,8 +651,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| ld2 {v4.2d, v5.2d}, [pCRow1] | ld2 {v4.2d, v5.2d}, [pCRow1] | ||||
| fmla v4.2d, v20.2d, alphaV0_R | fmla v4.2d, v20.2d, alphaV0_R | ||||
| fmls v4.2d, v21.2d, alphaV0_I | fmls v4.2d, v21.2d, alphaV0_I | ||||
| fmla v5.2d, v20.2d, alphaV1_I | |||||
| fmla v5.2d, v21.2d, alphaV1_R | |||||
| fmla v5.2d, v20.2d, alphaV0_I | |||||
| fmla v5.2d, v21.2d, alphaV0_R | |||||
| st2 {v4.2d, v5.2d}, [pCRow1] | st2 {v4.2d, v5.2d}, [pCRow1] | ||||
| add pCRow1, pCRow1, LDC | add pCRow1, pCRow1, LDC | ||||
| @@ -632,8 +660,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| ld2 {v0.2d, v1.2d}, [pCRow1] | ld2 {v0.2d, v1.2d}, [pCRow1] | ||||
| fmla v0.2d, v24.2d, alphaV0_R | fmla v0.2d, v24.2d, alphaV0_R | ||||
| fmls v0.2d, v25.2d, alphaV0_I | fmls v0.2d, v25.2d, alphaV0_I | ||||
| fmla v1.2d, v24.2d, alphaV1_I | |||||
| fmla v1.2d, v25.2d, alphaV1_R | |||||
| fmla v1.2d, v24.2d, alphaV0_I | |||||
| fmla v1.2d, v25.2d, alphaV0_R | |||||
| st2 {v0.2d, v1.2d}, [pCRow1] | st2 {v0.2d, v1.2d}, [pCRow1] | ||||
| add pCRow1, pCRow1, LDC | add pCRow1, pCRow1, LDC | ||||
| @@ -641,8 +669,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| ld2 {v4.2d, v5.2d}, [pCRow1] | ld2 {v4.2d, v5.2d}, [pCRow1] | ||||
| fmla v4.2d, v28.2d, alphaV0_R | fmla v4.2d, v28.2d, alphaV0_R | ||||
| fmls v4.2d, v29.2d, alphaV0_I | fmls v4.2d, v29.2d, alphaV0_I | ||||
| fmla v5.2d, v28.2d, alphaV1_I | |||||
| fmla v5.2d, v29.2d, alphaV1_R | |||||
| fmla v5.2d, v28.2d, alphaV0_I | |||||
| fmla v5.2d, v29.2d, alphaV0_R | |||||
| st2 {v4.2d, v5.2d}, [pCRow1] | st2 {v4.2d, v5.2d}, [pCRow1] | ||||
| add pCRow0, pCRow0, #32 | add pCRow0, pCRow0, #32 | ||||
| @@ -691,18 +719,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .endm | .endm | ||||
| .macro SAVE1x4 | .macro SAVE1x4 | ||||
| fmov alpha0_R, alpha_save_R | |||||
| fmov alpha0_I, alpha_save_I | |||||
| fmov alpha1_R, alpha0_R | |||||
| fmov alpha1_I, alpha0_I | |||||
| fmov alpha0_R, alphaR | |||||
| fmov alpha0_I, alphaI | |||||
| mov pCRow1, pCRow0 | mov pCRow1, pCRow0 | ||||
| ld2 {v0.d, v1.d}[0], [pCRow1] | ld2 {v0.d, v1.d}[0], [pCRow1] | ||||
| fmla d0, d16, alphaV0_R | fmla d0, d16, alphaV0_R | ||||
| fmls d0, d17, alphaV0_I | fmls d0, d17, alphaV0_I | ||||
| fmla d1, d16, alphaV1_I | |||||
| fmla d1, d17, alphaV1_R | |||||
| fmla d1, d16, alphaV0_I | |||||
| fmla d1, d17, alphaV0_R | |||||
| st2 {v0.d, v1.d}[0], [pCRow1] | st2 {v0.d, v1.d}[0], [pCRow1] | ||||
| add pCRow1, pCRow1, LDC | add pCRow1, pCRow1, LDC | ||||
| @@ -710,8 +736,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| ld2 {v4.d, v5.d}[0], [pCRow1] | ld2 {v4.d, v5.d}[0], [pCRow1] | ||||
| fmla d4, d20, alphaV0_R | fmla d4, d20, alphaV0_R | ||||
| fmls d4, d21, alphaV0_I | fmls d4, d21, alphaV0_I | ||||
| fmla d5, d20, alphaV1_I | |||||
| fmla d5, d21, alphaV1_R | |||||
| fmla d5, d20, alphaV0_I | |||||
| fmla d5, d21, alphaV0_R | |||||
| st2 {v4.d, v5.d}[0], [pCRow1] | st2 {v4.d, v5.d}[0], [pCRow1] | ||||
| add pCRow1, pCRow1, LDC | add pCRow1, pCRow1, LDC | ||||
| @@ -719,8 +745,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| ld2 {v0.d, v1.d}[0], [pCRow1] | ld2 {v0.d, v1.d}[0], [pCRow1] | ||||
| fmla d0, d24, alphaV0_R | fmla d0, d24, alphaV0_R | ||||
| fmls d0, d25, alphaV0_I | fmls d0, d25, alphaV0_I | ||||
| fmla d1, d24, alphaV1_I | |||||
| fmla d1, d25, alphaV1_R | |||||
| fmla d1, d24, alphaV0_I | |||||
| fmla d1, d25, alphaV0_R | |||||
| st2 {v0.d, v1.d}[0], [pCRow1] | st2 {v0.d, v1.d}[0], [pCRow1] | ||||
| add pCRow1, pCRow1, LDC | add pCRow1, pCRow1, LDC | ||||
| @@ -728,8 +754,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| ld2 {v4.d, v5.d}[0], [pCRow1] | ld2 {v4.d, v5.d}[0], [pCRow1] | ||||
| fmla d4, d28, alphaV0_R | fmla d4, d28, alphaV0_R | ||||
| fmls d4, d29, alphaV0_I | fmls d4, d29, alphaV0_I | ||||
| fmla d5, d28, alphaV1_I | |||||
| fmla d5, d29, alphaV1_R | |||||
| fmla d5, d28, alphaV0_I | |||||
| fmla d5, d29, alphaV0_R | |||||
| st2 {v4.d, v5.d}[0], [pCRow1] | st2 {v4.d, v5.d}[0], [pCRow1] | ||||
| add pCRow0, pCRow0, #16 | add pCRow0, pCRow0, #16 | ||||
| @@ -778,25 +804,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .endm | .endm | ||||
| .macro SAVE4x2 | .macro SAVE4x2 | ||||
| fmov alpha0_R, alpha_save_R | |||||
| fmov alpha0_I, alpha_save_I | |||||
| fmov alpha1_R, alpha0_R | |||||
| fmov alpha1_I, alpha0_I | |||||
| fmov alpha0_R, alphaR | |||||
| fmov alpha0_I, alphaI | |||||
| mov pCRow1, pCRow0 | mov pCRow1, pCRow0 | ||||
| ld2 {v0.2d, v1.2d}, [pCRow1] | ld2 {v0.2d, v1.2d}, [pCRow1] | ||||
| fmla v0.2d, v16.2d, alphaV0_R | fmla v0.2d, v16.2d, alphaV0_R | ||||
| fmls v0.2d, v17.2d, alphaV0_I | fmls v0.2d, v17.2d, alphaV0_I | ||||
| fmla v1.2d, v16.2d, alphaV1_I | |||||
| fmla v1.2d, v17.2d, alphaV1_R | |||||
| fmla v1.2d, v16.2d, alphaV0_I | |||||
| fmla v1.2d, v17.2d, alphaV0_R | |||||
| st2 {v0.2d, v1.2d}, [pCRow1] | st2 {v0.2d, v1.2d}, [pCRow1] | ||||
| add pCRow2, pCRow1, #32 | add pCRow2, pCRow1, #32 | ||||
| ld2 {v2.2d, v3.2d}, [pCRow2] | ld2 {v2.2d, v3.2d}, [pCRow2] | ||||
| fmla v2.2d, v18.2d, alphaV0_R | fmla v2.2d, v18.2d, alphaV0_R | ||||
| fmls v2.2d, v19.2d, alphaV0_I | fmls v2.2d, v19.2d, alphaV0_I | ||||
| fmla v3.2d, v18.2d, alphaV1_I | |||||
| fmla v3.2d, v19.2d, alphaV1_R | |||||
| fmla v3.2d, v18.2d, alphaV0_I | |||||
| fmla v3.2d, v19.2d, alphaV0_R | |||||
| st2 {v2.2d, v3.2d}, [pCRow2] | st2 {v2.2d, v3.2d}, [pCRow2] | ||||
| add pCRow1, pCRow1, LDC | add pCRow1, pCRow1, LDC | ||||
| @@ -804,15 +828,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| ld2 {v4.2d, v5.2d}, [pCRow1] | ld2 {v4.2d, v5.2d}, [pCRow1] | ||||
| fmla v4.2d, v20.2d, alphaV0_R | fmla v4.2d, v20.2d, alphaV0_R | ||||
| fmls v4.2d, v21.2d, alphaV0_I | fmls v4.2d, v21.2d, alphaV0_I | ||||
| fmla v5.2d, v20.2d, alphaV1_I | |||||
| fmla v5.2d, v21.2d, alphaV1_R | |||||
| fmla v5.2d, v20.2d, alphaV0_I | |||||
| fmla v5.2d, v21.2d, alphaV0_R | |||||
| st2 {v4.2d, v5.2d}, [pCRow1] | st2 {v4.2d, v5.2d}, [pCRow1] | ||||
| add pCRow2, pCRow1, #32 | add pCRow2, pCRow1, #32 | ||||
| ld2 {v6.2d, v7.2d}, [pCRow2] | ld2 {v6.2d, v7.2d}, [pCRow2] | ||||
| fmla v6.2d, v22.2d, alphaV0_R | fmla v6.2d, v22.2d, alphaV0_R | ||||
| fmls v6.2d, v23.2d, alphaV0_I | fmls v6.2d, v23.2d, alphaV0_I | ||||
| fmla v7.2d, v22.2d, alphaV1_I | |||||
| fmla v7.2d, v23.2d, alphaV1_R | |||||
| fmla v7.2d, v22.2d, alphaV0_I | |||||
| fmla v7.2d, v23.2d, alphaV0_R | |||||
| st2 {v6.2d, v7.2d}, [pCRow2] | st2 {v6.2d, v7.2d}, [pCRow2] | ||||
| add pCRow0, pCRow0, #64 | add pCRow0, pCRow0, #64 | ||||
| @@ -845,18 +869,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .endm | .endm | ||||
| .macro SAVE2x2 | .macro SAVE2x2 | ||||
| fmov alpha0_R, alpha_save_R | |||||
| fmov alpha0_I, alpha_save_I | |||||
| fmov alpha1_R, alpha0_R | |||||
| fmov alpha1_I, alpha0_I | |||||
| fmov alpha0_R, alphaR | |||||
| fmov alpha0_I, alphaI | |||||
| mov pCRow1, pCRow0 | mov pCRow1, pCRow0 | ||||
| ld2 {v0.2d, v1.2d}, [pCRow1] | ld2 {v0.2d, v1.2d}, [pCRow1] | ||||
| fmla v0.2d, v16.2d, alphaV0_R | fmla v0.2d, v16.2d, alphaV0_R | ||||
| fmls v0.2d, v17.2d, alphaV0_I | fmls v0.2d, v17.2d, alphaV0_I | ||||
| fmla v1.2d, v16.2d, alphaV1_I | |||||
| fmla v1.2d, v17.2d, alphaV1_R | |||||
| fmla v1.2d, v16.2d, alphaV0_I | |||||
| fmla v1.2d, v17.2d, alphaV0_R | |||||
| st2 {v0.2d, v1.2d}, [pCRow1] | st2 {v0.2d, v1.2d}, [pCRow1] | ||||
| add pCRow1, pCRow1, LDC | add pCRow1, pCRow1, LDC | ||||
| @@ -864,8 +886,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| ld2 {v4.2d, v5.2d}, [pCRow1] | ld2 {v4.2d, v5.2d}, [pCRow1] | ||||
| fmla v4.2d, v20.2d, alphaV0_R | fmla v4.2d, v20.2d, alphaV0_R | ||||
| fmls v4.2d, v21.2d, alphaV0_I | fmls v4.2d, v21.2d, alphaV0_I | ||||
| fmla v5.2d, v20.2d, alphaV1_I | |||||
| fmla v5.2d, v21.2d, alphaV1_R | |||||
| fmla v5.2d, v20.2d, alphaV0_I | |||||
| fmla v5.2d, v21.2d, alphaV0_R | |||||
| st2 {v4.2d, v5.2d}, [pCRow1] | st2 {v4.2d, v5.2d}, [pCRow1] | ||||
| add pCRow0, pCRow0, #32 | add pCRow0, pCRow0, #32 | ||||
| @@ -898,18 +920,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .endm | .endm | ||||
| .macro SAVE1x2 | .macro SAVE1x2 | ||||
| fmov alpha0_R, alpha_save_R | |||||
| fmov alpha0_I, alpha_save_I | |||||
| fmov alpha1_R, alpha0_R | |||||
| fmov alpha1_I, alpha0_I | |||||
| fmov alpha0_R, alphaR | |||||
| fmov alpha0_I, alphaI | |||||
| mov pCRow1, pCRow0 | mov pCRow1, pCRow0 | ||||
| ld2 {v0.d, v1.d}[0], [pCRow1] | ld2 {v0.d, v1.d}[0], [pCRow1] | ||||
| fmla d0, d16, alphaV0_R | fmla d0, d16, alphaV0_R | ||||
| fmls d0, d17, alphaV0_I | fmls d0, d17, alphaV0_I | ||||
| fmla d1, d16, alphaV1_I | |||||
| fmla d1, d17, alphaV1_R | |||||
| fmla d1, d16, alphaV0_I | |||||
| fmla d1, d17, alphaV0_R | |||||
| st2 {v0.d, v1.d}[0], [pCRow1] | st2 {v0.d, v1.d}[0], [pCRow1] | ||||
| add pCRow1, pCRow1, LDC | add pCRow1, pCRow1, LDC | ||||
| @@ -917,8 +937,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| ld2 {v4.d, v5.d}[0], [pCRow1] | ld2 {v4.d, v5.d}[0], [pCRow1] | ||||
| fmla d4, d20, alphaV0_R | fmla d4, d20, alphaV0_R | ||||
| fmls d4, d21, alphaV0_I | fmls d4, d21, alphaV0_I | ||||
| fmla d5, d20, alphaV1_I | |||||
| fmla d5, d21, alphaV1_R | |||||
| fmla d5, d20, alphaV0_I | |||||
| fmla d5, d21, alphaV0_R | |||||
| st2 {v4.d, v5.d}[0], [pCRow1] | st2 {v4.d, v5.d}[0], [pCRow1] | ||||
| add pCRow0, pCRow0, #16 | add pCRow0, pCRow0, #16 | ||||
| @@ -953,25 +973,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .endm | .endm | ||||
| .macro SAVE4x1 | .macro SAVE4x1 | ||||
| fmov alpha0_R, alpha_save_R | |||||
| fmov alpha0_I, alpha_save_I | |||||
| fmov alpha1_R, alpha0_R | |||||
| fmov alpha1_I, alpha0_I | |||||
| fmov alpha0_R, alphaR | |||||
| fmov alpha0_I, alphaI | |||||
| mov pCRow1, pCRow0 | mov pCRow1, pCRow0 | ||||
| ld2 {v0.2d, v1.2d}, [pCRow1] | ld2 {v0.2d, v1.2d}, [pCRow1] | ||||
| fmla v0.2d, v16.2d, alphaV0_R | fmla v0.2d, v16.2d, alphaV0_R | ||||
| fmls v0.2d, v17.2d, alphaV0_I | fmls v0.2d, v17.2d, alphaV0_I | ||||
| fmla v1.2d, v16.2d, alphaV1_I | |||||
| fmla v1.2d, v17.2d, alphaV1_R | |||||
| fmla v1.2d, v16.2d, alphaV0_I | |||||
| fmla v1.2d, v17.2d, alphaV0_R | |||||
| st2 {v0.2d, v1.2d}, [pCRow1] | st2 {v0.2d, v1.2d}, [pCRow1] | ||||
| add pCRow2, pCRow1, #32 | add pCRow2, pCRow1, #32 | ||||
| ld2 {v2.2d, v3.2d}, [pCRow2] | ld2 {v2.2d, v3.2d}, [pCRow2] | ||||
| fmla v2.2d, v18.2d, alphaV0_R | fmla v2.2d, v18.2d, alphaV0_R | ||||
| fmls v2.2d, v19.2d, alphaV0_I | fmls v2.2d, v19.2d, alphaV0_I | ||||
| fmla v3.2d, v18.2d, alphaV1_I | |||||
| fmla v3.2d, v19.2d, alphaV1_R | |||||
| fmla v3.2d, v18.2d, alphaV0_I | |||||
| fmla v3.2d, v19.2d, alphaV0_R | |||||
| st2 {v2.2d, v3.2d}, [pCRow2] | st2 {v2.2d, v3.2d}, [pCRow2] | ||||
| add pCRow0, pCRow0, #64 | add pCRow0, pCRow0, #64 | ||||
| @@ -997,18 +1015,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .endm | .endm | ||||
| .macro SAVE2x1 | .macro SAVE2x1 | ||||
| fmov alpha0_R, alpha_save_R | |||||
| fmov alpha0_I, alpha_save_I | |||||
| fmov alpha1_R, alpha0_R | |||||
| fmov alpha1_I, alpha0_I | |||||
| fmov alpha0_R, alphaR | |||||
| fmov alpha0_I, alphaI | |||||
| mov pCRow1, pCRow0 | mov pCRow1, pCRow0 | ||||
| ld2 {v0.2d, v1.2d}, [pCRow1] | ld2 {v0.2d, v1.2d}, [pCRow1] | ||||
| fmla v0.2d, v16.2d, alphaV0_R | fmla v0.2d, v16.2d, alphaV0_R | ||||
| fmls v0.2d, v17.2d, alphaV0_I | fmls v0.2d, v17.2d, alphaV0_I | ||||
| fmla v1.2d, v16.2d, alphaV1_I | |||||
| fmla v1.2d, v17.2d, alphaV1_R | |||||
| fmla v1.2d, v16.2d, alphaV0_I | |||||
| fmla v1.2d, v17.2d, alphaV0_R | |||||
| st2 {v0.2d, v1.2d}, [pCRow1] | st2 {v0.2d, v1.2d}, [pCRow1] | ||||
| add pCRow0, pCRow0, #32 | add pCRow0, pCRow0, #32 | ||||
| @@ -1035,18 +1051,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .endm | .endm | ||||
| .macro SAVE1x1 | .macro SAVE1x1 | ||||
| fmov alpha0_R, alpha_save_R | |||||
| fmov alpha0_I, alpha_save_I | |||||
| fmov alpha1_R, alpha0_R | |||||
| fmov alpha1_I, alpha0_I | |||||
| fmov alpha0_R, alphaR | |||||
| fmov alpha0_I, alphaI | |||||
| mov pCRow1, pCRow0 | mov pCRow1, pCRow0 | ||||
| ld2 {v0.d, v1.d}[0], [pCRow1] | ld2 {v0.d, v1.d}[0], [pCRow1] | ||||
| fmla d0, d16, alphaV0_R | fmla d0, d16, alphaV0_R | ||||
| fmls d0, d17, alphaV0_I | fmls d0, d17, alphaV0_I | ||||
| fmla d1, d16, alphaV1_I | |||||
| fmla d1, d17, alphaV1_R | |||||
| fmla d1, d16, alphaV0_I | |||||
| fmla d1, d17, alphaV0_R | |||||
| st2 {v0.d, v1.d}[0], [pCRow1] | st2 {v0.d, v1.d}[0], [pCRow1] | ||||
| add pCRow0, pCRow0, #16 | add pCRow0, pCRow0, #16 | ||||
| @@ -1072,8 +1086,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| stp x26, x27, [sp, #(9 * 16)] | stp x26, x27, [sp, #(9 * 16)] | ||||
| str x28, [sp, #(10 * 16)] | str x28, [sp, #(10 * 16)] | ||||
| fmov alpha_save_R, d0 | |||||
| fmov alpha_save_I, d1 | |||||
| prfm PLDL1KEEP, [origPB] | |||||
| prfm PLDL1KEEP, [origPA] | |||||
| fmov alphaR, d0 | |||||
| fmov alphaI, d1 | |||||
| lsl LDC, LDC, #4 // ldc = ldc * 2 * 8 | lsl LDC, LDC, #4 // ldc = ldc * 2 * 8 | ||||
| @@ -1085,8 +1102,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| ble zgemm_kernel_L2_BEGIN | ble zgemm_kernel_L2_BEGIN | ||||
| zgemm_kernel_L4_BEGIN: | zgemm_kernel_L4_BEGIN: | ||||
| mov pCRow0, pC // pCRow0 = C | |||||
| add pC, pC, LDC, lsl #2 | |||||
| mov pCRow0, pC | |||||
| add pCRow1, pCRow0, LDC | |||||
| add pCRow2, pCRow1, LDC | |||||
| add pCRow3, pCRow2, LDC | |||||
| add pC, pCRow3, LDC | |||||
| mov pA, origPA // pA = start of A array | mov pA, origPA // pA = start of A array | ||||
| zgemm_kernel_L4_M4_BEGIN: | zgemm_kernel_L4_M4_BEGIN: | ||||
| @@ -1096,42 +1118,68 @@ zgemm_kernel_L4_M4_BEGIN: | |||||
| cmp counterI, #0 | cmp counterI, #0 | ||||
| ble zgemm_kernel_L4_M2_BEGIN | ble zgemm_kernel_L4_M2_BEGIN | ||||
| .align 5 | |||||
| zgemm_kernel_L4_M4_20: | zgemm_kernel_L4_M4_20: | ||||
| mov pB, origPB | mov pB, origPB | ||||
| asr counterL , origK, #1 // L = K / 2 | |||||
| cmp counterL , #2 // is there at least 4 to do? | |||||
| asr counterL , origK, #3 | |||||
| cmp counterL , #2 | |||||
| blt zgemm_kernel_L4_M4_32 | blt zgemm_kernel_L4_M4_32 | ||||
| KERNEL4x4_I // do one in the K | |||||
| KERNEL4x4_M2 // do another in the K | |||||
| KERNEL4x4_I | |||||
| KERNEL4x4_M2 | |||||
| KERNEL4x4_M1 | |||||
| KERNEL4x4_M2 | |||||
| KERNEL4x4_M1 | |||||
| KERNEL4x4_M2 | |||||
| KERNEL4x4_M1 | |||||
| KERNEL4x4_M2 | |||||
| subs counterL, counterL, #2 // subtract 2 | subs counterL, counterL, #2 // subtract 2 | ||||
| ble zgemm_kernel_L4_M4_22a | ble zgemm_kernel_L4_M4_22a | ||||
| .align 5 | |||||
| .align 5 | |||||
| zgemm_kernel_L4_M4_22: | zgemm_kernel_L4_M4_22: | ||||
| KERNEL4x4_M1 | KERNEL4x4_M1 | ||||
| KERNEL4x4_M2 | KERNEL4x4_M2 | ||||
| KERNEL4x4_M1 | |||||
| KERNEL4x4_M2 | |||||
| KERNEL4x4_M1 | |||||
| KERNEL4x4_M2 | |||||
| KERNEL4x4_M1 | |||||
| KERNEL4x4_M2 | |||||
| subs counterL, counterL, #1 | subs counterL, counterL, #1 | ||||
| bgt zgemm_kernel_L4_M4_22 | bgt zgemm_kernel_L4_M4_22 | ||||
| .align 5 | |||||
| zgemm_kernel_L4_M4_22a: | zgemm_kernel_L4_M4_22a: | ||||
| KERNEL4x4_M1 | |||||
| KERNEL4x4_M2 | |||||
| KERNEL4x4_M1 | |||||
| KERNEL4x4_M2 | |||||
| KERNEL4x4_M1 | |||||
| KERNEL4x4_M2 | |||||
| KERNEL4x4_M1 | KERNEL4x4_M1 | ||||
| KERNEL4x4_E | KERNEL4x4_E | ||||
| b zgemm_kernel_L4_M4_44 | b zgemm_kernel_L4_M4_44 | ||||
| .align 5 | |||||
| zgemm_kernel_L4_M4_32: | zgemm_kernel_L4_M4_32: | ||||
| tst counterL, #1 | tst counterL, #1 | ||||
| ble zgemm_kernel_L4_M4_40 | ble zgemm_kernel_L4_M4_40 | ||||
| KERNEL4x4_I | KERNEL4x4_I | ||||
| KERNEL4x4_M2 | |||||
| KERNEL4x4_M1 | |||||
| KERNEL4x4_M2 | |||||
| KERNEL4x4_M1 | |||||
| KERNEL4x4_M2 | |||||
| KERNEL4x4_M1 | |||||
| KERNEL4x4_E | KERNEL4x4_E | ||||
| b zgemm_kernel_L4_M4_44 | b zgemm_kernel_L4_M4_44 | ||||
| @@ -1143,13 +1191,20 @@ zgemm_kernel_L4_M4_40: | |||||
| zgemm_kernel_L4_M4_44: | zgemm_kernel_L4_M4_44: | ||||
| ands counterL , origK, #1 | |||||
| ands counterL , origK, #7 | |||||
| ble zgemm_kernel_L4_M4_100 | ble zgemm_kernel_L4_M4_100 | ||||
| .align 5 | |||||
| zgemm_kernel_L4_M4_46: | zgemm_kernel_L4_M4_46: | ||||
| KERNEL4x4_SUB | KERNEL4x4_SUB | ||||
| subs counterL, counterL, #1 | |||||
| bne zgemm_kernel_L4_M4_46 | |||||
| zgemm_kernel_L4_M4_100: | zgemm_kernel_L4_M4_100: | ||||
| prfm PLDL1KEEP, [pA] | |||||
| prfm PLDL1KEEP, [pA, #64] | |||||
| prfm PLDL1KEEP, [origPB] | |||||
| SAVE4x4 | SAVE4x4 | ||||
| @@ -43,6 +43,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define Y_OPTR x13 /* loop Y vector address */ | #define Y_OPTR x13 /* loop Y vector address */ | ||||
| #define X_PTR x14 /* loop X vector address */ | #define X_PTR x14 /* loop X vector address */ | ||||
| #define A_PRE_SIZE 768 | |||||
| #define Y_PRE_SIZE 768 | |||||
| /******************************************************************************* | /******************************************************************************* | ||||
| * Macro definitions | * Macro definitions | ||||
| *******************************************************************************/ | *******************************************************************************/ | ||||
| @@ -50,14 +53,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #if !defined(DOUBLE) | #if !defined(DOUBLE) | ||||
| #define ALPHA_R s0 | #define ALPHA_R s0 | ||||
| #define ALPHA_I s1 | #define ALPHA_I s1 | ||||
| #define ALPHA_R_COPY s7 | |||||
| #define ALPHA_I_COPY s8 | |||||
| #define SHZ 3 | #define SHZ 3 | ||||
| #else | #else | ||||
| #define ALPHA_R d0 | #define ALPHA_R d0 | ||||
| #define ALPHA_I d1 | #define ALPHA_I d1 | ||||
| #define ALPHA_R_COPY d7 | |||||
| #define ALPHA_I_COPY d8 | |||||
| #define SHZ 4 | #define SHZ 4 | ||||
| #endif | #endif | ||||
| @@ -95,20 +94,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .macro INIT | .macro INIT | ||||
| /********** INIT FOR F4 LOOP **********/ | |||||
| fmov ALPHA_R_COPY, ALPHA_R | |||||
| fmov ALPHA_I_COPY, ALPHA_I | |||||
| #if !defined(DOUBLE) | |||||
| ins v7.s[1], v7.s[0] // R(ALPHA), R(ALPHA) | |||||
| ins v8.s[1], v8.s[0] // I(ALPHA), I(ALPHA) | |||||
| ins v7.d[1], v7.d[0] | |||||
| ins v8.d[1], v8.d[0] | |||||
| #else | |||||
| ins v7.d[1], v7.d[0] // R(ALPHA), R(ALPHA) | |||||
| ins v8.d[1], v8.d[0] // I(ALPHA), I(ALPHA) | |||||
| #endif | |||||
| /******* INIT FOR F1 AND S1 LOOP ******/ | |||||
| #if !defined(DOUBLE) | #if !defined(DOUBLE) | ||||
| ins v0.s[1], v0.s[0] // R(ALPHA), R(ALPHA) | ins v0.s[1], v0.s[0] // R(ALPHA), R(ALPHA) | ||||
| eor v2.16b, v2.16b, v2.16b | eor v2.16b, v2.16b, v2.16b | ||||
| @@ -129,47 +114,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .endm | .endm | ||||
| .macro INIT_LOOP | .macro INIT_LOOP | ||||
| /********** INIT_LOOP FOR F4 LOOP **********/ | |||||
| #if !defined(DOUBLE) | #if !defined(DOUBLE) | ||||
| ld1 {v9.2s}, [X_PTR] // [I(X), R(X)] | |||||
| ins v10.s[0], v9.s[1] | |||||
| ins v9.s[1], v9.s[0] // [R(X), R(X)] | |||||
| ins v10.s[1], v10.s[0] // [I(X), I(X)] | |||||
| ins v9.d[1], v9.d[0] | |||||
| ins v10.d[1], v10.d[0] | |||||
| ld1 {v2.2s}, [X_PTR] // [I(X), R(X)] | |||||
| ext v3.8b, v2.8b, v2.8b, #4 // [R(X), I(X)] | |||||
| fmul v2.2s, v0.2s, v2.2s | |||||
| fmla v2.2s, v1.2s, v3.2s // [I(TEMP), R(TEMP)] | |||||
| ins v3.s[0], v2.s[1] | |||||
| /********** INIT_LOOP FOR F4 LOOP **********/ | |||||
| #if !defined(CONJ) | #if !defined(CONJ) | ||||
| #if !defined(XCONJ) | #if !defined(XCONJ) | ||||
| fmul v11.4s, v9.4s, v7.4s // [+ R(X) * R(ALPHA)] | |||||
| fmls v11.4s, v10.4s, v8.4s // [- I(X) * I(ALPHA)] | |||||
| fmul v12.4s, v9.4s, v8.4s // [+ R(X) * I(ALPHA)] | |||||
| fmla v12.4s, v10.4s, v7.4s // [+ I(X) * R(ALPHA)] | |||||
| dup v21.4s, v2.s[0] // R[TEMP] | |||||
| dup v22.4s, v2.s[0] // R[TEMP] | |||||
| eor v25.16b, v25.16b, v25.16b | |||||
| fsub s25, s25, s3 | |||||
| dup v23.4s, v25.s[0] // -I[TEMP] | |||||
| dup v24.4s, v3.s[0] // I[TEMP] | |||||
| #else | #else | ||||
| fmul v11.4s, v9.4s, v7.4s // [+ R(X) * R(ALPHA)] | |||||
| fmla v11.4s, v10.4s, v8.4s // [+ I(X) * I(ALPHA)] | |||||
| fmul v12.4s, v9.4s, v8.4s // [+ R(X) * I(ALPHA)] | |||||
| fmls v12.4s, v10.4s, v7.4s // [- I(X) * R(ALPHA)] | |||||
| dup v21.4s, v2.s[0] // R[TEMP] | |||||
| dup v22.4s, v2.s[0] // R[TEMP] | |||||
| dup v23.4s, v3.s[0] // I[TEMP] | |||||
| eor v25.16b, v25.16b, v25.16b | |||||
| fsub s25, s25, s3 | |||||
| dup v24.4s, v25.s[0] // -I[TEMP] | |||||
| #endif | #endif | ||||
| #else // CONJ | #else // CONJ | ||||
| #if !defined(XCONJ) | #if !defined(XCONJ) | ||||
| fmul v11.4s, v9.4s, v7.4s // [+ R(X) * R(ALPHA)] | |||||
| fmls v11.4s, v10.4s, v8.4s // [+ I(X) * I(ALPHA)] | |||||
| fmul v12.4s, v10.4s, v7.4s // [+ I(X) * R(ALPHA)] | |||||
| fmls v12.4s, v9.4s, v8.4s // [- R(X) * I(ALPHA)] | |||||
| dup v21.4s, v2.s[0] // R[TEMP] | |||||
| eor v25.16b, v25.16b, v25.16b | |||||
| fsub s25, s25, s2 | |||||
| dup v22.4s, v25.s[0] // R[TEMP] | |||||
| dup v23.4s, v3.s[0] // I[TEMP] | |||||
| dup v24.4s, v3.s[0] // I[TEMP] | |||||
| #else | #else | ||||
| fmul v11.4s, v9.4s, v7.4s // [+ R(X) * R(ALPHA)] | |||||
| fmls v11.4s, v10.4s, v8.4s // [- I(X) * I(ALPHA)] | |||||
| eor v12.16b, v12.16b, v12.16b | |||||
| fmls v12.4s, v9.4s, v8.4s // [- R(X) * I(ALPHA)] | |||||
| fmla v12.4s, v10.4s, v7.4s // [- I(X) * R(ALPHA)] | |||||
| dup v21.4s, v2.s[0] // R[TEMP] | |||||
| eor v25.16b, v25.16b, v25.16b | |||||
| fsub s25, s25, s2 | |||||
| dup v22.4s, v25.s[0] // R[TEMP] | |||||
| eor v25.16b, v25.16b, v25.16b | |||||
| fsub s25, s25, s3 | |||||
| dup v23.4s, v25.s[0] // I[TEMP] | |||||
| dup v24.4s, v25.s[0] // I[TEMP] | |||||
| #endif | #endif | ||||
| #endif // CONJ | #endif // CONJ | ||||
| /****** INIT_LOOP FOR F1 AND S1 LOOP ******/ | /****** INIT_LOOP FOR F1 AND S1 LOOP ******/ | ||||
| ld1 {v2.2s}, [X_PTR] // [I(X), R(X)] | |||||
| ext v3.8b, v2.8b, v2.8b, #4 // [R(X), I(X)] | |||||
| fmul v2.2s, v0.2s, v2.2s | |||||
| fmla v2.2s, v1.2s, v3.2s // [I(TEMP), R(TEMP)] | |||||
| ins v3.s[0], v2.s[1] | |||||
| #if !defined(CONJ) | #if !defined(CONJ) | ||||
| #if !defined(XCONJ) | #if !defined(XCONJ) | ||||
| eor v4.16b, v4.16b, v4.16b | eor v4.16b, v4.16b, v4.16b | ||||
| @@ -200,45 +191,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #endif // CONJ | #endif // CONJ | ||||
| #else // DOUBLE | #else // DOUBLE | ||||
| ld1 {v2.2d}, [X_PTR] // [I(X), R(X)] | |||||
| ext v3.16b, v2.16b, v2.16b, #8 // [R(X), I(X)] | |||||
| fmul v2.2d, v0.2d, v2.2d | |||||
| fmla v2.2d, v1.2d, v3.2d // [I(TEMP), R(TEMP)] | |||||
| ins v3.d[0], v2.d[1] // I(TEMP) | |||||
| /********** INIT_LOOP FOR F4 LOOP **********/ | |||||
| ld1 {v9.2d}, [X_PTR] // [I(X), R(X)] | |||||
| ins v10.d[0], v9.d[1] | |||||
| ins v9.d[1], v9.d[0] // [R(X), R(X)] | |||||
| ins v10.d[1], v10.d[0] // [I(X), I(X)] | |||||
| /****** INIT_LOOP FOR F4 LOOP ******/ | |||||
| #if !defined(CONJ) | #if !defined(CONJ) | ||||
| #if !defined(XCONJ) | #if !defined(XCONJ) | ||||
| fmul v11.2d, v9.2d, v7.2d // [+ R(X) * R(ALPHA)] | |||||
| fmls v11.2d, v10.2d, v8.2d // [- I(X) * I(ALPHA)] | |||||
| fmul v12.2d, v9.2d, v8.2d // [+ R(X) * I(ALPHA)] | |||||
| fmla v12.2d, v10.2d, v7.2d // [+ I(X) * R(ALPHA)] | |||||
| dup v21.2d, v2.d[0] // R[TEMP] | |||||
| dup v22.2d, v2.d[0] // R[TEMP] | |||||
| eor v25.16b, v25.16b, v25.16b | |||||
| fsub d25, d25, d3 | |||||
| dup v23.2d, v25.d[0] // -I[TEMP] | |||||
| dup v24.2d, v3.d[0] // I[TEMP] | |||||
| #else | #else | ||||
| fmul v11.2d, v9.2d, v7.2d // [+ R(X) * R(ALPHA)] | |||||
| fmla v11.2d, v10.2d, v8.2d // [+ I(X) * I(ALPHA)] | |||||
| fmul v12.2d, v9.2d, v8.2d // [+ R(X) * I(ALPHA)] | |||||
| fmls v12.2d, v10.2d, v7.2d // [- I(X) * R(ALPHA)] | |||||
| dup v21.2d, v2.d[0] // R[TEMP] | |||||
| dup v22.2d, v2.d[0] // R[TEMP] | |||||
| dup v23.2d, v3.d[0] // I[TEMP] | |||||
| eor v25.16b, v25.16b, v25.16b | |||||
| fsub d25, d25, d3 | |||||
| dup v24.2d, v25.d[0] // -I[TEMP] | |||||
| #endif | #endif | ||||
| #else // CONJ | #else // CONJ | ||||
| #if !defined(XCONJ) | #if !defined(XCONJ) | ||||
| fmul v11.2d, v9.2d, v7.2d // [+ R(X) * R(ALPHA)] | |||||
| fmls v11.2d, v10.2d, v8.2d // [+ I(X) * I(ALPHA)] | |||||
| fmul v12.2d, v10.2d, v7.2d // [+ I(X) * R(ALPHA)] | |||||
| fmls v12.2d, v9.2d, v8.2d // [- R(X) * I(ALPHA)] | |||||
| dup v21.2d, v2.d[0] // R[TEMP] | |||||
| eor v25.16b, v25.16b, v25.16b | |||||
| fsub d25, d25, d2 | |||||
| dup v22.2d, v25.d[0] // R[TEMP] | |||||
| dup v23.2d, v3.d[0] // I[TEMP] | |||||
| dup v24.2d, v3.d[0] // I[TEMP] | |||||
| #else | #else | ||||
| fmul v11.2d, v9.2d, v7.2d // [+ R(X) * R(ALPHA)] | |||||
| fmls v11.2d, v10.2d, v8.2d // [- I(X) * I(ALPHA)] | |||||
| eor v12.16b, v12.16b, v12.16b | |||||
| fmls v12.2d, v9.2d, v8.2d // [- R(X) * I(ALPHA)] | |||||
| fmla v12.2d, v10.2d, v7.2d // [- I(X) * R(ALPHA)] | |||||
| dup v21.2d, v2.d[0] // R[TEMP] | |||||
| eor v25.16b, v25.16b, v25.16b | |||||
| fsub d25, d25, d2 | |||||
| dup v22.2d, v25.d[0] // R[TEMP] | |||||
| eor v25.16b, v25.16b, v25.16b | |||||
| fsub d25, d25, d3 | |||||
| dup v23.2d, v25.d[0] // I[TEMP] | |||||
| dup v24.2d, v25.d[0] // I[TEMP] | |||||
| #endif | #endif | ||||
| #endif // CONJ | #endif // CONJ | ||||
| /****** INIT_LOOP FOR F1 AND S1 LOOP ******/ | /****** INIT_LOOP FOR F1 AND S1 LOOP ******/ | ||||
| ld1 {v2.2d}, [X_PTR] // [I(X), R(X)] | |||||
| ext v3.16b, v2.16b, v2.16b, #8 // [R(X), I(X)] | |||||
| fmul v2.2d, v0.2d, v2.2d | |||||
| fmla v2.2d, v1.2d, v3.2d // [I(TEMP), R(TEMP)] | |||||
| ins v3.d[0], v2.d[1] // I(TEMP) | |||||
| #if !defined(CONJ) | #if !defined(CONJ) | ||||
| #if !defined(XCONJ) | #if !defined(XCONJ) | ||||
| eor v4.16b, v4.16b, v4.16b | eor v4.16b, v4.16b, v4.16b | ||||
| @@ -276,91 +274,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| ld2 {v13.4s, v14.4s}, [A_PTR], #32 | ld2 {v13.4s, v14.4s}, [A_PTR], #32 | ||||
| ld2 {v15.4s, v16.4s}, [Y_IPTR], #32 | ld2 {v15.4s, v16.4s}, [Y_IPTR], #32 | ||||
| #if !defined(CONJ) | |||||
| #if !defined(XCONJ) | |||||
| fmla v15.4s, v11.4s, v13.4s // [+ R(ALPHA * X) * A_R] | |||||
| fmls v15.4s, v12.4s, v14.4s // [- I(ALPHA * X) * A_I] | |||||
| fmla v16.4s, v11.4s, v14.4s // [+ R(ALPHA * X) * A_I] | |||||
| fmla v16.4s, v12.4s, v13.4s // [+ I(ALPHA * X) * A_R] | |||||
| #else | |||||
| fmla v15.4s, v11.4s, v13.4s // [+ R(ALPHA * X) * A_R] | |||||
| fmla v15.4s, v12.4s, v14.4s // [+ I(ALPHA * X) * A_I] | |||||
| fmla v16.4s, v11.4s, v14.4s // [+ R(ALPHA * X) * A_I] | |||||
| fmls v16.4s, v12.4s, v13.4s // [- I(ALPHA * X) * A_R] | |||||
| #endif | |||||
| #else // CONJ | |||||
| #if !defined(XCONJ) | |||||
| fmla v15.4s, v11.4s, v13.4s // [+ R(ALPHA * X) * A_R] | |||||
| fmla v15.4s, v12.4s, v14.4s // [+ I(ALPHA * X) * A_I] | |||||
| fmls v16.4s, v11.4s, v14.4s // [- R(ALPHA * X) * A_I] | |||||
| fmla v16.4s, v12.4s, v13.4s // [+ I(ALPHA * X) * A_R] | |||||
| #else | |||||
| fmla v15.4s, v11.4s, v13.4s // [+ R(ALPHA * X) * A_R] | |||||
| fmls v15.4s, v12.4s, v14.4s // [- I(ALPHA * X) * A_I] | |||||
| fmls v16.4s, v11.4s, v14.4s // [- R(ALPHA * X) * A_I] | |||||
| fmls v16.4s, v12.4s, v13.4s // [- I(ALPHA * X) * A_R] | |||||
| #endif | |||||
| #endif // CONJ | |||||
| prfm PLDL1KEEP, [A_PTR, #A_PRE_SIZE] | |||||
| prfm PLDL1KEEP, [Y_IPTR, #Y_PRE_SIZE] | |||||
| fmla v15.4s, v21.4s, v13.4s | |||||
| fmla v15.4s, v23.4s, v14.4s | |||||
| fmla v16.4s, v22.4s, v14.4s | |||||
| fmla v16.4s, v24.4s, v13.4s | |||||
| st2 {v15.4s, v16.4s}, [Y_OPTR], #32 | st2 {v15.4s, v16.4s}, [Y_OPTR], #32 | ||||
| #else // DOUBLE | #else // DOUBLE | ||||
| ld2 {v13.2d, v14.2d}, [A_PTR], #32 | ld2 {v13.2d, v14.2d}, [A_PTR], #32 | ||||
| ld2 {v15.2d, v16.2d}, [Y_IPTR], #32 | ld2 {v15.2d, v16.2d}, [Y_IPTR], #32 | ||||
| #if !defined(CONJ) | |||||
| #if !defined(XCONJ) | |||||
| fmla v15.2d, v11.2d, v13.2d // [+ R(ALPHA * X) * A_R] | |||||
| fmls v15.2d, v12.2d, v14.2d // [- I(ALPHA * X) * A_I] | |||||
| fmla v16.2d, v11.2d, v14.2d // [+ R(ALPHA * X) * A_I] | |||||
| fmla v16.2d, v12.2d, v13.2d // [+ I(ALPHA * X) * A_R] | |||||
| #else | |||||
| fmla v15.2d, v11.2d, v13.2d // [+ R(ALPHA * X) * A_R] | |||||
| fmla v15.2d, v12.2d, v14.2d // [+ I(ALPHA * X) * A_I] | |||||
| fmla v16.2d, v11.2d, v14.2d // [+ R(ALPHA * X) * A_I] | |||||
| fmls v16.2d, v12.2d, v13.2d // [- I(ALPHA * X) * A_R] | |||||
| #endif | |||||
| #else // CONJ | |||||
| #if !defined(XCONJ) | |||||
| fmla v15.2d, v11.2d, v13.2d // [+ R(ALPHA * X) * A_R] | |||||
| fmla v15.2d, v12.2d, v14.2d // [+ I(ALPHA * X) * A_I] | |||||
| fmls v16.2d, v11.2d, v14.2d // [- R(ALPHA * X) * A_I] | |||||
| fmla v16.2d, v12.2d, v13.2d // [+ I(ALPHA * X) * A_R] | |||||
| #else | |||||
| fmla v15.2d, v11.2d, v13.2d // [+ R(ALPHA * X) * A_R] | |||||
| fmls v15.2d, v12.2d, v14.2d // [- I(ALPHA * X) * A_I] | |||||
| fmls v16.2d, v11.2d, v14.2d // [- R(ALPHA * X) * A_I] | |||||
| fmls v16.2d, v12.2d, v13.2d // [- I(ALPHA * X) * A_R] | |||||
| #endif | |||||
| #endif // CONJ | |||||
| prfm PLDL1KEEP, [A_PTR, #A_PRE_SIZE] | |||||
| fmla v15.2d, v21.2d, v13.2d | |||||
| fmla v15.2d, v23.2d, v14.2d | |||||
| fmla v16.2d, v22.2d, v14.2d | |||||
| fmla v16.2d, v24.2d, v13.2d | |||||
| st2 {v15.2d, v16.2d}, [Y_OPTR], #32 | st2 {v15.2d, v16.2d}, [Y_OPTR], #32 | ||||
| ld2 {v17.2d, v18.2d}, [A_PTR], #32 | ld2 {v17.2d, v18.2d}, [A_PTR], #32 | ||||
| ld2 {v19.2d, v20.2d}, [Y_IPTR], #32 | ld2 {v19.2d, v20.2d}, [Y_IPTR], #32 | ||||
| #if !defined(CONJ) | |||||
| #if !defined(XCONJ) | |||||
| fmla v19.2d, v11.2d, v17.2d // [+ R(ALPHA * X) * A_R] | |||||
| fmls v19.2d, v12.2d, v18.2d // [- I(ALPHA * X) * A_I] | |||||
| fmla v20.2d, v11.2d, v18.2d // [+ R(ALPHA * X) * A_I] | |||||
| fmla v20.2d, v12.2d, v17.2d // [+ I(ALPHA * X) * A_R] | |||||
| #else | |||||
| fmla v19.2d, v11.2d, v17.2d // [+ R(ALPHA * X) * A_R] | |||||
| fmla v19.2d, v12.2d, v18.2d // [- I(ALPHA * X) * A_I] | |||||
| fmla v20.2d, v11.2d, v18.2d // [+ R(ALPHA * X) * A_I] | |||||
| fmls v20.2d, v12.2d, v17.2d // [+ I(ALPHA * X) * A_R] | |||||
| #endif | |||||
| #else // CONJ | |||||
| #if !defined(XCONJ) | |||||
| fmla v19.2d, v11.2d, v17.2d // [+ R(ALPHA * X) * A_R] | |||||
| fmla v19.2d, v12.2d, v18.2d // [- I(ALPHA * X) * A_I] | |||||
| fmls v20.2d, v11.2d, v18.2d // [+ R(ALPHA * X) * A_I] | |||||
| fmla v20.2d, v12.2d, v17.2d // [+ I(ALPHA * X) * A_R] | |||||
| #else | |||||
| fmla v19.2d, v11.2d, v17.2d // [+ R(ALPHA * X) * A_R] | |||||
| fmls v19.2d, v12.2d, v18.2d // [- I(ALPHA * X) * A_I] | |||||
| fmls v20.2d, v11.2d, v18.2d // [+ R(ALPHA * X) * A_I] | |||||
| fmls v20.2d, v12.2d, v17.2d // [+ I(ALPHA * X) * A_R] | |||||
| #endif | |||||
| #endif // CONJ | |||||
| prfm PLDL1KEEP, [Y_IPTR, #Y_PRE_SIZE] | |||||
| fmla v19.2d, v21.2d, v17.2d | |||||
| fmla v19.2d, v23.2d, v18.2d | |||||
| fmla v20.2d, v22.2d, v18.2d | |||||
| fmla v20.2d, v24.2d, v17.2d | |||||
| st2 {v19.2d, v20.2d}, [Y_OPTR], #32 | st2 {v19.2d, v20.2d}, [Y_OPTR], #32 | ||||
| #endif | #endif | ||||
| @@ -445,10 +391,7 @@ zgemv_n_kernel_F_LOOP: | |||||
| zgemv_n_kernel_F4: | zgemv_n_kernel_F4: | ||||
| KERNEL_F1 | |||||
| KERNEL_F1 | |||||
| KERNEL_F1 | |||||
| KERNEL_F1 | |||||
| KERNEL_F4 | |||||
| subs I, I, #1 | subs I, I, #1 | ||||
| bne zgemv_n_kernel_F4 | bne zgemv_n_kernel_F4 | ||||
| @@ -41,6 +41,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define J x11 /* loop variable */ | #define J x11 /* loop variable */ | ||||
| #define I x12 /* loop variable */ | #define I x12 /* loop variable */ | ||||
| #define A_PRE_SIZE 768 | |||||
| #define X_PRE_SIZE 768 | |||||
| /******************************************************************************* | /******************************************************************************* | ||||
| * Macro definitions | * Macro definitions | ||||
| *******************************************************************************/ | *******************************************************************************/ | ||||
| @@ -139,6 +142,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| ld2 {v11.4s, v12.4s}, [X_PTR], #32 | ld2 {v11.4s, v12.4s}, [X_PTR], #32 | ||||
| ld2 {v13.4s, v14.4s}, [A_PTR], #32 | ld2 {v13.4s, v14.4s}, [A_PTR], #32 | ||||
| prfm PLDL1STRM, [X_PTR, #X_PRE_SIZE] | |||||
| prfm PLDL1STRM, [A_PTR, #A_PRE_SIZE] | |||||
| #if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) | #if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) | ||||
| fmla v9.4s, v11.4s, v13.4s // [+ R(X) * A_R] | fmla v9.4s, v11.4s, v13.4s // [+ R(X) * A_R] | ||||
| @@ -155,7 +160,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #else // DOUBLE | #else // DOUBLE | ||||
| ld2 {v11.2d, v12.2d}, [X_PTR], #32 | ld2 {v11.2d, v12.2d}, [X_PTR], #32 | ||||
| ld2 {v13.2d, v14.2d}, [A_PTR], #32 | ld2 {v13.2d, v14.2d}, [A_PTR], #32 | ||||
| prfm PLDL1STRM, [X_PTR, #512] | |||||
| prfm PLDL1STRM, [X_PTR, #X_PRE_SIZE] | |||||
| #if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) | #if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) | ||||
| fmla v9.2d, v11.2d, v13.2d // [+ R(X) * A_R] | fmla v9.2d, v11.2d, v13.2d // [+ R(X) * A_R] | ||||
| @@ -171,7 +176,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| ld2 {v17.2d, v18.2d}, [X_PTR], #32 | ld2 {v17.2d, v18.2d}, [X_PTR], #32 | ||||
| ld2 {v19.2d, v20.2d}, [A_PTR], #32 | ld2 {v19.2d, v20.2d}, [A_PTR], #32 | ||||
| prfm PLDL1STRM, [A_PTR, #512] | |||||
| prfm PLDL1STRM, [A_PTR, #A_PRE_SIZE] | |||||
| #if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) | #if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) | ||||
| fmla v15.2d, v17.2d, v19.2d // [+ R(X) * A_R] | fmla v15.2d, v17.2d, v19.2d // [+ R(X) * A_R] | ||||
| @@ -46,23 +46,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define pCRow0 x12 | #define pCRow0 x12 | ||||
| #define pCRow1 x13 | #define pCRow1 x13 | ||||
| #define pCRow2 x14 | #define pCRow2 x14 | ||||
| #define pA x15 | |||||
| #define alpha_save_R x16 | |||||
| #define alpha_save_I x17 | |||||
| #define temp x18 | |||||
| #define tempOffset x19 | |||||
| #define tempK x20 | |||||
| #define pCRow3 x15 | |||||
| #define pA x16 | |||||
| #define alphaR x17 | |||||
| #define alphaI x18 | |||||
| #define temp x19 | |||||
| #define tempOffset x20 | |||||
| #define tempK x21 | |||||
| #define alpha0_R d10 | #define alpha0_R d10 | ||||
| #define alphaV0_R v10.d[0] | #define alphaV0_R v10.d[0] | ||||
| #define alpha0_I d11 | #define alpha0_I d11 | ||||
| #define alphaV0_I v11.d[0] | #define alphaV0_I v11.d[0] | ||||
| #define alpha1_R d14 | |||||
| #define alphaV1_R v14.d[0] | |||||
| #define alpha1_I d15 | |||||
| #define alphaV1_I v15.d[0] | |||||
| #define A_PRE_SIZE 2560 | |||||
| #define B_PRE_SIZE 448 | |||||
| #define C_PRE_SIZE 128 | |||||
| #if defined(NN) || defined(NT) || defined(TN) || defined(TT) | #if defined(NN) || defined(NT) || defined(TN) || defined(TT) | ||||
| #define OP_rr fmla | #define OP_rr fmla | ||||
| @@ -93,7 +92,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| // 04 origPB | // 04 origPB | ||||
| // 05 pC | // 05 pC | ||||
| // 06 origLDC -> LDC | // 06 origLDC -> LDC | ||||
| // 07 offset | |||||
| // 07 offset -> temp | |||||
| // 08 counterL | // 08 counterL | ||||
| // 09 counterI | // 09 counterI | ||||
| // 10 counterJ | // 10 counterJ | ||||
| @@ -101,13 +100,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| // 12 pCRow0 | // 12 pCRow0 | ||||
| // 13 pCRow1 | // 13 pCRow1 | ||||
| // 14 pCRow2 | // 14 pCRow2 | ||||
| // 15 pA | |||||
| // 16 alpha_save_R | |||||
| // 17 alpha_save_I | |||||
| // 18 must save temp | |||||
| // 19 must save tempOffset | |||||
| // 20 must save tempK | |||||
| // 21 must save | |||||
| // 15 pCRow3 | |||||
| // 16 pA | |||||
| // 17 alpha_save_R | |||||
| // 18 must save alpha_save_I | |||||
| // 19 must save temp | |||||
| // 20 must save tempOffset | |||||
| // 21 must save tempK | |||||
| // 22 must save | // 22 must save | ||||
| // 23 must save | // 23 must save | ||||
| // 24 must save | // 24 must save | ||||
| @@ -178,12 +177,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .macro KERNEL4x4_I | .macro KERNEL4x4_I | ||||
| ld2 {v8.2d, v9.2d}, [pB] | ld2 {v8.2d, v9.2d}, [pB] | ||||
| add pB, pB, #32 | add pB, pB, #32 | ||||
| ld2 {v10.2d, v11.2d}, [pB] | |||||
| add pB, pB, #32 | |||||
| ld2 {v0.2d, v1.2d}, [pA] | ld2 {v0.2d, v1.2d}, [pA] | ||||
| add pA, pA, #32 | add pA, pA, #32 | ||||
| ld2 {v2.2d, v3.2d}, [pA] | |||||
| add pA, pA, #32 | |||||
| fmul v16.2d, v0.2d, v8.d[0] | fmul v16.2d, v0.2d, v8.d[0] | ||||
| OP_ii v16.2d, v1.2d, v9.d[0] | OP_ii v16.2d, v1.2d, v9.d[0] | ||||
| @@ -196,16 +191,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #endif | #endif | ||||
| OP_ir v17.2d, v1.2d, v8.d[0] | OP_ir v17.2d, v1.2d, v8.d[0] | ||||
| fmul v18.2d, v2.2d, v8.d[0] | |||||
| OP_ii v18.2d, v3.2d, v9.d[0] | |||||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||||
| eor v19.16b, v19.16b, v19.16b | |||||
| fmls v19.2d, v2.2d, v9.d[0] | |||||
| #else | |||||
| fmul v19.2d, v2.2d, v9.d[0] | |||||
| #endif | |||||
| OP_ir v19.2d, v3.2d, v8.d[0] | |||||
| ld2 {v2.2d, v3.2d}, [pA] | |||||
| add pA, pA, #32 | |||||
| fmul v20.2d, v0.2d, v8.d[1] | fmul v20.2d, v0.2d, v8.d[1] | ||||
| OP_ii v20.2d, v1.2d, v9.d[1] | OP_ii v20.2d, v1.2d, v9.d[1] | ||||
| @@ -218,6 +205,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #endif | #endif | ||||
| OP_ir v21.2d, v1.2d, v8.d[1] | OP_ir v21.2d, v1.2d, v8.d[1] | ||||
| ld2 {v10.2d, v11.2d}, [pB] | |||||
| add pB, pB, #32 | |||||
| fmul v22.2d, v2.2d, v8.d[1] | fmul v22.2d, v2.2d, v8.d[1] | ||||
| OP_ii v22.2d, v3.2d, v9.d[1] | OP_ii v22.2d, v3.2d, v9.d[1] | ||||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | ||||
| @@ -229,6 +219,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #endif | #endif | ||||
| OP_ir v23.2d, v3.2d, v8.d[1] | OP_ir v23.2d, v3.2d, v8.d[1] | ||||
| ld2 {v12.2d, v13.2d}, [pB] | |||||
| add pB, pB, #32 | |||||
| fmul v18.2d, v2.2d, v8.d[0] | |||||
| OP_ii v18.2d, v3.2d, v9.d[0] | |||||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||||
| eor v19.16b, v19.16b, v19.16b | |||||
| fmls v19.2d, v2.2d, v9.d[0] | |||||
| #else | |||||
| fmul v19.2d, v2.2d, v9.d[0] | |||||
| #endif | |||||
| OP_ir v19.2d, v3.2d, v8.d[0] | |||||
| ld2 {v4.2d, v5.2d} , [pA] | |||||
| add pA, pA, #32 | |||||
| fmul v24.2d, v0.2d, v10.d[0] | fmul v24.2d, v0.2d, v10.d[0] | ||||
| OP_ii v24.2d, v1.2d, v11.d[0] | OP_ii v24.2d, v1.2d, v11.d[0] | ||||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | ||||
| @@ -240,6 +247,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #endif | #endif | ||||
| OP_ir v25.2d, v1.2d, v10.d[0] | OP_ir v25.2d, v1.2d, v10.d[0] | ||||
| ld2 {v6.2d, v7.2d} , [pA] | |||||
| add pA, pA, #32 | |||||
| fmul v26.2d, v2.2d, v10.d[0] | fmul v26.2d, v2.2d, v10.d[0] | ||||
| OP_ii v26.2d, v3.2d, v11.d[0] | OP_ii v26.2d, v3.2d, v11.d[0] | ||||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | ||||
| @@ -251,6 +261,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #endif | #endif | ||||
| OP_ir v27.2d, v3.2d, v10.d[0] | OP_ir v27.2d, v3.2d, v10.d[0] | ||||
| ld2 {v14.2d, v15.2d}, [pB] | |||||
| add pB, pB, #32 | |||||
| fmul v28.2d, v0.2d, v10.d[1] | fmul v28.2d, v0.2d, v10.d[1] | ||||
| OP_ii v28.2d, v1.2d, v11.d[1] | OP_ii v28.2d, v1.2d, v11.d[1] | ||||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | ||||
| @@ -262,6 +275,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #endif | #endif | ||||
| OP_ir v29.2d, v1.2d, v10.d[1] | OP_ir v29.2d, v1.2d, v10.d[1] | ||||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||||
| fmul v30.2d, v2.2d, v10.d[1] | fmul v30.2d, v2.2d, v10.d[1] | ||||
| OP_ii v30.2d, v3.2d, v11.d[1] | OP_ii v30.2d, v3.2d, v11.d[1] | ||||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | ||||
| @@ -273,14 +288,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #endif | #endif | ||||
| OP_ir v31.2d, v3.2d, v10.d[1] | OP_ir v31.2d, v3.2d, v10.d[1] | ||||
| ld2 {v12.2d, v13.2d}, [pB] | |||||
| add pB, pB, #32 | |||||
| ld2 {v14.2d, v15.2d}, [pB] | |||||
| add pB, pB, #32 | |||||
| ld2 {v4.2d, v5.2d} , [pA] | |||||
| add pA, pA, #32 | |||||
| ld2 {v6.2d, v7.2d} , [pA] | |||||
| add pA, pA, #32 | |||||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] | |||||
| .endm | .endm | ||||
| .macro KERNEL4x4_M1 | .macro KERNEL4x4_M1 | ||||
| @@ -289,7 +297,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| OP_ri v17.2d, v0.2d, v9.d[0] | OP_ri v17.2d, v0.2d, v9.d[0] | ||||
| OP_ir v17.2d, v1.2d, v8.d[0] | OP_ir v17.2d, v1.2d, v8.d[0] | ||||
| ld2 {v12.2d, v13.2d}, [pB] // For next round | |||||
| ld2 {v12.2d, v13.2d}, [pB] | |||||
| add pB, pB, #32 | add pB, pB, #32 | ||||
| OP_rr v18.2d, v2.2d, v8.d[0] | OP_rr v18.2d, v2.2d, v8.d[0] | ||||
| @@ -297,15 +305,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| OP_ri v19.2d, v2.2d, v9.d[0] | OP_ri v19.2d, v2.2d, v9.d[0] | ||||
| OP_ir v19.2d, v3.2d, v8.d[0] | OP_ir v19.2d, v3.2d, v8.d[0] | ||||
| ld2 {v14.2d, v15.2d}, [pB] // For next round | |||||
| add pB, pB, #32 | |||||
| ld2 {v4.2d, v5.2d} , [pA] | |||||
| add pA, pA, #32 | |||||
| OP_rr v20.2d, v0.2d, v8.d[1] | OP_rr v20.2d, v0.2d, v8.d[1] | ||||
| OP_ii v20.2d, v1.2d, v9.d[1] | OP_ii v20.2d, v1.2d, v9.d[1] | ||||
| OP_ri v21.2d, v0.2d, v9.d[1] | OP_ri v21.2d, v0.2d, v9.d[1] | ||||
| OP_ir v21.2d, v1.2d, v8.d[1] | OP_ir v21.2d, v1.2d, v8.d[1] | ||||
| ld2 {v4.2d, v5.2d} , [pA] // For next round | |||||
| ld2 {v6.2d, v7.2d} , [pA] | |||||
| add pA, pA, #32 | add pA, pA, #32 | ||||
| OP_rr v22.2d, v2.2d, v8.d[1] | OP_rr v22.2d, v2.2d, v8.d[1] | ||||
| @@ -313,22 +321,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| OP_ri v23.2d, v2.2d, v9.d[1] | OP_ri v23.2d, v2.2d, v9.d[1] | ||||
| OP_ir v23.2d, v3.2d, v8.d[1] | OP_ir v23.2d, v3.2d, v8.d[1] | ||||
| ld2 {v6.2d, v7.2d} , [pA] // For next round | |||||
| add pA, pA, #32 | |||||
| ld2 {v14.2d, v15.2d}, [pB] | |||||
| add pB, pB, #32 | |||||
| OP_rr v24.2d, v0.2d, v10.d[0] | OP_rr v24.2d, v0.2d, v10.d[0] | ||||
| OP_ii v24.2d, v1.2d, v11.d[0] | OP_ii v24.2d, v1.2d, v11.d[0] | ||||
| OP_ri v25.2d, v0.2d, v11.d[0] | OP_ri v25.2d, v0.2d, v11.d[0] | ||||
| OP_ir v25.2d, v1.2d, v10.d[0] | OP_ir v25.2d, v1.2d, v10.d[0] | ||||
| prfm PLDL1KEEP, [pA, #512] | |||||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||||
| OP_rr v26.2d, v2.2d, v10.d[0] | OP_rr v26.2d, v2.2d, v10.d[0] | ||||
| OP_ii v26.2d, v3.2d, v11.d[0] | OP_ii v26.2d, v3.2d, v11.d[0] | ||||
| OP_ri v27.2d, v2.2d, v11.d[0] | OP_ri v27.2d, v2.2d, v11.d[0] | ||||
| OP_ir v27.2d, v3.2d, v10.d[0] | OP_ir v27.2d, v3.2d, v10.d[0] | ||||
| prfm PLDL1KEEP, [pB, #512] | |||||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] | |||||
| OP_rr v28.2d, v0.2d, v10.d[1] | OP_rr v28.2d, v0.2d, v10.d[1] | ||||
| OP_ii v28.2d, v1.2d, v11.d[1] | OP_ii v28.2d, v1.2d, v11.d[1] | ||||
| @@ -347,7 +355,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| OP_ri v17.2d, v4.2d, v13.d[0] | OP_ri v17.2d, v4.2d, v13.d[0] | ||||
| OP_ir v17.2d, v5.2d, v12.d[0] | OP_ir v17.2d, v5.2d, v12.d[0] | ||||
| ld2 {v8.2d, v9.2d}, [pB] // For next round | |||||
| ld2 {v8.2d, v9.2d}, [pB] | |||||
| add pB, pB, #32 | add pB, pB, #32 | ||||
| OP_rr v18.2d, v6.2d, v12.d[0] | OP_rr v18.2d, v6.2d, v12.d[0] | ||||
| @@ -355,15 +363,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| OP_ri v19.2d, v6.2d, v13.d[0] | OP_ri v19.2d, v6.2d, v13.d[0] | ||||
| OP_ir v19.2d, v7.2d, v12.d[0] | OP_ir v19.2d, v7.2d, v12.d[0] | ||||
| ld2 {v10.2d, v11.2d}, [pB] // For next round | |||||
| add pB, pB, #32 | |||||
| ld2 {v0.2d, v1.2d}, [pA] | |||||
| add pA, pA, #32 | |||||
| OP_rr v20.2d, v4.2d, v12.d[1] | OP_rr v20.2d, v4.2d, v12.d[1] | ||||
| OP_ii v20.2d, v5.2d, v13.d[1] | OP_ii v20.2d, v5.2d, v13.d[1] | ||||
| OP_ri v21.2d, v4.2d, v13.d[1] | OP_ri v21.2d, v4.2d, v13.d[1] | ||||
| OP_ir v21.2d, v5.2d, v12.d[1] | OP_ir v21.2d, v5.2d, v12.d[1] | ||||
| ld2 {v0.2d, v1.2d}, [pA] // For next round | |||||
| ld2 {v2.2d, v3.2d}, [pA] | |||||
| add pA, pA, #32 | add pA, pA, #32 | ||||
| OP_rr v22.2d, v6.2d, v12.d[1] | OP_rr v22.2d, v6.2d, v12.d[1] | ||||
| @@ -371,22 +379,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| OP_ri v23.2d, v6.2d, v13.d[1] | OP_ri v23.2d, v6.2d, v13.d[1] | ||||
| OP_ir v23.2d, v7.2d, v12.d[1] | OP_ir v23.2d, v7.2d, v12.d[1] | ||||
| ld2 {v2.2d, v3.2d}, [pA] // For next round | |||||
| add pA, pA, #32 | |||||
| ld2 {v10.2d, v11.2d}, [pB] | |||||
| add pB, pB, #32 | |||||
| OP_rr v24.2d, v4.2d, v14.d[0] | OP_rr v24.2d, v4.2d, v14.d[0] | ||||
| OP_ii v24.2d, v5.2d, v15.d[0] | OP_ii v24.2d, v5.2d, v15.d[0] | ||||
| OP_ri v25.2d, v4.2d, v15.d[0] | OP_ri v25.2d, v4.2d, v15.d[0] | ||||
| OP_ir v25.2d, v5.2d, v14.d[0] | OP_ir v25.2d, v5.2d, v14.d[0] | ||||
| prfm PLDL1KEEP, [pA, #512] | |||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||||
| OP_rr v26.2d, v6.2d, v14.d[0] | OP_rr v26.2d, v6.2d, v14.d[0] | ||||
| OP_ii v26.2d, v7.2d, v15.d[0] | OP_ii v26.2d, v7.2d, v15.d[0] | ||||
| OP_ri v27.2d, v6.2d, v15.d[0] | OP_ri v27.2d, v6.2d, v15.d[0] | ||||
| OP_ir v27.2d, v7.2d, v14.d[0] | OP_ir v27.2d, v7.2d, v14.d[0] | ||||
| prfm PLDL1KEEP, [pB, #512] | |||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] | |||||
| OP_rr v28.2d, v4.2d, v14.d[1] | OP_rr v28.2d, v4.2d, v14.d[1] | ||||
| OP_ii v28.2d, v5.2d, v15.d[1] | OP_ii v28.2d, v5.2d, v15.d[1] | ||||
| @@ -415,6 +423,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| OP_ri v21.2d, v4.2d, v13.d[1] | OP_ri v21.2d, v4.2d, v13.d[1] | ||||
| OP_ir v21.2d, v5.2d, v12.d[1] | OP_ir v21.2d, v5.2d, v12.d[1] | ||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||||
| OP_rr v22.2d, v6.2d, v12.d[1] | OP_rr v22.2d, v6.2d, v12.d[1] | ||||
| OP_ii v22.2d, v7.2d, v13.d[1] | OP_ii v22.2d, v7.2d, v13.d[1] | ||||
| OP_ri v23.2d, v6.2d, v13.d[1] | OP_ri v23.2d, v6.2d, v13.d[1] | ||||
| @@ -425,6 +435,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| OP_ri v25.2d, v4.2d, v15.d[0] | OP_ri v25.2d, v4.2d, v15.d[0] | ||||
| OP_ir v25.2d, v5.2d, v14.d[0] | OP_ir v25.2d, v5.2d, v14.d[0] | ||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] | |||||
| OP_rr v26.2d, v6.2d, v14.d[0] | OP_rr v26.2d, v6.2d, v14.d[0] | ||||
| OP_ii v26.2d, v7.2d, v15.d[0] | OP_ii v26.2d, v7.2d, v15.d[0] | ||||
| OP_ri v27.2d, v6.2d, v15.d[0] | OP_ri v27.2d, v6.2d, v15.d[0] | ||||
| @@ -444,33 +456,40 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .macro KERNEL4x4_SUB | .macro KERNEL4x4_SUB | ||||
| ld2 {v8.2d, v9.2d}, [pB] | ld2 {v8.2d, v9.2d}, [pB] | ||||
| add pB, pB, #32 | add pB, pB, #32 | ||||
| ld2 {v10.2d, v11.2d}, [pB] | |||||
| add pB, pB, #32 | |||||
| ld2 {v0.2d, v1.2d}, [pA] | ld2 {v0.2d, v1.2d}, [pA] | ||||
| add pA, pA, #32 | add pA, pA, #32 | ||||
| ld2 {v2.2d, v3.2d}, [pA] | |||||
| add pA, pA, #32 | |||||
| OP_rr v16.2d, v0.2d, v8.d[0] | OP_rr v16.2d, v0.2d, v8.d[0] | ||||
| OP_ii v16.2d, v1.2d, v9.d[0] | OP_ii v16.2d, v1.2d, v9.d[0] | ||||
| OP_ri v17.2d, v0.2d, v9.d[0] | OP_ri v17.2d, v0.2d, v9.d[0] | ||||
| OP_ir v17.2d, v1.2d, v8.d[0] | OP_ir v17.2d, v1.2d, v8.d[0] | ||||
| OP_rr v18.2d, v2.2d, v8.d[0] | |||||
| OP_ii v18.2d, v3.2d, v9.d[0] | |||||
| OP_ri v19.2d, v2.2d, v9.d[0] | |||||
| OP_ir v19.2d, v3.2d, v8.d[0] | |||||
| ld2 {v2.2d, v3.2d}, [pA] | |||||
| add pA, pA, #32 | |||||
| OP_rr v20.2d, v0.2d, v8.d[1] | OP_rr v20.2d, v0.2d, v8.d[1] | ||||
| OP_ii v20.2d, v1.2d, v9.d[1] | OP_ii v20.2d, v1.2d, v9.d[1] | ||||
| OP_ri v21.2d, v0.2d, v9.d[1] | OP_ri v21.2d, v0.2d, v9.d[1] | ||||
| OP_ir v21.2d, v1.2d, v8.d[1] | OP_ir v21.2d, v1.2d, v8.d[1] | ||||
| ld2 {v10.2d, v11.2d}, [pB] | |||||
| add pB, pB, #32 | |||||
| OP_rr v18.2d, v2.2d, v8.d[0] | |||||
| OP_ii v18.2d, v3.2d, v9.d[0] | |||||
| OP_ri v19.2d, v2.2d, v9.d[0] | |||||
| OP_ir v19.2d, v3.2d, v8.d[0] | |||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||||
| OP_rr v22.2d, v2.2d, v8.d[1] | OP_rr v22.2d, v2.2d, v8.d[1] | ||||
| OP_ii v22.2d, v3.2d, v9.d[1] | OP_ii v22.2d, v3.2d, v9.d[1] | ||||
| OP_ri v23.2d, v2.2d, v9.d[1] | OP_ri v23.2d, v2.2d, v9.d[1] | ||||
| OP_ir v23.2d, v3.2d, v8.d[1] | OP_ir v23.2d, v3.2d, v8.d[1] | ||||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||||
| OP_rr v24.2d, v0.2d, v10.d[0] | OP_rr v24.2d, v0.2d, v10.d[0] | ||||
| OP_ii v24.2d, v1.2d, v11.d[0] | OP_ii v24.2d, v1.2d, v11.d[0] | ||||
| OP_ri v25.2d, v0.2d, v11.d[0] | OP_ri v25.2d, v0.2d, v11.d[0] | ||||
| @@ -493,66 +512,77 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .endm | .endm | ||||
| .macro SAVE4x4 | .macro SAVE4x4 | ||||
| fmov alpha0_R, alpha_save_R | |||||
| fmov alpha0_I, alpha_save_I | |||||
| fmov alpha1_R, alpha0_R | |||||
| fmov alpha1_I, alpha0_I | |||||
| fmov alpha0_R, alphaR | |||||
| fmov alpha0_I, alphaI | |||||
| mov pCRow1, pCRow0 | |||||
| prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||||
| fmul v0.2d, v16.2d, alphaV0_R | fmul v0.2d, v16.2d, alphaV0_R | ||||
| fmls v0.2d, v17.2d, alphaV0_I | fmls v0.2d, v17.2d, alphaV0_I | ||||
| fmul v1.2d, v16.2d, alphaV1_I | |||||
| fmla v1.2d, v17.2d, alphaV1_R | |||||
| st2 {v0.2d, v1.2d}, [pCRow1] | |||||
| add pCRow2, pCRow1, #32 | |||||
| fmul v1.2d, v16.2d, alphaV0_I | |||||
| fmla v1.2d, v17.2d, alphaV0_R | |||||
| st2 {v0.2d, v1.2d}, [pCRow0] | |||||
| add pCRow0, pCRow0, #32 | |||||
| fmul v2.2d, v18.2d, alphaV0_R | fmul v2.2d, v18.2d, alphaV0_R | ||||
| fmls v2.2d, v19.2d, alphaV0_I | fmls v2.2d, v19.2d, alphaV0_I | ||||
| fmul v3.2d, v18.2d, alphaV1_I | |||||
| fmla v3.2d, v19.2d, alphaV1_R | |||||
| st2 {v2.2d, v3.2d}, [pCRow2] | |||||
| fmul v3.2d, v18.2d, alphaV0_I | |||||
| fmla v3.2d, v19.2d, alphaV0_R | |||||
| st2 {v2.2d, v3.2d}, [pCRow0] | |||||
| add pCRow0, pCRow0, #32 | |||||
| prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||||
| add pCRow1, pCRow1, LDC | |||||
| fmul v4.2d, v20.2d, alphaV0_R | fmul v4.2d, v20.2d, alphaV0_R | ||||
| fmls v4.2d, v21.2d, alphaV0_I | fmls v4.2d, v21.2d, alphaV0_I | ||||
| fmul v5.2d, v20.2d, alphaV1_I | |||||
| fmla v5.2d, v21.2d, alphaV1_R | |||||
| fmul v5.2d, v20.2d, alphaV0_I | |||||
| fmla v5.2d, v21.2d, alphaV0_R | |||||
| st2 {v4.2d, v5.2d}, [pCRow1] | st2 {v4.2d, v5.2d}, [pCRow1] | ||||
| add pCRow2, pCRow1, #32 | |||||
| add pCRow1, pCRow1, #32 | |||||
| fmul v6.2d, v22.2d, alphaV0_R | fmul v6.2d, v22.2d, alphaV0_R | ||||
| fmls v6.2d, v23.2d, alphaV0_I | fmls v6.2d, v23.2d, alphaV0_I | ||||
| fmul v7.2d, v22.2d, alphaV1_I | |||||
| fmla v7.2d, v23.2d, alphaV1_R | |||||
| st2 {v6.2d, v7.2d}, [pCRow2] | |||||
| fmul v7.2d, v22.2d, alphaV0_I | |||||
| fmla v7.2d, v23.2d, alphaV0_R | |||||
| st2 {v6.2d, v7.2d}, [pCRow1] | |||||
| add pCRow1, pCRow1, #32 | |||||
| prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] | |||||
| add pCRow1, pCRow1, LDC | |||||
| fmul v0.2d, v24.2d, alphaV0_R | fmul v0.2d, v24.2d, alphaV0_R | ||||
| fmls v0.2d, v25.2d, alphaV0_I | fmls v0.2d, v25.2d, alphaV0_I | ||||
| fmul v1.2d, v24.2d, alphaV1_I | |||||
| fmla v1.2d, v25.2d, alphaV1_R | |||||
| st2 {v0.2d, v1.2d}, [pCRow1] | |||||
| add pCRow2, pCRow1, #32 | |||||
| fmul v1.2d, v24.2d, alphaV0_I | |||||
| fmla v1.2d, v25.2d, alphaV0_R | |||||
| st2 {v0.2d, v1.2d}, [pCRow2] | |||||
| add pCRow2, pCRow2, #32 | |||||
| fmul v2.2d, v26.2d, alphaV0_R | fmul v2.2d, v26.2d, alphaV0_R | ||||
| fmls v2.2d, v27.2d, alphaV0_I | fmls v2.2d, v27.2d, alphaV0_I | ||||
| fmul v3.2d, v26.2d, alphaV1_I | |||||
| fmla v3.2d, v27.2d, alphaV1_R | |||||
| fmul v3.2d, v26.2d, alphaV0_I | |||||
| fmla v3.2d, v27.2d, alphaV0_R | |||||
| st2 {v2.2d, v3.2d}, [pCRow2] | st2 {v2.2d, v3.2d}, [pCRow2] | ||||
| add pCRow1, pCRow1, LDC | |||||
| add pCRow2, pCRow2, #32 | |||||
| prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] | |||||
| fmul v4.2d, v28.2d, alphaV0_R | fmul v4.2d, v28.2d, alphaV0_R | ||||
| fmls v4.2d, v29.2d, alphaV0_I | fmls v4.2d, v29.2d, alphaV0_I | ||||
| fmul v5.2d, v28.2d, alphaV1_I | |||||
| fmla v5.2d, v29.2d, alphaV1_R | |||||
| st2 {v4.2d, v5.2d}, [pCRow1] | |||||
| add pCRow2, pCRow1, #32 | |||||
| fmul v5.2d, v28.2d, alphaV0_I | |||||
| fmla v5.2d, v29.2d, alphaV0_R | |||||
| st2 {v4.2d, v5.2d}, [pCRow3] | |||||
| add pCRow3, pCRow3, #32 | |||||
| fmul v6.2d, v30.2d, alphaV0_R | fmul v6.2d, v30.2d, alphaV0_R | ||||
| fmls v6.2d, v31.2d, alphaV0_I | fmls v6.2d, v31.2d, alphaV0_I | ||||
| fmul v7.2d, v30.2d, alphaV1_I | |||||
| fmla v7.2d, v31.2d, alphaV1_R | |||||
| st2 {v6.2d, v7.2d}, [pCRow2] | |||||
| fmul v7.2d, v30.2d, alphaV0_I | |||||
| fmla v7.2d, v31.2d, alphaV0_R | |||||
| st2 {v6.2d, v7.2d}, [pCRow3] | |||||
| add pCRow0, pCRow0, #64 | |||||
| add pCRow3, pCRow3, #32 | |||||
| .endm | .endm | ||||
| /******************************************************************************/ | /******************************************************************************/ | ||||
| @@ -599,41 +629,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .endm | .endm | ||||
| .macro SAVE2x4 | .macro SAVE2x4 | ||||
| fmov alpha0_R, alpha_save_R | |||||
| fmov alpha0_I, alpha_save_I | |||||
| fmov alpha1_R, alpha0_R | |||||
| fmov alpha1_I, alpha0_I | |||||
| fmov alpha0_R, alphaR | |||||
| fmov alpha0_I, alphaI | |||||
| mov pCRow1, pCRow0 | mov pCRow1, pCRow0 | ||||
| fmul v0.2d, v16.2d, alphaV0_R | fmul v0.2d, v16.2d, alphaV0_R | ||||
| fmls v0.2d, v17.2d, alphaV0_I | fmls v0.2d, v17.2d, alphaV0_I | ||||
| fmul v1.2d, v16.2d, alphaV1_I | |||||
| fmla v1.2d, v17.2d, alphaV1_R | |||||
| fmul v1.2d, v16.2d, alphaV0_I | |||||
| fmla v1.2d, v17.2d, alphaV0_R | |||||
| st2 {v0.2d, v1.2d}, [pCRow1] | st2 {v0.2d, v1.2d}, [pCRow1] | ||||
| add pCRow1, pCRow1, LDC | add pCRow1, pCRow1, LDC | ||||
| fmul v4.2d, v20.2d, alphaV0_R | fmul v4.2d, v20.2d, alphaV0_R | ||||
| fmls v4.2d, v21.2d, alphaV0_I | fmls v4.2d, v21.2d, alphaV0_I | ||||
| fmul v5.2d, v20.2d, alphaV1_I | |||||
| fmla v5.2d, v21.2d, alphaV1_R | |||||
| fmul v5.2d, v20.2d, alphaV0_I | |||||
| fmla v5.2d, v21.2d, alphaV0_R | |||||
| st2 {v4.2d, v5.2d}, [pCRow1] | st2 {v4.2d, v5.2d}, [pCRow1] | ||||
| add pCRow1, pCRow1, LDC | add pCRow1, pCRow1, LDC | ||||
| fmul v0.2d, v24.2d, alphaV0_R | fmul v0.2d, v24.2d, alphaV0_R | ||||
| fmls v0.2d, v25.2d, alphaV0_I | fmls v0.2d, v25.2d, alphaV0_I | ||||
| fmul v1.2d, v24.2d, alphaV1_I | |||||
| fmla v1.2d, v25.2d, alphaV1_R | |||||
| fmul v1.2d, v24.2d, alphaV0_I | |||||
| fmla v1.2d, v25.2d, alphaV0_R | |||||
| st2 {v0.2d, v1.2d}, [pCRow1] | st2 {v0.2d, v1.2d}, [pCRow1] | ||||
| add pCRow1, pCRow1, LDC | add pCRow1, pCRow1, LDC | ||||
| fmul v4.2d, v28.2d, alphaV0_R | fmul v4.2d, v28.2d, alphaV0_R | ||||
| fmls v4.2d, v29.2d, alphaV0_I | fmls v4.2d, v29.2d, alphaV0_I | ||||
| fmul v5.2d, v28.2d, alphaV1_I | |||||
| fmla v5.2d, v29.2d, alphaV1_R | |||||
| fmul v5.2d, v28.2d, alphaV0_I | |||||
| fmla v5.2d, v29.2d, alphaV0_R | |||||
| st2 {v4.2d, v5.2d}, [pCRow1] | st2 {v4.2d, v5.2d}, [pCRow1] | ||||
| add pCRow0, pCRow0, #32 | add pCRow0, pCRow0, #32 | ||||
| @@ -682,41 +710,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .endm | .endm | ||||
| .macro SAVE1x4 | .macro SAVE1x4 | ||||
| fmov alpha0_R, alpha_save_R | |||||
| fmov alpha0_I, alpha_save_I | |||||
| fmov alpha1_R, alpha0_R | |||||
| fmov alpha1_I, alpha0_I | |||||
| fmov alpha0_R, alphaR | |||||
| fmov alpha0_I, alphaI | |||||
| mov pCRow1, pCRow0 | mov pCRow1, pCRow0 | ||||
| fmul d0, d16, alphaV0_R | fmul d0, d16, alphaV0_R | ||||
| fmls d0, d17, alphaV0_I | fmls d0, d17, alphaV0_I | ||||
| fmul d1, d16, alphaV1_I | |||||
| fmla d1, d17, alphaV1_R | |||||
| fmul d1, d16, alphaV0_I | |||||
| fmla d1, d17, alphaV0_R | |||||
| st2 {v0.d, v1.d}[0], [pCRow1] | st2 {v0.d, v1.d}[0], [pCRow1] | ||||
| add pCRow1, pCRow1, LDC | add pCRow1, pCRow1, LDC | ||||
| fmul d4, d20, alphaV0_R | fmul d4, d20, alphaV0_R | ||||
| fmls d4, d21, alphaV0_I | fmls d4, d21, alphaV0_I | ||||
| fmul d5, d20, alphaV1_I | |||||
| fmla d5, d21, alphaV1_R | |||||
| fmul d5, d20, alphaV0_I | |||||
| fmla d5, d21, alphaV0_R | |||||
| st2 {v4.d, v5.d}[0], [pCRow1] | st2 {v4.d, v5.d}[0], [pCRow1] | ||||
| add pCRow1, pCRow1, LDC | add pCRow1, pCRow1, LDC | ||||
| fmul d0, d24, alphaV0_R | fmul d0, d24, alphaV0_R | ||||
| fmls d0, d25, alphaV0_I | fmls d0, d25, alphaV0_I | ||||
| fmul d1, d24, alphaV1_I | |||||
| fmla d1, d25, alphaV1_R | |||||
| fmul d1, d24, alphaV0_I | |||||
| fmla d1, d25, alphaV0_R | |||||
| st2 {v0.d, v1.d}[0], [pCRow1] | st2 {v0.d, v1.d}[0], [pCRow1] | ||||
| add pCRow1, pCRow1, LDC | add pCRow1, pCRow1, LDC | ||||
| fmul d4, d28, alphaV0_R | fmul d4, d28, alphaV0_R | ||||
| fmls d4, d29, alphaV0_I | fmls d4, d29, alphaV0_I | ||||
| fmul d5, d28, alphaV1_I | |||||
| fmla d5, d29, alphaV1_R | |||||
| fmul d5, d28, alphaV0_I | |||||
| fmla d5, d29, alphaV0_R | |||||
| st2 {v4.d, v5.d}[0], [pCRow1] | st2 {v4.d, v5.d}[0], [pCRow1] | ||||
| add pCRow0, pCRow0, #16 | add pCRow0, pCRow0, #16 | ||||
| @@ -765,37 +791,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .endm | .endm | ||||
| .macro SAVE4x2 | .macro SAVE4x2 | ||||
| fmov alpha0_R, alpha_save_R | |||||
| fmov alpha0_I, alpha_save_I | |||||
| fmov alpha1_R, alpha0_R | |||||
| fmov alpha1_I, alpha0_I | |||||
| fmov alpha0_R, alphaR | |||||
| fmov alpha0_I, alphaI | |||||
| mov pCRow1, pCRow0 | mov pCRow1, pCRow0 | ||||
| fmul v0.2d, v16.2d, alphaV0_R | fmul v0.2d, v16.2d, alphaV0_R | ||||
| fmls v0.2d, v17.2d, alphaV0_I | fmls v0.2d, v17.2d, alphaV0_I | ||||
| fmul v1.2d, v16.2d, alphaV1_I | |||||
| fmla v1.2d, v17.2d, alphaV1_R | |||||
| fmul v1.2d, v16.2d, alphaV0_I | |||||
| fmla v1.2d, v17.2d, alphaV0_R | |||||
| st2 {v0.2d, v1.2d}, [pCRow1] | st2 {v0.2d, v1.2d}, [pCRow1] | ||||
| add pCRow2, pCRow1, #32 | add pCRow2, pCRow1, #32 | ||||
| fmul v2.2d, v18.2d, alphaV0_R | fmul v2.2d, v18.2d, alphaV0_R | ||||
| fmls v2.2d, v19.2d, alphaV0_I | fmls v2.2d, v19.2d, alphaV0_I | ||||
| fmul v3.2d, v18.2d, alphaV1_I | |||||
| fmla v3.2d, v19.2d, alphaV1_R | |||||
| fmul v3.2d, v18.2d, alphaV0_I | |||||
| fmla v3.2d, v19.2d, alphaV0_R | |||||
| st2 {v2.2d, v3.2d}, [pCRow2] | st2 {v2.2d, v3.2d}, [pCRow2] | ||||
| add pCRow1, pCRow1, LDC | add pCRow1, pCRow1, LDC | ||||
| fmul v4.2d, v20.2d, alphaV0_R | fmul v4.2d, v20.2d, alphaV0_R | ||||
| fmls v4.2d, v21.2d, alphaV0_I | fmls v4.2d, v21.2d, alphaV0_I | ||||
| fmul v5.2d, v20.2d, alphaV1_I | |||||
| fmla v5.2d, v21.2d, alphaV1_R | |||||
| fmul v5.2d, v20.2d, alphaV0_I | |||||
| fmla v5.2d, v21.2d, alphaV0_R | |||||
| st2 {v4.2d, v5.2d}, [pCRow1] | st2 {v4.2d, v5.2d}, [pCRow1] | ||||
| add pCRow2, pCRow1, #32 | add pCRow2, pCRow1, #32 | ||||
| fmul v6.2d, v22.2d, alphaV0_R | fmul v6.2d, v22.2d, alphaV0_R | ||||
| fmls v6.2d, v23.2d, alphaV0_I | fmls v6.2d, v23.2d, alphaV0_I | ||||
| fmul v7.2d, v22.2d, alphaV1_I | |||||
| fmla v7.2d, v23.2d, alphaV1_R | |||||
| fmul v7.2d, v22.2d, alphaV0_I | |||||
| fmla v7.2d, v23.2d, alphaV0_R | |||||
| st2 {v6.2d, v7.2d}, [pCRow2] | st2 {v6.2d, v7.2d}, [pCRow2] | ||||
| add pCRow0, pCRow0, #64 | add pCRow0, pCRow0, #64 | ||||
| @@ -828,25 +852,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .endm | .endm | ||||
| .macro SAVE2x2 | .macro SAVE2x2 | ||||
| fmov alpha0_R, alpha_save_R | |||||
| fmov alpha0_I, alpha_save_I | |||||
| fmov alpha1_R, alpha0_R | |||||
| fmov alpha1_I, alpha0_I | |||||
| fmov alpha0_R, alphaR | |||||
| fmov alpha0_I, alphaI | |||||
| mov pCRow1, pCRow0 | mov pCRow1, pCRow0 | ||||
| fmul v0.2d, v16.2d, alphaV0_R | fmul v0.2d, v16.2d, alphaV0_R | ||||
| fmls v0.2d, v17.2d, alphaV0_I | fmls v0.2d, v17.2d, alphaV0_I | ||||
| fmul v1.2d, v16.2d, alphaV1_I | |||||
| fmla v1.2d, v17.2d, alphaV1_R | |||||
| fmul v1.2d, v16.2d, alphaV0_I | |||||
| fmla v1.2d, v17.2d, alphaV0_R | |||||
| st2 {v0.2d, v1.2d}, [pCRow1] | st2 {v0.2d, v1.2d}, [pCRow1] | ||||
| add pCRow1, pCRow1, LDC | add pCRow1, pCRow1, LDC | ||||
| fmul v4.2d, v20.2d, alphaV0_R | fmul v4.2d, v20.2d, alphaV0_R | ||||
| fmls v4.2d, v21.2d, alphaV0_I | fmls v4.2d, v21.2d, alphaV0_I | ||||
| fmul v5.2d, v20.2d, alphaV1_I | |||||
| fmla v5.2d, v21.2d, alphaV1_R | |||||
| fmul v5.2d, v20.2d, alphaV0_I | |||||
| fmla v5.2d, v21.2d, alphaV0_R | |||||
| st2 {v4.2d, v5.2d}, [pCRow1] | st2 {v4.2d, v5.2d}, [pCRow1] | ||||
| add pCRow0, pCRow0, #32 | add pCRow0, pCRow0, #32 | ||||
| @@ -879,25 +901,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .endm | .endm | ||||
| .macro SAVE1x2 | .macro SAVE1x2 | ||||
| fmov alpha0_R, alpha_save_R | |||||
| fmov alpha0_I, alpha_save_I | |||||
| fmov alpha1_R, alpha0_R | |||||
| fmov alpha1_I, alpha0_I | |||||
| fmov alpha0_R, alphaR | |||||
| fmov alpha0_I, alphaI | |||||
| mov pCRow1, pCRow0 | mov pCRow1, pCRow0 | ||||
| fmul d0, d16, alphaV0_R | fmul d0, d16, alphaV0_R | ||||
| fmls d0, d17, alphaV0_I | fmls d0, d17, alphaV0_I | ||||
| fmul d1, d16, alphaV1_I | |||||
| fmla d1, d17, alphaV1_R | |||||
| fmul d1, d16, alphaV0_I | |||||
| fmla d1, d17, alphaV0_R | |||||
| st2 {v0.d, v1.d}[0], [pCRow1] | st2 {v0.d, v1.d}[0], [pCRow1] | ||||
| add pCRow1, pCRow1, LDC | add pCRow1, pCRow1, LDC | ||||
| fmul d4, d20, alphaV0_R | fmul d4, d20, alphaV0_R | ||||
| fmls d4, d21, alphaV0_I | fmls d4, d21, alphaV0_I | ||||
| fmul d5, d20, alphaV1_I | |||||
| fmla d5, d21, alphaV1_R | |||||
| fmul d5, d20, alphaV0_I | |||||
| fmla d5, d21, alphaV0_R | |||||
| st2 {v4.d, v5.d}[0], [pCRow1] | st2 {v4.d, v5.d}[0], [pCRow1] | ||||
| add pCRow0, pCRow0, #16 | add pCRow0, pCRow0, #16 | ||||
| @@ -932,23 +952,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .endm | .endm | ||||
| .macro SAVE4x1 | .macro SAVE4x1 | ||||
| fmov alpha0_R, alpha_save_R | |||||
| fmov alpha0_I, alpha_save_I | |||||
| fmov alpha1_R, alpha0_R | |||||
| fmov alpha1_I, alpha0_I | |||||
| fmov alpha0_R, alphaR | |||||
| fmov alpha0_I, alphaI | |||||
| mov pCRow1, pCRow0 | mov pCRow1, pCRow0 | ||||
| fmul v0.2d, v16.2d, alphaV0_R | fmul v0.2d, v16.2d, alphaV0_R | ||||
| fmls v0.2d, v17.2d, alphaV0_I | fmls v0.2d, v17.2d, alphaV0_I | ||||
| fmul v1.2d, v16.2d, alphaV1_I | |||||
| fmla v1.2d, v17.2d, alphaV1_R | |||||
| fmul v1.2d, v16.2d, alphaV0_I | |||||
| fmla v1.2d, v17.2d, alphaV0_R | |||||
| st2 {v0.2d, v1.2d}, [pCRow1] | st2 {v0.2d, v1.2d}, [pCRow1] | ||||
| add pCRow2, pCRow1, #32 | add pCRow2, pCRow1, #32 | ||||
| fmul v2.2d, v18.2d, alphaV0_R | fmul v2.2d, v18.2d, alphaV0_R | ||||
| fmls v2.2d, v19.2d, alphaV0_I | fmls v2.2d, v19.2d, alphaV0_I | ||||
| fmul v3.2d, v18.2d, alphaV1_I | |||||
| fmla v3.2d, v19.2d, alphaV1_R | |||||
| fmul v3.2d, v18.2d, alphaV0_I | |||||
| fmla v3.2d, v19.2d, alphaV0_R | |||||
| st2 {v2.2d, v3.2d}, [pCRow2] | st2 {v2.2d, v3.2d}, [pCRow2] | ||||
| add pCRow0, pCRow0, #64 | add pCRow0, pCRow0, #64 | ||||
| @@ -974,17 +992,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .endm | .endm | ||||
| .macro SAVE2x1 | .macro SAVE2x1 | ||||
| fmov alpha0_R, alpha_save_R | |||||
| fmov alpha0_I, alpha_save_I | |||||
| fmov alpha1_R, alpha0_R | |||||
| fmov alpha1_I, alpha0_I | |||||
| fmov alpha0_R, alphaR | |||||
| fmov alpha0_I, alphaI | |||||
| mov pCRow1, pCRow0 | mov pCRow1, pCRow0 | ||||
| fmul v0.2d, v16.2d, alphaV0_R | fmul v0.2d, v16.2d, alphaV0_R | ||||
| fmls v0.2d, v17.2d, alphaV0_I | fmls v0.2d, v17.2d, alphaV0_I | ||||
| fmul v1.2d, v16.2d, alphaV1_I | |||||
| fmla v1.2d, v17.2d, alphaV1_R | |||||
| fmul v1.2d, v16.2d, alphaV0_I | |||||
| fmla v1.2d, v17.2d, alphaV0_R | |||||
| st2 {v0.2d, v1.2d}, [pCRow1] | st2 {v0.2d, v1.2d}, [pCRow1] | ||||
| add pCRow0, pCRow0, #32 | add pCRow0, pCRow0, #32 | ||||
| @@ -1011,17 +1027,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .endm | .endm | ||||
| .macro SAVE1x1 | .macro SAVE1x1 | ||||
| fmov alpha0_R, alpha_save_R | |||||
| fmov alpha0_I, alpha_save_I | |||||
| fmov alpha1_R, alpha0_R | |||||
| fmov alpha1_I, alpha0_I | |||||
| fmov alpha0_R, alphaR | |||||
| fmov alpha0_I, alphaI | |||||
| mov pCRow1, pCRow0 | mov pCRow1, pCRow0 | ||||
| fmul d0, d16, alphaV0_R | fmul d0, d16, alphaV0_R | ||||
| fmls d0, d17, alphaV0_I | fmls d0, d17, alphaV0_I | ||||
| fmul d1, d16, alphaV1_I | |||||
| fmla d1, d17, alphaV1_R | |||||
| fmul d1, d16, alphaV0_I | |||||
| fmla d1, d17, alphaV0_R | |||||
| st2 {v0.d, v1.d}[0], [pCRow1] | st2 {v0.d, v1.d}[0], [pCRow1] | ||||
| add pCRow0, pCRow0, #16 | add pCRow0, pCRow0, #16 | ||||
| @@ -1047,8 +1061,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| stp x26, x27, [sp, #(9 * 16)] | stp x26, x27, [sp, #(9 * 16)] | ||||
| str x28, [sp, #(10 * 16)] | str x28, [sp, #(10 * 16)] | ||||
| fmov alpha_save_R, d0 | |||||
| fmov alpha_save_I, d1 | |||||
| prfm PLDL1KEEP, [origPB] | |||||
| prfm PLDL1KEEP, [origPA] | |||||
| fmov alphaR, d0 | |||||
| fmov alphaI, d1 | |||||
| lsl LDC, LDC, #4 // ldc = ldc * 2 * 8 | lsl LDC, LDC, #4 // ldc = ldc * 2 * 8 | ||||
| @@ -1064,8 +1081,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| ble ztrmm_kernel_L2_BEGIN | ble ztrmm_kernel_L2_BEGIN | ||||
| ztrmm_kernel_L4_BEGIN: | ztrmm_kernel_L4_BEGIN: | ||||
| mov pCRow0, pC // pCRow0 = C | |||||
| add pC, pC, LDC, lsl #2 | |||||
| mov pCRow0, pC | |||||
| add pCRow1, pCRow0, LDC | |||||
| add pCRow2, pCRow1, LDC | |||||
| add pCRow3, pCRow2, LDC | |||||
| add pC, pCRow3, LDC | |||||
| #if defined(LEFT) | #if defined(LEFT) | ||||
| mov tempOffset, offset | mov tempOffset, offset | ||||
| @@ -1079,6 +1101,7 @@ ztrmm_kernel_L4_M4_BEGIN: | |||||
| cmp counterI, #0 | cmp counterI, #0 | ||||
| ble ztrmm_kernel_L4_M2_BEGIN | ble ztrmm_kernel_L4_M2_BEGIN | ||||
| .align 5 | |||||
| ztrmm_kernel_L4_M4_20: | ztrmm_kernel_L4_M4_20: | ||||
| #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | ||||
| @@ -1098,39 +1121,64 @@ ztrmm_kernel_L4_M4_20: | |||||
| add tempK, tempOffset, #4 | add tempK, tempOffset, #4 | ||||
| #endif | #endif | ||||
| asr counterL , tempK, #1 // L = K / 2 | |||||
| cmp counterL , #2 // is there at least 4 to do? | |||||
| asr counterL , tempK, #3 | |||||
| cmp counterL , #2 | |||||
| blt ztrmm_kernel_L4_M4_32 | blt ztrmm_kernel_L4_M4_32 | ||||
| KERNEL4x4_I // do one in the K | |||||
| KERNEL4x4_M2 // do another in the K | |||||
| KERNEL4x4_I | |||||
| KERNEL4x4_M2 | |||||
| KERNEL4x4_M1 | |||||
| KERNEL4x4_M2 | |||||
| KERNEL4x4_M1 | |||||
| KERNEL4x4_M2 | |||||
| KERNEL4x4_M1 | |||||
| KERNEL4x4_M2 | |||||
| subs counterL, counterL, #2 | subs counterL, counterL, #2 | ||||
| ble ztrmm_kernel_L4_M4_22a | ble ztrmm_kernel_L4_M4_22a | ||||
| .align 5 | |||||
| .align 5 | |||||
| ztrmm_kernel_L4_M4_22: | ztrmm_kernel_L4_M4_22: | ||||
| KERNEL4x4_M1 | KERNEL4x4_M1 | ||||
| KERNEL4x4_M2 | KERNEL4x4_M2 | ||||
| KERNEL4x4_M1 | |||||
| KERNEL4x4_M2 | |||||
| KERNEL4x4_M1 | |||||
| KERNEL4x4_M2 | |||||
| KERNEL4x4_M1 | |||||
| KERNEL4x4_M2 | |||||
| subs counterL, counterL, #1 | subs counterL, counterL, #1 | ||||
| bgt ztrmm_kernel_L4_M4_22 | bgt ztrmm_kernel_L4_M4_22 | ||||
| .align 5 | |||||
| ztrmm_kernel_L4_M4_22a: | ztrmm_kernel_L4_M4_22a: | ||||
| KERNEL4x4_M1 | |||||
| KERNEL4x4_M2 | |||||
| KERNEL4x4_M1 | |||||
| KERNEL4x4_M2 | |||||
| KERNEL4x4_M1 | |||||
| KERNEL4x4_M2 | |||||
| KERNEL4x4_M1 | KERNEL4x4_M1 | ||||
| KERNEL4x4_E | KERNEL4x4_E | ||||
| b ztrmm_kernel_L4_M4_44 | b ztrmm_kernel_L4_M4_44 | ||||
| .align 5 | |||||
| ztrmm_kernel_L4_M4_32: | ztrmm_kernel_L4_M4_32: | ||||
| tst counterL, #1 | tst counterL, #1 | ||||
| ble ztrmm_kernel_L4_M4_40 | ble ztrmm_kernel_L4_M4_40 | ||||
| KERNEL4x4_I | KERNEL4x4_I | ||||
| KERNEL4x4_M2 | |||||
| KERNEL4x4_M1 | |||||
| KERNEL4x4_M2 | |||||
| KERNEL4x4_M1 | |||||
| KERNEL4x4_M2 | |||||
| KERNEL4x4_M1 | |||||
| KERNEL4x4_E | KERNEL4x4_E | ||||
| b ztrmm_kernel_L4_M4_44 | b ztrmm_kernel_L4_M4_44 | ||||
| @@ -1142,12 +1190,16 @@ ztrmm_kernel_L4_M4_40: | |||||
| ztrmm_kernel_L4_M4_44: | ztrmm_kernel_L4_M4_44: | ||||
| ands counterL , tempK, #1 | |||||
| ands counterL , tempK, #7 | |||||
| ble ztrmm_kernel_L4_M4_100 | ble ztrmm_kernel_L4_M4_100 | ||||
| .align 5 | |||||
| ztrmm_kernel_L4_M4_46: | ztrmm_kernel_L4_M4_46: | ||||
| KERNEL4x4_SUB | KERNEL4x4_SUB | ||||
| subs counterL, counterL, #1 | |||||
| bne ztrmm_kernel_L4_M4_46 | |||||
| ztrmm_kernel_L4_M4_100: | ztrmm_kernel_L4_M4_100: | ||||
| SAVE4x4 | SAVE4x4 | ||||
| @@ -1167,6 +1219,10 @@ ztrmm_kernel_L4_M4_100: | |||||
| add tempOffset, tempOffset, #4 | add tempOffset, tempOffset, #4 | ||||
| #endif | #endif | ||||
| prfm PLDL1KEEP, [pA] | |||||
| prfm PLDL1KEEP, [pA, #64] | |||||
| prfm PLDL1KEEP, [origPB] | |||||
| ztrmm_kernel_L4_M4_END: | ztrmm_kernel_L4_M4_END: | ||||
| subs counterI, counterI, #1 | subs counterI, counterI, #1 | ||||
| bne ztrmm_kernel_L4_M4_20 | bne ztrmm_kernel_L4_M4_20 | ||||
| @@ -0,0 +1,46 @@ | |||||
| ifndef SNRM2KERNEL | |||||
| SNRM2KERNEL = nrm2.c | |||||
| endif | |||||
| ifndef DNRM2KERNEL | |||||
| DNRM2KERNEL = nrm2.c | |||||
| endif | |||||
| ifndef CNRM2KERNEL | |||||
| CNRM2KERNEL = znrm2.c | |||||
| endif | |||||
| ifndef ZNRM2KERNEL | |||||
| ZNRM2KERNEL = znrm2.c | |||||
| endif | |||||
| ifndef SCABS_KERNEL | |||||
| SCABS_KERNEL = ../generic/cabs.c | |||||
| endif | |||||
| ifndef DCABS_KERNEL | |||||
| DCABS_KERNEL = ../generic/cabs.c | |||||
| endif | |||||
| ifndef QCABS_KERNEL | |||||
| QCABS_KERNEL = ../generic/cabs.c | |||||
| endif | |||||
| ifndef LSAME_KERNEL | |||||
| LSAME_KERNEL = ../generic/lsame.c | |||||
| endif | |||||
| ifndef SGEMM_BETA | |||||
| SGEMM_BETA = ../generic/gemm_beta.c | |||||
| endif | |||||
| ifndef DGEMM_BETA | |||||
| DGEMM_BETA = ../generic/gemm_beta.c | |||||
| endif | |||||
| ifndef CGEMM_BETA | |||||
| CGEMM_BETA = ../generic/zgemm_beta.c | |||||
| endif | |||||
| ifndef ZGEMM_BETA | |||||
| ZGEMM_BETA = ../generic/zgemm_beta.c | |||||
| endif | |||||
| @@ -0,0 +1,221 @@ | |||||
| SAMAXKERNEL = ../mips/amax.c | |||||
| DAMAXKERNEL = ../mips/amax.c | |||||
| CAMAXKERNEL = ../mips/zamax.c | |||||
| ZAMAXKERNEL = ../mips/zamax.c | |||||
| SAMINKERNEL = ../mips/amin.c | |||||
| DAMINKERNEL = ../mips/amin.c | |||||
| CAMINKERNEL = ../mips/zamin.c | |||||
| ZAMINKERNEL = ../mips/zamin.c | |||||
| SMAXKERNEL = ../mips/max.c | |||||
| DMAXKERNEL = ../mips/max.c | |||||
| SMINKERNEL = ../mips/min.c | |||||
| DMINKERNEL = ../mips/min.c | |||||
| ISAMAXKERNEL = ../mips/iamax.c | |||||
| IDAMAXKERNEL = ../mips/iamax.c | |||||
| ICAMAXKERNEL = ../mips/izamax.c | |||||
| IZAMAXKERNEL = ../mips/izamax.c | |||||
| ISAMINKERNEL = ../mips/iamin.c | |||||
| IDAMINKERNEL = ../mips/iamin.c | |||||
| ICAMINKERNEL = ../mips/izamin.c | |||||
| IZAMINKERNEL = ../mips/izamin.c | |||||
| ISMAXKERNEL = ../mips/imax.c | |||||
| IDMAXKERNEL = ../mips/imax.c | |||||
| ISMINKERNEL = ../mips/imin.c | |||||
| IDMINKERNEL = ../mips/imin.c | |||||
| ifdef HAVE_MSA | |||||
| SASUMKERNEL = ../mips/sasum_msa.c | |||||
| DASUMKERNEL = ../mips/dasum_msa.c | |||||
| CASUMKERNEL = ../mips/casum_msa.c | |||||
| ZASUMKERNEL = ../mips/zasum_msa.c | |||||
| else | |||||
| SASUMKERNEL = ../mips/asum.c | |||||
| DASUMKERNEL = ../mips/asum.c | |||||
| CASUMKERNEL = ../mips/asum.c | |||||
| ZASUMKERNEL = ../mips/asum.c | |||||
| endif | |||||
| SAXPYKERNEL = ../mips/axpy.c | |||||
| DAXPYKERNEL = ../mips/axpy.c | |||||
| CAXPYKERNEL = ../mips/zaxpy.c | |||||
| ZAXPYKERNEL = ../mips/zaxpy.c | |||||
| SCOPYKERNEL = ../mips/copy.c | |||||
| DCOPYKERNEL = ../mips/copy.c | |||||
| CCOPYKERNEL = ../mips/zcopy.c | |||||
| ZCOPYKERNEL = ../mips/zcopy.c | |||||
| ifdef HAVE_MSA | |||||
| SDOTKERNEL = ../mips/sdot_msa.c | |||||
| DDOTKERNEL = ../mips/ddot_msa.c | |||||
| CDOTKERNEL = ../mips/cdot_msa.c | |||||
| ZDOTKERNEL = ../mips/zdot_msa.c | |||||
| else | |||||
| SDOTKERNEL = ../mips/dot.c | |||||
| DDOTKERNEL = ../mips/dot.c | |||||
| CDOTKERNEL = ../mips/zdot.c | |||||
| ZDOTKERNEL = ../mips/zdot.c | |||||
| endif | |||||
| SNRM2KERNEL = ../mips/nrm2.c | |||||
| DNRM2KERNEL = ../mips/nrm2.c | |||||
| CNRM2KERNEL = ../mips/znrm2.c | |||||
| ZNRM2KERNEL = ../mips/znrm2.c | |||||
| SROTKERNEL = ../mips/rot.c | |||||
| DROTKERNEL = ../mips/rot.c | |||||
| CROTKERNEL = ../mips/zrot.c | |||||
| ZROTKERNEL = ../mips/zrot.c | |||||
| SSCALKERNEL = ../mips/scal.c | |||||
| DSCALKERNEL = ../mips/scal.c | |||||
| CSCALKERNEL = ../mips/zscal.c | |||||
| ZSCALKERNEL = ../mips/zscal.c | |||||
| SSWAPKERNEL = ../mips/swap.c | |||||
| DSWAPKERNEL = ../mips/swap.c | |||||
| CSWAPKERNEL = ../mips/zswap.c | |||||
| ZSWAPKERNEL = ../mips/zswap.c | |||||
| ifdef HAVE_MSA | |||||
| SGEMVNKERNEL = ../mips/sgemv_n_msa.c | |||||
| DGEMVNKERNEL = ../mips/dgemv_n_msa.c | |||||
| CGEMVNKERNEL = ../mips/cgemv_n_msa.c | |||||
| ZGEMVNKERNEL = ../mips/zgemv_n_msa.c | |||||
| else | |||||
| SGEMVNKERNEL = ../mips/gemv_n.c | |||||
| DGEMVNKERNEL = ../mips/gemv_n.c | |||||
| CGEMVNKERNEL = ../mips/zgemv_n.c | |||||
| ZGEMVNKERNEL = ../mips/zgemv_n.c | |||||
| endif | |||||
| ifdef HAVE_MSA | |||||
| SGEMVTKERNEL = ../mips/sgemv_t_msa.c | |||||
| DGEMVTKERNEL = ../mips/dgemv_t_msa.c | |||||
| CGEMVTKERNEL = ../mips/cgemv_t_msa.c | |||||
| ZGEMVTKERNEL = ../mips/zgemv_t_msa.c | |||||
| else | |||||
| SGEMVTKERNEL = ../mips/gemv_t.c | |||||
| DGEMVTKERNEL = ../mips/gemv_t.c | |||||
| CGEMVTKERNEL = ../mips/zgemv_t.c | |||||
| ZGEMVTKERNEL = ../mips/zgemv_t.c | |||||
| endif | |||||
| ifdef HAVE_MSA | |||||
| SGEMMKERNEL = ../mips/sgemm_kernel_8x8_msa.c | |||||
| SGEMMONCOPY = ../mips/sgemm_ncopy_8_msa.c | |||||
| SGEMMOTCOPY = ../mips/sgemm_tcopy_8_msa.c | |||||
| SGEMMONCOPYOBJ = sgemm_oncopy.o | |||||
| SGEMMOTCOPYOBJ = sgemm_otcopy.o | |||||
| else | |||||
| SGEMMKERNEL = ../generic/gemmkernel_2x2.c | |||||
| SGEMMONCOPY = ../generic/gemm_ncopy_2.c | |||||
| SGEMMOTCOPY = ../generic/gemm_tcopy_2.c | |||||
| SGEMMONCOPYOBJ = sgemm_oncopy.o | |||||
| SGEMMOTCOPYOBJ = sgemm_otcopy.o | |||||
| endif | |||||
| ifdef HAVE_MSA | |||||
| DGEMMKERNEL = ../mips/dgemm_kernel_8x4_msa.c | |||||
| DGEMMINCOPY = ../mips/dgemm_ncopy_8_msa.c | |||||
| DGEMMITCOPY = ../mips/dgemm_tcopy_8_msa.c | |||||
| DGEMMONCOPY = ../mips/dgemm_ncopy_4_msa.c | |||||
| DGEMMOTCOPY = ../mips/dgemm_tcopy_4_msa.c | |||||
| DGEMMINCOPYOBJ = dgemm_incopy.o | |||||
| DGEMMITCOPYOBJ = dgemm_itcopy.o | |||||
| DGEMMONCOPYOBJ = dgemm_oncopy.o | |||||
| DGEMMOTCOPYOBJ = dgemm_otcopy.o | |||||
| else | |||||
| DGEMMKERNEL = ../generic/gemmkernel_2x2.c | |||||
| DGEMMONCOPY = ../generic/gemm_ncopy_2.c | |||||
| DGEMMOTCOPY = ../generic/gemm_tcopy_2.c | |||||
| DGEMMONCOPYOBJ = dgemm_oncopy.o | |||||
| DGEMMOTCOPYOBJ = dgemm_otcopy.o | |||||
| endif | |||||
| ifdef HAVE_MSA | |||||
| CGEMMKERNEL = ../mips/cgemm_kernel_8x4_msa.c | |||||
| CGEMMINCOPY = ../mips/cgemm_ncopy_8_msa.c | |||||
| CGEMMITCOPY = ../mips/cgemm_tcopy_8_msa.c | |||||
| CGEMMONCOPY = ../mips/cgemm_ncopy_4_msa.c | |||||
| CGEMMOTCOPY = ../mips/cgemm_tcopy_4_msa.c | |||||
| CGEMMINCOPYOBJ = cgemm_incopy.o | |||||
| CGEMMITCOPYOBJ = cgemm_itcopy.o | |||||
| CGEMMONCOPYOBJ = cgemm_oncopy.o | |||||
| CGEMMOTCOPYOBJ = cgemm_otcopy.o | |||||
| else | |||||
| CGEMMKERNEL = ../generic/zgemmkernel_2x2.c | |||||
| CGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||||
| CGEMMONCOPYOBJ = cgemm_oncopy.o | |||||
| CGEMMOTCOPYOBJ = cgemm_otcopy.o | |||||
| endif | |||||
| ifdef HAVE_MSA | |||||
| ZGEMMKERNEL = ../mips/zgemm_kernel_4x4_msa.c | |||||
| ZGEMMONCOPY = ../mips/zgemm_ncopy_4_msa.c | |||||
| ZGEMMOTCOPY = ../mips/zgemm_tcopy_4_msa.c | |||||
| ZGEMMONCOPYOBJ = zgemm_oncopy.o | |||||
| ZGEMMOTCOPYOBJ = zgemm_otcopy.o | |||||
| else | |||||
| ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c | |||||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||||
| ZGEMMONCOPYOBJ = zgemm_oncopy.o | |||||
| ZGEMMOTCOPYOBJ = zgemm_otcopy.o | |||||
| endif | |||||
| ifdef HAVE_MSA | |||||
| STRSMKERNEL_LN = ../mips/strsm_kernel_LN_8x8_msa.c | |||||
| STRSMKERNEL_LT = ../mips/strsm_kernel_LT_8x8_msa.c | |||||
| STRSMKERNEL_RN = ../mips/strsm_kernel_RN_8x8_msa.c | |||||
| STRSMKERNEL_RT = ../mips/strsm_kernel_RT_8x8_msa.c | |||||
| else | |||||
| STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| endif | |||||
| ifdef HAVE_MSA | |||||
| DTRSMKERNEL_LN = ../mips/dtrsm_kernel_LN_8x4_msa.c | |||||
| DTRSMKERNEL_LT = ../mips/dtrsm_kernel_LT_8x4_msa.c | |||||
| DTRSMKERNEL_RN = ../mips/dtrsm_kernel_RN_8x4_msa.c | |||||
| DTRSMKERNEL_RT = ../mips/dtrsm_kernel_RT_8x4_msa.c | |||||
| else | |||||
| DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| endif | |||||
| ifdef HAVE_MSA | |||||
| CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| else | |||||
| CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| endif | |||||
| ifdef HAVE_MSA | |||||
| ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| else | |||||
| ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| endif | |||||
| @@ -0,0 +1,2 @@ | |||||
| clean :: | |||||
| @@ -0,0 +1,66 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2016, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #include "common.h" | |||||
| #include <math.h> | |||||
| #if defined(DOUBLE) | |||||
| #define ABS fabs | |||||
| #else | |||||
| #define ABS fabsf | |||||
| #endif | |||||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| { | |||||
| BLASLONG i=0; | |||||
| BLASLONG ix=0; | |||||
| FLOAT maxf=0.0; | |||||
| if (n <= 0 || inc_x <= 0) return(maxf); | |||||
| maxf=ABS(x[0]); | |||||
| ix += inc_x; | |||||
| i++; | |||||
| while(i < n) | |||||
| { | |||||
| if( ABS(x[ix]) > maxf ) | |||||
| { | |||||
| maxf = ABS(x[ix]); | |||||
| } | |||||
| ix += inc_x; | |||||
| i++; | |||||
| } | |||||
| return(maxf); | |||||
| } | |||||
| @@ -0,0 +1,66 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2016, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #include "common.h" | |||||
| #include <math.h> | |||||
| #if defined(DOUBLE) | |||||
| #define ABS fabs | |||||
| #else | |||||
| #define ABS fabsf | |||||
| #endif | |||||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| { | |||||
| BLASLONG i=0; | |||||
| BLASLONG ix=0; | |||||
| FLOAT minf=0.0; | |||||
| if (n <= 0 || inc_x <= 0) return(minf); | |||||
| minf=ABS(x[0]); | |||||
| ix += inc_x; | |||||
| i++; | |||||
| while(i < n) | |||||
| { | |||||
| if( ABS(x[ix]) < minf ) | |||||
| { | |||||
| minf = ABS(x[ix]); | |||||
| } | |||||
| ix += inc_x; | |||||
| i++; | |||||
| } | |||||
| return(minf); | |||||
| } | |||||
| @@ -0,0 +1,57 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2016, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #include "common.h" | |||||
| #include <math.h> | |||||
| #if defined(DOUBLE) | |||||
| #define ABS fabs | |||||
| #else | |||||
| #define ABS fabsf | |||||
| #endif | |||||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| { | |||||
| BLASLONG i=0; | |||||
| FLOAT sumf = 0.0; | |||||
| if (n <= 0 || inc_x <= 0) return(sumf); | |||||
| n *= inc_x; | |||||
| while(i < n) | |||||
| { | |||||
| sumf += ABS(x[i]); | |||||
| i += inc_x; | |||||
| } | |||||
| return(sumf); | |||||
| } | |||||
| @@ -0,0 +1,95 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2016, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #include "common.h" | |||||
| int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *y, BLASLONG inc_y) | |||||
| { | |||||
| BLASLONG i=0; | |||||
| BLASLONG ix,iy; | |||||
| if ( n < 0 ) return(0); | |||||
| ix = 0; | |||||
| iy = 0; | |||||
| if ( beta == 0.0 ) | |||||
| { | |||||
| if ( alpha == 0.0 ) | |||||
| { | |||||
| while(i < n) | |||||
| { | |||||
| y[iy] = 0.0 ; | |||||
| iy += inc_y ; | |||||
| i++ ; | |||||
| } | |||||
| } | |||||
| else | |||||
| { | |||||
| while(i < n) | |||||
| { | |||||
| y[iy] = alpha * x[ix] ; | |||||
| ix += inc_x ; | |||||
| iy += inc_y ; | |||||
| i++ ; | |||||
| } | |||||
| } | |||||
| } | |||||
| else | |||||
| { | |||||
| if ( alpha == 0.0 ) | |||||
| { | |||||
| while(i < n) | |||||
| { | |||||
| y[iy] = beta * y[iy] ; | |||||
| iy += inc_y ; | |||||
| i++ ; | |||||
| } | |||||
| } | |||||
| else | |||||
| { | |||||
| while(i < n) | |||||
| { | |||||
| y[iy] = alpha * x[ix] + beta * y[iy] ; | |||||
| ix += inc_x ; | |||||
| iy += inc_y ; | |||||
| i++ ; | |||||
| } | |||||
| } | |||||
| } | |||||
| return(0); | |||||
| } | |||||
| @@ -0,0 +1,54 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2016, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #include "common.h" | |||||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | |||||
| { | |||||
| BLASLONG i=0; | |||||
| BLASLONG ix,iy; | |||||
| if ( n < 0 ) return(0); | |||||
| if ( da == 0.0 ) return(0); | |||||
| ix = 0; | |||||
| iy = 0; | |||||
| while(i < n) | |||||
| { | |||||
| y[iy] += da * x[ix] ; | |||||
| ix += inc_x ; | |||||
| iy += inc_y ; | |||||
| i++ ; | |||||
| } | |||||
| return(0); | |||||
| } | |||||
| @@ -0,0 +1,338 @@ | |||||
| /******************************************************************************* | |||||
| Copyright (c) 2016, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *******************************************************************************/ | |||||
| #include "common.h" | |||||
| #include <math.h> | |||||
| #include "macros_msa.h" | |||||
| #define AND_VEC_W(in) ((v4f32) ((v4i32) in & and_vec)) | |||||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| { | |||||
| BLASLONG i, inc_x2; | |||||
| FLOAT sumf = 0.0; | |||||
| v4f32 src0, src1, src2, src3, src4, src5, src6, src7; | |||||
| v4f32 sum_abs0, sum_abs1, sum_abs2, sum_abs3; | |||||
| v4f32 zero_v = {0}; | |||||
| v4i32 and_vec = {0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF}; | |||||
| if (n <= 0 || inc_x <= 0) return (sumf); | |||||
| if (1 == inc_x) | |||||
| { | |||||
| if (n > 15) | |||||
| { | |||||
| n -= 16; | |||||
| LD_SP8_INC(x, 4, src0, src1, src2, src3, src4, src5, src6, src7); | |||||
| sum_abs0 = AND_VEC_W(src0); | |||||
| sum_abs1 = AND_VEC_W(src1); | |||||
| sum_abs2 = AND_VEC_W(src2); | |||||
| sum_abs3 = AND_VEC_W(src3); | |||||
| sum_abs0 += AND_VEC_W(src4); | |||||
| sum_abs1 += AND_VEC_W(src5); | |||||
| sum_abs2 += AND_VEC_W(src6); | |||||
| sum_abs3 += AND_VEC_W(src7); | |||||
| } | |||||
| else | |||||
| { | |||||
| sum_abs0 = zero_v; | |||||
| sum_abs1 = zero_v; | |||||
| sum_abs2 = zero_v; | |||||
| sum_abs3 = zero_v; | |||||
| } | |||||
| for (i = (n >> 4); i--;) | |||||
| { | |||||
| LD_SP8_INC(x, 4, src0, src1, src2, src3, src4, src5, src6, src7); | |||||
| sum_abs0 += AND_VEC_W(src0); | |||||
| sum_abs1 += AND_VEC_W(src1); | |||||
| sum_abs2 += AND_VEC_W(src2); | |||||
| sum_abs3 += AND_VEC_W(src3); | |||||
| sum_abs0 += AND_VEC_W(src4); | |||||
| sum_abs1 += AND_VEC_W(src5); | |||||
| sum_abs2 += AND_VEC_W(src6); | |||||
| sum_abs3 += AND_VEC_W(src7); | |||||
| } | |||||
| if (n & 15) | |||||
| { | |||||
| if ((n & 8) && (n & 4) && (n & 2)) | |||||
| { | |||||
| LD_SP7_INC(x, 4, src0, src1, src2, src3, src4, src5, src6); | |||||
| sum_abs0 += AND_VEC_W(src0); | |||||
| sum_abs1 += AND_VEC_W(src1); | |||||
| sum_abs2 += AND_VEC_W(src2); | |||||
| sum_abs3 += AND_VEC_W(src3); | |||||
| sum_abs0 += AND_VEC_W(src4); | |||||
| sum_abs1 += AND_VEC_W(src5); | |||||
| sum_abs2 += AND_VEC_W(src6); | |||||
| sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; | |||||
| sumf = sum_abs0[0]; | |||||
| sumf += sum_abs0[1]; | |||||
| sumf += sum_abs0[2]; | |||||
| sumf += sum_abs0[3]; | |||||
| } | |||||
| else if ((n & 8) && (n & 4)) | |||||
| { | |||||
| LD_SP6_INC(x, 4, src0, src1, src2, src3, src4, src5); | |||||
| sum_abs0 += AND_VEC_W(src0); | |||||
| sum_abs1 += AND_VEC_W(src1); | |||||
| sum_abs2 += AND_VEC_W(src2); | |||||
| sum_abs3 += AND_VEC_W(src3); | |||||
| sum_abs0 += AND_VEC_W(src4); | |||||
| sum_abs1 += AND_VEC_W(src5); | |||||
| sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; | |||||
| sumf = sum_abs0[0]; | |||||
| sumf += sum_abs0[1]; | |||||
| sumf += sum_abs0[2]; | |||||
| sumf += sum_abs0[3]; | |||||
| } | |||||
| else if ((n & 8) && (n & 2)) | |||||
| { | |||||
| LD_SP5_INC(x, 4, src0, src1, src2, src3, src4); | |||||
| sum_abs0 += AND_VEC_W(src0); | |||||
| sum_abs1 += AND_VEC_W(src1); | |||||
| sum_abs2 += AND_VEC_W(src2); | |||||
| sum_abs3 += AND_VEC_W(src3); | |||||
| sum_abs0 += AND_VEC_W(src4); | |||||
| sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; | |||||
| sumf = sum_abs0[0]; | |||||
| sumf += sum_abs0[1]; | |||||
| sumf += sum_abs0[2]; | |||||
| sumf += sum_abs0[3]; | |||||
| } | |||||
| else if ((n & 4) && (n & 2)) | |||||
| { | |||||
| LD_SP3_INC(x, 4, src0, src1, src2); | |||||
| sum_abs0 += AND_VEC_W(src0); | |||||
| sum_abs1 += AND_VEC_W(src1); | |||||
| sum_abs2 += AND_VEC_W(src2); | |||||
| sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; | |||||
| sumf = sum_abs0[0]; | |||||
| sumf += sum_abs0[1]; | |||||
| sumf += sum_abs0[2]; | |||||
| sumf += sum_abs0[3]; | |||||
| } | |||||
| else if (n & 8) | |||||
| { | |||||
| LD_SP4_INC(x, 4, src0, src1, src2, src3); | |||||
| sum_abs0 += AND_VEC_W(src0); | |||||
| sum_abs1 += AND_VEC_W(src1); | |||||
| sum_abs2 += AND_VEC_W(src2); | |||||
| sum_abs3 += AND_VEC_W(src3); | |||||
| sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; | |||||
| sumf = sum_abs0[0]; | |||||
| sumf += sum_abs0[1]; | |||||
| sumf += sum_abs0[2]; | |||||
| sumf += sum_abs0[3]; | |||||
| } | |||||
| else if (n & 4) | |||||
| { | |||||
| LD_SP2_INC(x, 4, src0, src1); | |||||
| sum_abs0 += AND_VEC_W(src0); | |||||
| sum_abs1 += AND_VEC_W(src1); | |||||
| sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; | |||||
| sumf = sum_abs0[0]; | |||||
| sumf += sum_abs0[1]; | |||||
| sumf += sum_abs0[2]; | |||||
| sumf += sum_abs0[3]; | |||||
| } | |||||
| else if (n & 2) | |||||
| { | |||||
| src0 = LD_SP(x); x += 4; | |||||
| sum_abs0 += AND_VEC_W(src0); | |||||
| sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; | |||||
| sumf = sum_abs0[0]; | |||||
| sumf += sum_abs0[1]; | |||||
| sumf += sum_abs0[2]; | |||||
| sumf += sum_abs0[3]; | |||||
| } | |||||
| else | |||||
| { | |||||
| sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; | |||||
| sumf = sum_abs0[0]; | |||||
| sumf += sum_abs0[1]; | |||||
| sumf += sum_abs0[2]; | |||||
| sumf += sum_abs0[3]; | |||||
| } | |||||
| if (n & 1) | |||||
| { | |||||
| sumf += fabsf(*(x + 0)); | |||||
| sumf += fabsf(*(x + 1)); | |||||
| } | |||||
| } | |||||
| else | |||||
| { | |||||
| sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; | |||||
| sumf = sum_abs0[0]; | |||||
| sumf += sum_abs0[1]; | |||||
| sumf += sum_abs0[2]; | |||||
| sumf += sum_abs0[3]; | |||||
| } | |||||
| } | |||||
| else | |||||
| { | |||||
| inc_x2 = 2 * inc_x; | |||||
| if (n > 8) | |||||
| { | |||||
| n -= 8; | |||||
| LD_SP8_INC(x, inc_x2, src0, src1, src2, src3, src4, src5, src6, src7); | |||||
| sum_abs0 = AND_VEC_W(src0); | |||||
| sum_abs1 = AND_VEC_W(src1); | |||||
| sum_abs2 = AND_VEC_W(src2); | |||||
| sum_abs3 = AND_VEC_W(src3); | |||||
| sum_abs0 += AND_VEC_W(src4); | |||||
| sum_abs1 += AND_VEC_W(src5); | |||||
| sum_abs2 += AND_VEC_W(src6); | |||||
| sum_abs3 += AND_VEC_W(src7); | |||||
| } | |||||
| else | |||||
| { | |||||
| sum_abs0 = zero_v; | |||||
| sum_abs1 = zero_v; | |||||
| sum_abs2 = zero_v; | |||||
| sum_abs3 = zero_v; | |||||
| } | |||||
| for (i = (n >> 3); i--;) | |||||
| { | |||||
| LD_SP8_INC(x, inc_x2, src0, src1, src2, src3, src4, src5, src6, src7); | |||||
| sum_abs0 += AND_VEC_W(src0); | |||||
| sum_abs1 += AND_VEC_W(src1); | |||||
| sum_abs2 += AND_VEC_W(src2); | |||||
| sum_abs3 += AND_VEC_W(src3); | |||||
| sum_abs0 += AND_VEC_W(src4); | |||||
| sum_abs1 += AND_VEC_W(src5); | |||||
| sum_abs2 += AND_VEC_W(src6); | |||||
| sum_abs3 += AND_VEC_W(src7); | |||||
| } | |||||
| if (n & 7) | |||||
| { | |||||
| if ((n & 4) && (n & 2) && (n & 1)) | |||||
| { | |||||
| LD_SP7_INC(x, inc_x2, src0, src1, src2, src3, src4, src5, src6); | |||||
| sum_abs0 += AND_VEC_W(src0); | |||||
| sum_abs1 += AND_VEC_W(src1); | |||||
| sum_abs2 += AND_VEC_W(src2); | |||||
| sum_abs3 += AND_VEC_W(src3); | |||||
| sum_abs0 += AND_VEC_W(src4); | |||||
| sum_abs1 += AND_VEC_W(src5); | |||||
| sum_abs2 += AND_VEC_W(src6); | |||||
| } | |||||
| else if ((n & 4) && (n & 2)) | |||||
| { | |||||
| LD_SP6_INC(x, inc_x2, src0, src1, src2, src3, src4, src5); | |||||
| sum_abs0 += AND_VEC_W(src0); | |||||
| sum_abs1 += AND_VEC_W(src1); | |||||
| sum_abs2 += AND_VEC_W(src2); | |||||
| sum_abs3 += AND_VEC_W(src3); | |||||
| sum_abs0 += AND_VEC_W(src4); | |||||
| sum_abs1 += AND_VEC_W(src5); | |||||
| } | |||||
| else if ((n & 4) && (n & 1)) | |||||
| { | |||||
| LD_SP5_INC(x, inc_x2, src0, src1, src2, src3, src4); | |||||
| sum_abs0 += AND_VEC_W(src0); | |||||
| sum_abs1 += AND_VEC_W(src1); | |||||
| sum_abs2 += AND_VEC_W(src2); | |||||
| sum_abs3 += AND_VEC_W(src3); | |||||
| sum_abs0 += AND_VEC_W(src4); | |||||
| } | |||||
| else if ((n & 2) && (n & 1)) | |||||
| { | |||||
| LD_SP3_INC(x, inc_x2, src0, src1, src2); | |||||
| sum_abs0 += AND_VEC_W(src0); | |||||
| sum_abs1 += AND_VEC_W(src1); | |||||
| sum_abs2 += AND_VEC_W(src2); | |||||
| } | |||||
| else if (n & 4) | |||||
| { | |||||
| LD_SP4_INC(x, inc_x2, src0, src1, src2, src3); | |||||
| sum_abs0 += AND_VEC_W(src0); | |||||
| sum_abs1 += AND_VEC_W(src1); | |||||
| sum_abs2 += AND_VEC_W(src2); | |||||
| sum_abs3 += AND_VEC_W(src3); | |||||
| } | |||||
| else if (n & 2) | |||||
| { | |||||
| LD_SP2_INC(x, inc_x2, src0, src1); | |||||
| sum_abs0 += AND_VEC_W(src0); | |||||
| sum_abs1 += AND_VEC_W(src1); | |||||
| } | |||||
| else if (n & 1) | |||||
| { | |||||
| src0 = LD_SP(x); x += inc_x2; | |||||
| sum_abs0 += AND_VEC_W(src0); | |||||
| } | |||||
| } | |||||
| sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; | |||||
| sumf = sum_abs0[0] + sum_abs0[1]; | |||||
| } | |||||
| return (sumf); | |||||
| } | |||||
| @@ -0,0 +1,361 @@ | |||||
| /******************************************************************************* | |||||
| Copyright (c) 2016, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *******************************************************************************/ | |||||
| #include "common.h" | |||||
| #include "macros_msa.h" | |||||
| #if !defined(CONJ) | |||||
| #define OP2 += | |||||
| #define OP3 - | |||||
| #define OP4 + | |||||
| #else | |||||
| #define OP2 -= | |||||
| #define OP3 + | |||||
| #define OP4 - | |||||
| #endif | |||||
| #define DOT16_KERNEL(OPR0, OPR1) \ | |||||
| dot0 += (vx0r * vy0r); \ | |||||
| dot0 OPR0## = (vx0i * vy0i); \ | |||||
| dot1 OPR1## = (vx0i * vy0r); \ | |||||
| dot1 += (vx0r * vy0i); \ | |||||
| \ | |||||
| dot0 += (vx1r * vy1r); \ | |||||
| dot0 OPR0## = (vx1i * vy1i); \ | |||||
| dot1 OPR1## = (vx1i * vy1r); \ | |||||
| dot1 += (vx1r * vy1i); \ | |||||
| \ | |||||
| dot0 += (vx2r * vy2r); \ | |||||
| dot0 OPR0## = (vx2i * vy2i); \ | |||||
| dot1 OPR1## = (vx2i * vy2r); \ | |||||
| dot1 += (vx2r * vy2i); \ | |||||
| \ | |||||
| dot0 += (vx3r * vy3r); \ | |||||
| dot0 OPR0## = (vx3i * vy3i); \ | |||||
| dot1 OPR1## = (vx3i * vy3r); \ | |||||
| dot1 += (vx3r * vy3i); | |||||
| #define DOT12_KERNEL(OPR0, OPR1) \ | |||||
| dot0 += (vx0r * vy0r); \ | |||||
| dot0 OPR0## = (vx0i * vy0i); \ | |||||
| dot1 OPR1## = (vx0i * vy0r); \ | |||||
| dot1 += (vx0r * vy0i); \ | |||||
| \ | |||||
| dot0 += (vx1r * vy1r); \ | |||||
| dot0 OPR0## = (vx1i * vy1i); \ | |||||
| dot1 OPR1## = (vx1i * vy1r); \ | |||||
| dot1 += (vx1r * vy1i); \ | |||||
| \ | |||||
| dot0 += (vx2r * vy2r); \ | |||||
| dot0 OPR0## = (vx2i * vy2i); \ | |||||
| dot1 OPR1## = (vx2i * vy2r); \ | |||||
| dot1 += (vx2r * vy2i); | |||||
| #define DOT8_KERNEL(OPR0, OPR1) \ | |||||
| dot0 += (vx0r * vy0r); \ | |||||
| dot0 OPR0## = (vx0i * vy0i); \ | |||||
| dot1 OPR1## = (vx0i * vy0r); \ | |||||
| dot1 += (vx0r * vy0i); \ | |||||
| \ | |||||
| dot0 += (vx1r * vy1r); \ | |||||
| dot0 OPR0## = (vx1i * vy1i); \ | |||||
| dot1 OPR1## = (vx1i * vy1r); \ | |||||
| dot1 += (vx1r * vy1i); | |||||
| #define DOT4_KERNEL(OPR0, OPR1) \ | |||||
| dot0 += (vx0r * vy0r); \ | |||||
| dot0 OPR0## = (vx0i * vy0i); \ | |||||
| dot1 OPR1## = (vx0i * vy0r); \ | |||||
| dot1 += (vx0r * vy0i); | |||||
| /* return float, x,y float */ | |||||
| /* cdotc - CONJ */ | |||||
| /* cdotu - !CONJ */ | |||||
| #ifndef _MSC_VER | |||||
| #include <complex.h> | |||||
| FLOAT _Complex CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||||
| #else | |||||
| OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||||
| #endif | |||||
| { | |||||
| BLASLONG i = 0; | |||||
| FLOAT dot[2]; | |||||
| BLASLONG inc_x2; | |||||
| BLASLONG inc_y2; | |||||
| FLOAT x0, x1, x2, x3, x4, x5, x6, x7; | |||||
| FLOAT y0, y1, y2, y3, y4, y5, y6, y7; | |||||
| v4f32 vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7; | |||||
| v4f32 vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7; | |||||
| v4f32 vx0r, vx0i, vx1r, vx1i, vx2r, vx2i, vx3r, vx3i; | |||||
| v4f32 vy0r, vy0i, vy1r, vy1i, vy2r, vy2i, vy3r, vy3i; | |||||
| v4f32 dot0 = {0, 0, 0, 0}; | |||||
| v4f32 dot1 = {0, 0, 0, 0}; | |||||
| openblas_complex_float result; | |||||
| dot[0] = 0.0; | |||||
| dot[1] = 0.0; | |||||
| __real__(result) = 0.0; | |||||
| __imag__(result) = 0.0; | |||||
| if ( n < 1 ) return(result); | |||||
| if ((1 == inc_x) && (1 == inc_y)) | |||||
| { | |||||
| for (i = (n >> 4); i--;) | |||||
| { | |||||
| LD_SP8_INC(x, 4, vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7); | |||||
| LD_SP8_INC(y, 4, vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7); | |||||
| PCKEVOD_W2_SP(vx1, vx0, vx0r, vx0i); | |||||
| PCKEVOD_W2_SP(vx3, vx2, vx1r, vx1i); | |||||
| PCKEVOD_W2_SP(vx5, vx4, vx2r, vx2i); | |||||
| PCKEVOD_W2_SP(vx7, vx6, vx3r, vx3i); | |||||
| PCKEVOD_W2_SP(vy1, vy0, vy0r, vy0i); | |||||
| PCKEVOD_W2_SP(vy3, vy2, vy1r, vy1i); | |||||
| PCKEVOD_W2_SP(vy5, vy4, vy2r, vy2i); | |||||
| PCKEVOD_W2_SP(vy7, vy6, vy3r, vy3i); | |||||
| #if !defined(CONJ) | |||||
| DOT16_KERNEL(-, +); | |||||
| #else | |||||
| DOT16_KERNEL(+, -); | |||||
| #endif | |||||
| } | |||||
| if (n & 15) | |||||
| { | |||||
| if ((n & 8) && (n & 4)) | |||||
| { | |||||
| LD_SP4_INC(x, 4, vx0, vx1, vx2, vx3); | |||||
| LD_SP4_INC(y, 4, vy0, vy1, vy2, vy3); | |||||
| LD_SP2_INC(x, 4, vx4, vx5); | |||||
| LD_SP2_INC(y, 4, vy4, vy5); | |||||
| PCKEVOD_W2_SP(vx1, vx0, vx0r, vx0i); | |||||
| PCKEVOD_W2_SP(vx3, vx2, vx1r, vx1i); | |||||
| PCKEVOD_W2_SP(vx5, vx4, vx2r, vx2i); | |||||
| PCKEVOD_W2_SP(vy1, vy0, vy0r, vy0i); | |||||
| PCKEVOD_W2_SP(vy3, vy2, vy1r, vy1i); | |||||
| PCKEVOD_W2_SP(vy5, vy4, vy2r, vy2i); | |||||
| #if !defined(CONJ) | |||||
| DOT12_KERNEL(-, +); | |||||
| #else | |||||
| DOT12_KERNEL(+, -); | |||||
| #endif | |||||
| } | |||||
| else if (n & 8) | |||||
| { | |||||
| LD_SP4_INC(x, 4, vx0, vx1, vx2, vx3); | |||||
| LD_SP4_INC(y, 4, vy0, vy1, vy2, vy3); | |||||
| PCKEVOD_W2_SP(vx1, vx0, vx0r, vx0i); | |||||
| PCKEVOD_W2_SP(vx3, vx2, vx1r, vx1i); | |||||
| PCKEVOD_W2_SP(vy1, vy0, vy0r, vy0i); | |||||
| PCKEVOD_W2_SP(vy3, vy2, vy1r, vy1i); | |||||
| #if !defined(CONJ) | |||||
| DOT8_KERNEL(-, +); | |||||
| #else | |||||
| DOT8_KERNEL(+, -); | |||||
| #endif | |||||
| } | |||||
| else if (n & 4) | |||||
| { | |||||
| LD_SP2_INC(x, 4, vx0, vx1); | |||||
| LD_SP2_INC(y, 4, vy0, vy1); | |||||
| PCKEVOD_W2_SP(vx1, vx0, vx0r, vx0i); | |||||
| PCKEVOD_W2_SP(vy1, vy0, vy0r, vy0i); | |||||
| #if !defined(CONJ) | |||||
| DOT4_KERNEL(-, +); | |||||
| #else | |||||
| DOT4_KERNEL(+, -); | |||||
| #endif | |||||
| } | |||||
| if ((n & 2) && (n & 1)) | |||||
| { | |||||
| LD_GP6_INC(x, 1, x0, x1, x2, x3, x4, x5); | |||||
| LD_GP6_INC(y, 1, y0, y1, y2, y3, y4, y5); | |||||
| dot[0] += ( x0 * y0 OP3 x1 * y1 ); | |||||
| dot[1] OP2 ( x1 * y0 OP4 x0 * y1 ); | |||||
| dot[0] += ( x2 * y2 OP3 x3 * y3 ); | |||||
| dot[1] OP2 ( x3 * y2 OP4 x2 * y3 ); | |||||
| dot[0] += ( x4 * y4 OP3 x5 * y5 ); | |||||
| dot[1] OP2 ( x5 * y4 OP4 x4 * y5 ); | |||||
| } | |||||
| else if (n & 2) | |||||
| { | |||||
| LD_GP4_INC(x, 1, x0, x1, x2, x3); | |||||
| LD_GP4_INC(y, 1, y0, y1, y2, y3); | |||||
| dot[0] += ( x0 * y0 OP3 x1 * y1 ); | |||||
| dot[1] OP2 ( x1 * y0 OP4 x0 * y1 ); | |||||
| dot[0] += ( x2 * y2 OP3 x3 * y3 ); | |||||
| dot[1] OP2 ( x3 * y2 OP4 x2 * y3 ); | |||||
| } | |||||
| else if (n & 1) | |||||
| { | |||||
| LD_GP2_INC(x, 1, x0, x1); | |||||
| LD_GP2_INC(y, 1, y0, y1); | |||||
| dot[0] += ( x0 * y0 OP3 x1 * y1 ); | |||||
| dot[1] OP2 ( x1 * y0 OP4 x0 * y1 ); | |||||
| } | |||||
| } | |||||
| dot[0] += (dot0[0] + dot0[1] + dot0[2] + dot0[3]); | |||||
| dot[1] += (dot1[0] + dot1[1] + dot1[2] + dot1[3]); | |||||
| } | |||||
| else | |||||
| { | |||||
| inc_x2 = 2 * inc_x; | |||||
| inc_y2 = 2 * inc_y; | |||||
| for (i = (n >> 2); i--;) | |||||
| { | |||||
| x0 = *x; | |||||
| x1 = *(x + 1); | |||||
| x += inc_x2; | |||||
| x2 = *x; | |||||
| x3 = *(x + 1); | |||||
| x += inc_x2; | |||||
| x4 = *x; | |||||
| x5 = *(x + 1); | |||||
| x += inc_x2; | |||||
| x6 = *x; | |||||
| x7 = *(x + 1); | |||||
| x += inc_x2; | |||||
| y0 = *y; | |||||
| y1 = *(y + 1); | |||||
| y += inc_y2; | |||||
| y2 = *y; | |||||
| y3 = *(y + 1); | |||||
| y += inc_y2; | |||||
| y4 = *y; | |||||
| y5 = *(y + 1); | |||||
| y += inc_y2; | |||||
| y6 = *y; | |||||
| y7 = *(y + 1); | |||||
| y += inc_y2; | |||||
| dot[0] += ( x0 * y0 OP3 x1 * y1 ); | |||||
| dot[1] OP2 ( x1 * y0 OP4 x0 * y1 ); | |||||
| dot[0] += ( x2 * y2 OP3 x3 * y3 ); | |||||
| dot[1] OP2 ( x3 * y2 OP4 x2 * y3 ); | |||||
| dot[0] += ( x4 * y4 OP3 x5 * y5 ); | |||||
| dot[1] OP2 ( x5 * y4 OP4 x4 * y5 ); | |||||
| dot[0] += ( x6 * y6 OP3 x7 * y7 ); | |||||
| dot[1] OP2 ( x7 * y6 OP4 x6 * y7 ); | |||||
| } | |||||
| if ((n & 2) && (n & 1)) | |||||
| { | |||||
| x0 = *x; | |||||
| x1 = *(x + 1); | |||||
| x += inc_x2; | |||||
| x2 = *x; | |||||
| x3 = *(x + 1); | |||||
| x += inc_x2; | |||||
| x4 = *x; | |||||
| x5 = *(x + 1); | |||||
| x += inc_x2; | |||||
| y0 = *y; | |||||
| y1 = *(y + 1); | |||||
| y += inc_y2; | |||||
| y2 = *y; | |||||
| y3 = *(y + 1); | |||||
| y += inc_y2; | |||||
| y4 = *y; | |||||
| y5 = *(y + 1); | |||||
| y += inc_y2; | |||||
| dot[0] += ( x0 * y0 OP3 x1 * y1 ); | |||||
| dot[1] OP2 ( x1 * y0 OP4 x0 * y1 ); | |||||
| dot[0] += ( x2 * y2 OP3 x3 * y3 ); | |||||
| dot[1] OP2 ( x3 * y2 OP4 x2 * y3 ); | |||||
| dot[0] += ( x4 * y4 OP3 x5 * y5 ); | |||||
| dot[1] OP2 ( x5 * y4 OP4 x4 * y5 ); | |||||
| } | |||||
| else if (n & 2) | |||||
| { | |||||
| x0 = *x; | |||||
| x1 = *(x + 1); | |||||
| x += inc_x2; | |||||
| x2 = *x; | |||||
| x3 = *(x + 1); | |||||
| x += inc_x2; | |||||
| y0 = *y; | |||||
| y1 = *(y + 1); | |||||
| y += inc_y2; | |||||
| y2 = *y; | |||||
| y3 = *(y + 1); | |||||
| y += inc_y2; | |||||
| dot[0] += ( x0 * y0 OP3 x1 * y1 ); | |||||
| dot[1] OP2 ( x1 * y0 OP4 x0 * y1 ); | |||||
| dot[0] += ( x2 * y2 OP3 x3 * y3 ); | |||||
| dot[1] OP2 ( x3 * y2 OP4 x2 * y3 ); | |||||
| } | |||||
| else if (n & 1) | |||||
| { | |||||
| x0 = *x; | |||||
| x1 = *(x + 1); | |||||
| x += inc_x2; | |||||
| y0 = *y; | |||||
| y1 = *(y + 1); | |||||
| y += inc_y2; | |||||
| dot[0] += ( x0 * y0 OP3 x1 * y1 ); | |||||
| dot[1] OP2 ( x1 * y0 OP4 x0 * y1 ); | |||||
| } | |||||
| } | |||||
| __real__(result) = dot[0]; | |||||
| __imag__(result) = dot[1]; | |||||
| return(result); | |||||
| } | |||||
| @@ -0,0 +1,195 @@ | |||||
| /******************************************************************************* | |||||
| Copyright (c) 2016, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *******************************************************************************/ | |||||
| #include "common.h" | |||||
| #include "macros_msa.h" | |||||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *src, BLASLONG lda, FLOAT *dst) | |||||
| { | |||||
| BLASLONG i, j; | |||||
| FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4, *pdst; | |||||
| FLOAT ctemp01, ctemp02, ctemp03, ctemp04; | |||||
| FLOAT ctemp05, ctemp06, ctemp07, ctemp08; | |||||
| v4f32 src0, src1, src2, src3, src4, src5, src6, src7; | |||||
| v4f32 dst0, dst1, dst4, dst5; | |||||
| psrc0 = src; | |||||
| pdst = dst; | |||||
| lda *= 2; | |||||
| for (j = (n >> 2); j--;) | |||||
| { | |||||
| psrc1 = psrc0; | |||||
| psrc2 = psrc1 + lda; | |||||
| psrc3 = psrc2 + lda; | |||||
| psrc4 = psrc3 + lda; | |||||
| psrc0 += 4 * lda; | |||||
| for (i = (m >> 2); i--;) | |||||
| { | |||||
| LD_SP2_INC(psrc1, 4, src0, src1); | |||||
| LD_SP2_INC(psrc2, 4, src2, src3); | |||||
| LD_SP2_INC(psrc3, 4, src4, src5); | |||||
| LD_SP2_INC(psrc4, 4, src6, src7); | |||||
| ILVRL_D2_SP(src2, src0, dst0, dst4); | |||||
| ILVRL_D2_SP(src6, src4, dst1, dst5); | |||||
| ST_SP4_INC(dst0, dst1, dst4, dst5, pdst, 4); | |||||
| ILVRL_D2_SP(src3, src1, dst0, dst4); | |||||
| ILVRL_D2_SP(src7, src5, dst1, dst5); | |||||
| ST_SP4_INC(dst0, dst1, dst4, dst5, pdst, 4); | |||||
| } | |||||
| if (m & 2) | |||||
| { | |||||
| src0 = LD_SP(psrc1); | |||||
| src2 = LD_SP(psrc2); | |||||
| src4 = LD_SP(psrc3); | |||||
| src6 = LD_SP(psrc4); | |||||
| psrc1 += 4; | |||||
| psrc2 += 4; | |||||
| psrc3 += 4; | |||||
| psrc4 += 4; | |||||
| ILVRL_D2_SP(src2, src0, dst0, dst4); | |||||
| ILVRL_D2_SP(src6, src4, dst1, dst5); | |||||
| ST_SP4_INC(dst0, dst1, dst4, dst5, pdst, 4); | |||||
| } | |||||
| if (m & 1) | |||||
| { | |||||
| ctemp01 = *(psrc1 + 0); | |||||
| ctemp02 = *(psrc1 + 1); | |||||
| ctemp03 = *(psrc2 + 0); | |||||
| ctemp04 = *(psrc2 + 1); | |||||
| ctemp05 = *(psrc3 + 0); | |||||
| ctemp06 = *(psrc3 + 1); | |||||
| ctemp07 = *(psrc4 + 0); | |||||
| ctemp08 = *(psrc4 + 1); | |||||
| psrc1 += 2; | |||||
| psrc2 += 2; | |||||
| psrc3 += 2; | |||||
| psrc4 += 2; | |||||
| *(pdst + 0) = ctemp01; | |||||
| *(pdst + 1) = ctemp02; | |||||
| *(pdst + 2) = ctemp03; | |||||
| *(pdst + 3) = ctemp04; | |||||
| *(pdst + 4) = ctemp05; | |||||
| *(pdst + 5) = ctemp06; | |||||
| *(pdst + 6) = ctemp07; | |||||
| *(pdst + 7) = ctemp08; | |||||
| pdst += 8; | |||||
| } | |||||
| } | |||||
| if (n & 2) | |||||
| { | |||||
| psrc1 = psrc0; | |||||
| psrc2 = psrc1 + lda; | |||||
| psrc0 += 2 * lda; | |||||
| for (i = (m >> 2); i--;) | |||||
| { | |||||
| LD_SP2_INC(psrc1, 4, src0, src1); | |||||
| LD_SP2_INC(psrc2, 4, src2, src3); | |||||
| ILVRL_D2_SP(src2, src0, dst0, dst4); | |||||
| ST_SP2_INC(dst0, dst4, pdst, 4); | |||||
| ILVRL_D2_SP(src3, src1, dst0, dst4); | |||||
| ST_SP2_INC(dst0, dst4, pdst, 4); | |||||
| } | |||||
| if (m & 2) | |||||
| { | |||||
| src0 = LD_SP(psrc1); | |||||
| src2 = LD_SP(psrc2); | |||||
| psrc1 += 4; | |||||
| psrc2 += 4; | |||||
| ILVRL_D2_SP(src2, src0, dst0, dst4); | |||||
| ST_SP2_INC(dst0, dst4, pdst, 4); | |||||
| } | |||||
| if (m & 1) | |||||
| { | |||||
| ctemp01 = *(psrc1 + 0); | |||||
| ctemp02 = *(psrc1 + 1); | |||||
| ctemp03 = *(psrc2 + 0); | |||||
| ctemp04 = *(psrc2 + 1); | |||||
| psrc1 += 2; | |||||
| psrc2 += 2; | |||||
| *(pdst + 0) = ctemp01; | |||||
| *(pdst + 1) = ctemp02; | |||||
| *(pdst + 2) = ctemp03; | |||||
| *(pdst + 3) = ctemp04; | |||||
| pdst += 4; | |||||
| } | |||||
| } | |||||
| if (n & 1) | |||||
| { | |||||
| psrc1 = psrc0; | |||||
| for (i = (m >> 2); i--;) | |||||
| { | |||||
| LD_SP2_INC(psrc1, 4, src0, src1); | |||||
| ST_SP2_INC(src0, src1, pdst, 4); | |||||
| } | |||||
| if (m & 2) | |||||
| { | |||||
| src0 = LD_SP(psrc1); | |||||
| psrc1 += 4; | |||||
| ST_SP(src0, pdst); | |||||
| pdst += 4; | |||||
| } | |||||
| if (m & 1) | |||||
| { | |||||
| ctemp01 = *(psrc1 + 0); | |||||
| ctemp02 = *(psrc1 + 1); | |||||
| psrc1 += 2; | |||||
| *(pdst + 0) = ctemp01; | |||||
| *(pdst + 1) = ctemp02; | |||||
| pdst += 2; | |||||
| } | |||||
| } | |||||
| return 0; | |||||
| } | |||||
| @@ -0,0 +1,310 @@ | |||||
| /******************************************************************************* | |||||
| Copyright (c) 2016, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *******************************************************************************/ | |||||
| #include "common.h" | |||||
| #include "macros_msa.h" | |||||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *src, BLASLONG lda, FLOAT *dst) | |||||
| { | |||||
| BLASLONG i, j; | |||||
| FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4, *psrc5, *psrc6, *psrc7; | |||||
| FLOAT *psrc8, *pdst; | |||||
| FLOAT ctemp01, ctemp02, ctemp03, ctemp04, ctemp05, ctemp06, ctemp07; | |||||
| FLOAT ctemp08, ctemp09, ctemp10, ctemp11, ctemp12, ctemp13, ctemp14; | |||||
| FLOAT ctemp15, ctemp16; | |||||
| v4f32 src0, src1, src2, src3, src4, src5, src6, src7; | |||||
| v4f32 src8, src9, src10, src11, src12, src13, src14, src15; | |||||
| v4f32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; | |||||
| psrc0 = src; | |||||
| pdst = dst; | |||||
| lda *= 2; | |||||
| for (j = (n >> 3); j--;) | |||||
| { | |||||
| psrc1 = psrc0; | |||||
| psrc2 = psrc1 + lda; | |||||
| psrc3 = psrc2 + lda; | |||||
| psrc4 = psrc3 + lda; | |||||
| psrc5 = psrc4 + lda; | |||||
| psrc6 = psrc5 + lda; | |||||
| psrc7 = psrc6 + lda; | |||||
| psrc8 = psrc7 + lda; | |||||
| psrc0 += 8 * lda; | |||||
| for (i = (m >> 2); i--;) | |||||
| { | |||||
| LD_SP2_INC(psrc1, 4, src0, src1); | |||||
| LD_SP2_INC(psrc2, 4, src2, src3); | |||||
| LD_SP2_INC(psrc3, 4, src4, src5); | |||||
| LD_SP2_INC(psrc4, 4, src6, src7); | |||||
| LD_SP2_INC(psrc5, 4, src8, src9); | |||||
| LD_SP2_INC(psrc6, 4, src10, src11); | |||||
| LD_SP2_INC(psrc7, 4, src12, src13); | |||||
| LD_SP2_INC(psrc8, 4, src14, src15); | |||||
| ILVRL_D2_SP(src2, src0, dst0, dst4); | |||||
| ILVRL_D2_SP(src6, src4, dst1, dst5); | |||||
| ILVRL_D2_SP(src10, src8, dst2, dst6); | |||||
| ILVRL_D2_SP(src14, src12, dst3, dst7); | |||||
| ST_SP8_INC(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst, 4); | |||||
| ILVRL_D2_SP(src3, src1, dst0, dst4); | |||||
| ILVRL_D2_SP(src7, src5, dst1, dst5); | |||||
| ILVRL_D2_SP(src11, src9, dst2, dst6); | |||||
| ILVRL_D2_SP(src15, src13, dst3, dst7); | |||||
| ST_SP8_INC(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst, 4); | |||||
| } | |||||
| if (m & 2) | |||||
| { | |||||
| src0 = LD_SP(psrc1); | |||||
| src2 = LD_SP(psrc2); | |||||
| src4 = LD_SP(psrc3); | |||||
| src6 = LD_SP(psrc4); | |||||
| src8 = LD_SP(psrc5); | |||||
| src10 = LD_SP(psrc6); | |||||
| src12 = LD_SP(psrc7); | |||||
| src14 = LD_SP(psrc8); | |||||
| psrc1 += 4; | |||||
| psrc2 += 4; | |||||
| psrc3 += 4; | |||||
| psrc4 += 4; | |||||
| psrc5 += 4; | |||||
| psrc6 += 4; | |||||
| psrc7 += 4; | |||||
| psrc8 += 4; | |||||
| ILVRL_D2_SP(src2, src0, dst0, dst4); | |||||
| ILVRL_D2_SP(src6, src4, dst1, dst5); | |||||
| ILVRL_D2_SP(src10, src8, dst2, dst6); | |||||
| ILVRL_D2_SP(src14, src12, dst3, dst7); | |||||
| ST_SP8_INC(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst, 4); | |||||
| } | |||||
| if (m & 1) | |||||
| { | |||||
| ctemp01 = *(psrc1 + 0); | |||||
| ctemp02 = *(psrc1 + 1); | |||||
| ctemp03 = *(psrc2 + 0); | |||||
| ctemp04 = *(psrc2 + 1); | |||||
| ctemp05 = *(psrc3 + 0); | |||||
| ctemp06 = *(psrc3 + 1); | |||||
| ctemp07 = *(psrc4 + 0); | |||||
| ctemp08 = *(psrc4 + 1); | |||||
| ctemp09 = *(psrc5 + 0); | |||||
| ctemp10 = *(psrc5 + 1); | |||||
| ctemp11 = *(psrc6 + 0); | |||||
| ctemp12 = *(psrc6 + 1); | |||||
| ctemp13 = *(psrc7 + 0); | |||||
| ctemp14 = *(psrc7 + 1); | |||||
| ctemp15 = *(psrc8 + 0); | |||||
| ctemp16 = *(psrc8 + 1); | |||||
| psrc1 += 2; | |||||
| psrc2 += 2; | |||||
| psrc3 += 2; | |||||
| psrc4 += 2; | |||||
| psrc5 += 2; | |||||
| psrc6 += 2; | |||||
| psrc7 += 2; | |||||
| psrc8 += 2; | |||||
| *(pdst + 0) = ctemp01; | |||||
| *(pdst + 1) = ctemp02; | |||||
| *(pdst + 2) = ctemp03; | |||||
| *(pdst + 3) = ctemp04; | |||||
| *(pdst + 4) = ctemp05; | |||||
| *(pdst + 5) = ctemp06; | |||||
| *(pdst + 6) = ctemp07; | |||||
| *(pdst + 7) = ctemp08; | |||||
| *(pdst + 8) = ctemp09; | |||||
| *(pdst + 9) = ctemp10; | |||||
| *(pdst + 10) = ctemp11; | |||||
| *(pdst + 11) = ctemp12; | |||||
| *(pdst + 12) = ctemp13; | |||||
| *(pdst + 13) = ctemp14; | |||||
| *(pdst + 14) = ctemp15; | |||||
| *(pdst + 15) = ctemp16; | |||||
| pdst += 16; | |||||
| } | |||||
| } | |||||
| if (n & 4) | |||||
| { | |||||
| psrc1 = psrc0; | |||||
| psrc2 = psrc1 + lda; | |||||
| psrc3 = psrc2 + lda; | |||||
| psrc4 = psrc3 + lda; | |||||
| psrc0 += 4 * lda; | |||||
| for (i = (m >> 2); i--;) | |||||
| { | |||||
| LD_SP2_INC(psrc1, 4, src0, src1); | |||||
| LD_SP2_INC(psrc2, 4, src2, src3); | |||||
| LD_SP2_INC(psrc3, 4, src4, src5); | |||||
| LD_SP2_INC(psrc4, 4, src6, src7); | |||||
| ILVRL_D2_SP(src2, src0, dst0, dst4); | |||||
| ILVRL_D2_SP(src6, src4, dst1, dst5); | |||||
| ST_SP4_INC(dst0, dst1, dst4, dst5, pdst, 4); | |||||
| ILVRL_D2_SP(src3, src1, dst0, dst4); | |||||
| ILVRL_D2_SP(src7, src5, dst1, dst5); | |||||
| ST_SP4_INC(dst0, dst1, dst4, dst5, pdst, 4); | |||||
| } | |||||
| if (m & 2) | |||||
| { | |||||
| src0 = LD_SP(psrc1); | |||||
| src2 = LD_SP(psrc2); | |||||
| src4 = LD_SP(psrc3); | |||||
| src6 = LD_SP(psrc4); | |||||
| psrc1 += 4; | |||||
| psrc2 += 4; | |||||
| psrc3 += 4; | |||||
| psrc4 += 4; | |||||
| ILVRL_D2_SP(src2, src0, dst0, dst4); | |||||
| ILVRL_D2_SP(src6, src4, dst1, dst5); | |||||
| ST_SP4_INC(dst0, dst1, dst4, dst5, pdst, 4); | |||||
| } | |||||
| if (m & 1) | |||||
| { | |||||
| ctemp01 = *(psrc1 + 0); | |||||
| ctemp02 = *(psrc1 + 1); | |||||
| ctemp03 = *(psrc2 + 0); | |||||
| ctemp04 = *(psrc2 + 1); | |||||
| ctemp05 = *(psrc3 + 0); | |||||
| ctemp06 = *(psrc3 + 1); | |||||
| ctemp07 = *(psrc4 + 0); | |||||
| ctemp08 = *(psrc4 + 1); | |||||
| psrc1 += 2; | |||||
| psrc2 += 2; | |||||
| psrc3 += 2; | |||||
| psrc4 += 2; | |||||
| *(pdst + 0) = ctemp01; | |||||
| *(pdst + 1) = ctemp02; | |||||
| *(pdst + 2) = ctemp03; | |||||
| *(pdst + 3) = ctemp04; | |||||
| *(pdst + 4) = ctemp05; | |||||
| *(pdst + 5) = ctemp06; | |||||
| *(pdst + 6) = ctemp07; | |||||
| *(pdst + 7) = ctemp08; | |||||
| pdst += 8; | |||||
| } | |||||
| } | |||||
| if (n & 2) | |||||
| { | |||||
| psrc1 = psrc0; | |||||
| psrc2 = psrc1 + lda; | |||||
| psrc0 += 2 * lda; | |||||
| for (i = (m >> 2); i--;) | |||||
| { | |||||
| LD_SP2_INC(psrc1, 4, src0, src1); | |||||
| LD_SP2_INC(psrc2, 4, src2, src3); | |||||
| ILVRL_D2_SP(src2, src0, dst0, dst4); | |||||
| ST_SP2_INC(dst0, dst4, pdst, 4); | |||||
| ILVRL_D2_SP(src3, src1, dst0, dst4); | |||||
| ST_SP2_INC(dst0, dst4, pdst, 4); | |||||
| } | |||||
| if (m & 2) | |||||
| { | |||||
| src0 = LD_SP(psrc1); | |||||
| src2 = LD_SP(psrc2); | |||||
| psrc1 += 4; | |||||
| psrc2 += 4; | |||||
| ILVRL_D2_SP(src2, src0, dst0, dst4); | |||||
| ST_SP2_INC(dst0, dst4, pdst, 4); | |||||
| } | |||||
| if (m & 1) | |||||
| { | |||||
| ctemp01 = *(psrc1 + 0); | |||||
| ctemp02 = *(psrc1 + 1); | |||||
| ctemp03 = *(psrc2 + 0); | |||||
| ctemp04 = *(psrc2 + 1); | |||||
| psrc1 += 2; | |||||
| psrc2 += 2; | |||||
| *(pdst + 0) = ctemp01; | |||||
| *(pdst + 1) = ctemp02; | |||||
| *(pdst + 2) = ctemp03; | |||||
| *(pdst + 3) = ctemp04; | |||||
| pdst += 4; | |||||
| } | |||||
| } | |||||
| if (n & 1) | |||||
| { | |||||
| psrc1 = psrc0; | |||||
| for (i = (m >> 2); i--;) | |||||
| { | |||||
| LD_SP2_INC(psrc1, 4, src0, src1); | |||||
| ST_SP2_INC(src0, src1, pdst, 4); | |||||
| } | |||||
| if (m & 2) | |||||
| { | |||||
| src0 = LD_SP(psrc1); | |||||
| psrc1 += 4; | |||||
| ST_SP(src0, pdst); | |||||
| pdst += 4; | |||||
| } | |||||
| if (m & 1) | |||||
| { | |||||
| ctemp01 = *(psrc1 + 0); | |||||
| ctemp02 = *(psrc1 + 1); | |||||
| psrc1 += 2; | |||||
| *(pdst + 0) = ctemp01; | |||||
| *(pdst + 1) = ctemp02; | |||||
| pdst += 2; | |||||
| } | |||||
| } | |||||
| return 0; | |||||
| } | |||||
| @@ -0,0 +1,125 @@ | |||||
| /******************************************************************************* | |||||
| Copyright (c) 2016, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *******************************************************************************/ | |||||
| #include "common.h" | |||||
| #include "macros_msa.h" | |||||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *src, BLASLONG lda, FLOAT *dst) | |||||
| { | |||||
| BLASLONG i, j; | |||||
| FLOAT *psrc0; | |||||
| FLOAT *psrc1, *psrc2; | |||||
| FLOAT *pdst0; | |||||
| FLOAT ctemp01, ctemp02, ctemp03, ctemp04; | |||||
| v4f32 src0, src1, src2, src3; | |||||
| psrc0 = src; | |||||
| pdst0 = dst; | |||||
| lda *= 2; | |||||
| for (j = (n >> 2); j--;) | |||||
| { | |||||
| psrc1 = psrc0; | |||||
| psrc2 = psrc0 + lda; | |||||
| psrc0 += 8; | |||||
| for (i = (m >> 1); i--;) | |||||
| { | |||||
| LD_SP2(psrc1, 4, src0, src1); | |||||
| LD_SP2(psrc2, 4, src2, src3); | |||||
| ST_SP4_INC(src0, src1, src2, src3, pdst0, 4); | |||||
| psrc1 += 2 * lda; | |||||
| psrc2 += 2 * lda; | |||||
| } | |||||
| if (m & 1) | |||||
| { | |||||
| LD_SP2(psrc1, 4, src0, src1); | |||||
| ST_SP2_INC(src0, src1, pdst0, 4); | |||||
| } | |||||
| } | |||||
| if (n & 2) | |||||
| { | |||||
| psrc1 = psrc0; | |||||
| psrc2 = psrc0 + lda; | |||||
| psrc0 += 4; | |||||
| for (i = (m >> 1); i--;) | |||||
| { | |||||
| src0 = LD_SP(psrc1); | |||||
| src1 = LD_SP(psrc2); | |||||
| ST_SP2_INC(src0, src1, pdst0, 4); | |||||
| psrc1 += 2 * lda; | |||||
| psrc2 += 2 * lda; | |||||
| } | |||||
| if (m & 1) | |||||
| { | |||||
| src0 = LD_SP(psrc1); | |||||
| ST_SP(src0, pdst0); | |||||
| pdst0 += 4; | |||||
| } | |||||
| } | |||||
| if (n & 1) | |||||
| { | |||||
| psrc1 = psrc0; | |||||
| psrc2 = psrc0 + lda; | |||||
| psrc0 += 2; | |||||
| for (i = (m >> 1); i--;) | |||||
| { | |||||
| ctemp01 = *(psrc1 + 0); | |||||
| ctemp02 = *(psrc1 + 1); | |||||
| ctemp03 = *(psrc2 + 0); | |||||
| ctemp04 = *(psrc2 + 1); | |||||
| *(pdst0 + 0) = ctemp01; | |||||
| *(pdst0 + 1) = ctemp02; | |||||
| *(pdst0 + 2) = ctemp03; | |||||
| *(pdst0 + 3) = ctemp04; | |||||
| psrc1 += 2 * lda; | |||||
| psrc2 += 2 * lda; | |||||
| pdst0 += 4; | |||||
| } | |||||
| if (m & 1) | |||||
| { | |||||
| ctemp01 = *(psrc1 + 0); | |||||
| ctemp02 = *(psrc1 + 1); | |||||
| *(pdst0 + 0) = ctemp01; | |||||
| *(pdst0 + 1) = ctemp02; | |||||
| pdst0 += 2; | |||||
| } | |||||
| } | |||||
| return 0; | |||||
| } | |||||
| @@ -0,0 +1,214 @@ | |||||
| /******************************************************************************* | |||||
| Copyright (c) 2016, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *******************************************************************************/ | |||||
| #include "common.h" | |||||
| #include "macros_msa.h" | |||||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *src, BLASLONG lda, FLOAT *dst) | |||||
| { | |||||
| BLASLONG i, j; | |||||
| FLOAT *psrc0, *psrc1, *psrc2, *pdst0; | |||||
| FLOAT ctemp01, ctemp02, ctemp03, ctemp04; | |||||
| v4f32 src0, src1, src2, src3, src4, src5, src6, src7; | |||||
| v4f32 src8, src9, src10, src11, src12, src13, src14, src15; | |||||
| psrc0 = src; | |||||
| pdst0 = dst; | |||||
| lda *= 2; | |||||
| for (j = (n >> 3); j--;) | |||||
| { | |||||
| psrc1 = psrc0; | |||||
| psrc2 = psrc0 + lda; | |||||
| psrc0 += 16; | |||||
| for (i = (m >> 2); i--;) | |||||
| { | |||||
| LD_SP4(psrc1, 4, src0, src1, src2, src3); | |||||
| LD_SP4(psrc2, 4, src4, src5, src6, src7); | |||||
| LD_SP4(psrc1 + 2 * lda, 4, src8, src9, src10, src11); | |||||
| LD_SP4(psrc2 + 2 * lda, 4, src12, src13, src14, src15); | |||||
| ST_SP8_INC(src0, src1, src2, src3, src4, src5, src6, src7, pdst0, 4); | |||||
| ST_SP8_INC(src8, src9, src10, src11, src12, src13, src14, src15, pdst0, 4); | |||||
| psrc1 += 4 * lda; | |||||
| psrc2 += 4 * lda; | |||||
| } | |||||
| if (m & 2) | |||||
| { | |||||
| LD_SP4(psrc1, 4, src0, src1, src2, src3); | |||||
| LD_SP4(psrc2, 4, src4, src5, src6, src7); | |||||
| ST_SP8_INC(src0, src1, src2, src3, src4, src5, src6, src7, pdst0, 4); | |||||
| psrc1 += 2 * lda; | |||||
| psrc2 += 2 * lda; | |||||
| } | |||||
| if (m & 1) | |||||
| { | |||||
| LD_SP4(psrc1, 4, src0, src1, src2, src3); | |||||
| ST_SP4_INC(src0, src1, src2, src3, pdst0, 4); | |||||
| } | |||||
| } | |||||
| if (n & 4) | |||||
| { | |||||
| psrc1 = psrc0; | |||||
| psrc2 = psrc0 + lda; | |||||
| psrc0 += 8; | |||||
| for (i = (m >> 2); i--;) | |||||
| { | |||||
| LD_SP2(psrc1, 4, src0, src1); | |||||
| LD_SP2(psrc2, 4, src2, src3); | |||||
| LD_SP2(psrc1 + 2 * lda, 4, src4, src5); | |||||
| LD_SP2(psrc2 + 2 * lda, 4, src6, src7); | |||||
| ST_SP4_INC(src0, src1, src2, src3, pdst0, 4); | |||||
| ST_SP4_INC(src4, src5, src6, src7, pdst0, 4); | |||||
| psrc1 += 4 * lda; | |||||
| psrc2 += 4 * lda; | |||||
| } | |||||
| if (m & 2) | |||||
| { | |||||
| LD_SP2(psrc1, 4, src0, src1); | |||||
| LD_SP2(psrc2, 4, src2, src3); | |||||
| ST_SP4_INC(src0, src1, src2, src3, pdst0, 4); | |||||
| psrc1 += 2 * lda; | |||||
| psrc2 += 2 * lda; | |||||
| } | |||||
| if (m & 1) | |||||
| { | |||||
| LD_SP2(psrc1, 4, src0, src1); | |||||
| ST_SP2_INC(src0, src1, pdst0, 4); | |||||
| } | |||||
| } | |||||
| if (n & 2) | |||||
| { | |||||
| psrc1 = psrc0; | |||||
| psrc2 = psrc0 + lda; | |||||
| psrc0 += 4; | |||||
| for (i = (m >> 2); i--;) | |||||
| { | |||||
| src0 = LD_SP(psrc1); | |||||
| src1 = LD_SP(psrc2); | |||||
| src2 = LD_SP(psrc1 + 2 * lda); | |||||
| src3 = LD_SP(psrc2 + 2 * lda); | |||||
| ST_SP4_INC(src0, src1, src2, src3, pdst0, 4); | |||||
| psrc1 += 4 * lda; | |||||
| psrc2 += 4 * lda; | |||||
| } | |||||
| if (m & 2) | |||||
| { | |||||
| src0 = LD_SP(psrc1); | |||||
| src1 = LD_SP(psrc2); | |||||
| ST_SP2_INC(src0, src1, pdst0, 4); | |||||
| psrc1 += 2 * lda; | |||||
| psrc2 += 2 * lda; | |||||
| } | |||||
| if (m & 1) | |||||
| { | |||||
| src0 = LD_SP(psrc1); | |||||
| ST_SP(src0, pdst0); | |||||
| pdst0 += 4; | |||||
| } | |||||
| } | |||||
| if (n & 1) | |||||
| { | |||||
| psrc1 = psrc0; | |||||
| psrc2 = psrc0 + lda; | |||||
| psrc0 += 2; | |||||
| for (i = (m >> 2); i--;) | |||||
| { | |||||
| ctemp01 = *(psrc1 + 0); | |||||
| ctemp02 = *(psrc1 + 1); | |||||
| ctemp03 = *(psrc2 + 0); | |||||
| ctemp04 = *(psrc2 + 1); | |||||
| *(pdst0 + 0) = ctemp01; | |||||
| *(pdst0 + 1) = ctemp02; | |||||
| *(pdst0 + 2) = ctemp03; | |||||
| *(pdst0 + 3) = ctemp04; | |||||
| psrc1 += 2 * lda; | |||||
| psrc2 += 2 * lda; | |||||
| pdst0 += 4; | |||||
| ctemp01 = *(psrc1 + 0); | |||||
| ctemp02 = *(psrc1 + 1); | |||||
| ctemp03 = *(psrc2 + 0); | |||||
| ctemp04 = *(psrc2 + 1); | |||||
| *(pdst0 + 0) = ctemp01; | |||||
| *(pdst0 + 1) = ctemp02; | |||||
| *(pdst0 + 2) = ctemp03; | |||||
| *(pdst0 + 3) = ctemp04; | |||||
| psrc1 += 2 * lda; | |||||
| psrc2 += 2 * lda; | |||||
| pdst0 += 4; | |||||
| } | |||||
| if (m & 2) | |||||
| { | |||||
| ctemp01 = *(psrc1 + 0); | |||||
| ctemp02 = *(psrc1 + 1); | |||||
| ctemp03 = *(psrc2 + 0); | |||||
| ctemp04 = *(psrc2 + 1); | |||||
| *(pdst0 + 0) = ctemp01; | |||||
| *(pdst0 + 1) = ctemp02; | |||||
| *(pdst0 + 2) = ctemp03; | |||||
| *(pdst0 + 3) = ctemp04; | |||||
| psrc1 += 2 * lda; | |||||
| psrc2 += 2 * lda; | |||||
| pdst0 += 4; | |||||
| } | |||||
| if (m & 1) | |||||
| { | |||||
| ctemp01 = *(psrc1 + 0); | |||||
| ctemp02 = *(psrc1 + 1); | |||||
| *(pdst0 + 0) = ctemp01; | |||||
| *(pdst0 + 1) = ctemp02; | |||||
| pdst0 += 2; | |||||
| } | |||||
| } | |||||
| return 0; | |||||
| } | |||||
| @@ -0,0 +1,611 @@ | |||||
| /******************************************************************************* | |||||
| Copyright (c) 2016, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *******************************************************************************/ | |||||
| #include "common.h" | |||||
| #include "macros_msa.h" | |||||
| #undef OP0 | |||||
| #undef OP1 | |||||
| #undef OP2 | |||||
| #undef OP3 | |||||
| #undef OP4 | |||||
| #if !defined(XCONJ) | |||||
| #define OP3 -= | |||||
| #define OP4 += | |||||
| #else | |||||
| #define OP3 += | |||||
| #define OP4 -= | |||||
| #endif | |||||
| #if !defined(CONJ) | |||||
| #if !defined(XCONJ) | |||||
| #define OP0 -= | |||||
| #define OP1 += | |||||
| #define OP2 += | |||||
| #else | |||||
| #define OP0 += | |||||
| #define OP1 += | |||||
| #define OP2 -= | |||||
| #endif | |||||
| #else | |||||
| #if !defined(XCONJ) | |||||
| #define OP0 += | |||||
| #define OP1 -= | |||||
| #define OP2 -= | |||||
| #else | |||||
| #define OP0 -= | |||||
| #define OP1 -= | |||||
| #define OP2 += | |||||
| #endif | |||||
| #endif | |||||
| #define CGEMV_N_8x4() \ | |||||
| LD_SP4(pa0 + k, 4, t0, t1, t2, t3); \ | |||||
| LD_SP4(pa1 + k, 4, t4, t5, t6, t7); \ | |||||
| LD_SP4(pa2 + k, 4, t8, t9, t10, t11); \ | |||||
| LD_SP4(pa3 + k, 4, t12, t13, t14, t15); \ | |||||
| \ | |||||
| PCKEVOD_W2_SP(t1, t0, src0r, src0i); \ | |||||
| PCKEVOD_W2_SP(t3, t2, src1r, src1i); \ | |||||
| PCKEVOD_W2_SP(t5, t4, src2r, src2i); \ | |||||
| PCKEVOD_W2_SP(t7, t6, src3r, src3i); \ | |||||
| PCKEVOD_W2_SP(t9, t8, src4r, src4i); \ | |||||
| PCKEVOD_W2_SP(t11, t10, src5r, src5i); \ | |||||
| PCKEVOD_W2_SP(t13, t12, src6r, src6i); \ | |||||
| PCKEVOD_W2_SP(t15, t14, src7r, src7i); \ | |||||
| \ | |||||
| y0r += tp0r * src0r; \ | |||||
| y1r += tp0r * src1r; \ | |||||
| y0r += tp1r * src2r; \ | |||||
| y1r += tp1r * src3r; \ | |||||
| y0r += tp2r * src4r; \ | |||||
| y1r += tp2r * src5r; \ | |||||
| y0r += tp3r * src6r; \ | |||||
| y1r += tp3r * src7r; \ | |||||
| \ | |||||
| y0r OP0 tp0i * src0i; \ | |||||
| y1r OP0 tp0i * src1i; \ | |||||
| y0r OP0 tp1i * src2i; \ | |||||
| y1r OP0 tp1i * src3i; \ | |||||
| y0r OP0 tp2i * src4i; \ | |||||
| y1r OP0 tp2i * src5i; \ | |||||
| y0r OP0 tp3i * src6i; \ | |||||
| y1r OP0 tp3i * src7i; \ | |||||
| \ | |||||
| y0i OP1 tp0r * src0i; \ | |||||
| y1i OP1 tp0r * src1i; \ | |||||
| y0i OP1 tp1r * src2i; \ | |||||
| y1i OP1 tp1r * src3i; \ | |||||
| y0i OP1 tp2r * src4i; \ | |||||
| y1i OP1 tp2r * src5i; \ | |||||
| y0i OP1 tp3r * src6i; \ | |||||
| y1i OP1 tp3r * src7i; \ | |||||
| \ | |||||
| y0i OP2 tp0i * src0r; \ | |||||
| y1i OP2 tp0i * src1r; \ | |||||
| y0i OP2 tp1i * src2r; \ | |||||
| y1i OP2 tp1i * src3r; \ | |||||
| y0i OP2 tp2i * src4r; \ | |||||
| y1i OP2 tp2i * src5r; \ | |||||
| y0i OP2 tp3i * src6r; \ | |||||
| y1i OP2 tp3i * src7r; \ | |||||
| #define CGEMV_N_4x4() \ | |||||
| LD_SP2(pa0 + k, 4, t0, t1); \ | |||||
| LD_SP2(pa1 + k, 4, t4, t5); \ | |||||
| LD_SP2(pa2 + k, 4, t8, t9); \ | |||||
| LD_SP2(pa3 + k, 4, t12, t13); \ | |||||
| \ | |||||
| PCKEVOD_W2_SP(t1, t0, src0r, src0i); \ | |||||
| PCKEVOD_W2_SP(t5, t4, src2r, src2i); \ | |||||
| PCKEVOD_W2_SP(t9, t8, src4r, src4i); \ | |||||
| PCKEVOD_W2_SP(t13, t12, src6r, src6i); \ | |||||
| \ | |||||
| y0r += tp0r * src0r; \ | |||||
| y0r += tp1r * src2r; \ | |||||
| y0r += tp2r * src4r; \ | |||||
| y0r += tp3r * src6r; \ | |||||
| \ | |||||
| y0r OP0 tp0i * src0i; \ | |||||
| y0r OP0 tp1i * src2i; \ | |||||
| y0r OP0 tp2i * src4i; \ | |||||
| y0r OP0 tp3i * src6i; \ | |||||
| \ | |||||
| y0i OP1 tp0r * src0i; \ | |||||
| y0i OP1 tp1r * src2i; \ | |||||
| y0i OP1 tp2r * src4i; \ | |||||
| y0i OP1 tp3r * src6i; \ | |||||
| \ | |||||
| y0i OP2 tp0i * src0r; \ | |||||
| y0i OP2 tp1i * src2r; \ | |||||
| y0i OP2 tp2i * src4r; \ | |||||
| y0i OP2 tp3i * src6r; \ | |||||
| #define CGEMV_N_1x4() \ | |||||
| res0 = y[0 * inc_y2]; \ | |||||
| res1 = y[0 * inc_y2 + 1]; \ | |||||
| \ | |||||
| res0 += temp0_r * pa0[k]; \ | |||||
| res0 OP0 temp0_i * pa0[k + 1]; \ | |||||
| res0 += temp1_r * pa1[k]; \ | |||||
| res0 OP0 temp1_i * pa1[k + 1]; \ | |||||
| res0 += temp2_r * pa2[k]; \ | |||||
| res0 OP0 temp2_i * pa2[k + 1]; \ | |||||
| res0 += temp3_r * pa3[k]; \ | |||||
| res0 OP0 temp3_i * pa3[k + 1]; \ | |||||
| \ | |||||
| res1 OP1 temp0_r * pa0[k + 1]; \ | |||||
| res1 OP2 temp0_i * pa0[k]; \ | |||||
| res1 OP1 temp1_r * pa1[k + 1]; \ | |||||
| res1 OP2 temp1_i * pa1[k]; \ | |||||
| res1 OP1 temp2_r * pa2[k + 1]; \ | |||||
| res1 OP2 temp2_i * pa2[k]; \ | |||||
| res1 OP1 temp3_r * pa3[k + 1]; \ | |||||
| res1 OP2 temp3_i * pa3[k]; \ | |||||
| \ | |||||
| y[0 * inc_y2] = res0; \ | |||||
| y[0 * inc_y2 + 1] = res1; \ | |||||
| #define CGEMV_N_8x2() \ | |||||
| LD_SP4(pa0 + k, 4, t0, t1, t2, t3); \ | |||||
| LD_SP4(pa1 + k, 4, t4, t5, t6, t7); \ | |||||
| \ | |||||
| PCKEVOD_W2_SP(t1, t0, src0r, src0i); \ | |||||
| PCKEVOD_W2_SP(t3, t2, src1r, src1i); \ | |||||
| PCKEVOD_W2_SP(t5, t4, src2r, src2i); \ | |||||
| PCKEVOD_W2_SP(t7, t6, src3r, src3i); \ | |||||
| \ | |||||
| y0r += tp0r * src0r; \ | |||||
| y1r += tp0r * src1r; \ | |||||
| y0r += tp1r * src2r; \ | |||||
| y1r += tp1r * src3r; \ | |||||
| \ | |||||
| y0r OP0 tp0i * src0i; \ | |||||
| y1r OP0 tp0i * src1i; \ | |||||
| y0r OP0 tp1i * src2i; \ | |||||
| y1r OP0 tp1i * src3i; \ | |||||
| \ | |||||
| y0i OP1 tp0r * src0i; \ | |||||
| y1i OP1 tp0r * src1i; \ | |||||
| y0i OP1 tp1r * src2i; \ | |||||
| y1i OP1 tp1r * src3i; \ | |||||
| \ | |||||
| y0i OP2 tp0i * src0r; \ | |||||
| y1i OP2 tp0i * src1r; \ | |||||
| y0i OP2 tp1i * src2r; \ | |||||
| y1i OP2 tp1i * src3r; \ | |||||
| #define CGEMV_N_4x2() \ | |||||
| LD_SP2(pa0 + k, 4, t0, t1); \ | |||||
| LD_SP2(pa1 + k, 4, t4, t5); \ | |||||
| \ | |||||
| PCKEVOD_W2_SP(t1, t0, src0r, src0i); \ | |||||
| PCKEVOD_W2_SP(t5, t4, src2r, src2i); \ | |||||
| \ | |||||
| y0r += tp0r * src0r; \ | |||||
| y0r += tp1r * src2r; \ | |||||
| \ | |||||
| y0r OP0 tp0i * src0i; \ | |||||
| y0r OP0 tp1i * src2i; \ | |||||
| \ | |||||
| y0i OP1 tp0r * src0i; \ | |||||
| y0i OP1 tp1r * src2i; \ | |||||
| \ | |||||
| y0i OP2 tp0i * src0r; \ | |||||
| y0i OP2 tp1i * src2r; \ | |||||
| #define CGEMV_N_1x2() \ | |||||
| res0 = y[0 * inc_y2]; \ | |||||
| res1 = y[0 * inc_y2 + 1]; \ | |||||
| \ | |||||
| res0 += temp0_r * pa0[k]; \ | |||||
| res0 OP0 temp0_i * pa0[k + 1]; \ | |||||
| res0 += temp1_r * pa1[k]; \ | |||||
| res0 OP0 temp1_i * pa1[k + 1]; \ | |||||
| \ | |||||
| res1 OP1 temp0_r * pa0[k + 1]; \ | |||||
| res1 OP2 temp0_i * pa0[k]; \ | |||||
| res1 OP1 temp1_r * pa1[k + 1]; \ | |||||
| res1 OP2 temp1_i * pa1[k]; \ | |||||
| \ | |||||
| y[0 * inc_y2] = res0; \ | |||||
| y[0 * inc_y2 + 1] = res1; \ | |||||
| #define CGEMV_N_1x1() \ | |||||
| res0 = y[0 * inc_y2]; \ | |||||
| res1 = y[0 * inc_y2 + 1]; \ | |||||
| \ | |||||
| res0 += temp_r * pa0[k]; \ | |||||
| res0 OP0 temp_i * pa0[k + 1]; \ | |||||
| \ | |||||
| res1 OP1 temp_r * pa0[k + 1]; \ | |||||
| res1 OP2 temp_i * pa0[k]; \ | |||||
| \ | |||||
| y[0 * inc_y2] = res0; \ | |||||
| y[0 * inc_y2 + 1] = res1; \ | |||||
| #define CLOAD_X4_SCALE_VECTOR() \ | |||||
| LD_SP2(x, 4, x0, x1); \ | |||||
| \ | |||||
| PCKEVOD_W2_SP(x1, x0, x0r, x0i); \ | |||||
| \ | |||||
| tp4r = alphar * x0r; \ | |||||
| tp4r OP3 alphai * x0i; \ | |||||
| tp4i = alphar * x0i; \ | |||||
| tp4i OP4 alphai * x0r; \ | |||||
| \ | |||||
| SPLATI_W4_SP(tp4r, tp0r, tp1r, tp2r, tp3r); \ | |||||
| SPLATI_W4_SP(tp4i, tp0i, tp1i, tp2i, tp3i); \ | |||||
| #define CLOAD_X4_SCALE_GP() \ | |||||
| x0r = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *) (x + 0 * inc_x2))); \ | |||||
| x0r = (v4f32) __msa_insert_w((v4i32) x0r, 1, *((int *) (x + 1 * inc_x2))); \ | |||||
| x0r = (v4f32) __msa_insert_w((v4i32) x0r, 2, *((int *) (x + 2 * inc_x2))); \ | |||||
| x0r = (v4f32) __msa_insert_w((v4i32) x0r, 3, *((int *) (x + 3 * inc_x2))); \ | |||||
| x0i = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *) (x + 0 * inc_x2 + 1))); \ | |||||
| x0i = (v4f32) __msa_insert_w((v4i32) x0i, 1, *((int *) (x + 1 * inc_x2 + 1))); \ | |||||
| x0i = (v4f32) __msa_insert_w((v4i32) x0i, 2, *((int *) (x + 2 * inc_x2 + 1))); \ | |||||
| x0i = (v4f32) __msa_insert_w((v4i32) x0i, 3, *((int *) (x + 3 * inc_x2 + 1))); \ | |||||
| \ | |||||
| tp4r = alphar * x0r; \ | |||||
| tp4r OP3 alphai * x0i; \ | |||||
| tp4i = alphar * x0i; \ | |||||
| tp4i OP4 alphai * x0r; \ | |||||
| \ | |||||
| SPLATI_W4_SP(tp4r, tp0r, tp1r, tp2r, tp3r); \ | |||||
| SPLATI_W4_SP(tp4i, tp0i, tp1i, tp2i, tp3i); \ | |||||
| #define CLOAD_X2_SCALE_GP() \ | |||||
| temp0_r = alpha_r * x[0 * inc_x2]; \ | |||||
| temp0_r OP3 alpha_i * x[0 * inc_x2 + 1]; \ | |||||
| temp0_i = alpha_r * x[0 * inc_x2 + 1]; \ | |||||
| temp0_i OP4 alpha_i * x[0 * inc_x2]; \ | |||||
| \ | |||||
| temp1_r = alpha_r * x[1 * inc_x2]; \ | |||||
| temp1_r OP3 alpha_i * x[1 * inc_x2 + 1]; \ | |||||
| temp1_i = alpha_r * x[1 * inc_x2 + 1]; \ | |||||
| temp1_i OP4 alpha_i * x[1 * inc_x2]; \ | |||||
| \ | |||||
| tp0r = (v4f32) COPY_FLOAT_TO_VECTOR(temp0_r); \ | |||||
| tp0i = (v4f32) COPY_FLOAT_TO_VECTOR(temp0_i); \ | |||||
| tp1r = (v4f32) COPY_FLOAT_TO_VECTOR(temp1_r); \ | |||||
| tp1i = (v4f32) COPY_FLOAT_TO_VECTOR(temp1_i); \ | |||||
| #define CLOAD_X1_SCALE_GP() \ | |||||
| temp_r = alpha_r * x[0 * inc_x2]; \ | |||||
| temp_r OP3 alpha_i * x[0 * inc_x2 + 1]; \ | |||||
| temp_i = alpha_r * x[0 * inc_x2 + 1]; \ | |||||
| temp_i OP4 alpha_i * x[0 * inc_x2]; \ | |||||
| #define CLOAD_Y8_VECTOR() \ | |||||
| LD_SP4(y, 4, y0, y1, y2, y3); \ | |||||
| PCKEVOD_W2_SP(y1, y0, y0r, y0i); \ | |||||
| PCKEVOD_W2_SP(y3, y2, y1r, y1i); \ | |||||
| #define CLOAD_Y4_VECTOR() \ | |||||
| LD_SP2(y, 4, y0, y1); \ | |||||
| PCKEVOD_W2_SP(y1, y0, y0r, y0i); \ | |||||
| #define CSTORE_Y8_VECTOR() \ | |||||
| ILVRL_W2_SP(y0i, y0r, y0, y1); \ | |||||
| ILVRL_W2_SP(y1i, y1r, y2, y3); \ | |||||
| ST_SP4(y0, y1, y2, y3, y, 4); \ | |||||
| #define CSTORE_Y4_VECTOR() \ | |||||
| ILVRL_W2_SP(y0i, y0r, y0, y1); \ | |||||
| ST_SP2(y0, y1, y, 4); \ | |||||
| #define CLOAD_Y8_GP() \ | |||||
| y0r = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *)(y + 0 * inc_y2))); \ | |||||
| y0r = (v4f32) __msa_insert_w((v4i32) y0r, 1, *((int *)(y + 1 * inc_y2))); \ | |||||
| y0r = (v4f32) __msa_insert_w((v4i32) y0r, 2, *((int *)(y + 2 * inc_y2))); \ | |||||
| y0r = (v4f32) __msa_insert_w((v4i32) y0r, 3, *((int *)(y + 3 * inc_y2))); \ | |||||
| y1r = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *)(y + 4 * inc_y2))); \ | |||||
| y1r = (v4f32) __msa_insert_w((v4i32) y1r, 1, *((int *)(y + 5 * inc_y2))); \ | |||||
| y1r = (v4f32) __msa_insert_w((v4i32) y1r, 2, *((int *)(y + 6 * inc_y2))); \ | |||||
| y1r = (v4f32) __msa_insert_w((v4i32) y1r, 3, *((int *)(y + 7 * inc_y2))); \ | |||||
| y0i = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *)(y + 0 * inc_y2 + 1))); \ | |||||
| y0i = (v4f32) __msa_insert_w((v4i32) y0i, 1, *((int *)(y + 1 * inc_y2 + 1))); \ | |||||
| y0i = (v4f32) __msa_insert_w((v4i32) y0i, 2, *((int *)(y + 2 * inc_y2 + 1))); \ | |||||
| y0i = (v4f32) __msa_insert_w((v4i32) y0i, 3, *((int *)(y + 3 * inc_y2 + 1))); \ | |||||
| y1i = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *)(y + 4 * inc_y2 + 1))); \ | |||||
| y1i = (v4f32) __msa_insert_w((v4i32) y1i, 1, *((int *)(y + 5 * inc_y2 + 1))); \ | |||||
| y1i = (v4f32) __msa_insert_w((v4i32) y1i, 2, *((int *)(y + 6 * inc_y2 + 1))); \ | |||||
| y1i = (v4f32) __msa_insert_w((v4i32) y1i, 3, *((int *)(y + 7 * inc_y2 + 1))); \ | |||||
| #define CLOAD_Y4_GP() \ | |||||
| y0r = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *)(y + 0 * inc_y2))); \ | |||||
| y0r = (v4f32) __msa_insert_w((v4i32) y0r, 1, *((int *)(y + 1 * inc_y2))); \ | |||||
| y0r = (v4f32) __msa_insert_w((v4i32) y0r, 2, *((int *)(y + 2 * inc_y2))); \ | |||||
| y0r = (v4f32) __msa_insert_w((v4i32) y0r, 3, *((int *)(y + 3 * inc_y2))); \ | |||||
| y0i = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *)(y + 0 * inc_y2 + 1))); \ | |||||
| y0i = (v4f32) __msa_insert_w((v4i32) y0i, 1, *((int *)(y + 1 * inc_y2 + 1))); \ | |||||
| y0i = (v4f32) __msa_insert_w((v4i32) y0i, 2, *((int *)(y + 2 * inc_y2 + 1))); \ | |||||
| y0i = (v4f32) __msa_insert_w((v4i32) y0i, 3, *((int *)(y + 3 * inc_y2 + 1))); \ | |||||
| #define CSTORE_Y8_GP() \ | |||||
| *((int *)(y + 0 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 0); \ | |||||
| *((int *)(y + 1 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 1); \ | |||||
| *((int *)(y + 2 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 2); \ | |||||
| *((int *)(y + 3 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 3); \ | |||||
| *((int *)(y + 4 * inc_y2)) = __msa_copy_s_w((v4i32) y1r, 0); \ | |||||
| *((int *)(y + 5 * inc_y2)) = __msa_copy_s_w((v4i32) y1r, 1); \ | |||||
| *((int *)(y + 6 * inc_y2)) = __msa_copy_s_w((v4i32) y1r, 2); \ | |||||
| *((int *)(y + 7 * inc_y2)) = __msa_copy_s_w((v4i32) y1r, 3); \ | |||||
| *((int *)(y + 0 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 0); \ | |||||
| *((int *)(y + 1 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 1); \ | |||||
| *((int *)(y + 2 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 2); \ | |||||
| *((int *)(y + 3 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 3); \ | |||||
| *((int *)(y + 4 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y1i, 0); \ | |||||
| *((int *)(y + 5 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y1i, 1); \ | |||||
| *((int *)(y + 6 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y1i, 2); \ | |||||
| *((int *)(y + 7 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y1i, 3); \ | |||||
| #define CSTORE_Y4_GP() \ | |||||
| *((int *)(y + 0 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 0); \ | |||||
| *((int *)(y + 1 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 1); \ | |||||
| *((int *)(y + 2 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 2); \ | |||||
| *((int *)(y + 3 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 3); \ | |||||
| *((int *)(y + 0 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 0); \ | |||||
| *((int *)(y + 1 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 1); \ | |||||
| *((int *)(y + 2 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 2); \ | |||||
| *((int *)(y + 3 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 3); \ | |||||
| #define CGEMV_N_MSA() \ | |||||
| for (j = (n >> 2); j--;) \ | |||||
| { \ | |||||
| CLOAD_X4_SCALE(); \ | |||||
| \ | |||||
| k = 0; \ | |||||
| y = y_org; \ | |||||
| \ | |||||
| for (i = (m >> 3); i--;) \ | |||||
| { \ | |||||
| CLOAD_Y8() \ | |||||
| CGEMV_N_8x4(); \ | |||||
| CSTORE_Y8(); \ | |||||
| \ | |||||
| k += 2 * 8; \ | |||||
| y += inc_y2 * 8; \ | |||||
| } \ | |||||
| \ | |||||
| if (m & 4) \ | |||||
| { \ | |||||
| CLOAD_Y4(); \ | |||||
| CGEMV_N_4x4(); \ | |||||
| CSTORE_Y4(); \ | |||||
| \ | |||||
| k += 2 * 4; \ | |||||
| y += inc_y2 * 4; \ | |||||
| } \ | |||||
| \ | |||||
| if (m & 3) \ | |||||
| { \ | |||||
| temp0_r = tp4r[0]; \ | |||||
| temp1_r = tp4r[1]; \ | |||||
| temp2_r = tp4r[2]; \ | |||||
| temp3_r = tp4r[3]; \ | |||||
| \ | |||||
| temp0_i = tp4i[0]; \ | |||||
| temp1_i = tp4i[1]; \ | |||||
| temp2_i = tp4i[2]; \ | |||||
| temp3_i = tp4i[3]; \ | |||||
| \ | |||||
| for (i = (m & 3); i--;) \ | |||||
| { \ | |||||
| CGEMV_N_1x4(); \ | |||||
| \ | |||||
| k += 2; \ | |||||
| y += inc_y2; \ | |||||
| } \ | |||||
| } \ | |||||
| \ | |||||
| pa0 += 4 * lda2; \ | |||||
| pa1 += 4 * lda2; \ | |||||
| pa2 += 4 * lda2; \ | |||||
| pa3 += 4 * lda2; \ | |||||
| \ | |||||
| x += 4 * inc_x2; \ | |||||
| } \ | |||||
| \ | |||||
| if (n & 2) \ | |||||
| { \ | |||||
| CLOAD_X2_SCALE(); \ | |||||
| \ | |||||
| k = 0; \ | |||||
| y = y_org; \ | |||||
| \ | |||||
| for (i = (m >> 3); i--;) \ | |||||
| { \ | |||||
| CLOAD_Y8(); \ | |||||
| CGEMV_N_8x2(); \ | |||||
| CSTORE_Y8(); \ | |||||
| \ | |||||
| k += 2 * 8; \ | |||||
| y += inc_y2 * 8; \ | |||||
| } \ | |||||
| \ | |||||
| if (m & 4) \ | |||||
| { \ | |||||
| CLOAD_Y4(); \ | |||||
| CGEMV_N_4x2(); \ | |||||
| CSTORE_Y4(); \ | |||||
| \ | |||||
| k += 2 * 4; \ | |||||
| y += inc_y2 * 4; \ | |||||
| } \ | |||||
| \ | |||||
| for (i = (m & 3); i--;) \ | |||||
| { \ | |||||
| CGEMV_N_1x2(); \ | |||||
| \ | |||||
| k += 2; \ | |||||
| y += inc_y2; \ | |||||
| } \ | |||||
| \ | |||||
| pa0 += 2 * lda2; \ | |||||
| pa1 += 2 * lda2; \ | |||||
| \ | |||||
| x += 2 * inc_x2; \ | |||||
| } \ | |||||
| \ | |||||
| if (n & 1) \ | |||||
| { \ | |||||
| CLOAD_X1_SCALE(); \ | |||||
| \ | |||||
| k = 0; \ | |||||
| y = y_org; \ | |||||
| \ | |||||
| for (i = m; i--;) \ | |||||
| { \ | |||||
| CGEMV_N_1x1(); \ | |||||
| \ | |||||
| k += 2; \ | |||||
| y += inc_y2; \ | |||||
| } \ | |||||
| \ | |||||
| pa0 += lda2; \ | |||||
| x += inc_x2; \ | |||||
| } \ | |||||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, | |||||
| FLOAT *A, BLASLONG lda2, FLOAT *x, BLASLONG inc_x2, FLOAT *y, | |||||
| BLASLONG inc_y2, FLOAT *buffer) | |||||
| { | |||||
| BLASLONG i, j, k; | |||||
| FLOAT *y_org = y; | |||||
| FLOAT *pa0, *pa1, *pa2, *pa3; | |||||
| FLOAT temp_r, temp_i, res0, res1, temp0_r; | |||||
| FLOAT temp0_i, temp1_r, temp1_i, temp2_r, temp2_i, temp3_r, temp3_i; | |||||
| v4f32 alphar, alphai; | |||||
| v4f32 x0, x1, y0, y1, y2, y3, x0r, x0i, y0r, y1r, y0i, y1i; | |||||
| v4f32 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15; | |||||
| v4f32 src0r, src1r, src2r, src3r, src4r, src5r, src6r, src7r; | |||||
| v4f32 src0i, src1i, src2i, src3i, src4i, src5i, src6i, src7i; | |||||
| v4f32 tp0r, tp1r, tp2r, tp3r, tp4r, tp0i, tp1i, tp2i, tp3i, tp4i; | |||||
| lda2 = 2 * lda2; | |||||
| inc_x2 = 2 * inc_x2; | |||||
| inc_y2 = 2 * inc_y2; | |||||
| pa0 = A; | |||||
| pa1 = A + lda2; | |||||
| pa2 = A + 2 * lda2; | |||||
| pa3 = A + 3 * lda2; | |||||
| alphar = COPY_FLOAT_TO_VECTOR(alpha_r); | |||||
| alphai = COPY_FLOAT_TO_VECTOR(alpha_i); | |||||
| if ((2 == inc_x2) && (2 == inc_y2)) | |||||
| { | |||||
| #define CLOAD_X4_SCALE CLOAD_X4_SCALE_VECTOR | |||||
| #define CLOAD_X2_SCALE CLOAD_X2_SCALE_GP | |||||
| #define CLOAD_X1_SCALE CLOAD_X1_SCALE_GP | |||||
| #define CLOAD_Y8 CLOAD_Y8_VECTOR | |||||
| #define CLOAD_Y4 CLOAD_Y4_VECTOR | |||||
| #define CSTORE_Y8 CSTORE_Y8_VECTOR | |||||
| #define CSTORE_Y4 CSTORE_Y4_VECTOR | |||||
| CGEMV_N_MSA(); | |||||
| #undef CLOAD_X4_SCALE | |||||
| #undef CLOAD_X2_SCALE | |||||
| #undef CLOAD_X1_SCALE | |||||
| #undef CLOAD_Y8 | |||||
| #undef CLOAD_Y4 | |||||
| #undef CSTORE_Y8 | |||||
| #undef CSTORE_Y4 | |||||
| } | |||||
| else if (2 == inc_x2) | |||||
| { | |||||
| #define CLOAD_X4_SCALE CLOAD_X4_SCALE_VECTOR | |||||
| #define CLOAD_X2_SCALE CLOAD_X2_SCALE_GP | |||||
| #define CLOAD_X1_SCALE CLOAD_X1_SCALE_GP | |||||
| #define CLOAD_Y8 CLOAD_Y8_GP | |||||
| #define CLOAD_Y4 CLOAD_Y4_GP | |||||
| #define CSTORE_Y8 CSTORE_Y8_GP | |||||
| #define CSTORE_Y4 CSTORE_Y4_GP | |||||
| CGEMV_N_MSA(); | |||||
| #undef CLOAD_X4_SCALE | |||||
| #undef CLOAD_X2_SCALE | |||||
| #undef CLOAD_X1_SCALE | |||||
| #undef CLOAD_Y8 | |||||
| #undef CLOAD_Y4 | |||||
| #undef CSTORE_Y8 | |||||
| #undef CSTORE_Y4 | |||||
| } | |||||
| else if (2 == inc_y2) | |||||
| { | |||||
| #define CLOAD_X4_SCALE CLOAD_X4_SCALE_GP | |||||
| #define CLOAD_X2_SCALE CLOAD_X2_SCALE_GP | |||||
| #define CLOAD_X1_SCALE CLOAD_X1_SCALE_GP | |||||
| #define CLOAD_Y8 CLOAD_Y8_VECTOR | |||||
| #define CLOAD_Y4 CLOAD_Y4_VECTOR | |||||
| #define CSTORE_Y8 CSTORE_Y8_VECTOR | |||||
| #define CSTORE_Y4 CSTORE_Y4_VECTOR | |||||
| CGEMV_N_MSA(); | |||||
| #undef CLOAD_X4_SCALE | |||||
| #undef CLOAD_X2_SCALE | |||||
| #undef CLOAD_X1_SCALE | |||||
| #undef CLOAD_Y8 | |||||
| #undef CLOAD_Y4 | |||||
| #undef CSTORE_Y8 | |||||
| #undef CSTORE_Y4 | |||||
| } | |||||
| else | |||||
| { | |||||
| #define CLOAD_X4_SCALE CLOAD_X4_SCALE_GP | |||||
| #define CLOAD_X2_SCALE CLOAD_X2_SCALE_GP | |||||
| #define CLOAD_X1_SCALE CLOAD_X1_SCALE_GP | |||||
| #define CLOAD_Y8 CLOAD_Y8_GP | |||||
| #define CLOAD_Y4 CLOAD_Y4_GP | |||||
| #define CSTORE_Y8 CSTORE_Y8_GP | |||||
| #define CSTORE_Y4 CSTORE_Y4_GP | |||||
| CGEMV_N_MSA(); | |||||
| #undef CLOAD_X4_SCALE | |||||
| #undef CLOAD_X2_SCALE | |||||
| #undef CLOAD_X1_SCALE | |||||
| #undef CLOAD_Y8 | |||||
| #undef CLOAD_Y4 | |||||
| #undef CSTORE_Y8 | |||||
| #undef CSTORE_Y4 | |||||
| } | |||||
| return(0); | |||||
| } | |||||
| #undef OP0 | |||||
| #undef OP1 | |||||
| #undef OP2 | |||||
| #undef OP3 | |||||
| #undef OP4 | |||||
| @@ -0,0 +1,583 @@ | |||||
| /******************************************************************************* | |||||
| Copyright (c) 2016, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *******************************************************************************/ | |||||
| #include "common.h" | |||||
| #include "macros_msa.h" | |||||
| #undef OP0 | |||||
| #undef OP1 | |||||
| #undef OP2 | |||||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||||
| #define OP0 -= | |||||
| #define OP1 += | |||||
| #define OP2 += | |||||
| #else | |||||
| #define OP0 += | |||||
| #define OP1 += | |||||
| #define OP2 -= | |||||
| #endif | |||||
| #define CGEMV_T_8x4() \ | |||||
| LD_SP4(pa0 + k, 4, t0, t1, t2, t3); \ | |||||
| LD_SP4(pa1 + k, 4, t4, t5, t6, t7); \ | |||||
| LD_SP4(pa2 + k, 4, t8, t9, t10, t11); \ | |||||
| LD_SP4(pa3 + k, 4, t12, t13, t14, t15); \ | |||||
| \ | |||||
| PCKEVOD_W2_SP(t1, t0, src0r, src0i); \ | |||||
| PCKEVOD_W2_SP(t3, t2, src1r, src1i); \ | |||||
| PCKEVOD_W2_SP(t5, t4, src2r, src2i); \ | |||||
| PCKEVOD_W2_SP(t7, t6, src3r, src3i); \ | |||||
| PCKEVOD_W2_SP(t9, t8, src4r, src4i); \ | |||||
| PCKEVOD_W2_SP(t11, t10, src5r, src5i); \ | |||||
| PCKEVOD_W2_SP(t13, t12, src6r, src6i); \ | |||||
| PCKEVOD_W2_SP(t15, t14, src7r, src7i); \ | |||||
| \ | |||||
| tp0r += src0r * x0r; \ | |||||
| tp0r += src1r * x1r; \ | |||||
| tp0r OP0 src0i * x0i; \ | |||||
| tp0r OP0 src1i * x1i; \ | |||||
| \ | |||||
| tp1r += src2r * x0r; \ | |||||
| tp1r += src3r * x1r; \ | |||||
| tp1r OP0 src2i * x0i; \ | |||||
| tp1r OP0 src3i * x1i; \ | |||||
| \ | |||||
| tp2r += src4r * x0r; \ | |||||
| tp2r += src5r * x1r; \ | |||||
| tp2r OP0 src4i * x0i; \ | |||||
| tp2r OP0 src5i * x1i; \ | |||||
| \ | |||||
| tp3r += src6r * x0r; \ | |||||
| tp3r += src7r * x1r; \ | |||||
| tp3r OP0 src6i * x0i; \ | |||||
| tp3r OP0 src7i * x1i; \ | |||||
| \ | |||||
| tp0i OP1 src0r * x0i; \ | |||||
| tp0i OP1 src1r * x1i; \ | |||||
| tp0i OP2 src0i * x0r; \ | |||||
| tp0i OP2 src1i * x1r; \ | |||||
| \ | |||||
| tp1i OP1 src2r * x0i; \ | |||||
| tp1i OP1 src3r * x1i; \ | |||||
| tp1i OP2 src2i * x0r; \ | |||||
| tp1i OP2 src3i * x1r; \ | |||||
| \ | |||||
| tp2i OP1 src4r * x0i; \ | |||||
| tp2i OP1 src5r * x1i; \ | |||||
| tp2i OP2 src4i * x0r; \ | |||||
| tp2i OP2 src5i * x1r; \ | |||||
| \ | |||||
| tp3i OP1 src6r * x0i; \ | |||||
| tp3i OP1 src7r * x1i; \ | |||||
| tp3i OP2 src6i * x0r; \ | |||||
| tp3i OP2 src7i * x1r; \ | |||||
| #define CGEMV_T_8x2() \ | |||||
| LD_SP4(pa0 + k, 4, t0, t1, t2, t3); \ | |||||
| LD_SP4(pa1 + k, 4, t4, t5, t6, t7); \ | |||||
| \ | |||||
| PCKEVOD_W2_SP(t1, t0, src0r, src0i); \ | |||||
| PCKEVOD_W2_SP(t3, t2, src1r, src1i); \ | |||||
| PCKEVOD_W2_SP(t5, t4, src2r, src2i); \ | |||||
| PCKEVOD_W2_SP(t7, t6, src3r, src3i); \ | |||||
| \ | |||||
| tp0r += src0r * x0r; \ | |||||
| tp0r += src1r * x1r; \ | |||||
| tp0r OP0 src0i * x0i; \ | |||||
| tp0r OP0 src1i * x1i; \ | |||||
| \ | |||||
| tp1r += src2r * x0r; \ | |||||
| tp1r += src3r * x1r; \ | |||||
| tp1r OP0 src2i * x0i; \ | |||||
| tp1r OP0 src3i * x1i; \ | |||||
| \ | |||||
| tp0i OP1 src0r * x0i; \ | |||||
| tp0i OP1 src1r * x1i; \ | |||||
| tp0i OP2 src0i * x0r; \ | |||||
| tp0i OP2 src1i * x1r; \ | |||||
| \ | |||||
| tp1i OP1 src2r * x0i; \ | |||||
| tp1i OP1 src3r * x1i; \ | |||||
| tp1i OP2 src2i * x0r; \ | |||||
| tp1i OP2 src3i * x1r; \ | |||||
| #define CGEMV_T_8x1() \ | |||||
| LD_SP4(pa0 + k, 4, t0, t1, t2, t3); \ | |||||
| \ | |||||
| PCKEVOD_W2_SP(t1, t0, src0r, src0i); \ | |||||
| PCKEVOD_W2_SP(t3, t2, src1r, src1i); \ | |||||
| \ | |||||
| tp0r += src0r * x0r; \ | |||||
| tp0r += src1r * x1r; \ | |||||
| tp0r OP0 src0i * x0i; \ | |||||
| tp0r OP0 src1i * x1i; \ | |||||
| \ | |||||
| tp0i OP1 src0r * x0i; \ | |||||
| tp0i OP1 src1r * x1i; \ | |||||
| tp0i OP2 src0i * x0r; \ | |||||
| tp0i OP2 src1i * x1r; \ | |||||
| #define CGEMV_T_4x4() \ | |||||
| LD_SP2(pa0 + k, 4, t0, t1); \ | |||||
| LD_SP2(pa1 + k, 4, t4, t5); \ | |||||
| LD_SP2(pa2 + k, 4, t8, t9); \ | |||||
| LD_SP2(pa3 + k, 4, t12, t13); \ | |||||
| \ | |||||
| PCKEVOD_W2_SP(t1, t0, src0r, src0i); \ | |||||
| PCKEVOD_W2_SP(t5, t4, src2r, src2i); \ | |||||
| PCKEVOD_W2_SP(t9, t8, src4r, src4i); \ | |||||
| PCKEVOD_W2_SP(t13, t12, src6r, src6i); \ | |||||
| \ | |||||
| tp0r += src0r * x0r; \ | |||||
| tp0r OP0 src0i * x0i; \ | |||||
| \ | |||||
| tp1r += src2r * x0r; \ | |||||
| tp1r OP0 src2i * x0i; \ | |||||
| \ | |||||
| tp2r += src4r * x0r; \ | |||||
| tp2r OP0 src4i * x0i; \ | |||||
| \ | |||||
| tp3r += src6r * x0r; \ | |||||
| tp3r OP0 src6i * x0i; \ | |||||
| \ | |||||
| tp0i OP1 src0r * x0i; \ | |||||
| tp0i OP2 src0i * x0r; \ | |||||
| \ | |||||
| tp1i OP1 src2r * x0i; \ | |||||
| tp1i OP2 src2i * x0r; \ | |||||
| \ | |||||
| tp2i OP1 src4r * x0i; \ | |||||
| tp2i OP2 src4i * x0r; \ | |||||
| \ | |||||
| tp3i OP1 src6r * x0i; \ | |||||
| tp3i OP2 src6i * x0r; \ | |||||
| #define CGEMV_T_4x2() \ | |||||
| LD_SP2(pa0 + k, 4, t0, t1); \ | |||||
| LD_SP2(pa1 + k, 4, t4, t5); \ | |||||
| \ | |||||
| PCKEVOD_W2_SP(t1, t0, src0r, src0i); \ | |||||
| PCKEVOD_W2_SP(t5, t4, src2r, src2i); \ | |||||
| \ | |||||
| tp0r += src0r * x0r; \ | |||||
| tp0r OP0 src0i * x0i; \ | |||||
| \ | |||||
| tp1r += src2r * x0r; \ | |||||
| tp1r OP0 src2i * x0i; \ | |||||
| \ | |||||
| tp0i OP1 src0r * x0i; \ | |||||
| tp0i OP2 src0i * x0r; \ | |||||
| \ | |||||
| tp1i OP1 src2r * x0i; \ | |||||
| tp1i OP2 src2i * x0r; \ | |||||
| #define CGEMV_T_4x1() \ | |||||
| LD_SP2(pa0 + k, 4, t0, t1); \ | |||||
| \ | |||||
| PCKEVOD_W2_SP(t1, t0, src0r, src0i); \ | |||||
| \ | |||||
| tp0r += src0r * x0r; \ | |||||
| tp0r OP0 src0i * x0i; \ | |||||
| \ | |||||
| tp0i OP1 src0r * x0i; \ | |||||
| tp0i OP2 src0i * x0r; \ | |||||
| #define CGEMV_T_1x4() \ | |||||
| temp0r += pa0[k + 0] * x[0 * inc_x2]; \ | |||||
| temp0r OP0 pa0[k + 1] * x[0 * inc_x2 + 1]; \ | |||||
| temp1r += pa1[k + 0] * x[0 * inc_x2]; \ | |||||
| temp1r OP0 pa1[k + 1] * x[0 * inc_x2 + 1]; \ | |||||
| temp2r += pa2[k + 0] * x[0 * inc_x2]; \ | |||||
| temp2r OP0 pa2[k + 1] * x[0 * inc_x2 + 1]; \ | |||||
| temp3r += pa3[k + 0] * x[0 * inc_x2]; \ | |||||
| temp3r OP0 pa3[k + 1] * x[0 * inc_x2 + 1]; \ | |||||
| \ | |||||
| temp0i OP1 pa0[k + 0] * x[0 * inc_x2 + 1]; \ | |||||
| temp0i OP2 pa0[k + 1] * x[0 * inc_x2]; \ | |||||
| temp1i OP1 pa1[k + 0] * x[0 * inc_x2 + 1]; \ | |||||
| temp1i OP2 pa1[k + 1] * x[0 * inc_x2]; \ | |||||
| temp2i OP1 pa2[k + 0] * x[0 * inc_x2 + 1]; \ | |||||
| temp2i OP2 pa2[k + 1] * x[0 * inc_x2]; \ | |||||
| temp3i OP1 pa3[k + 0] * x[0 * inc_x2 + 1]; \ | |||||
| temp3i OP2 pa3[k + 1] * x[0 * inc_x2]; \ | |||||
| #define CGEMV_T_1x2() \ | |||||
| temp0r += pa0[k + 0] * x[0 * inc_x2]; \ | |||||
| temp0r OP0 pa0[k + 1] * x[0 * inc_x2 + 1]; \ | |||||
| temp1r += pa1[k + 0] * x[0 * inc_x2]; \ | |||||
| temp1r OP0 pa1[k + 1] * x[0 * inc_x2 + 1]; \ | |||||
| \ | |||||
| temp0i OP1 pa0[k + 0] * x[0 * inc_x2 + 1]; \ | |||||
| temp0i OP2 pa0[k + 1] * x[0 * inc_x2]; \ | |||||
| temp1i OP1 pa1[k + 0] * x[0 * inc_x2 + 1]; \ | |||||
| temp1i OP2 pa1[k + 1] * x[0 * inc_x2]; \ | |||||
| #define CGEMV_T_1x1() \ | |||||
| temp0r += pa0[k + 0] * x[0 * inc_x2]; \ | |||||
| temp0r OP0 pa0[k + 1] * x[0 * inc_x2 + 1]; \ | |||||
| \ | |||||
| temp0i OP1 pa0[k + 0] * x[0 * inc_x2 + 1]; \ | |||||
| temp0i OP2 pa0[k + 1] * x[0 * inc_x2]; \ | |||||
| #define CSCALE_STORE_Y4_GP() \ | |||||
| res0r = y[0 * inc_y2]; \ | |||||
| res1r = y[1 * inc_y2]; \ | |||||
| res2r = y[2 * inc_y2]; \ | |||||
| res3r = y[3 * inc_y2]; \ | |||||
| \ | |||||
| res0i = y[0 * inc_y2 + 1]; \ | |||||
| res1i = y[1 * inc_y2 + 1]; \ | |||||
| res2i = y[2 * inc_y2 + 1]; \ | |||||
| res3i = y[3 * inc_y2 + 1]; \ | |||||
| \ | |||||
| res0r += alphar * temp0r; \ | |||||
| res0r OP0 alphai * temp0i; \ | |||||
| res1r += alphar * temp1r; \ | |||||
| res1r OP0 alphai * temp1i; \ | |||||
| res2r += alphar * temp2r; \ | |||||
| res2r OP0 alphai * temp2i; \ | |||||
| res3r += alphar * temp3r; \ | |||||
| res3r OP0 alphai * temp3i; \ | |||||
| \ | |||||
| res0i OP1 alphar * temp0i; \ | |||||
| res0i OP2 alphai * temp0r; \ | |||||
| res1i OP1 alphar * temp1i; \ | |||||
| res1i OP2 alphai * temp1r; \ | |||||
| res2i OP1 alphar * temp2i; \ | |||||
| res2i OP2 alphai * temp2r; \ | |||||
| res3i OP1 alphar * temp3i; \ | |||||
| res3i OP2 alphai * temp3r; \ | |||||
| \ | |||||
| y[0 * inc_y2] = res0r; \ | |||||
| y[1 * inc_y2] = res1r; \ | |||||
| y[2 * inc_y2] = res2r; \ | |||||
| y[3 * inc_y2] = res3r; \ | |||||
| \ | |||||
| y[0 * inc_y2 + 1] = res0i; \ | |||||
| y[1 * inc_y2 + 1] = res1i; \ | |||||
| y[2 * inc_y2 + 1] = res2i; \ | |||||
| y[3 * inc_y2 + 1] = res3i; \ | |||||
| #define CSCALE_STORE_Y2_GP() \ | |||||
| res0r = y[0 * inc_y2]; \ | |||||
| res1r = y[1 * inc_y2]; \ | |||||
| \ | |||||
| res0i = y[0 * inc_y2 + 1]; \ | |||||
| res1i = y[1 * inc_y2 + 1]; \ | |||||
| \ | |||||
| res0r += alphar * temp0r; \ | |||||
| res0r OP0 alphai * temp0i; \ | |||||
| res1r += alphar * temp1r; \ | |||||
| res1r OP0 alphai * temp1i; \ | |||||
| \ | |||||
| res0i OP1 alphar * temp0i; \ | |||||
| res0i OP2 alphai * temp0r; \ | |||||
| res1i OP1 alphar * temp1i; \ | |||||
| res1i OP2 alphai * temp1r; \ | |||||
| \ | |||||
| y[0 * inc_y2] = res0r; \ | |||||
| y[1 * inc_y2] = res1r; \ | |||||
| \ | |||||
| y[0 * inc_y2 + 1] = res0i; \ | |||||
| y[1 * inc_y2 + 1] = res1i; \ | |||||
| #define CSCALE_STORE_Y1_GP() \ | |||||
| res0r = y[0 * inc_y2]; \ | |||||
| res0i = y[0 * inc_y2 + 1]; \ | |||||
| \ | |||||
| res0r += alphar * temp0r; \ | |||||
| res0r OP0 alphai * temp0i; \ | |||||
| \ | |||||
| res0i OP1 alphar * temp0i; \ | |||||
| res0i OP2 alphai * temp0r; \ | |||||
| \ | |||||
| y[0 * inc_y2] = res0r; \ | |||||
| y[0 * inc_y2 + 1] = res0i; \ | |||||
| #define CLOAD_X8_VECTOR() \ | |||||
| LD_SP4(x, 4, x0, x1, x2, x3); \ | |||||
| PCKEVOD_W2_SP(x1, x0, x0r, x0i); \ | |||||
| PCKEVOD_W2_SP(x3, x2, x1r, x1i); \ | |||||
| #define CLOAD_X4_VECTOR() \ | |||||
| LD_SP2(x, 4, x0, x1); \ | |||||
| PCKEVOD_W2_SP(x1, x0, x0r, x0i); \ | |||||
| #define CLOAD_X8_GP() \ | |||||
| x0r = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *) (x + 0 * inc_x2))); \ | |||||
| x0r = (v4f32) __msa_insert_w((v4i32) x0r, 1, *((int *) (x + 1 * inc_x2))); \ | |||||
| x0r = (v4f32) __msa_insert_w((v4i32) x0r, 2, *((int *) (x + 2 * inc_x2))); \ | |||||
| x0r = (v4f32) __msa_insert_w((v4i32) x0r, 3, *((int *) (x + 3 * inc_x2))); \ | |||||
| x1r = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *) (x + 4 * inc_x2))); \ | |||||
| x1r = (v4f32) __msa_insert_w((v4i32) x1r, 1, *((int *) (x + 5 * inc_x2))); \ | |||||
| x1r = (v4f32) __msa_insert_w((v4i32) x1r, 2, *((int *) (x + 6 * inc_x2))); \ | |||||
| x1r = (v4f32) __msa_insert_w((v4i32) x1r, 3, *((int *) (x + 7 * inc_x2))); \ | |||||
| x0i = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *) (x + 0 * inc_x2 + 1))); \ | |||||
| x0i = (v4f32) __msa_insert_w((v4i32) x0i, 1, *((int *) (x + 1 * inc_x2 + 1))); \ | |||||
| x0i = (v4f32) __msa_insert_w((v4i32) x0i, 2, *((int *) (x + 2 * inc_x2 + 1))); \ | |||||
| x0i = (v4f32) __msa_insert_w((v4i32) x0i, 3, *((int *) (x + 3 * inc_x2 + 1))); \ | |||||
| x1i = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *) (x + 4 * inc_x2 + 1))); \ | |||||
| x1i = (v4f32) __msa_insert_w((v4i32) x1i, 1, *((int *) (x + 5 * inc_x2 + 1))); \ | |||||
| x1i = (v4f32) __msa_insert_w((v4i32) x1i, 2, *((int *) (x + 6 * inc_x2 + 1))); \ | |||||
| x1i = (v4f32) __msa_insert_w((v4i32) x1i, 3, *((int *) (x + 7 * inc_x2 + 1))); \ | |||||
| #define CLOAD_X4_GP() \ | |||||
| x0r = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *) (x + 0 * inc_x2))); \ | |||||
| x0r = (v4f32) __msa_insert_w((v4i32) x0r, 1, *((int *) (x + 1 * inc_x2))); \ | |||||
| x0r = (v4f32) __msa_insert_w((v4i32) x0r, 2, *((int *) (x + 2 * inc_x2))); \ | |||||
| x0r = (v4f32) __msa_insert_w((v4i32) x0r, 3, *((int *) (x + 3 * inc_x2))); \ | |||||
| x0i = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *) (x + 0 * inc_x2 + 1))); \ | |||||
| x0i = (v4f32) __msa_insert_w((v4i32) x0i, 1, *((int *) (x + 1 * inc_x2 + 1))); \ | |||||
| x0i = (v4f32) __msa_insert_w((v4i32) x0i, 2, *((int *) (x + 2 * inc_x2 + 1))); \ | |||||
| x0i = (v4f32) __msa_insert_w((v4i32) x0i, 3, *((int *) (x + 3 * inc_x2 + 1))); \ | |||||
| #define CGEMV_T_MSA() \ | |||||
| for (j = (n >> 2); j--;) \ | |||||
| { \ | |||||
| tp0r = tp1r = tp2r = tp3r = zero; \ | |||||
| tp0i = tp1i = tp2i = tp3i = zero; \ | |||||
| \ | |||||
| k = 0; \ | |||||
| x = srcx_org; \ | |||||
| \ | |||||
| for (i = (m >> 3); i--;) \ | |||||
| { \ | |||||
| CLOAD_X8() \ | |||||
| CGEMV_T_8x4(); \ | |||||
| \ | |||||
| k += 2 * 8; \ | |||||
| x += inc_x2 * 8; \ | |||||
| } \ | |||||
| \ | |||||
| if (m & 4) \ | |||||
| { \ | |||||
| CLOAD_X4(); \ | |||||
| \ | |||||
| CGEMV_T_4x4(); \ | |||||
| \ | |||||
| k += 2 * 4; \ | |||||
| x += inc_x2 * 4; \ | |||||
| } \ | |||||
| \ | |||||
| TRANSPOSE4x4_SP_SP(tp0r, tp1r, tp2r, tp3r, \ | |||||
| tp0r, tp1r, tp2r, tp3r); \ | |||||
| TRANSPOSE4x4_SP_SP(tp0i, tp1i, tp2i, tp3i, \ | |||||
| tp0i, tp1i, tp2i, tp3i); \ | |||||
| \ | |||||
| tp0r += tp1r; \ | |||||
| tp0r += tp2r; \ | |||||
| tp0r += tp3r; \ | |||||
| tp0i += tp1i; \ | |||||
| tp0i += tp2i; \ | |||||
| tp0i += tp3i; \ | |||||
| \ | |||||
| temp0r = tp0r[0]; \ | |||||
| temp1r = tp0r[1]; \ | |||||
| temp2r = tp0r[2]; \ | |||||
| temp3r = tp0r[3]; \ | |||||
| temp0i = tp0i[0]; \ | |||||
| temp1i = tp0i[1]; \ | |||||
| temp2i = tp0i[2]; \ | |||||
| temp3i = tp0i[3]; \ | |||||
| \ | |||||
| for (i = (m & 3); i--;) \ | |||||
| { \ | |||||
| CGEMV_T_1x4(); \ | |||||
| \ | |||||
| k += 2; \ | |||||
| x += inc_x2; \ | |||||
| } \ | |||||
| \ | |||||
| CSCALE_STORE_Y4_GP(); \ | |||||
| \ | |||||
| pa0 += 4 * lda2; \ | |||||
| pa1 += 4 * lda2; \ | |||||
| pa2 += 4 * lda2; \ | |||||
| pa3 += 4 * lda2; \ | |||||
| y += 4 * inc_y2; \ | |||||
| } \ | |||||
| \ | |||||
| if (n & 2) \ | |||||
| { \ | |||||
| tp0r = tp1r = zero; \ | |||||
| tp0i = tp1i = zero; \ | |||||
| \ | |||||
| k = 0; \ | |||||
| x = srcx_org; \ | |||||
| \ | |||||
| for (i = (m >> 3); i--;) \ | |||||
| { \ | |||||
| CLOAD_X8(); \ | |||||
| \ | |||||
| CGEMV_T_8x2(); \ | |||||
| \ | |||||
| k += 2 * 8; \ | |||||
| x += inc_x2 * 8; \ | |||||
| } \ | |||||
| \ | |||||
| if (m & 4) \ | |||||
| { \ | |||||
| CLOAD_X4(); \ | |||||
| \ | |||||
| CGEMV_T_4x2(); \ | |||||
| \ | |||||
| k += 2 * 4; \ | |||||
| x += inc_x2 * 4; \ | |||||
| } \ | |||||
| \ | |||||
| TRANSPOSE4x4_SP_SP(tp0r, tp1r, tp0i, tp1i, \ | |||||
| tp0r, tp1r, tp0i, tp1i); \ | |||||
| \ | |||||
| tp0r += tp1r; \ | |||||
| tp0r += tp0i; \ | |||||
| tp0r += tp1i; \ | |||||
| \ | |||||
| temp0r = tp0r[0]; \ | |||||
| temp1r = tp0r[1]; \ | |||||
| temp0i = tp0r[2]; \ | |||||
| temp1i = tp0r[3]; \ | |||||
| \ | |||||
| for (i = (m & 3); i--;) \ | |||||
| { \ | |||||
| CGEMV_T_1x2(); \ | |||||
| \ | |||||
| k += 2; \ | |||||
| x += inc_x2; \ | |||||
| } \ | |||||
| \ | |||||
| CSCALE_STORE_Y2_GP(); \ | |||||
| \ | |||||
| pa0 += 2 * lda2; \ | |||||
| pa1 += 2 * lda2; \ | |||||
| y += 2 * inc_y2; \ | |||||
| } \ | |||||
| \ | |||||
| if (n & 1) \ | |||||
| { \ | |||||
| tp0r = zero; \ | |||||
| tp0i = zero; \ | |||||
| \ | |||||
| k = 0; \ | |||||
| x = srcx_org; \ | |||||
| \ | |||||
| for (i = (m >> 3); i--;) \ | |||||
| { \ | |||||
| CLOAD_X8(); \ | |||||
| \ | |||||
| CGEMV_T_8x1(); \ | |||||
| \ | |||||
| k += 2 * 8; \ | |||||
| x += inc_x2 * 8; \ | |||||
| } \ | |||||
| \ | |||||
| if (m & 4) \ | |||||
| { \ | |||||
| CLOAD_X4(); \ | |||||
| \ | |||||
| CGEMV_T_4x1(); \ | |||||
| \ | |||||
| k += 2 * 4; \ | |||||
| x += inc_x2 * 4; \ | |||||
| } \ | |||||
| \ | |||||
| ILVRL_W2_SP(tp0i, tp0r, t0, t1); \ | |||||
| \ | |||||
| t0 += t1; \ | |||||
| \ | |||||
| temp0r = t0[0] + t0[2]; \ | |||||
| temp0i = t0[1] + t0[3]; \ | |||||
| \ | |||||
| for (i = (m & 3); i--;) \ | |||||
| { \ | |||||
| CGEMV_T_1x1(); \ | |||||
| \ | |||||
| k += 2; \ | |||||
| x += inc_x2; \ | |||||
| } \ | |||||
| \ | |||||
| CSCALE_STORE_Y1_GP(); \ | |||||
| \ | |||||
| pa0 += lda2; \ | |||||
| y += inc_y2; \ | |||||
| } \ | |||||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alphar, FLOAT alphai, | |||||
| FLOAT *A, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, | |||||
| BLASLONG inc_y, FLOAT *buffer) | |||||
| { | |||||
| BLASLONG i, j, k; | |||||
| FLOAT *pa0, *pa1, *pa2, *pa3; | |||||
| FLOAT *srcx_org = x; | |||||
| FLOAT temp0r, temp0i, temp2r, temp2i, temp1r, temp1i, temp3r, temp3i; | |||||
| FLOAT res0r, res0i, res2r, res2i, res1r, res1i, res3r, res3i; | |||||
| BLASLONG inc_x2, inc_y2, lda2; | |||||
| v4f32 zero = {0}; | |||||
| v4f32 x0, x1, x2, x3, x0r, x1r, x0i, x1i; | |||||
| v4f32 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15; | |||||
| v4f32 src0r, src1r, src2r, src3r, src4r, src5r, src6r, src7r; | |||||
| v4f32 src0i, src1i, src2i, src3i, src4i, src5i, src6i, src7i; | |||||
| v4f32 tp0r, tp1r, tp2r, tp3r, tp0i, tp1i, tp2i, tp3i; | |||||
| lda2 = 2 * lda; | |||||
| pa0 = A; | |||||
| pa1 = A + lda2; | |||||
| pa2 = A + 2 * lda2; | |||||
| pa3 = A + 3 * lda2; | |||||
| inc_x2 = 2 * inc_x; | |||||
| inc_y2 = 2 * inc_y; | |||||
| if (2 == inc_x2) | |||||
| { | |||||
| #define CLOAD_X8 CLOAD_X8_VECTOR | |||||
| #define CLOAD_X4 CLOAD_X4_VECTOR | |||||
| CGEMV_T_MSA(); | |||||
| #undef CLOAD_X8 | |||||
| #undef CLOAD_X4 | |||||
| } | |||||
| else | |||||
| { | |||||
| #define CLOAD_X8 CLOAD_X8_GP | |||||
| #define CLOAD_X4 CLOAD_X4_GP | |||||
| CGEMV_T_MSA(); | |||||
| #undef CLOAD_X8 | |||||
| #undef CLOAD_X4 | |||||
| } | |||||
| return(0); | |||||
| } | |||||
| #undef OP0 | |||||
| #undef OP1 | |||||
| #undef OP2 | |||||
| @@ -0,0 +1,50 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2016, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #include "common.h" | |||||
| int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||||
| { | |||||
| BLASLONG i=0; | |||||
| BLASLONG ix=0,iy=0; | |||||
| if ( n < 0 ) return(0); | |||||
| while(i < n) | |||||
| { | |||||
| y[iy] = x[ix] ; | |||||
| ix += inc_x ; | |||||
| iy += inc_y ; | |||||
| i++ ; | |||||
| } | |||||
| return(0); | |||||
| } | |||||
| @@ -0,0 +1,278 @@ | |||||
| /******************************************************************************* | |||||
| Copyright (c) 2016, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *******************************************************************************/ | |||||
| #include "common.h" | |||||
| #include <math.h> | |||||
| #include "macros_msa.h" | |||||
| #define AND_VEC_D(in) ((v2f64) ((v2i64) in & and_vec)) | |||||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| { | |||||
| BLASLONG i; | |||||
| FLOAT sumf = 0.0; | |||||
| v2f64 src0, src1, src2, src3, src4, src5, src6, src7; | |||||
| v2f64 sum_abs0, sum_abs1, sum_abs2, sum_abs3; | |||||
| v2f64 zero_v = {0}; | |||||
| v2i64 and_vec = {0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF}; | |||||
| if (n <= 0 || inc_x <= 0) return (sumf); | |||||
| if (1 == inc_x) | |||||
| { | |||||
| if (n > 15) | |||||
| { | |||||
| n -= 16; | |||||
| LD_DP8_INC(x, 2, src0, src1, src2, src3, src4, src5, src6, src7); | |||||
| sum_abs0 = AND_VEC_D(src0); | |||||
| sum_abs1 = AND_VEC_D(src1); | |||||
| sum_abs2 = AND_VEC_D(src2); | |||||
| sum_abs3 = AND_VEC_D(src3); | |||||
| sum_abs0 += AND_VEC_D(src4); | |||||
| sum_abs1 += AND_VEC_D(src5); | |||||
| sum_abs2 += AND_VEC_D(src6); | |||||
| sum_abs3 += AND_VEC_D(src7); | |||||
| } | |||||
| else | |||||
| { | |||||
| sum_abs0 = zero_v; | |||||
| sum_abs1 = zero_v; | |||||
| sum_abs2 = zero_v; | |||||
| sum_abs3 = zero_v; | |||||
| } | |||||
| for (i = (n >> 4); i--;) | |||||
| { | |||||
| LD_DP8_INC(x, 2, src0, src1, src2, src3, src4, src5, src6, src7); | |||||
| sum_abs0 += AND_VEC_D(src0); | |||||
| sum_abs1 += AND_VEC_D(src1); | |||||
| sum_abs2 += AND_VEC_D(src2); | |||||
| sum_abs3 += AND_VEC_D(src3); | |||||
| sum_abs0 += AND_VEC_D(src4); | |||||
| sum_abs1 += AND_VEC_D(src5); | |||||
| sum_abs2 += AND_VEC_D(src6); | |||||
| sum_abs3 += AND_VEC_D(src7); | |||||
| } | |||||
| if (n & 15) | |||||
| { | |||||
| if ((n & 8) && (n & 4) && (n & 2)) | |||||
| { | |||||
| LD_DP7_INC(x, 2, src0, src1, src2, src3, src4, src5, src6); | |||||
| sum_abs0 += AND_VEC_D(src0); | |||||
| sum_abs1 += AND_VEC_D(src1); | |||||
| sum_abs2 += AND_VEC_D(src2); | |||||
| sum_abs3 += AND_VEC_D(src3); | |||||
| sum_abs0 += AND_VEC_D(src4); | |||||
| sum_abs1 += AND_VEC_D(src5); | |||||
| sum_abs2 += AND_VEC_D(src6); | |||||
| } | |||||
| else if ((n & 8) && (n & 4)) | |||||
| { | |||||
| LD_DP6_INC(x, 2, src0, src1, src2, src3, src4, src5); | |||||
| sum_abs0 += AND_VEC_D(src0); | |||||
| sum_abs1 += AND_VEC_D(src1); | |||||
| sum_abs2 += AND_VEC_D(src2); | |||||
| sum_abs3 += AND_VEC_D(src3); | |||||
| sum_abs0 += AND_VEC_D(src4); | |||||
| sum_abs1 += AND_VEC_D(src5); | |||||
| } | |||||
| else if ((n & 8) && (n & 2)) | |||||
| { | |||||
| LD_DP5_INC(x, 2, src0, src1, src2, src3, src4); | |||||
| sum_abs0 += AND_VEC_D(src0); | |||||
| sum_abs1 += AND_VEC_D(src1); | |||||
| sum_abs2 += AND_VEC_D(src2); | |||||
| sum_abs3 += AND_VEC_D(src3); | |||||
| sum_abs0 += AND_VEC_D(src4); | |||||
| } | |||||
| else if ((n & 4) && (n & 2)) | |||||
| { | |||||
| LD_DP3_INC(x, 2, src0, src1, src2); | |||||
| sum_abs0 += AND_VEC_D(src0); | |||||
| sum_abs1 += AND_VEC_D(src1); | |||||
| sum_abs2 += AND_VEC_D(src2); | |||||
| } | |||||
| else if (n & 8) | |||||
| { | |||||
| LD_DP4_INC(x, 2, src0, src1, src2, src3); | |||||
| sum_abs0 += AND_VEC_D(src0); | |||||
| sum_abs1 += AND_VEC_D(src1); | |||||
| sum_abs2 += AND_VEC_D(src2); | |||||
| sum_abs3 += AND_VEC_D(src3); | |||||
| } | |||||
| else if (n & 4) | |||||
| { | |||||
| LD_DP2_INC(x, 2, src0, src1); | |||||
| sum_abs0 += AND_VEC_D(src0); | |||||
| sum_abs1 += AND_VEC_D(src1); | |||||
| } | |||||
| else if (n & 2) | |||||
| { | |||||
| src0 = LD_DP(x); x += 2; | |||||
| sum_abs0 += AND_VEC_D(src0); | |||||
| } | |||||
| sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; | |||||
| sumf = sum_abs0[0] + sum_abs0[1]; | |||||
| if (n & 1) | |||||
| { | |||||
| sumf += fabs(*x); | |||||
| } | |||||
| } | |||||
| else | |||||
| { | |||||
| sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; | |||||
| sumf = sum_abs0[0] + sum_abs0[1]; | |||||
| } | |||||
| } | |||||
| else | |||||
| { | |||||
| if (n > 8) | |||||
| { | |||||
| n -= 8; | |||||
| LD_DP8_INC(x, inc_x, src0, src1, src2, src3, src4, src5, src6, src7); | |||||
| sum_abs0 = AND_VEC_D(src0); | |||||
| sum_abs1 = AND_VEC_D(src1); | |||||
| sum_abs2 = AND_VEC_D(src2); | |||||
| sum_abs3 = AND_VEC_D(src3); | |||||
| sum_abs0 += AND_VEC_D(src4); | |||||
| sum_abs1 += AND_VEC_D(src5); | |||||
| sum_abs2 += AND_VEC_D(src6); | |||||
| sum_abs3 += AND_VEC_D(src7); | |||||
| } | |||||
| else | |||||
| { | |||||
| sum_abs0 = zero_v; | |||||
| sum_abs1 = zero_v; | |||||
| sum_abs2 = zero_v; | |||||
| sum_abs3 = zero_v; | |||||
| } | |||||
| for (i = (n >> 3); i--;) | |||||
| { | |||||
| LD_DP8_INC(x, inc_x, src0, src1, src2, src3, src4, src5, src6, src7); | |||||
| sum_abs0 += AND_VEC_D(src0); | |||||
| sum_abs1 += AND_VEC_D(src1); | |||||
| sum_abs2 += AND_VEC_D(src2); | |||||
| sum_abs3 += AND_VEC_D(src3); | |||||
| sum_abs0 += AND_VEC_D(src4); | |||||
| sum_abs1 += AND_VEC_D(src5); | |||||
| sum_abs2 += AND_VEC_D(src6); | |||||
| sum_abs3 += AND_VEC_D(src7); | |||||
| } | |||||
| if (n & 7) | |||||
| { | |||||
| if ((n & 4) && (n & 2) && (n & 1)) | |||||
| { | |||||
| LD_DP7_INC(x, inc_x, src0, src1, src2, src3, src4, src5, src6); | |||||
| sum_abs0 += AND_VEC_D(src0); | |||||
| sum_abs1 += AND_VEC_D(src1); | |||||
| sum_abs2 += AND_VEC_D(src2); | |||||
| sum_abs3 += AND_VEC_D(src3); | |||||
| sum_abs0 += AND_VEC_D(src4); | |||||
| sum_abs1 += AND_VEC_D(src5); | |||||
| sum_abs2 += AND_VEC_D(src6); | |||||
| } | |||||
| else if ((n & 4) && (n & 2)) | |||||
| { | |||||
| LD_DP6_INC(x, inc_x, src0, src1, src2, src3, src4, src5); | |||||
| sum_abs0 += AND_VEC_D(src0); | |||||
| sum_abs1 += AND_VEC_D(src1); | |||||
| sum_abs2 += AND_VEC_D(src2); | |||||
| sum_abs3 += AND_VEC_D(src3); | |||||
| sum_abs0 += AND_VEC_D(src4); | |||||
| sum_abs1 += AND_VEC_D(src5); | |||||
| } | |||||
| else if ((n & 4) && (n & 1)) | |||||
| { | |||||
| LD_DP5_INC(x, inc_x, src0, src1, src2, src3, src4); | |||||
| sum_abs0 += AND_VEC_D(src0); | |||||
| sum_abs1 += AND_VEC_D(src1); | |||||
| sum_abs2 += AND_VEC_D(src2); | |||||
| sum_abs3 += AND_VEC_D(src3); | |||||
| sum_abs0 += AND_VEC_D(src4); | |||||
| } | |||||
| else if ((n & 2) && (n & 1)) | |||||
| { | |||||
| LD_DP3_INC(x, inc_x, src0, src1, src2); | |||||
| sum_abs0 += AND_VEC_D(src0); | |||||
| sum_abs1 += AND_VEC_D(src1); | |||||
| sum_abs2 += AND_VEC_D(src2); | |||||
| } | |||||
| else if (n & 4) | |||||
| { | |||||
| LD_DP4_INC(x, inc_x, src0, src1, src2, src3); | |||||
| sum_abs0 += AND_VEC_D(src0); | |||||
| sum_abs1 += AND_VEC_D(src1); | |||||
| sum_abs2 += AND_VEC_D(src2); | |||||
| sum_abs3 += AND_VEC_D(src3); | |||||
| } | |||||
| else if (n & 2) | |||||
| { | |||||
| LD_DP2_INC(x, inc_x, src0, src1); | |||||
| sum_abs0 += AND_VEC_D(src0); | |||||
| sum_abs1 += AND_VEC_D(src1); | |||||
| } | |||||
| else if (n & 1) | |||||
| { | |||||
| src0 = LD_DP(x); | |||||
| sum_abs0 += AND_VEC_D(src0); | |||||
| } | |||||
| } | |||||
| sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; | |||||
| sumf = sum_abs0[0]; | |||||
| } | |||||
| return (sumf); | |||||
| } | |||||
| @@ -0,0 +1,189 @@ | |||||
| /******************************************************************************* | |||||
| Copyright (c) 2016, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *******************************************************************************/ | |||||
| #include "common.h" | |||||
| #include "macros_msa.h" | |||||
| /* return float, x,y float */ | |||||
| #if defined(DSDOT) | |||||
| double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||||
| #else | |||||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||||
| #endif | |||||
| { | |||||
| BLASLONG i = 0; | |||||
| double dot = 0.0; | |||||
| FLOAT x0, x1, x2, x3, y0, y1, y2, y3; | |||||
| v2f64 vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7; | |||||
| v2f64 vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7; | |||||
| v2f64 dot0 = {0, 0}; | |||||
| if (n < 0) return (dot); | |||||
| if ((1 == inc_x) && (1 == inc_y)) | |||||
| { | |||||
| for (i = (n >> 4); i--;) | |||||
| { | |||||
| LD_DP8_INC(x, 2, vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7); | |||||
| LD_DP8_INC(y, 2, vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7); | |||||
| dot0 += (vy0 * vx0); | |||||
| dot0 += (vy1 * vx1); | |||||
| dot0 += (vy2 * vx2); | |||||
| dot0 += (vy3 * vx3); | |||||
| dot0 += (vy4 * vx4); | |||||
| dot0 += (vy5 * vx5); | |||||
| dot0 += (vy6 * vx6); | |||||
| dot0 += (vy7 * vx7); | |||||
| } | |||||
| if (n & 15) | |||||
| { | |||||
| if ((n & 8) && (n & 4) && (n & 2)) | |||||
| { | |||||
| LD_DP7_INC(x, 2, vx0, vx1, vx2, vx3, vx4, vx5, vx6); | |||||
| LD_DP7_INC(y, 2, vy0, vy1, vy2, vy3, vy4, vy5, vy6); | |||||
| dot0 += (vy0 * vx0); | |||||
| dot0 += (vy1 * vx1); | |||||
| dot0 += (vy2 * vx2); | |||||
| dot0 += (vy3 * vx3); | |||||
| dot0 += (vy4 * vx4); | |||||
| dot0 += (vy5 * vx5); | |||||
| dot0 += (vy6 * vx6); | |||||
| } | |||||
| else if ((n & 8) && (n & 4)) | |||||
| { | |||||
| LD_DP6_INC(x, 2, vx0, vx1, vx2, vx3, vx4, vx5); | |||||
| LD_DP6_INC(y, 2, vy0, vy1, vy2, vy3, vy4, vy5); | |||||
| dot0 += (vy0 * vx0); | |||||
| dot0 += (vy1 * vx1); | |||||
| dot0 += (vy2 * vx2); | |||||
| dot0 += (vy3 * vx3); | |||||
| dot0 += (vy4 * vx4); | |||||
| dot0 += (vy5 * vx5); | |||||
| } | |||||
| else if ((n & 8) && (n & 2)) | |||||
| { | |||||
| LD_DP5_INC(x, 2, vx0, vx1, vx2, vx3, vx4); | |||||
| LD_DP5_INC(y, 2, vy0, vy1, vy2, vy3, vy4); | |||||
| dot0 += (vy0 * vx0); | |||||
| dot0 += (vy1 * vx1); | |||||
| dot0 += (vy2 * vx2); | |||||
| dot0 += (vy3 * vx3); | |||||
| dot0 += (vy4 * vx4); | |||||
| } | |||||
| else if ((n & 4) && (n & 2)) | |||||
| { | |||||
| LD_DP3_INC(x, 2, vx0, vx1, vx2); | |||||
| LD_DP3_INC(y, 2, vy0, vy1, vy2); | |||||
| dot0 += (vy0 * vx0); | |||||
| dot0 += (vy1 * vx1); | |||||
| dot0 += (vy2 * vx2); | |||||
| } | |||||
| else if (n & 8) | |||||
| { | |||||
| LD_DP4_INC(x, 2, vx0, vx1, vx2, vx3); | |||||
| LD_DP4_INC(y, 2, vy0, vy1, vy2, vy3); | |||||
| dot0 += (vy0 * vx0); | |||||
| dot0 += (vy1 * vx1); | |||||
| dot0 += (vy2 * vx2); | |||||
| dot0 += (vy3 * vx3); | |||||
| } | |||||
| else if (n & 4) | |||||
| { | |||||
| LD_DP2_INC(x, 2, vx0, vx1); | |||||
| LD_DP2_INC(y, 2, vy0, vy1); | |||||
| dot0 += (vy0 * vx0); | |||||
| dot0 += (vy1 * vx1); | |||||
| } | |||||
| else if (n & 2) | |||||
| { | |||||
| vx0 = LD_DP(x); x += 2; | |||||
| vy0 = LD_DP(y); y += 2; | |||||
| dot0 += (vy0 * vx0); | |||||
| } | |||||
| if (n & 1) | |||||
| { | |||||
| x0 = *x; | |||||
| y0 = *y; | |||||
| dot += (y0 * x0); | |||||
| } | |||||
| } | |||||
| dot += dot0[0]; | |||||
| dot += dot0[1]; | |||||
| } | |||||
| else | |||||
| { | |||||
| for (i = (n >> 2); i--;) | |||||
| { | |||||
| LD_GP4_INC(x, inc_x, x0, x1, x2, x3); | |||||
| LD_GP4_INC(y, inc_y, y0, y1, y2, y3); | |||||
| dot += (y0 * x0); | |||||
| dot += (y1 * x1); | |||||
| dot += (y2 * x2); | |||||
| dot += (y3 * x3); | |||||
| } | |||||
| if ((n & 2) && (n & 1)) | |||||
| { | |||||
| LD_GP3_INC(x, inc_x, x0, x1, x2); | |||||
| LD_GP3_INC(y, inc_y, y0, y1, y2); | |||||
| dot += (y0 * x0); | |||||
| dot += (y1 * x1); | |||||
| dot += (y2 * x2); | |||||
| } | |||||
| else if (n & 2) | |||||
| { | |||||
| LD_GP2_INC(x, inc_x, x0, x1); | |||||
| LD_GP2_INC(y, inc_y, y0, y1); | |||||
| dot += (y0 * x0); | |||||
| dot += (y1 * x1); | |||||
| } | |||||
| else if (n & 1) | |||||
| { | |||||
| x0 = *x; | |||||
| y0 = *y; | |||||
| dot += (y0 * x0); | |||||
| } | |||||
| } | |||||
| return (dot); | |||||
| } | |||||
| @@ -0,0 +1,118 @@ | |||||
| /******************************************************************************* | |||||
| Copyright (c) 2016, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *******************************************************************************/ | |||||
| #include "common.h" | |||||
| #include "macros_msa.h" | |||||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, | |||||
| FLOAT * __restrict dst) | |||||
| { | |||||
| BLASLONG i, j; | |||||
| FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4, *pdst; | |||||
| v2f64 src0, src1, src2, src3, src4, src5, src6, src7; | |||||
| v2f64 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; | |||||
| psrc0 = src; | |||||
| pdst = dst; | |||||
| for (j = (n >> 2); j--;) | |||||
| { | |||||
| psrc1 = psrc0; | |||||
| psrc2 = psrc1 + lda; | |||||
| psrc3 = psrc2 + lda; | |||||
| psrc4 = psrc3 + lda; | |||||
| psrc0 += 4 * lda; | |||||
| for (i = (m >> 2); i--;) | |||||
| { | |||||
| LD_DP2_INC(psrc1, 2, src0, src1); | |||||
| LD_DP2_INC(psrc2, 2, src2, src3); | |||||
| LD_DP2_INC(psrc3, 2, src4, src5); | |||||
| LD_DP2_INC(psrc4, 2, src6, src7); | |||||
| ILVRL_D2_DP(src2, src0, dst0, dst4); | |||||
| ILVRL_D2_DP(src6, src4, dst1, dst5); | |||||
| ILVRL_D2_DP(src3, src1, dst2, dst6); | |||||
| ILVRL_D2_DP(src7, src5, dst3, dst7); | |||||
| ST_DP8_INC(dst0, dst1, dst4, dst5, dst2, dst3, dst6, dst7, pdst, 2); | |||||
| } | |||||
| for (i = (m & 3); i--;) | |||||
| { | |||||
| *pdst++ = *psrc1++; | |||||
| *pdst++ = *psrc2++; | |||||
| *pdst++ = *psrc3++; | |||||
| *pdst++ = *psrc4++; | |||||
| } | |||||
| } | |||||
| if (n & 2) | |||||
| { | |||||
| psrc1 = psrc0; | |||||
| psrc2 = psrc1 + lda; | |||||
| psrc0 += 2 * lda; | |||||
| for (i = (m >> 2); i--;) | |||||
| { | |||||
| LD_DP2_INC(psrc1, 2, src0, src1); | |||||
| LD_DP2_INC(psrc2, 2, src2, src3); | |||||
| ILVRL_D2_DP(src2, src0, dst0, dst4); | |||||
| ILVRL_D2_DP(src3, src1, dst1, dst5); | |||||
| ST_DP4_INC(dst0, dst4, dst1, dst5, pdst, 2); | |||||
| } | |||||
| for (i = (m & 3); i--;) | |||||
| { | |||||
| *pdst++ = *psrc1++; | |||||
| *pdst++ = *psrc2++; | |||||
| } | |||||
| } | |||||
| if (n & 1) | |||||
| { | |||||
| psrc1 = psrc0; | |||||
| for (i = (m >> 2); i--;) | |||||
| { | |||||
| LD_DP2(psrc1, 2, src0, src1); | |||||
| psrc1 += 4; | |||||
| ST_DP2(src0, src1, pdst, 2); | |||||
| pdst += 4; | |||||
| } | |||||
| for (i = (m & 3); i--;) | |||||
| { | |||||
| *pdst++ = *psrc1++; | |||||
| } | |||||
| } | |||||
| return 0; | |||||
| } | |||||
| @@ -0,0 +1,186 @@ | |||||
| /******************************************************************************* | |||||
| Copyright (c) 2016, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *******************************************************************************/ | |||||
| #include "common.h" | |||||
| #include "macros_msa.h" | |||||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, | |||||
| FLOAT * __restrict dst) | |||||
| { | |||||
| BLASLONG i, j; | |||||
| FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4, *psrc5, *psrc6, *psrc7; | |||||
| FLOAT *psrc8, *pdst; | |||||
| v2f64 src0, src1, src2, src3, src4, src5, src6, src7; | |||||
| v2f64 src8, src9, src10, src11, src12, src13, src14, src15; | |||||
| v2f64 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; | |||||
| psrc0 = src; | |||||
| pdst = dst; | |||||
| for (j = (n >> 3); j--;) | |||||
| { | |||||
| psrc1 = psrc0; | |||||
| psrc2 = psrc1 + lda; | |||||
| psrc3 = psrc2 + lda; | |||||
| psrc4 = psrc3 + lda; | |||||
| psrc5 = psrc4 + lda; | |||||
| psrc6 = psrc5 + lda; | |||||
| psrc7 = psrc6 + lda; | |||||
| psrc8 = psrc7 + lda; | |||||
| psrc0 += 8 * lda; | |||||
| for (i = (m >> 3); i--;) | |||||
| { | |||||
| LD_DP2_INC(psrc1, 2, src0, src1); | |||||
| LD_DP2_INC(psrc2, 2, src2, src3); | |||||
| LD_DP2_INC(psrc3, 2, src4, src5); | |||||
| LD_DP2_INC(psrc4, 2, src6, src7); | |||||
| LD_DP2_INC(psrc5, 2, src8, src9); | |||||
| LD_DP2_INC(psrc6, 2, src10, src11); | |||||
| LD_DP2_INC(psrc7, 2, src12, src13); | |||||
| LD_DP2_INC(psrc8, 2, src14, src15); | |||||
| ILVRL_D2_DP(src2, src0, dst0, dst4); | |||||
| ILVRL_D2_DP(src6, src4, dst1, dst5); | |||||
| ILVRL_D2_DP(src10, src8, dst2, dst6); | |||||
| ILVRL_D2_DP(src14, src12, dst3, dst7); | |||||
| ST_DP8_INC(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst, 2); | |||||
| ILVRL_D2_DP(src3, src1, dst0, dst4); | |||||
| ILVRL_D2_DP(src7, src5, dst1, dst5); | |||||
| ILVRL_D2_DP(src11, src9, dst2, dst6); | |||||
| ILVRL_D2_DP(src15, src13, dst3, dst7); | |||||
| ST_DP8_INC(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst, 2); | |||||
| LD_DP2_INC(psrc1, 2, src0, src1); | |||||
| LD_DP2_INC(psrc2, 2, src2, src3); | |||||
| LD_DP2_INC(psrc3, 2, src4, src5); | |||||
| LD_DP2_INC(psrc4, 2, src6, src7); | |||||
| LD_DP2_INC(psrc5, 2, src8, src9); | |||||
| LD_DP2_INC(psrc6, 2, src10, src11); | |||||
| LD_DP2_INC(psrc7, 2, src12, src13); | |||||
| LD_DP2_INC(psrc8, 2, src14, src15); | |||||
| ILVRL_D2_DP(src2, src0, dst0, dst4); | |||||
| ILVRL_D2_DP(src6, src4, dst1, dst5); | |||||
| ILVRL_D2_DP(src10, src8, dst2, dst6); | |||||
| ILVRL_D2_DP(src14, src12, dst3, dst7); | |||||
| ST_DP8_INC(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst, 2); | |||||
| ILVRL_D2_DP(src3, src1, dst0, dst4); | |||||
| ILVRL_D2_DP(src7, src5, dst1, dst5); | |||||
| ILVRL_D2_DP(src11, src9, dst2, dst6); | |||||
| ILVRL_D2_DP(src15, src13, dst3, dst7); | |||||
| ST_DP8_INC(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst, 2); | |||||
| } | |||||
| for (i = (m & 7); i--;) | |||||
| { | |||||
| *pdst++ = *psrc1++; | |||||
| *pdst++ = *psrc2++; | |||||
| *pdst++ = *psrc3++; | |||||
| *pdst++ = *psrc4++; | |||||
| *pdst++ = *psrc5++; | |||||
| *pdst++ = *psrc6++; | |||||
| *pdst++ = *psrc7++; | |||||
| *pdst++ = *psrc8++; | |||||
| } | |||||
| } | |||||
| if (n & 4) | |||||
| { | |||||
| psrc1 = psrc0; | |||||
| psrc2 = psrc1 + lda; | |||||
| psrc3 = psrc2 + lda; | |||||
| psrc4 = psrc3 + lda; | |||||
| psrc0 += 4 * lda; | |||||
| for (i = (m >> 2); i--;) | |||||
| { | |||||
| LD_DP2_INC(psrc1, 2, src0, src1); | |||||
| LD_DP2_INC(psrc2, 2, src2, src3); | |||||
| LD_DP2_INC(psrc3, 2, src4, src5); | |||||
| LD_DP2_INC(psrc4, 2, src6, src7); | |||||
| ILVRL_D2_DP(src2, src0, dst0, dst4); | |||||
| ILVRL_D2_DP(src6, src4, dst1, dst5); | |||||
| ILVRL_D2_DP(src3, src1, dst2, dst6); | |||||
| ILVRL_D2_DP(src7, src5, dst3, dst7); | |||||
| ST_DP8_INC(dst0, dst1, dst4, dst5, dst2, dst3, dst6, dst7, pdst, 2); | |||||
| } | |||||
| for (i = (m & 3); i--;) | |||||
| { | |||||
| *pdst++ = *psrc1++; | |||||
| *pdst++ = *psrc2++; | |||||
| *pdst++ = *psrc3++; | |||||
| *pdst++ = *psrc4++; | |||||
| } | |||||
| } | |||||
| if (n & 2) | |||||
| { | |||||
| psrc1 = psrc0; | |||||
| psrc2 = psrc1 + lda; | |||||
| psrc0 += 2 * lda; | |||||
| for (i = (m >> 1); i--;) | |||||
| { | |||||
| src0 = LD_DP(psrc1); | |||||
| src1 = LD_DP(psrc2); | |||||
| psrc1 += 2; | |||||
| psrc2 += 2; | |||||
| ILVRL_D2_DP(src1, src0, dst0, dst1); | |||||
| ST_DP2_INC(dst0, dst1, pdst, 2); | |||||
| } | |||||
| if (m & 1) | |||||
| { | |||||
| *pdst++ = *psrc1++; | |||||
| *pdst++ = *psrc2++; | |||||
| } | |||||
| } | |||||
| if (n & 1) | |||||
| { | |||||
| psrc1 = psrc0; | |||||
| for (i = m; i--;) | |||||
| { | |||||
| *pdst++ = *psrc1++; | |||||
| } | |||||
| } | |||||
| return 0; | |||||
| } | |||||
| @@ -0,0 +1,153 @@ | |||||
| /******************************************************************************* | |||||
| Copyright (c) 2016, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *******************************************************************************/ | |||||
| #include "common.h" | |||||
| #include "macros_msa.h" | |||||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, | |||||
| FLOAT * __restrict dst) | |||||
| { | |||||
| BLASLONG i, j; | |||||
| FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4; | |||||
| FLOAT *pdst0, *pdst1, *pdst2, *pdst3; | |||||
| v2f64 src0, src1, src2, src3, src4, src5, src6, src7; | |||||
| psrc0 = src; | |||||
| pdst0 = dst; | |||||
| pdst2 = dst + m * (n & ~3); | |||||
| pdst3 = dst + m * (n & ~1); | |||||
| for (j = (m >> 2); j--;) | |||||
| { | |||||
| psrc1 = psrc0; | |||||
| psrc2 = psrc1 + lda; | |||||
| psrc3 = psrc2 + lda; | |||||
| psrc4 = psrc3 + lda; | |||||
| psrc0 += 4 * lda; | |||||
| pdst1 = pdst0; | |||||
| pdst0 += 16; | |||||
| for (i = (n >> 2); i--;) | |||||
| { | |||||
| LD_DP2_INC(psrc1, 2, src0, src1); | |||||
| LD_DP2_INC(psrc2, 2, src2, src3); | |||||
| LD_DP2_INC(psrc3, 2, src4, src5); | |||||
| LD_DP2_INC(psrc4, 2, src6, src7); | |||||
| ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1, 2); | |||||
| pdst1 += m * 4; | |||||
| } | |||||
| if (n & 2) | |||||
| { | |||||
| src0 = LD_DP(psrc1); | |||||
| src1 = LD_DP(psrc2); | |||||
| src2 = LD_DP(psrc3); | |||||
| src3 = LD_DP(psrc4); | |||||
| psrc1 += 2; | |||||
| psrc2 += 2; | |||||
| psrc3 += 2; | |||||
| psrc4 += 2; | |||||
| ST_DP4_INC(src0, src1, src2, src3, pdst2, 2); | |||||
| } | |||||
| if (n & 1) | |||||
| { | |||||
| *pdst3++ = *psrc1++; | |||||
| *pdst3++ = *psrc2++; | |||||
| *pdst3++ = *psrc3++; | |||||
| *pdst3++ = *psrc4++; | |||||
| } | |||||
| } | |||||
| if (m & 2) | |||||
| { | |||||
| psrc1 = psrc0; | |||||
| psrc2 = psrc1 + lda; | |||||
| psrc0 += 2 * lda; | |||||
| pdst1 = pdst0; | |||||
| pdst0 += 8; | |||||
| for (i = (n >> 2); i--;) | |||||
| { | |||||
| LD_DP2_INC(psrc1, 2, src0, src1); | |||||
| LD_DP2_INC(psrc2, 2, src2, src3); | |||||
| ST_DP4(src0, src1, src2, src3, pdst1, 2); | |||||
| pdst1 += m * 4; | |||||
| } | |||||
| if (n & 2) | |||||
| { | |||||
| src0 = LD_DP(psrc1); | |||||
| src1 = LD_DP(psrc2); | |||||
| psrc1 += 2; | |||||
| psrc2 += 2; | |||||
| ST_DP2_INC(src0, src1, pdst2, 2); | |||||
| } | |||||
| if (n & 1) | |||||
| { | |||||
| *pdst3++ = *psrc1++; | |||||
| *pdst3++ = *psrc2++; | |||||
| } | |||||
| } | |||||
| if (m & 1) | |||||
| { | |||||
| psrc1 = psrc0; | |||||
| pdst1 = pdst0; | |||||
| for (i = (n >> 2); i--;) | |||||
| { | |||||
| LD_DP2_INC(psrc1, 2, src0, src1); | |||||
| ST_DP2(src0, src1, pdst1, 2); | |||||
| pdst1 += 4 * m; | |||||
| } | |||||
| if (n & 2) | |||||
| { | |||||
| src0 = LD_DP(psrc1); | |||||
| psrc1 += 2; | |||||
| ST_DP(src0, pdst2); | |||||
| } | |||||
| if (n & 1) | |||||
| { | |||||
| *pdst3 = *psrc1; | |||||
| } | |||||
| } | |||||
| return 0; | |||||
| } | |||||
| @@ -0,0 +1,276 @@ | |||||
| /******************************************************************************* | |||||
| Copyright (c) 2016, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *******************************************************************************/ | |||||
| #include "common.h" | |||||
| #include "macros_msa.h" | |||||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, | |||||
| FLOAT * __restrict dst) | |||||
| { | |||||
| BLASLONG i, j; | |||||
| FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4; | |||||
| FLOAT *psrc5, *psrc6, *psrc7, *psrc8; | |||||
| FLOAT *pdst0, *pdst1, *pdst2, *pdst3, *pdst4; | |||||
| v2f64 src0, src1, src2, src3, src4, src5, src6, src7; | |||||
| v2f64 src8, src9, src10, src11, src12, src13, src14, src15; | |||||
| psrc0 = src; | |||||
| pdst0 = dst; | |||||
| pdst2 = dst + m * (n & ~7); | |||||
| pdst3 = dst + m * (n & ~3); | |||||
| pdst4 = dst + m * (n & ~1); | |||||
| for (j = (m >> 3); j--;) | |||||
| { | |||||
| psrc1 = psrc0; | |||||
| psrc2 = psrc1 + lda; | |||||
| psrc3 = psrc2 + lda; | |||||
| psrc4 = psrc3 + lda; | |||||
| psrc5 = psrc4 + lda; | |||||
| psrc6 = psrc5 + lda; | |||||
| psrc7 = psrc6 + lda; | |||||
| psrc8 = psrc7 + lda; | |||||
| psrc0 += 8 * lda; | |||||
| pdst1 = pdst0; | |||||
| pdst0 += 64; | |||||
| for (i = (n >> 3); i--;) | |||||
| { | |||||
| LD_DP4_INC(psrc1, 2, src0, src1, src2, src3); | |||||
| LD_DP4_INC(psrc2, 2, src4, src5, src6, src7); | |||||
| LD_DP4_INC(psrc3, 2, src8, src9, src10, src11); | |||||
| LD_DP4_INC(psrc4, 2, src12, src13, src14, src15); | |||||
| ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1, 2); | |||||
| ST_DP8(src8, src9, src10, src11, src12, src13, src14, src15, | |||||
| pdst1 + 16, 2); | |||||
| LD_DP4_INC(psrc5, 2, src0, src1, src2, src3); | |||||
| LD_DP4_INC(psrc6, 2, src4, src5, src6, src7); | |||||
| LD_DP4_INC(psrc7, 2, src8, src9, src10, src11); | |||||
| LD_DP4_INC(psrc8, 2, src12, src13, src14, src15); | |||||
| ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1 + 32, | |||||
| 2); | |||||
| ST_DP8(src8, src9, src10, src11, src12, src13, src14, src15, | |||||
| pdst1 + 48, 2); | |||||
| pdst1 += m * 8; | |||||
| } | |||||
| if (n & 4) | |||||
| { | |||||
| LD_DP2_INC(psrc1, 2, src0, src1); | |||||
| LD_DP2_INC(psrc2, 2, src2, src3); | |||||
| LD_DP2_INC(psrc3, 2, src4, src5); | |||||
| LD_DP2_INC(psrc4, 2, src6, src7); | |||||
| LD_DP2_INC(psrc5, 2, src8, src9); | |||||
| LD_DP2_INC(psrc6, 2, src10, src11); | |||||
| LD_DP2_INC(psrc7, 2, src12, src13); | |||||
| LD_DP2_INC(psrc8, 2, src14, src15); | |||||
| ST_DP8_INC(src0, src1, src2, src3, src4, src5, src6, src7, pdst2, 2); | |||||
| ST_DP8_INC(src8, src9, src10, src11, src12, src13, src14, src15, | |||||
| pdst2, 2); | |||||
| } | |||||
| if (n & 2) | |||||
| { | |||||
| src0 = LD_DP(psrc1); | |||||
| src1 = LD_DP(psrc2); | |||||
| src2 = LD_DP(psrc3); | |||||
| src3 = LD_DP(psrc4); | |||||
| src4 = LD_DP(psrc5); | |||||
| src5 = LD_DP(psrc6); | |||||
| src6 = LD_DP(psrc7); | |||||
| src7 = LD_DP(psrc8); | |||||
| psrc1 += 2; | |||||
| psrc2 += 2; | |||||
| psrc3 += 2; | |||||
| psrc4 += 2; | |||||
| psrc5 += 2; | |||||
| psrc6 += 2; | |||||
| psrc7 += 2; | |||||
| psrc8 += 2; | |||||
| ST_DP8_INC(src0, src1, src2, src3, src4, src5, src6, src7, pdst3, 2); | |||||
| } | |||||
| if (n & 1) | |||||
| { | |||||
| *pdst4++ = *psrc1++; | |||||
| *pdst4++ = *psrc2++; | |||||
| *pdst4++ = *psrc3++; | |||||
| *pdst4++ = *psrc4++; | |||||
| *pdst4++ = *psrc5++; | |||||
| *pdst4++ = *psrc6++; | |||||
| *pdst4++ = *psrc7++; | |||||
| *pdst4++ = *psrc8++; | |||||
| } | |||||
| } | |||||
| if (m & 4) | |||||
| { | |||||
| psrc1 = psrc0; | |||||
| psrc2 = psrc1 + lda; | |||||
| psrc3 = psrc2 + lda; | |||||
| psrc4 = psrc3 + lda; | |||||
| psrc0 += 4 * lda; | |||||
| pdst1 = pdst0; | |||||
| pdst0 += 32; | |||||
| for (i = (n >> 3); i--;) | |||||
| { | |||||
| LD_DP4_INC(psrc1, 2, src0, src1, src2, src3); | |||||
| LD_DP4_INC(psrc2, 2, src4, src5, src6, src7); | |||||
| LD_DP4_INC(psrc3, 2, src8, src9, src10, src11); | |||||
| LD_DP4_INC(psrc4, 2, src12, src13, src14, src15); | |||||
| ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1, 2); | |||||
| ST_DP8(src8, src9, src10, src11, src12, src13, src14, src15, | |||||
| pdst1 + 16, 2); | |||||
| pdst1 += 8 * m; | |||||
| } | |||||
| if (n & 4) | |||||
| { | |||||
| LD_DP2_INC(psrc1, 2, src0, src1); | |||||
| LD_DP2_INC(psrc2, 2, src2, src3); | |||||
| LD_DP2_INC(psrc3, 2, src4, src5); | |||||
| LD_DP2_INC(psrc4, 2, src6, src7); | |||||
| ST_DP8_INC(src0, src1, src2, src3, src4, src5, src6, src7, pdst2, 2); | |||||
| } | |||||
| if (n & 2) | |||||
| { | |||||
| src0 = LD_DP(psrc1); | |||||
| src1 = LD_DP(psrc2); | |||||
| src2 = LD_DP(psrc3); | |||||
| src3 = LD_DP(psrc4); | |||||
| psrc1 += 2; | |||||
| psrc2 += 2; | |||||
| psrc3 += 2; | |||||
| psrc4 += 2; | |||||
| ST_DP4_INC(src0, src1, src2, src3, pdst3, 2); | |||||
| } | |||||
| if (n & 1) | |||||
| { | |||||
| *pdst4++ = *psrc1++; | |||||
| *pdst4++ = *psrc2++; | |||||
| *pdst4++ = *psrc3++; | |||||
| *pdst4++ = *psrc4++; | |||||
| } | |||||
| } | |||||
| if (m & 2) | |||||
| { | |||||
| psrc1 = psrc0; | |||||
| psrc2 = psrc1 + lda; | |||||
| psrc0 += 2 * lda; | |||||
| pdst1 = pdst0; | |||||
| pdst0 += 16; | |||||
| for (i = (n >> 3); i--;) | |||||
| { | |||||
| LD_DP4_INC(psrc1, 2, src0, src1, src2, src3); | |||||
| LD_DP4_INC(psrc2, 2, src4, src5, src6, src7); | |||||
| ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1, 2); | |||||
| pdst1 += 8 * m; | |||||
| } | |||||
| if (n & 4) | |||||
| { | |||||
| LD_DP2_INC(psrc1, 2, src0, src1); | |||||
| LD_DP2_INC(psrc2, 2, src2, src3); | |||||
| ST_DP4_INC(src0, src1, src2, src3, pdst2, 2); | |||||
| } | |||||
| if (n & 2) | |||||
| { | |||||
| src0 = LD_DP(psrc1); | |||||
| src1 = LD_DP(psrc2); | |||||
| psrc1 += 2; | |||||
| psrc2 += 2; | |||||
| ST_DP2_INC(src0, src1, pdst3, 2); | |||||
| } | |||||
| if (n & 1) | |||||
| { | |||||
| *pdst4++ = *psrc1++; | |||||
| *pdst4++ = *psrc2++; | |||||
| } | |||||
| } | |||||
| if (m & 1) | |||||
| { | |||||
| psrc1 = psrc0; | |||||
| psrc0 += lda; | |||||
| pdst1 = pdst0; | |||||
| pdst0 += 8; | |||||
| for (i = (n >> 3); i--;) | |||||
| { | |||||
| LD_DP4_INC(psrc1, 2, src0, src1, src2, src3); | |||||
| ST_DP4(src0, src1, src2, src3, pdst1, 2); | |||||
| pdst1 += 8 * m; | |||||
| } | |||||
| if (n & 4) | |||||
| { | |||||
| LD_DP2_INC(psrc1, 2, src0, src1); | |||||
| ST_DP2_INC(src0, src1, pdst2, 2); | |||||
| } | |||||
| if (n & 2) | |||||
| { | |||||
| src0 = LD_DP(psrc1); | |||||
| psrc1 += 2; | |||||
| ST_DP(src0, pdst3); | |||||
| pdst3 += 2; | |||||
| } | |||||
| if (n & 1) | |||||
| { | |||||
| *pdst4++ = *psrc1++; | |||||
| } | |||||
| } | |||||
| return 0; | |||||
| } | |||||
| @@ -0,0 +1,577 @@ | |||||
| /******************************************************************************* | |||||
| Copyright (c) 2016, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *******************************************************************************/ | |||||
| #include "common.h" | |||||
| #include "macros_msa.h" | |||||
| #define DGEMV_N_8x8() \ | |||||
| { \ | |||||
| LD_DP4(pa0 + k, 2, t0, t1, t2, t3); \ | |||||
| LD_DP4(pa1 + k, 2, t4, t5, t6, t7); \ | |||||
| LD_DP4(pa2 + k, 2, t8, t9, t10, t11); \ | |||||
| LD_DP4(pa3 + k, 2, t12, t13, t14, t15); \ | |||||
| LD_DP4(pa4 + k, 2, t16, t17, t18, t19); \ | |||||
| LD_DP4(pa5 + k, 2, t20, t21, t22, t23); \ | |||||
| LD_DP4(pa6 + k, 2, t24, t25, t26, t27); \ | |||||
| LD_DP4(pa7 + k, 2, t28, t29, t30, t31); \ | |||||
| \ | |||||
| y0 += tp0 * t0; \ | |||||
| y1 += tp0 * t1; \ | |||||
| y2 += tp0 * t2; \ | |||||
| y3 += tp0 * t3; \ | |||||
| \ | |||||
| y0 += tp1 * t4; \ | |||||
| y1 += tp1 * t5; \ | |||||
| y2 += tp1 * t6; \ | |||||
| y3 += tp1 * t7; \ | |||||
| \ | |||||
| y0 += tp2 * t8; \ | |||||
| y1 += tp2 * t9; \ | |||||
| y2 += tp2 * t10; \ | |||||
| y3 += tp2 * t11; \ | |||||
| \ | |||||
| y0 += tp3 * t12; \ | |||||
| y1 += tp3 * t13; \ | |||||
| y2 += tp3 * t14; \ | |||||
| y3 += tp3 * t15; \ | |||||
| \ | |||||
| y0 += tp4 * t16; \ | |||||
| y1 += tp4 * t17; \ | |||||
| y2 += tp4 * t18; \ | |||||
| y3 += tp4 * t19; \ | |||||
| \ | |||||
| y0 += tp5 * t20; \ | |||||
| y1 += tp5 * t21; \ | |||||
| y2 += tp5 * t22; \ | |||||
| y3 += tp5 * t23; \ | |||||
| \ | |||||
| y0 += tp6 * t24; \ | |||||
| y1 += tp6 * t25; \ | |||||
| y2 += tp6 * t26; \ | |||||
| y3 += tp6 * t27; \ | |||||
| \ | |||||
| y0 += tp7 * t28; \ | |||||
| y1 += tp7 * t29; \ | |||||
| y2 += tp7 * t30; \ | |||||
| y3 += tp7 * t31; \ | |||||
| } | |||||
| #define DGEMV_N_4x8() \ | |||||
| { \ | |||||
| LD_DP2(pa0 + k, 2, t0, t1); \ | |||||
| LD_DP2(pa1 + k, 2, t4, t5); \ | |||||
| LD_DP2(pa2 + k, 2, t8, t9); \ | |||||
| LD_DP2(pa3 + k, 2, t12, t13); \ | |||||
| LD_DP2(pa4 + k, 2, t16, t17); \ | |||||
| LD_DP2(pa5 + k, 2, t20, t21); \ | |||||
| LD_DP2(pa6 + k, 2, t24, t25); \ | |||||
| LD_DP2(pa7 + k, 2, t28, t29); \ | |||||
| \ | |||||
| y0 += tp0 * t0; \ | |||||
| y1 += tp0 * t1; \ | |||||
| \ | |||||
| y0 += tp1 * t4; \ | |||||
| y1 += tp1 * t5; \ | |||||
| \ | |||||
| y0 += tp2 * t8; \ | |||||
| y1 += tp2 * t9; \ | |||||
| \ | |||||
| y0 += tp3 * t12; \ | |||||
| y1 += tp3 * t13; \ | |||||
| \ | |||||
| y0 += tp4 * t16; \ | |||||
| y1 += tp4 * t17; \ | |||||
| \ | |||||
| y0 += tp5 * t20; \ | |||||
| y1 += tp5 * t21; \ | |||||
| \ | |||||
| y0 += tp6 * t24; \ | |||||
| y1 += tp6 * t25; \ | |||||
| \ | |||||
| y0 += tp7 * t28; \ | |||||
| y1 += tp7 * t29; \ | |||||
| } | |||||
| #define DGEMV_N_8x4() \ | |||||
| { \ | |||||
| LD_DP4(pa0 + k, 2, t0, t1, t2, t3); \ | |||||
| LD_DP4(pa1 + k, 2, t4, t5, t6, t7); \ | |||||
| LD_DP4(pa2 + k, 2, t8, t9, t10, t11); \ | |||||
| LD_DP4(pa3 + k, 2, t12, t13, t14, t15); \ | |||||
| \ | |||||
| y0 += tp0 * t0; \ | |||||
| y1 += tp0 * t1; \ | |||||
| y2 += tp0 * t2; \ | |||||
| y3 += tp0 * t3; \ | |||||
| \ | |||||
| y0 += tp1 * t4; \ | |||||
| y1 += tp1 * t5; \ | |||||
| y2 += tp1 * t6; \ | |||||
| y3 += tp1 * t7; \ | |||||
| \ | |||||
| y0 += tp2 * t8; \ | |||||
| y1 += tp2 * t9; \ | |||||
| y2 += tp2 * t10; \ | |||||
| y3 += tp2 * t11; \ | |||||
| \ | |||||
| y0 += tp3 * t12; \ | |||||
| y1 += tp3 * t13; \ | |||||
| y2 += tp3 * t14; \ | |||||
| y3 += tp3 * t15; \ | |||||
| } | |||||
| #define DGEMV_N_4x4() \ | |||||
| { \ | |||||
| LD_DP2(pa0 + k, 2, t0, t1); \ | |||||
| LD_DP2(pa1 + k, 2, t4, t5); \ | |||||
| LD_DP2(pa2 + k, 2, t8, t9); \ | |||||
| LD_DP2(pa3 + k, 2, t12, t13); \ | |||||
| \ | |||||
| y0 += tp0 * t0; \ | |||||
| y1 += tp0 * t1; \ | |||||
| \ | |||||
| y0 += tp1 * t4; \ | |||||
| y1 += tp1 * t5; \ | |||||
| \ | |||||
| y0 += tp2 * t8; \ | |||||
| y1 += tp2 * t9; \ | |||||
| \ | |||||
| y0 += tp3 * t12; \ | |||||
| y1 += tp3 * t13; \ | |||||
| } | |||||
| #define DGEMV_N_8x2() \ | |||||
| { \ | |||||
| LD_DP4(pa0 + k, 2, t0, t1, t2, t3); \ | |||||
| LD_DP4(pa1 + k, 2, t4, t5, t6, t7); \ | |||||
| \ | |||||
| y0 += tp0 * t0; \ | |||||
| y1 += tp0 * t1; \ | |||||
| y2 += tp0 * t2; \ | |||||
| y3 += tp0 * t3; \ | |||||
| \ | |||||
| y0 += tp1 * t4; \ | |||||
| y1 += tp1 * t5; \ | |||||
| y2 += tp1 * t6; \ | |||||
| y3 += tp1 * t7; \ | |||||
| } | |||||
| #define DGEMV_N_4x2() \ | |||||
| { \ | |||||
| LD_DP2(pa0 + k, 2, t0, t1); \ | |||||
| LD_DP2(pa1 + k, 2, t4, t5); \ | |||||
| \ | |||||
| y0 += tp0 * t0; \ | |||||
| y1 += tp0 * t1; \ | |||||
| \ | |||||
| y0 += tp1 * t4; \ | |||||
| y1 += tp1 * t5; \ | |||||
| } | |||||
| #define DLOAD_X8_SCALE_GP() \ | |||||
| temp0 = alpha * x[0 * inc_x]; \ | |||||
| temp1 = alpha * x[1 * inc_x]; \ | |||||
| temp2 = alpha * x[2 * inc_x]; \ | |||||
| temp3 = alpha * x[3 * inc_x]; \ | |||||
| temp4 = alpha * x[4 * inc_x]; \ | |||||
| temp5 = alpha * x[5 * inc_x]; \ | |||||
| temp6 = alpha * x[6 * inc_x]; \ | |||||
| temp7 = alpha * x[7 * inc_x]; \ | |||||
| \ | |||||
| tp0 = COPY_DOUBLE_TO_VECTOR(temp0); \ | |||||
| tp1 = COPY_DOUBLE_TO_VECTOR(temp1); \ | |||||
| tp2 = COPY_DOUBLE_TO_VECTOR(temp2); \ | |||||
| tp3 = COPY_DOUBLE_TO_VECTOR(temp3); \ | |||||
| tp4 = COPY_DOUBLE_TO_VECTOR(temp4); \ | |||||
| tp5 = COPY_DOUBLE_TO_VECTOR(temp5); \ | |||||
| tp6 = COPY_DOUBLE_TO_VECTOR(temp6); \ | |||||
| tp7 = COPY_DOUBLE_TO_VECTOR(temp7); \ | |||||
| #define DLOAD_X4_SCALE_GP() \ | |||||
| temp0 = alpha * x[0 * inc_x]; \ | |||||
| temp1 = alpha * x[1 * inc_x]; \ | |||||
| temp2 = alpha * x[2 * inc_x]; \ | |||||
| temp3 = alpha * x[3 * inc_x]; \ | |||||
| \ | |||||
| tp0 = COPY_DOUBLE_TO_VECTOR(temp0); \ | |||||
| tp1 = COPY_DOUBLE_TO_VECTOR(temp1); \ | |||||
| tp2 = COPY_DOUBLE_TO_VECTOR(temp2); \ | |||||
| tp3 = COPY_DOUBLE_TO_VECTOR(temp3); \ | |||||
| #define DLOAD_X8_SCALE_VECTOR() \ | |||||
| LD_DP4(x, 2, x0, x1, x2, x3); \ | |||||
| \ | |||||
| x0 = x0 * v_alpha; \ | |||||
| x1 = x1 * v_alpha; \ | |||||
| x2 = x2 * v_alpha; \ | |||||
| x3 = x3 * v_alpha; \ | |||||
| \ | |||||
| SPLATI_D2_DP(x0, tp0, tp1); \ | |||||
| SPLATI_D2_DP(x1, tp2, tp3); \ | |||||
| SPLATI_D2_DP(x2, tp4, tp5); \ | |||||
| SPLATI_D2_DP(x3, tp6, tp7); \ | |||||
| #define DLOAD_X4_SCALE_VECTOR() \ | |||||
| LD_DP2(x, 2, x0, x1); \ | |||||
| \ | |||||
| x0 = x0 * v_alpha; \ | |||||
| x1 = x1 * v_alpha; \ | |||||
| \ | |||||
| SPLATI_D2_DP(x0, tp0, tp1); \ | |||||
| SPLATI_D2_DP(x1, tp2, tp3); \ | |||||
| #define DLOAD_Y8_GP() \ | |||||
| y0 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(y + 0 * inc_y))); \ | |||||
| y0 = (v2f64) __msa_insert_d((v2i64) y0, 1, *((long long *)(y + 1 * inc_y))); \ | |||||
| y1 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(y + 2 * inc_y))); \ | |||||
| y1 = (v2f64) __msa_insert_d((v2i64) y1, 1, *((long long *)(y + 3 * inc_y))); \ | |||||
| y2 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(y + 4 * inc_y))); \ | |||||
| y2 = (v2f64) __msa_insert_d((v2i64) y2, 1, *((long long *)(y + 5 * inc_y))); \ | |||||
| y3 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(y + 6 * inc_y))); \ | |||||
| y3 = (v2f64) __msa_insert_d((v2i64) y3, 1, *((long long *)(y + 7 * inc_y))); \ | |||||
| #define DLOAD_Y4_GP() \ | |||||
| y0 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(y + 0 * inc_y))); \ | |||||
| y0 = (v2f64) __msa_insert_d((v2i64) y0, 1, *((long long *)(y + 1 * inc_y))); \ | |||||
| y1 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(y + 2 * inc_y))); \ | |||||
| y1 = (v2f64) __msa_insert_d((v2i64) y1, 1, *((long long *)(y + 3 * inc_y))); \ | |||||
| #define DLOAD_Y8_VECTOR() LD_DP4(y, 2, y0, y1, y2, y3); | |||||
| #define DLOAD_Y4_VECTOR() LD_DP2(y, 2, y0, y1); | |||||
| #define DSTORE_Y8_GP() \ | |||||
| *((long long *)(y + 0 * inc_y)) = __msa_copy_s_d((v2i64) y0, 0); \ | |||||
| *((long long *)(y + 1 * inc_y)) = __msa_copy_s_d((v2i64) y0, 1); \ | |||||
| *((long long *)(y + 2 * inc_y)) = __msa_copy_s_d((v2i64) y1, 0); \ | |||||
| *((long long *)(y + 3 * inc_y)) = __msa_copy_s_d((v2i64) y1, 1); \ | |||||
| *((long long *)(y + 4 * inc_y)) = __msa_copy_s_d((v2i64) y2, 0); \ | |||||
| *((long long *)(y + 5 * inc_y)) = __msa_copy_s_d((v2i64) y2, 1); \ | |||||
| *((long long *)(y + 6 * inc_y)) = __msa_copy_s_d((v2i64) y3, 0); \ | |||||
| *((long long *)(y + 7 * inc_y)) = __msa_copy_s_d((v2i64) y3, 1); \ | |||||
| #define DSTORE_Y4_GP() \ | |||||
| *((long long *)(y + 0 * inc_y)) = __msa_copy_s_d((v2i64) y0, 0); \ | |||||
| *((long long *)(y + 1 * inc_y)) = __msa_copy_s_d((v2i64) y0, 1); \ | |||||
| *((long long *)(y + 2 * inc_y)) = __msa_copy_s_d((v2i64) y1, 0); \ | |||||
| *((long long *)(y + 3 * inc_y)) = __msa_copy_s_d((v2i64) y1, 1); \ | |||||
| #define DSTORE_Y8_VECTOR() ST_DP4(y0, y1, y2, y3, y, 2); | |||||
| #define DSTORE_Y4_VECTOR() ST_DP2(y0, y1, y, 2); | |||||
| #define DGEMV_N_MSA() \ | |||||
| for (j = (n >> 3); j--;) \ | |||||
| { \ | |||||
| DLOAD_X8_SCALE(); \ | |||||
| \ | |||||
| k = 0; \ | |||||
| y = y_org; \ | |||||
| \ | |||||
| for (i = (m >> 3); i--;) \ | |||||
| { \ | |||||
| DLOAD_Y8(); \ | |||||
| DGEMV_N_8x8(); \ | |||||
| DSTORE_Y8(); \ | |||||
| \ | |||||
| y += 8 * inc_y; \ | |||||
| k += 8; \ | |||||
| } \ | |||||
| \ | |||||
| if (m & 4) \ | |||||
| { \ | |||||
| DLOAD_Y4(); \ | |||||
| DGEMV_N_4x8(); \ | |||||
| DSTORE_Y4(); \ | |||||
| \ | |||||
| y += 4 * inc_y; \ | |||||
| k += 4; \ | |||||
| } \ | |||||
| \ | |||||
| if (m & 3) \ | |||||
| { \ | |||||
| temp0 = alpha * x[0 * inc_x]; \ | |||||
| temp1 = alpha * x[1 * inc_x]; \ | |||||
| temp2 = alpha * x[2 * inc_x]; \ | |||||
| temp3 = alpha * x[3 * inc_x]; \ | |||||
| temp4 = alpha * x[4 * inc_x]; \ | |||||
| temp5 = alpha * x[5 * inc_x]; \ | |||||
| temp6 = alpha * x[6 * inc_x]; \ | |||||
| temp7 = alpha * x[7 * inc_x]; \ | |||||
| \ | |||||
| for (i = (m & 3); i--;) \ | |||||
| { \ | |||||
| temp = y[0]; \ | |||||
| temp += temp0 * pa0[k]; \ | |||||
| temp += temp1 * pa1[k]; \ | |||||
| temp += temp2 * pa2[k]; \ | |||||
| temp += temp3 * pa3[k]; \ | |||||
| temp += temp4 * pa4[k]; \ | |||||
| temp += temp5 * pa5[k]; \ | |||||
| temp += temp6 * pa6[k]; \ | |||||
| temp += temp7 * pa7[k]; \ | |||||
| y[0] = temp; \ | |||||
| \ | |||||
| y += inc_y; \ | |||||
| k++; \ | |||||
| } \ | |||||
| } \ | |||||
| pa0 += 8 * lda; \ | |||||
| pa1 += 8 * lda; \ | |||||
| pa2 += 8 * lda; \ | |||||
| pa3 += 8 * lda; \ | |||||
| pa4 += 8 * lda; \ | |||||
| pa5 += 8 * lda; \ | |||||
| pa6 += 8 * lda; \ | |||||
| pa7 += 8 * lda; \ | |||||
| \ | |||||
| x += 8 * inc_x; \ | |||||
| } \ | |||||
| \ | |||||
| if (n & 4) \ | |||||
| { \ | |||||
| DLOAD_X4_SCALE(); \ | |||||
| \ | |||||
| k = 0; \ | |||||
| y = y_org; \ | |||||
| \ | |||||
| for (i = (m >> 3); i--;) \ | |||||
| { \ | |||||
| DLOAD_Y8(); \ | |||||
| DGEMV_N_8x4(); \ | |||||
| DSTORE_Y8(); \ | |||||
| \ | |||||
| y += 8 * inc_y; \ | |||||
| k += 8; \ | |||||
| } \ | |||||
| \ | |||||
| if (m & 4) \ | |||||
| { \ | |||||
| DLOAD_Y4(); \ | |||||
| DGEMV_N_4x4(); \ | |||||
| DSTORE_Y4(); \ | |||||
| \ | |||||
| y += 4 * inc_y; \ | |||||
| k += 4; \ | |||||
| } \ | |||||
| \ | |||||
| if (m & 3) \ | |||||
| { \ | |||||
| temp0 = alpha * x[0 * inc_x]; \ | |||||
| temp1 = alpha * x[1 * inc_x]; \ | |||||
| temp2 = alpha * x[2 * inc_x]; \ | |||||
| temp3 = alpha * x[3 * inc_x]; \ | |||||
| \ | |||||
| for (i = (m & 3); i--;) \ | |||||
| { \ | |||||
| temp = y[0]; \ | |||||
| temp += temp0 * pa0[k]; \ | |||||
| temp += temp1 * pa1[k]; \ | |||||
| temp += temp2 * pa2[k]; \ | |||||
| temp += temp3 * pa3[k]; \ | |||||
| y[0] = temp; \ | |||||
| \ | |||||
| y += inc_y; \ | |||||
| k++; \ | |||||
| } \ | |||||
| } \ | |||||
| \ | |||||
| pa0 += 4 * lda; \ | |||||
| pa1 += 4 * lda; \ | |||||
| pa2 += 4 * lda; \ | |||||
| pa3 += 4 * lda; \ | |||||
| \ | |||||
| x += 4 * inc_x; \ | |||||
| } \ | |||||
| \ | |||||
| if (n & 2) \ | |||||
| { \ | |||||
| temp0 = alpha * x[0 * inc_x]; \ | |||||
| temp1 = alpha * x[1 * inc_x]; \ | |||||
| \ | |||||
| tp0 = COPY_DOUBLE_TO_VECTOR(temp0); \ | |||||
| tp1 = COPY_DOUBLE_TO_VECTOR(temp1); \ | |||||
| \ | |||||
| k = 0; \ | |||||
| y = y_org; \ | |||||
| \ | |||||
| for (i = (m >> 3); i--;) \ | |||||
| { \ | |||||
| DLOAD_Y8(); \ | |||||
| DGEMV_N_8x2(); \ | |||||
| DSTORE_Y8(); \ | |||||
| \ | |||||
| y += 8 * inc_y; \ | |||||
| k += 8; \ | |||||
| } \ | |||||
| \ | |||||
| if (m & 4) \ | |||||
| { \ | |||||
| DLOAD_Y4(); \ | |||||
| DGEMV_N_4x2(); \ | |||||
| DSTORE_Y4(); \ | |||||
| \ | |||||
| y += 4 * inc_y; \ | |||||
| k += 4; \ | |||||
| } \ | |||||
| \ | |||||
| if (m & 3) \ | |||||
| { \ | |||||
| temp0 = alpha * x[0 * inc_x]; \ | |||||
| temp1 = alpha * x[1 * inc_x]; \ | |||||
| \ | |||||
| for (i = (m & 3); i--;) \ | |||||
| { \ | |||||
| temp = y[0]; \ | |||||
| temp += temp0 * pa0[k]; \ | |||||
| temp += temp1 * pa1[k]; \ | |||||
| y[0] = temp; \ | |||||
| \ | |||||
| y += inc_y; \ | |||||
| k++; \ | |||||
| } \ | |||||
| } \ | |||||
| \ | |||||
| pa0 += 2 * lda; \ | |||||
| pa1 += 2 * lda; \ | |||||
| \ | |||||
| x += 2 * inc_x; \ | |||||
| } \ | |||||
| \ | |||||
| if (n & 1) \ | |||||
| { \ | |||||
| temp = alpha * x[0]; \ | |||||
| \ | |||||
| k = 0; \ | |||||
| y = y_org; \ | |||||
| \ | |||||
| for (i = m; i--;) \ | |||||
| { \ | |||||
| y[0] += temp * pa0[k]; \ | |||||
| y += inc_y; \ | |||||
| k++; \ | |||||
| } \ | |||||
| } \ | |||||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *A, | |||||
| BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, | |||||
| FLOAT *buffer) | |||||
| { | |||||
| BLASLONG i, j, k; | |||||
| FLOAT *y_org = y; | |||||
| FLOAT *pa0, *pa1, *pa2, *pa3, *pa4, *pa5, *pa6, *pa7; | |||||
| FLOAT temp, temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; | |||||
| v2f64 v_alpha; | |||||
| v2f64 x0, x1, x2, x3, y0, y1, y2, y3; | |||||
| v2f64 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15; | |||||
| v2f64 t16, t17, t18, t19, t20, t21, t22, t23, t24, t25, t26, t27, t28, t29; | |||||
| v2f64 t30, t31, tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7; | |||||
| v_alpha = COPY_DOUBLE_TO_VECTOR(alpha); | |||||
| pa0 = A; | |||||
| pa1 = A + lda; | |||||
| pa2 = A + 2 * lda; | |||||
| pa3 = A + 3 * lda; | |||||
| pa4 = A + 4 * lda; | |||||
| pa5 = A + 5 * lda; | |||||
| pa6 = A + 6 * lda; | |||||
| pa7 = A + 7 * lda; | |||||
| if ((1 == inc_x) && (1 == inc_y)) | |||||
| { | |||||
| #define DLOAD_X8_SCALE DLOAD_X8_SCALE_VECTOR | |||||
| #define DLOAD_X4_SCALE DLOAD_X4_SCALE_VECTOR | |||||
| #define DLOAD_Y8 DLOAD_Y8_VECTOR | |||||
| #define DLOAD_Y4 DLOAD_Y4_VECTOR | |||||
| #define DSTORE_Y8 DSTORE_Y8_VECTOR | |||||
| #define DSTORE_Y4 DSTORE_Y4_VECTOR | |||||
| DGEMV_N_MSA(); | |||||
| #undef DLOAD_X8_SCALE | |||||
| #undef DLOAD_X4_SCALE | |||||
| #undef DLOAD_Y8 | |||||
| #undef DLOAD_Y4 | |||||
| #undef DSTORE_Y8 | |||||
| #undef DSTORE_Y4 | |||||
| } | |||||
| else if (1 == inc_y) | |||||
| { | |||||
| #define DLOAD_X8_SCALE DLOAD_X8_SCALE_GP | |||||
| #define DLOAD_X4_SCALE DLOAD_X4_SCALE_GP | |||||
| #define DLOAD_Y8 DLOAD_Y8_VECTOR | |||||
| #define DLOAD_Y4 DLOAD_Y4_VECTOR | |||||
| #define DSTORE_Y8 DSTORE_Y8_VECTOR | |||||
| #define DSTORE_Y4 DSTORE_Y4_VECTOR | |||||
| DGEMV_N_MSA(); | |||||
| #undef DLOAD_X8_SCALE | |||||
| #undef DLOAD_X4_SCALE | |||||
| #undef DLOAD_Y8 | |||||
| #undef DLOAD_Y4 | |||||
| #undef DSTORE_Y8 | |||||
| #undef DSTORE_Y4 | |||||
| } | |||||
| else if (1 == inc_x) | |||||
| { | |||||
| #define DLOAD_X8_SCALE DLOAD_X8_SCALE_VECTOR | |||||
| #define DLOAD_X4_SCALE DLOAD_X4_SCALE_VECTOR | |||||
| #define DLOAD_Y8 DLOAD_Y8_GP | |||||
| #define DLOAD_Y4 DLOAD_Y4_GP | |||||
| #define DSTORE_Y8 DSTORE_Y8_GP | |||||
| #define DSTORE_Y4 DSTORE_Y4_GP | |||||
| DGEMV_N_MSA(); | |||||
| #undef DLOAD_X8_SCALE | |||||
| #undef DLOAD_X4_SCALE | |||||
| #undef DLOAD_Y8 | |||||
| #undef DLOAD_Y4 | |||||
| #undef DSTORE_Y8 | |||||
| #undef DSTORE_Y4 | |||||
| } | |||||
| else | |||||
| { | |||||
| #define DLOAD_X8_SCALE DLOAD_X8_SCALE_GP | |||||
| #define DLOAD_X4_SCALE DLOAD_X4_SCALE_GP | |||||
| #define DLOAD_Y8 DLOAD_Y8_GP | |||||
| #define DLOAD_Y4 DLOAD_Y4_GP | |||||
| #define DSTORE_Y8 DSTORE_Y8_GP | |||||
| #define DSTORE_Y4 DSTORE_Y4_GP | |||||
| DGEMV_N_MSA(); | |||||
| #undef DLOAD_X8_SCALE | |||||
| #undef DLOAD_X4_SCALE | |||||
| #undef DLOAD_Y8 | |||||
| #undef DLOAD_Y4 | |||||
| #undef DSTORE_Y8 | |||||
| #undef DSTORE_Y4 | |||||
| } | |||||
| return(0); | |||||
| } | |||||
| @@ -0,0 +1,589 @@ | |||||
| /******************************************************************************* | |||||
| Copyright (c) 2016, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *******************************************************************************/ | |||||
| #include "common.h" | |||||
| #include "macros_msa.h" | |||||
| #define DGEMV_T_8x8() \ | |||||
| { \ | |||||
| LD_DP4(pa0 + k, 2, t0, t1, t2, t3); \ | |||||
| LD_DP4(pa1 + k, 2, t4, t5, t6, t7); \ | |||||
| LD_DP4(pa2 + k, 2, t8, t9, t10, t11); \ | |||||
| LD_DP4(pa3 + k, 2, t12, t13, t14, t15); \ | |||||
| LD_DP4(pa4 + k, 2, t16, t17, t18, t19); \ | |||||
| LD_DP4(pa5 + k, 2, t20, t21, t22, t23); \ | |||||
| LD_DP4(pa6 + k, 2, t24, t25, t26, t27); \ | |||||
| LD_DP4(pa7 + k, 2, t28, t29, t30, t31); \ | |||||
| \ | |||||
| tp0 += x0 * t0; \ | |||||
| tp0 += x1 * t1; \ | |||||
| tp0 += x2 * t2; \ | |||||
| tp0 += x3 * t3; \ | |||||
| \ | |||||
| tp1 += x0 * t4; \ | |||||
| tp1 += x1 * t5; \ | |||||
| tp1 += x2 * t6; \ | |||||
| tp1 += x3 * t7; \ | |||||
| \ | |||||
| tp2 += x0 * t8; \ | |||||
| tp2 += x1 * t9; \ | |||||
| tp2 += x2 * t10; \ | |||||
| tp2 += x3 * t11; \ | |||||
| \ | |||||
| tp3 += x0 * t12; \ | |||||
| tp3 += x1 * t13; \ | |||||
| tp3 += x2 * t14; \ | |||||
| tp3 += x3 * t15; \ | |||||
| \ | |||||
| tp4 += x0 * t16; \ | |||||
| tp4 += x1 * t17; \ | |||||
| tp4 += x2 * t18; \ | |||||
| tp4 += x3 * t19; \ | |||||
| \ | |||||
| tp5 += x0 * t20; \ | |||||
| tp5 += x1 * t21; \ | |||||
| tp5 += x2 * t22; \ | |||||
| tp5 += x3 * t23; \ | |||||
| \ | |||||
| tp6 += x0 * t24; \ | |||||
| tp6 += x1 * t25; \ | |||||
| tp6 += x2 * t26; \ | |||||
| tp6 += x3 * t27; \ | |||||
| \ | |||||
| tp7 += x0 * t28; \ | |||||
| tp7 += x1 * t29; \ | |||||
| tp7 += x2 * t30; \ | |||||
| tp7 += x3 * t31; \ | |||||
| } | |||||
| #define DGEMV_T_8x4() \ | |||||
| { \ | |||||
| LD_DP2(pa0 + k, 2, t0, t1); \ | |||||
| LD_DP2(pa1 + k, 2, t4, t5); \ | |||||
| LD_DP2(pa2 + k, 2, t8, t9); \ | |||||
| LD_DP2(pa3 + k, 2, t12, t13); \ | |||||
| LD_DP2(pa4 + k, 2, t16, t17); \ | |||||
| LD_DP2(pa5 + k, 2, t20, t21); \ | |||||
| LD_DP2(pa6 + k, 2, t24, t25); \ | |||||
| LD_DP2(pa7 + k, 2, t28, t29); \ | |||||
| \ | |||||
| tp0 += x0 * t0; \ | |||||
| tp0 += x1 * t1; \ | |||||
| \ | |||||
| tp1 += x0 * t4; \ | |||||
| tp1 += x1 * t5; \ | |||||
| \ | |||||
| tp2 += x0 * t8; \ | |||||
| tp2 += x1 * t9; \ | |||||
| \ | |||||
| tp3 += x0 * t12; \ | |||||
| tp3 += x1 * t13; \ | |||||
| \ | |||||
| tp4 += x0 * t16; \ | |||||
| tp4 += x1 * t17; \ | |||||
| \ | |||||
| tp5 += x0 * t20; \ | |||||
| tp5 += x1 * t21; \ | |||||
| \ | |||||
| tp6 += x0 * t24; \ | |||||
| tp6 += x1 * t25; \ | |||||
| \ | |||||
| tp7 += x0 * t28; \ | |||||
| tp7 += x1 * t29; \ | |||||
| } | |||||
| #define DGEMV_T_8x2() \ | |||||
| { \ | |||||
| t0 = LD_DP(pa0 + k); \ | |||||
| t4 = LD_DP(pa1 + k); \ | |||||
| t8 = LD_DP(pa2 + k); \ | |||||
| t12 = LD_DP(pa3 + k); \ | |||||
| t16 = LD_DP(pa4 + k); \ | |||||
| t20 = LD_DP(pa5 + k); \ | |||||
| t24 = LD_DP(pa6 + k); \ | |||||
| t28 = LD_DP(pa7 + k); \ | |||||
| \ | |||||
| tp0 += x0 * t0; \ | |||||
| tp1 += x0 * t4; \ | |||||
| tp2 += x0 * t8; \ | |||||
| tp3 += x0 * t12; \ | |||||
| tp4 += x0 * t16; \ | |||||
| tp5 += x0 * t20; \ | |||||
| tp6 += x0 * t24; \ | |||||
| tp7 += x0 * t28; \ | |||||
| } | |||||
| #define DGEMV_T_4x8() \ | |||||
| { \ | |||||
| LD_DP4(pa0 + k, 2, t0, t1, t2, t3); \ | |||||
| LD_DP4(pa1 + k, 2, t4, t5, t6, t7); \ | |||||
| LD_DP4(pa2 + k, 2, t8, t9, t10, t11); \ | |||||
| LD_DP4(pa3 + k, 2, t12, t13, t14, t15); \ | |||||
| \ | |||||
| tp0 += x0 * t0; \ | |||||
| tp0 += x1 * t1; \ | |||||
| tp0 += x2 * t2; \ | |||||
| tp0 += x3 * t3; \ | |||||
| \ | |||||
| tp1 += x0 * t4; \ | |||||
| tp1 += x1 * t5; \ | |||||
| tp1 += x2 * t6; \ | |||||
| tp1 += x3 * t7; \ | |||||
| \ | |||||
| tp2 += x0 * t8; \ | |||||
| tp2 += x1 * t9; \ | |||||
| tp2 += x2 * t10; \ | |||||
| tp2 += x3 * t11; \ | |||||
| \ | |||||
| tp3 += x0 * t12; \ | |||||
| tp3 += x1 * t13; \ | |||||
| tp3 += x2 * t14; \ | |||||
| tp3 += x3 * t15; \ | |||||
| } | |||||
| #define DGEMV_T_4x4() \ | |||||
| { \ | |||||
| LD_DP2(pa0 + k, 2, t0, t1); \ | |||||
| LD_DP2(pa1 + k, 2, t4, t5); \ | |||||
| LD_DP2(pa2 + k, 2, t8, t9); \ | |||||
| LD_DP2(pa3 + k, 2, t12, t13); \ | |||||
| \ | |||||
| tp0 += x0 * t0; \ | |||||
| tp0 += x1 * t1; \ | |||||
| \ | |||||
| tp1 += x0 * t4; \ | |||||
| tp1 += x1 * t5; \ | |||||
| \ | |||||
| tp2 += x0 * t8; \ | |||||
| tp2 += x1 * t9; \ | |||||
| \ | |||||
| tp3 += x0 * t12; \ | |||||
| tp3 += x1 * t13; \ | |||||
| } | |||||
| #define DGEMV_T_4x2() \ | |||||
| { \ | |||||
| t0 = LD_DP(pa0 + k); \ | |||||
| t4 = LD_DP(pa1 + k); \ | |||||
| t8 = LD_DP(pa2 + k); \ | |||||
| t12 = LD_DP(pa3 + k); \ | |||||
| \ | |||||
| tp0 += x0 * t0; \ | |||||
| tp1 += x0 * t4; \ | |||||
| tp2 += x0 * t8; \ | |||||
| tp3 += x0 * t12; \ | |||||
| } | |||||
| #define DGEMV_T_2x8() \ | |||||
| { \ | |||||
| LD_DP4(pa0 + k, 2, t0, t1, t2, t3); \ | |||||
| LD_DP4(pa1 + k, 2, t4, t5, t6, t7); \ | |||||
| \ | |||||
| tp0 += x0 * t0; \ | |||||
| tp0 += x1 * t1; \ | |||||
| tp0 += x2 * t2; \ | |||||
| tp0 += x3 * t3; \ | |||||
| \ | |||||
| tp1 += x0 * t4; \ | |||||
| tp1 += x1 * t5; \ | |||||
| tp1 += x2 * t6; \ | |||||
| tp1 += x3 * t7; \ | |||||
| } | |||||
| #define DGEMV_T_2x4() \ | |||||
| { \ | |||||
| LD_DP2(pa0 + k, 2, t0, t1); \ | |||||
| LD_DP2(pa1 + k, 2, t4, t5); \ | |||||
| \ | |||||
| tp0 += x0 * t0; \ | |||||
| tp0 += x1 * t1; \ | |||||
| \ | |||||
| tp1 += x0 * t4; \ | |||||
| tp1 += x1 * t5; \ | |||||
| } | |||||
| #define DGEMV_T_2x2() \ | |||||
| { \ | |||||
| t0 = LD_DP(pa0 + k); \ | |||||
| t4 = LD_DP(pa1 + k); \ | |||||
| \ | |||||
| tp0 += x0 * t0; \ | |||||
| tp1 += x0 * t4; \ | |||||
| } | |||||
| #define DLOAD_X8_GP() \ | |||||
| x0 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(x + 0 * inc_x))); \ | |||||
| x0 = (v2f64) __msa_insert_d((v2i64) x0, 1, *((long long *)(x + 1 * inc_x))); \ | |||||
| x1 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(x + 2 * inc_x))); \ | |||||
| x1 = (v2f64) __msa_insert_d((v2i64) x1, 1, *((long long *)(x + 3 * inc_x))); \ | |||||
| x2 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(x + 4 * inc_x))); \ | |||||
| x2 = (v2f64) __msa_insert_d((v2i64) x2, 1, *((long long *)(x + 5 * inc_x))); \ | |||||
| x3 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(x + 6 * inc_x))); \ | |||||
| x3 = (v2f64) __msa_insert_d((v2i64) x3, 1, *((long long *)(x + 7 * inc_x))); \ | |||||
| #define DLOAD_X4_GP() \ | |||||
| x0 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(x + 0 * inc_x))); \ | |||||
| x0 = (v2f64) __msa_insert_d((v2i64) x0, 1, *((long long *)(x + 1 * inc_x))); \ | |||||
| x1 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(x + 2 * inc_x))); \ | |||||
| x1 = (v2f64) __msa_insert_d((v2i64) x1, 1, *((long long *)(x + 3 * inc_x))); \ | |||||
| #define DLOAD_X2_GP() \ | |||||
| x0 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(x + 0 * inc_x))); \ | |||||
| x0 = (v2f64) __msa_insert_d((v2i64) x0, 1, *((long long *)(x + 1 * inc_x))); \ | |||||
| #define DLOAD_X8_VECTOR() LD_DP4(x, 2, x0, x1, x2, x3); | |||||
| #define DLOAD_X4_VECTOR() LD_DP2(x, 2, x0, x1); | |||||
| #define DLOAD_X2_VECTOR() x0 = LD_DP(x); | |||||
| #define DGEMV_T_MSA() \ | |||||
| for (j = (n >> 3); j--;) \ | |||||
| { \ | |||||
| tp0 = zero; \ | |||||
| tp1 = zero; \ | |||||
| tp2 = zero; \ | |||||
| tp3 = zero; \ | |||||
| tp4 = zero; \ | |||||
| tp5 = zero; \ | |||||
| tp6 = zero; \ | |||||
| tp7 = zero; \ | |||||
| \ | |||||
| k = 0; \ | |||||
| x = srcx_org; \ | |||||
| \ | |||||
| for (i = (m >> 3); i--;) \ | |||||
| { \ | |||||
| DLOAD_X8(); \ | |||||
| DGEMV_T_8x8(); \ | |||||
| \ | |||||
| x += 8 * inc_x; \ | |||||
| k += 8; \ | |||||
| } \ | |||||
| \ | |||||
| if (m & 4) \ | |||||
| { \ | |||||
| DLOAD_X4(); \ | |||||
| DGEMV_T_8x4(); \ | |||||
| \ | |||||
| x += 4 * inc_x; \ | |||||
| k += 4; \ | |||||
| } \ | |||||
| \ | |||||
| if (m & 2) \ | |||||
| { \ | |||||
| DLOAD_X2(); \ | |||||
| DGEMV_T_8x2(); \ | |||||
| \ | |||||
| x += 2 * inc_x; \ | |||||
| k += 2; \ | |||||
| } \ | |||||
| \ | |||||
| ILVRL_D2_DP(tp1, tp0, t0, t4); \ | |||||
| ILVRL_D2_DP(tp3, tp2, t1, t5); \ | |||||
| ILVRL_D2_DP(tp5, tp4, t2, t6); \ | |||||
| ILVRL_D2_DP(tp7, tp6, t3, t7); \ | |||||
| ADD2(t0, t4, t1, t5, t0, t1); \ | |||||
| ADD2(t2, t6, t3, t7, t2, t3); \ | |||||
| \ | |||||
| temp0 = t0[0]; \ | |||||
| temp1 = t0[1]; \ | |||||
| temp2 = t1[0]; \ | |||||
| temp3 = t1[1]; \ | |||||
| temp4 = t2[0]; \ | |||||
| temp5 = t2[1]; \ | |||||
| temp6 = t3[0]; \ | |||||
| temp7 = t3[1]; \ | |||||
| \ | |||||
| if (m & 1) \ | |||||
| { \ | |||||
| temp0 += pa0[k] * x[0]; \ | |||||
| temp1 += pa1[k] * x[0]; \ | |||||
| temp2 += pa2[k] * x[0]; \ | |||||
| temp3 += pa3[k] * x[0]; \ | |||||
| temp4 += pa4[k] * x[0]; \ | |||||
| temp5 += pa5[k] * x[0]; \ | |||||
| temp6 += pa6[k] * x[0]; \ | |||||
| temp7 += pa7[k] * x[0]; \ | |||||
| \ | |||||
| x += inc_x; \ | |||||
| k++; \ | |||||
| } \ | |||||
| \ | |||||
| res0 = y[0 * inc_y]; \ | |||||
| res1 = y[1 * inc_y]; \ | |||||
| res2 = y[2 * inc_y]; \ | |||||
| res3 = y[3 * inc_y]; \ | |||||
| res4 = y[4 * inc_y]; \ | |||||
| res5 = y[5 * inc_y]; \ | |||||
| res6 = y[6 * inc_y]; \ | |||||
| res7 = y[7 * inc_y]; \ | |||||
| \ | |||||
| res0 += alpha * temp0; \ | |||||
| res1 += alpha * temp1; \ | |||||
| res2 += alpha * temp2; \ | |||||
| res3 += alpha * temp3; \ | |||||
| res4 += alpha * temp4; \ | |||||
| res5 += alpha * temp5; \ | |||||
| res6 += alpha * temp6; \ | |||||
| res7 += alpha * temp7; \ | |||||
| \ | |||||
| y[0 * inc_y] = res0; \ | |||||
| y[1 * inc_y] = res1; \ | |||||
| y[2 * inc_y] = res2; \ | |||||
| y[3 * inc_y] = res3; \ | |||||
| y[4 * inc_y] = res4; \ | |||||
| y[5 * inc_y] = res5; \ | |||||
| y[6 * inc_y] = res6; \ | |||||
| y[7 * inc_y] = res7; \ | |||||
| \ | |||||
| y += 8 * inc_y; \ | |||||
| \ | |||||
| pa0 += 8 * lda; \ | |||||
| pa1 += 8 * lda; \ | |||||
| pa2 += 8 * lda; \ | |||||
| pa3 += 8 * lda; \ | |||||
| pa4 += 8 * lda; \ | |||||
| pa5 += 8 * lda; \ | |||||
| pa6 += 8 * lda; \ | |||||
| pa7 += 8 * lda; \ | |||||
| } \ | |||||
| \ | |||||
| if (n & 4) \ | |||||
| { \ | |||||
| tp0 = zero; \ | |||||
| tp1 = zero; \ | |||||
| tp2 = zero; \ | |||||
| tp3 = zero; \ | |||||
| \ | |||||
| k = 0; \ | |||||
| x = srcx_org; \ | |||||
| \ | |||||
| for (i = (m >> 3); i--;) \ | |||||
| { \ | |||||
| DLOAD_X8(); \ | |||||
| DGEMV_T_4x8(); \ | |||||
| \ | |||||
| x += 8 * inc_x; \ | |||||
| k += 8; \ | |||||
| } \ | |||||
| \ | |||||
| if (m & 4) \ | |||||
| { \ | |||||
| DLOAD_X4(); \ | |||||
| DGEMV_T_4x4(); \ | |||||
| \ | |||||
| x += 4 * inc_x; \ | |||||
| k += 4; \ | |||||
| } \ | |||||
| \ | |||||
| if (m & 2) \ | |||||
| { \ | |||||
| DLOAD_X2(); \ | |||||
| DGEMV_T_4x2(); \ | |||||
| \ | |||||
| x += 2 * inc_x; \ | |||||
| k += 2; \ | |||||
| } \ | |||||
| \ | |||||
| ILVRL_D2_DP(tp1, tp0, t0, t4); \ | |||||
| ILVRL_D2_DP(tp3, tp2, t1, t5); \ | |||||
| ADD2(t0, t4, t1, t5, t0, t1); \ | |||||
| \ | |||||
| temp0 = t0[0]; \ | |||||
| temp1 = t0[1]; \ | |||||
| temp2 = t1[0]; \ | |||||
| temp3 = t1[1]; \ | |||||
| \ | |||||
| if (m & 1) \ | |||||
| { \ | |||||
| temp0 += pa0[k] * x[0]; \ | |||||
| temp1 += pa1[k] * x[0]; \ | |||||
| temp2 += pa2[k] * x[0]; \ | |||||
| temp3 += pa3[k] * x[0]; \ | |||||
| \ | |||||
| x += inc_x; \ | |||||
| k++; \ | |||||
| } \ | |||||
| \ | |||||
| res0 = y[0 * inc_y]; \ | |||||
| res1 = y[1 * inc_y]; \ | |||||
| res2 = y[2 * inc_y]; \ | |||||
| res3 = y[3 * inc_y]; \ | |||||
| \ | |||||
| res0 += alpha * temp0; \ | |||||
| res1 += alpha * temp1; \ | |||||
| res2 += alpha * temp2; \ | |||||
| res3 += alpha * temp3; \ | |||||
| \ | |||||
| y[0 * inc_y] = res0; \ | |||||
| y[1 * inc_y] = res1; \ | |||||
| y[2 * inc_y] = res2; \ | |||||
| y[3 * inc_y] = res3; \ | |||||
| \ | |||||
| y += 4 * inc_y; \ | |||||
| \ | |||||
| pa0 += 4 * lda; \ | |||||
| pa1 += 4 * lda; \ | |||||
| pa2 += 4 * lda; \ | |||||
| pa3 += 4 * lda; \ | |||||
| } \ | |||||
| \ | |||||
| if (n & 2) \ | |||||
| { \ | |||||
| tp0 = zero; \ | |||||
| tp1 = zero; \ | |||||
| \ | |||||
| k = 0; \ | |||||
| x = srcx_org; \ | |||||
| \ | |||||
| for (i = (m >> 3); i--;) \ | |||||
| { \ | |||||
| DLOAD_X8(); \ | |||||
| DGEMV_T_2x8(); \ | |||||
| \ | |||||
| x += 8 * inc_x; \ | |||||
| k += 8; \ | |||||
| } \ | |||||
| \ | |||||
| if (m & 4) \ | |||||
| { \ | |||||
| DLOAD_X4(); \ | |||||
| DGEMV_T_2x4(); \ | |||||
| \ | |||||
| x += 4 * inc_x; \ | |||||
| k += 4; \ | |||||
| } \ | |||||
| \ | |||||
| if (m & 2) \ | |||||
| { \ | |||||
| DLOAD_X2(); \ | |||||
| DGEMV_T_2x2(); \ | |||||
| \ | |||||
| x += 2 * inc_x; \ | |||||
| k += 2; \ | |||||
| } \ | |||||
| \ | |||||
| ILVRL_D2_DP(tp1, tp0, t0, t4); \ | |||||
| \ | |||||
| t0 += t4; \ | |||||
| \ | |||||
| temp0 = t0[0]; \ | |||||
| temp1 = t0[1]; \ | |||||
| \ | |||||
| if (m & 1) \ | |||||
| { \ | |||||
| temp0 += pa0[k] * x[0]; \ | |||||
| temp1 += pa1[k] * x[0]; \ | |||||
| x += inc_x; \ | |||||
| k++; \ | |||||
| } \ | |||||
| \ | |||||
| res0 = y[0 * inc_y]; \ | |||||
| res1 = y[1 * inc_y]; \ | |||||
| \ | |||||
| res0 += alpha * temp0; \ | |||||
| res1 += alpha * temp1; \ | |||||
| \ | |||||
| y[0 * inc_y] = res0; \ | |||||
| y[1 * inc_y] = res1; \ | |||||
| \ | |||||
| y += 2 * inc_y; \ | |||||
| \ | |||||
| pa0 += 2 * lda; \ | |||||
| pa1 += 2 * lda; \ | |||||
| } \ | |||||
| \ | |||||
| if (n & 1) \ | |||||
| { \ | |||||
| temp0 = 0.0; \ | |||||
| \ | |||||
| k = 0; \ | |||||
| x = srcx_org; \ | |||||
| \ | |||||
| for (i = m; i--;) \ | |||||
| { \ | |||||
| temp0 += pa0[k] * x[0]; \ | |||||
| x += inc_x; \ | |||||
| k++; \ | |||||
| } \ | |||||
| \ | |||||
| y[0] += alpha * temp0; \ | |||||
| y += inc_y; \ | |||||
| pa0 += lda; \ | |||||
| } | |||||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *A, | |||||
| BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, | |||||
| FLOAT *buffer) | |||||
| { | |||||
| BLASLONG i, j, k; | |||||
| FLOAT *srcx_org = x; | |||||
| FLOAT *pa0, *pa1, *pa2, *pa3, *pa4, *pa5, *pa6, *pa7; | |||||
| FLOAT temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; | |||||
| FLOAT res0, res1, res2, res3, res4, res5, res6, res7; | |||||
| v2f64 x0, x1, x2, x3; | |||||
| v2f64 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15; | |||||
| v2f64 t16, t17, t18, t19, t20, t21, t22, t23, t24, t25, t26, t27, t28, t29; | |||||
| v2f64 t30, t31, tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7; | |||||
| v2f64 zero = {0}; | |||||
| pa0 = A + 0 * lda; | |||||
| pa1 = A + 1 * lda; | |||||
| pa2 = A + 2 * lda; | |||||
| pa3 = A + 3 * lda; | |||||
| pa4 = A + 4 * lda; | |||||
| pa5 = A + 5 * lda; | |||||
| pa6 = A + 6 * lda; | |||||
| pa7 = A + 7 * lda; | |||||
| if (1 == inc_x) | |||||
| { | |||||
| #define DLOAD_X8 DLOAD_X8_VECTOR | |||||
| #define DLOAD_X4 DLOAD_X4_VECTOR | |||||
| #define DLOAD_X2 DLOAD_X2_VECTOR | |||||
| DGEMV_T_MSA(); | |||||
| #undef DLOAD_X8 | |||||
| #undef DLOAD_X4 | |||||
| #undef DLOAD_X2 | |||||
| } | |||||
| else | |||||
| { | |||||
| #define DLOAD_X8 DLOAD_X8_GP | |||||
| #define DLOAD_X4 DLOAD_X4_GP | |||||
| #define DLOAD_X2 DLOAD_X2_GP | |||||
| DGEMV_T_MSA(); | |||||
| #undef DLOAD_X8 | |||||
| #undef DLOAD_X4 | |||||
| #undef DLOAD_X2 | |||||
| } | |||||
| return(0); | |||||
| } | |||||